Skip to content
Snippets Groups Projects
Unverified Commit afe3dcbf authored by Jingyi Zhao's avatar Jingyi Zhao Committed by GitHub
Browse files

- fix: rssReader support customized user-agent (#18076)

parent 08740a62
No related branches found
No related tags found
No related merge requests found
......@@ -6,6 +6,7 @@ pip install llama-index-readers-web
This loader allows fetching text from an RSS feed. It uses the `feedparser` module
to fetch the feed and optionally the `html2text` module to sanitize it.
allow modify feedparser's useragent
## Usage
......
"""Rss reader."""
from typing import List
from typing import List, Any, Union
import logging
from llama_index.core.readers.base import BasePydanticReader
from llama_index.core.schema import Document
logger = logging.getLogger(__name__)
class RssReader(BasePydanticReader):
"""RSS reader.
......@@ -15,6 +18,12 @@ class RssReader(BasePydanticReader):
is_remote: bool = True
html_to_text: bool = False
user_agent: Union[str, None] = None
def __init__(self, **kwargs: Any) -> None:
super().__init__(**kwargs)
# https://pythonhosted.org/feedparser/http-useragent.html
self.user_agent = kwargs.get("user_agent", None)
@classmethod
def class_name(cls) -> str:
......@@ -32,6 +41,9 @@ class RssReader(BasePydanticReader):
"""
import feedparser
if self.user_agent:
feedparser.USER_AGENT = self.user_agent
if not isinstance(urls, list):
raise ValueError("urls must be a list of strings.")
......@@ -64,3 +76,12 @@ class RssReader(BasePydanticReader):
documents.append(Document(text=data, extra_info=extra_info))
return documents
if __name__ == "__main__":
default_reader = RssReader()
print(
default_reader.load_data(urls=["https://rsshub.app/hackernews/newest"])
) # 0 blocked by cloudflare
reader = RssReader(user_agent="MyApp/1.0 +http://example.com/")
print(reader.load_data(urls=["https://rsshub.app/hackernews/newest"]))
python_tests(
dependencies=["llama-index-integrations/readers/llama-index-readers-web:poetry"]
)
from llama_index.readers.web import RssReader
def test_rss_reader_non_strict_sources():
default_reader = RssReader()
documents = default_reader.load_data(urls=["https://news.ycombinator.com/rss"])
assert len(documents) > 0
def test_rss_reader_rsshub():
default_reader = RssReader()
documents = default_reader.load_data(urls=["https://rsshub.app/hackernews/newest"])
assert len(documents) == 0
def test_rss_reader_user_agent():
reader = RssReader(user_agent="MyApp/1.0 +http://example.com/")
documents = reader.load_data(urls=["https://rsshub.app/hackernews/newest"])
assert len(documents) > 0
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment