diff --git a/llama-index-integrations/readers/llama-index-readers-telegram/README.md b/llama-index-integrations/readers/llama-index-readers-telegram/README.md index b84262540a265f3c444e4c75e275ac24a704c9c7..0ee82e3344510d384e289fe3ef55310c9aa5e6e7 100644 --- a/llama-index-integrations/readers/llama-index-readers-telegram/README.md +++ b/llama-index-integrations/readers/llama-index-readers-telegram/README.md @@ -31,7 +31,7 @@ If the `.session` file already existed, it will not login again, so be aware of To use this loader, you simply need to pass in a entity name. ```python -from llama_index import download_loader +from llama_index.core import download_loader TelegramReader = download_loader("TelegramReader") loader = TelegramReader( diff --git a/llama-index-integrations/readers/llama-index-readers-telegram/llama_index/readers/telegram/base.py b/llama-index-integrations/readers/llama-index-readers-telegram/llama_index/readers/telegram/base.py index b1a90a3178d5b151bb6c7ce8f8fc62dd11a5723a..ab0c7e10000b47f2e993229a04a01ca1a46d029c 100644 --- a/llama-index-integrations/readers/llama-index-readers-telegram/llama_index/readers/telegram/base.py +++ b/llama-index-integrations/readers/llama-index-readers-telegram/llama_index/readers/telegram/base.py @@ -1,5 +1,6 @@ """Telegram reader that reads posts/chats and comments to post from Telegram channel or chat.""" import asyncio +import re from typing import List, Union from llama_index.core.readers.base import BaseReader @@ -47,7 +48,8 @@ class TelegramReader(BaseReader): self.api_id = api_id self.api_hash = api_hash self.phone_number = phone_number - self.loop = asyncio.get_event_loop() + self.loop = asyncio.new_event_loop() + asyncio.set_event_loop(self.loop) def load_data( self, @@ -101,5 +103,15 @@ class TelegramReader(BaseReader): entity_name, reply_to=post_id, limit=limit ): if isinstance(message.text, str) and message.text != "": - results.append(Document(text=message.text)) + results.append(Document(text=self._remove_links(message.text))) return results + + def _remove_links(self, string) -> str: + """Removes all URLs from a given string, leaving only the base domain name.""" + + def replace_match(match): + text = match.group(1) + return text if text else "" + + url_pattern = r"https?://(?:www\.)?((?!www\.).)+?" + return re.sub(url_pattern, replace_match, string) diff --git a/llama-index-integrations/readers/llama-index-readers-telegram/pyproject.toml b/llama-index-integrations/readers/llama-index-readers-telegram/pyproject.toml index 1a8b8ea4d99d72a7c9e6c671e7e18f408f48e634..72ab072a9a6aff96d177044ba3a95ef8781f911e 100644 --- a/llama-index-integrations/readers/llama-index-readers-telegram/pyproject.toml +++ b/llama-index-integrations/readers/llama-index-readers-telegram/pyproject.toml @@ -14,7 +14,7 @@ ignore_missing_imports = true python_version = "3.8" [tool.poetry] -authors = ["Your Name <you@example.com>"] +authors = ["Dias Kalkamanov <diicellman@gmail.com>"] description = "llama-index readers telegram integration" license = "MIT" name = "llama-index-readers-telegram"