From c6dbd6ea806c19864c49dde7ee29176700ca1218 Mon Sep 17 00:00:00 2001 From: Diicell <44242534+diicellman@users.noreply.github.com> Date: Tue, 13 Feb 2024 11:29:56 +0600 Subject: [PATCH] TelegramReader fixes not transferred from llama-hub (#10625) fixes from llama-hub --- .../llama-index-readers-telegram/README.md | 2 +- .../llama_index/readers/telegram/base.py | 16 ++++++++++++++-- .../llama-index-readers-telegram/pyproject.toml | 2 +- 3 files changed, 16 insertions(+), 4 deletions(-) diff --git a/llama-index-integrations/readers/llama-index-readers-telegram/README.md b/llama-index-integrations/readers/llama-index-readers-telegram/README.md index b84262540a..0ee82e3344 100644 --- a/llama-index-integrations/readers/llama-index-readers-telegram/README.md +++ b/llama-index-integrations/readers/llama-index-readers-telegram/README.md @@ -31,7 +31,7 @@ If the `.session` file already existed, it will not login again, so be aware of To use this loader, you simply need to pass in a entity name. ```python -from llama_index import download_loader +from llama_index.core import download_loader TelegramReader = download_loader("TelegramReader") loader = TelegramReader( diff --git a/llama-index-integrations/readers/llama-index-readers-telegram/llama_index/readers/telegram/base.py b/llama-index-integrations/readers/llama-index-readers-telegram/llama_index/readers/telegram/base.py index b1a90a3178..ab0c7e1000 100644 --- a/llama-index-integrations/readers/llama-index-readers-telegram/llama_index/readers/telegram/base.py +++ b/llama-index-integrations/readers/llama-index-readers-telegram/llama_index/readers/telegram/base.py @@ -1,5 +1,6 @@ """Telegram reader that reads posts/chats and comments to post from Telegram channel or chat.""" import asyncio +import re from typing import List, Union from llama_index.core.readers.base import BaseReader @@ -47,7 +48,8 @@ class TelegramReader(BaseReader): self.api_id = api_id self.api_hash = api_hash self.phone_number = phone_number - self.loop = asyncio.get_event_loop() + self.loop = asyncio.new_event_loop() + asyncio.set_event_loop(self.loop) def load_data( self, @@ -101,5 +103,15 @@ class TelegramReader(BaseReader): entity_name, reply_to=post_id, limit=limit ): if isinstance(message.text, str) and message.text != "": - results.append(Document(text=message.text)) + results.append(Document(text=self._remove_links(message.text))) return results + + def _remove_links(self, string) -> str: + """Removes all URLs from a given string, leaving only the base domain name.""" + + def replace_match(match): + text = match.group(1) + return text if text else "" + + url_pattern = r"https?://(?:www\.)?((?!www\.).)+?" + return re.sub(url_pattern, replace_match, string) diff --git a/llama-index-integrations/readers/llama-index-readers-telegram/pyproject.toml b/llama-index-integrations/readers/llama-index-readers-telegram/pyproject.toml index 1a8b8ea4d9..72ab072a9a 100644 --- a/llama-index-integrations/readers/llama-index-readers-telegram/pyproject.toml +++ b/llama-index-integrations/readers/llama-index-readers-telegram/pyproject.toml @@ -14,7 +14,7 @@ ignore_missing_imports = true python_version = "3.8" [tool.poetry] -authors = ["Your Name <you@example.com>"] +authors = ["Dias Kalkamanov <diicellman@gmail.com>"] description = "llama-index readers telegram integration" license = "MIT" name = "llama-index-readers-telegram" -- GitLab