Skip to content
Snippets Groups Projects
Unverified Commit 85f9070a authored by Diicell's avatar Diicell Committed by GitHub
Browse files

Telegram Reader. Added ability to select the time period of posts/messages. (#12078)

* Added ability to select the time period of posts/messages

* changed from Union to Optional
parent dbf3d9a3
No related branches found
No related tags found
No related merge requests found
"""Telegram reader that reads posts/chats and comments to post from Telegram channel or chat.""" """Telegram reader that reads posts/chats and comments to post from Telegram channel or chat."""
import asyncio import asyncio
import re import re
from typing import List, Union from typing import List, Optional
import datetime
from llama_index.core.readers.base import BaseReader from llama_index.core.readers.base import BaseReader
from llama_index.core.schema import Document from llama_index.core.schema import Document
...@@ -54,8 +56,10 @@ class TelegramReader(BaseReader): ...@@ -54,8 +56,10 @@ class TelegramReader(BaseReader):
def load_data( def load_data(
self, self,
entity_name: str, entity_name: str,
post_id: Union[int, None] = None, post_id: Optional[int] = None,
limit: Union[int, None] = None, limit: Optional[int] = None,
start_date: Optional[datetime.date] = None,
end_date: Optional[datetime.date] = None,
) -> List[Document]: ) -> List[Document]:
"""Load posts/chat messages/comments from Telegram channels or chats. """Load posts/chat messages/comments from Telegram channels or chats.
...@@ -69,17 +73,27 @@ class TelegramReader(BaseReader): ...@@ -69,17 +73,27 @@ class TelegramReader(BaseReader):
the comments that reply to this ID will be returned.\ the comments that reply to this ID will be returned.\
Else will get posts/chat messages. Else will get posts/chat messages.
limit (int): Number of messages to be retrieved. limit (int): Number of messages to be retrieved.
start_date (datetime.date): Start date of the time period.
end_date (datetime.date): End date of the time period.
""" """
return self.loop.run_until_complete( return self.loop.run_until_complete(
self._load_data(entity_name=entity_name, post_id=post_id, limit=limit) self._load_data(
entity_name=entity_name,
post_id=post_id,
limit=limit,
start_date=start_date,
end_date=end_date,
)
) )
async def _load_data( async def _load_data(
self, self,
entity_name: str, entity_name: str,
post_id: Union[int, None] = None, post_id: Optional[int] = None,
limit: Union[int, None] = None, limit: Optional[int] = None,
start_date: Optional[datetime.date] = None,
end_date: Optional[datetime.date] = None,
) -> List[Document]: ) -> List[Document]:
"""Load posts/chat messages/comments from Telegram channels or chats. """Load posts/chat messages/comments from Telegram channels or chats.
...@@ -89,6 +103,8 @@ class TelegramReader(BaseReader): ...@@ -89,6 +103,8 @@ class TelegramReader(BaseReader):
the comments that reply to this ID will be returned.\ the comments that reply to this ID will be returned.\
Else will get posts/chat messages. Else will get posts/chat messages.
limit (int): Number of messages to be retrieved. limit (int): Number of messages to be retrieved.
start_date (datetime.date): Start date of the time period.
end_date (datetime.date): End date of the time period.
""" """
import telethon import telethon
...@@ -98,12 +114,28 @@ class TelegramReader(BaseReader): ...@@ -98,12 +114,28 @@ class TelegramReader(BaseReader):
results = [] results = []
async with client: async with client:
# Asynchronously iterate over messages if end_date and start_date:
async for message in client.iter_messages( # Asynchronously iterate over messages in between start_date and end_date
entity_name, reply_to=post_id, limit=limit async for message in client.iter_messages(
): entity_name,
if isinstance(message.text, str) and message.text != "": reply_to=post_id,
results.append(Document(text=self._remove_links(message.text))) limit=limit,
offset_date=end_date,
reverse=True,
):
if message.date < start_date:
break
if isinstance(message.text, str) and message.text != "":
results.append(Document(text=self._remove_links(message.text)))
else:
# Asynchronously iterate over messages
async for message in client.iter_messages(
entity_name,
reply_to=post_id,
limit=limit,
):
if isinstance(message.text, str) and message.text != "":
results.append(Document(text=self._remove_links(message.text)))
return results return results
def _remove_links(self, string) -> str: def _remove_links(self, string) -> str:
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment