Skip to content
Snippets Groups Projects
Unverified Commit 85f9070a authored by Diicell's avatar Diicell Committed by GitHub
Browse files

Telegram Reader. Added ability to select the time period of posts/messages. (#12078)

* Added ability to select the time period of posts/messages

* changed from Union to Optional
parent dbf3d9a3
No related branches found
No related tags found
No related merge requests found
"""Telegram reader that reads posts/chats and comments to post from Telegram channel or chat."""
import asyncio
import re
from typing import List, Union
from typing import List, Optional
import datetime
from llama_index.core.readers.base import BaseReader
from llama_index.core.schema import Document
......@@ -54,8 +56,10 @@ class TelegramReader(BaseReader):
def load_data(
self,
entity_name: str,
post_id: Union[int, None] = None,
limit: Union[int, None] = None,
post_id: Optional[int] = None,
limit: Optional[int] = None,
start_date: Optional[datetime.date] = None,
end_date: Optional[datetime.date] = None,
) -> List[Document]:
"""Load posts/chat messages/comments from Telegram channels or chats.
......@@ -69,17 +73,27 @@ class TelegramReader(BaseReader):
the comments that reply to this ID will be returned.\
Else will get posts/chat messages.
limit (int): Number of messages to be retrieved.
start_date (datetime.date): Start date of the time period.
end_date (datetime.date): End date of the time period.
"""
return self.loop.run_until_complete(
self._load_data(entity_name=entity_name, post_id=post_id, limit=limit)
self._load_data(
entity_name=entity_name,
post_id=post_id,
limit=limit,
start_date=start_date,
end_date=end_date,
)
)
async def _load_data(
self,
entity_name: str,
post_id: Union[int, None] = None,
limit: Union[int, None] = None,
post_id: Optional[int] = None,
limit: Optional[int] = None,
start_date: Optional[datetime.date] = None,
end_date: Optional[datetime.date] = None,
) -> List[Document]:
"""Load posts/chat messages/comments from Telegram channels or chats.
......@@ -89,6 +103,8 @@ class TelegramReader(BaseReader):
the comments that reply to this ID will be returned.\
Else will get posts/chat messages.
limit (int): Number of messages to be retrieved.
start_date (datetime.date): Start date of the time period.
end_date (datetime.date): End date of the time period.
"""
import telethon
......@@ -98,12 +114,28 @@ class TelegramReader(BaseReader):
results = []
async with client:
# Asynchronously iterate over messages
async for message in client.iter_messages(
entity_name, reply_to=post_id, limit=limit
):
if isinstance(message.text, str) and message.text != "":
results.append(Document(text=self._remove_links(message.text)))
if end_date and start_date:
# Asynchronously iterate over messages in between start_date and end_date
async for message in client.iter_messages(
entity_name,
reply_to=post_id,
limit=limit,
offset_date=end_date,
reverse=True,
):
if message.date < start_date:
break
if isinstance(message.text, str) and message.text != "":
results.append(Document(text=self._remove_links(message.text)))
else:
# Asynchronously iterate over messages
async for message in client.iter_messages(
entity_name,
reply_to=post_id,
limit=limit,
):
if isinstance(message.text, str) and message.text != "":
results.append(Document(text=self._remove_links(message.text)))
return results
def _remove_links(self, string) -> str:
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment