diff --git a/llama-index-integrations/readers/llama-index-readers-microsoft-outlook-emails/.gitignore b/llama-index-integrations/readers/llama-index-readers-microsoft-outlook-emails/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..990c18de229088f55c6c514fd0f2d49981d1b0e7 --- /dev/null +++ b/llama-index-integrations/readers/llama-index-readers-microsoft-outlook-emails/.gitignore @@ -0,0 +1,153 @@ +llama_index/_static +.DS_Store +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +bin/ +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +etc/ +include/ +lib/ +lib64/ +parts/ +sdist/ +share/ +var/ +wheels/ +pip-wheel-metadata/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +.ruff_cache + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints +notebooks/ + +# IPython +profile_default/ +ipython_config.py + +# pyenv +.python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ +pyvenv.cfg + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# Jetbrains +.idea +modules/ +*.swp + +# VsCode +.vscode + +# pipenv +Pipfile +Pipfile.lock + +# pyright +pyrightconfig.json diff --git a/llama-index-integrations/readers/llama-index-readers-microsoft-outlook-emails/BUILD b/llama-index-integrations/readers/llama-index-readers-microsoft-outlook-emails/BUILD new file mode 100644 index 0000000000000000000000000000000000000000..0896ca890d8bffd60a44fa824f8d57fecd73ee53 --- /dev/null +++ b/llama-index-integrations/readers/llama-index-readers-microsoft-outlook-emails/BUILD @@ -0,0 +1,3 @@ +poetry_requirements( + name="poetry", +) diff --git a/llama-index-integrations/readers/llama-index-readers-microsoft-outlook-emails/Makefile b/llama-index-integrations/readers/llama-index-readers-microsoft-outlook-emails/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..b9eab05aa370629a4a3de75df3ff64cd53887b68 --- /dev/null +++ b/llama-index-integrations/readers/llama-index-readers-microsoft-outlook-emails/Makefile @@ -0,0 +1,17 @@ +GIT_ROOT ?= $(shell git rev-parse --show-toplevel) + +help: ## Show all Makefile targets. + @grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[33m%-30s\033[0m %s\n", $$1, $$2}' + +format: ## Run code autoformatters (black). + pre-commit install + git ls-files | xargs pre-commit run black --files + +lint: ## Run linters: pre-commit (black, ruff, codespell) and mypy + pre-commit install && git ls-files | xargs pre-commit run --show-diff-on-failure --files + +test: ## Run tests via pytest. + pytest tests + +watch-docs: ## Build and watch documentation. + sphinx-autobuild docs/ docs/_build/html --open-browser --watch $(GIT_ROOT)/llama_index/ diff --git a/llama-index-integrations/readers/llama-index-readers-microsoft-outlook-emails/README.md b/llama-index-integrations/readers/llama-index-readers-microsoft-outlook-emails/README.md new file mode 100644 index 0000000000000000000000000000000000000000..95716d9c9eb6795df6895bab32b9e1238fbd59da --- /dev/null +++ b/llama-index-integrations/readers/llama-index-readers-microsoft-outlook-emails/README.md @@ -0,0 +1,40 @@ +# Microsoft Outlook Email Reader + +```bash +pip install llama-index-readers-microsoft-outlook-emails +``` + +The loader retrieves emails from an Outlook mailbox and indexes the subject and body of the emails. + +## Prerequisites + +### App Authentication using Microsoft Entra ID (formerly Azure AD) + +1. You need to create an App Registration in Microsoft Entra ID. Refer [here](https://learn.microsoft.com/en-us/azure/healthcare-apis/register-application) +2. API Permissions for the created app: + 1. Microsoft Graph --> Application Permissions --> Mail.Read (**Grant Admin Consent**) + +More info on Microsoft Graph APIs - [Refer here](https://learn.microsoft.com/en-us/graph/permissions-reference) + +## Usage + +To use this loader, `client_id`, `client_secret`, and `tenant_id` of the registered app in Microsoft Azure Portal are required. + +This loader fetches emails from a specified folder in an Outlook mailbox. + +```python +from llama_index.readers.outlook_emails import OutlookEmailReader + +loader = OutlookEmailReader( + client_id="<Client ID of the app>", + client_secret="<Client Secret of the app>", + tenant_id="<Tenant ID of the Microsoft Azure Directory>", + user_email="<User Email Address>", + folder="Inbox", + num_mails=10, +) + +documents = loader.load_data() +``` + +The loader retrieves the subject and body of the emails from the specified folder in Outlook. diff --git a/llama-index-integrations/readers/llama-index-readers-microsoft-outlook-emails/llama_index/readers/outlook_emails/BUILD b/llama-index-integrations/readers/llama-index-readers-microsoft-outlook-emails/llama_index/readers/outlook_emails/BUILD new file mode 100644 index 0000000000000000000000000000000000000000..db46e8d6c978c67e301dd6c47bee08c1b3fd141c --- /dev/null +++ b/llama-index-integrations/readers/llama-index-readers-microsoft-outlook-emails/llama_index/readers/outlook_emails/BUILD @@ -0,0 +1 @@ +python_sources() diff --git a/llama-index-integrations/readers/llama-index-readers-microsoft-outlook-emails/llama_index/readers/outlook_emails/__init__.py b/llama-index-integrations/readers/llama-index-readers-microsoft-outlook-emails/llama_index/readers/outlook_emails/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..215ef3fc0a78afa53f8a6ae9f06e4a76b4e0a070 --- /dev/null +++ b/llama-index-integrations/readers/llama-index-readers-microsoft-outlook-emails/llama_index/readers/outlook_emails/__init__.py @@ -0,0 +1,3 @@ +from llama_index.readers.outlook_emails.base import OutlookEmailReader + +__all__ = ["OutlookEmailReader"] diff --git a/llama-index-integrations/readers/llama-index-readers-microsoft-outlook-emails/llama_index/readers/outlook_emails/base.py b/llama-index-integrations/readers/llama-index-readers-microsoft-outlook-emails/llama_index/readers/outlook_emails/base.py new file mode 100644 index 0000000000000000000000000000000000000000..eea132c8619d9418065a21e44a8c320133bbfa53 --- /dev/null +++ b/llama-index-integrations/readers/llama-index-readers-microsoft-outlook-emails/llama_index/readers/outlook_emails/base.py @@ -0,0 +1,87 @@ +import logging +import requests +from typing import List, Optional +from llama_index.core.readers.base import BasePydanticReader +from llama_index.core.bridge.pydantic import PrivateAttr + +logger = logging.getLogger(__name__) + + +class OutlookEmailReader(BasePydanticReader): + """ + Outlook Emails Reader using Microsoft Graph API. + + Reads emails from a given Outlook mailbox and indexes the subject and body. + + Args: + client_id (str): The Application ID for the app registered in Microsoft Azure. + client_secret (str): The application secret for the app registered in Azure. + tenant_id (str): Unique identifier of the Azure Active Directory Instance. + user_email (str): Email address of the user whose emails need to be fetched. + folder (Optional[str]): The email folder to fetch emails from. Defaults to "Inbox". + num_mails (int): Number of emails to retrieve. Defaults to 10. + """ + + client_id: str + client_secret: str + tenant_id: str + user_email: str + folder: Optional[str] = "Inbox" + num_mails: int = 10 + + _authorization_headers: Optional[dict] = PrivateAttr(default=None) + + def __init__( + self, + client_id: str, + client_secret: str, + tenant_id: str, + user_email: str, + folder: Optional[str] = "Inbox", + num_mails: int = 10, + ): + super().__init__( + client_id=client_id, + client_secret=client_secret, + tenant_id=tenant_id, + user_email=user_email, + folder=folder, + num_mails=num_mails, + ) + + def _ensure_token(self): + """Ensures we have a valid access token.""" + if self._authorization_headers is None: + token = self._get_access_token() + self._authorization_headers = {"Authorization": f"Bearer {token}"} + + def _get_access_token(self) -> str: + """Fetches the OAuth token from Microsoft.""" + token_url = f"https://login.microsoftonline.com/{self.tenant_id}/oauth2/token" + payload = { + "grant_type": "client_credentials", + "client_id": self.client_id, + "client_secret": self.client_secret, + "resource": "https://graph.microsoft.com/", + } + response = requests.post(token_url, data=payload) + response.raise_for_status() + return response.json().get("access_token") + + def _fetch_emails(self) -> List[dict]: + """Fetches emails from the specified folder.""" + self._ensure_token() + url = f"https://graph.microsoft.com/v1.0/users/{self.user_email}/mailFolders/{self.folder}/messages?$top={self.num_mails}" + response = requests.get(url, headers=self._authorization_headers) + response.raise_for_status() + return response.json().get("value", []) + + def load_data(self) -> List[str]: + """Loads emails as texts containing subject and body.""" + emails = self._fetch_emails() + email_texts = [] + for email in emails: + subject = email.get("subject", "No Subject") + body = email.get("body", {}).get("content", "No Content") + email_texts.append(f"Subject: {subject}\n\n{body}") + return email_texts diff --git a/llama-index-integrations/readers/llama-index-readers-microsoft-outlook-emails/pyproject.toml b/llama-index-integrations/readers/llama-index-readers-microsoft-outlook-emails/pyproject.toml new file mode 100644 index 0000000000000000000000000000000000000000..71b46862c7db6438826feb2b6f85a88d335ac827 --- /dev/null +++ b/llama-index-integrations/readers/llama-index-readers-microsoft-outlook-emails/pyproject.toml @@ -0,0 +1,64 @@ +[build-system] +build-backend = "poetry.core.masonry.api" +requires = ["poetry-core"] + +[tool.codespell] +check-filenames = true +check-hidden = true +skip = "*.csv,*.html,*.json,*.jsonl,*.pdf,*.txt,*.ipynb" + +[tool.llamahub] +contains_example = false +import_path = "llama_index.readers.microsoft_outlook_emails" + +[tool.llamahub.class_authors] +OutlookEmailReader = "llama-index" + +[tool.mypy] +disallow_untyped_defs = true +exclude = ["_static", "build", "examples", "notebooks", "venv"] +ignore_missing_imports = true +python_version = "3.8" + +[tool.poetry] +authors = ["Your Name <you@example.com>"] +description = "llama-index readers microsoft_outlook_emails integration" +exclude = ["**/BUILD"] +keywords = ["emails", "microsoft 365", "microsoft365", "outlook"] +license = "MIT" +maintainers = ["llama-index"] +name = "llama-index-readers-microsoft-outlook-emails" +readme = "README.md" +version = "0.1.0" + +[tool.poetry.dependencies] +python = ">=3.9,<4.0" +llama-index-core = "^0.12.0" + +[tool.poetry.group.dev.dependencies] +ipython = "8.10.0" +jupyter = "^1.0.0" +mypy = "0.991" +pre-commit = "3.2.0" +pylint = "2.15.10" +pytest = "7.2.1" +pytest-mock = "3.11.1" +ruff = "0.0.292" +tree-sitter-languages = "^1.8.0" +types-Deprecated = ">=0.1.0" +types-PyYAML = "^6.0.12.12" +types-protobuf = "^4.24.0.4" +types-redis = "4.5.5.0" +types-requests = "2.28.11.8" +types-setuptools = "67.1.0.0" + +[tool.poetry.group.dev.dependencies.black] +extras = ["jupyter"] +version = "<=23.9.1,>=23.7.0" + +[tool.poetry.group.dev.dependencies.codespell] +extras = ["toml"] +version = ">=v2.2.6" + +[[tool.poetry.packages]] +include = "llama_index/" diff --git a/llama-index-integrations/readers/llama-index-readers-microsoft-outlook-emails/tests/BUILD b/llama-index-integrations/readers/llama-index-readers-microsoft-outlook-emails/tests/BUILD new file mode 100644 index 0000000000000000000000000000000000000000..dabf212d7e7162849c24a733909ac4f645d75a31 --- /dev/null +++ b/llama-index-integrations/readers/llama-index-readers-microsoft-outlook-emails/tests/BUILD @@ -0,0 +1 @@ +python_tests() diff --git a/llama-index-integrations/readers/llama-index-readers-microsoft-outlook-emails/tests/__init__.py b/llama-index-integrations/readers/llama-index-readers-microsoft-outlook-emails/tests/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/llama-index-integrations/readers/llama-index-readers-microsoft-outlook-emails/tests/test_readers_microsoft_outlook_mails.py b/llama-index-integrations/readers/llama-index-readers-microsoft-outlook-emails/tests/test_readers_microsoft_outlook_mails.py new file mode 100644 index 0000000000000000000000000000000000000000..7162cbdce27c4e978d46511ef15d3626743a17d0 --- /dev/null +++ b/llama-index-integrations/readers/llama-index-readers-microsoft-outlook-emails/tests/test_readers_microsoft_outlook_mails.py @@ -0,0 +1,75 @@ +import pytest +from unittest.mock import patch +from llama_index.core.readers.base import BaseReader +from llama_index.readers.outlook_emails import OutlookEmailReader + + +def test_class(): + names_of_base_classes = [b.__name__ for b in OutlookEmailReader.__mro__] + assert BaseReader.__name__ in names_of_base_classes + + +def test_serialize(): + reader = OutlookEmailReader( + client_id="test_client_id", + client_secret="test_client_secret", + tenant_id="test_tenant_id", + user_email="test_user@domain.com", + num_mails=5, + ) + + # Get the JSON schema + schema = reader.model_json_schema() + assert schema is not None + assert "properties" in schema + assert "client_id" in schema["properties"] + assert "client_secret" in schema["properties"] + assert "tenant_id" in schema["properties"] + assert "user_email" in schema["properties"] + + # Test serialization/deserialization + data = reader.model_dump() + data.pop("is_remote") + data.pop("class_name") + new_reader = OutlookEmailReader.model_validate(data) + assert new_reader.client_id == reader.client_id + assert new_reader.client_secret == reader.client_secret + assert new_reader.tenant_id == reader.tenant_id + assert new_reader.user_email == reader.user_email + + +@pytest.fixture() +def outlook_reader(): + return OutlookEmailReader( + client_id="dummy_client_id", + client_secret="dummy_client_secret", + tenant_id="dummy_tenant_id", + user_email="dummy_user@domain.com", + num_mails=2, + ) + + +def mock_fetch_emails(*args, **kwargs): + return [ + { + "subject": "Test Email 1", + "body": {"content": "This is the body of email 1."}, + }, + { + "subject": "Test Email 2", + "body": {"content": "This is the body of email 2."}, + }, + ] + + +def test_load_data(outlook_reader): + # Only mock the response from _fetch_emails, not the entire method + with patch.object( + OutlookEmailReader, "_fetch_emails", return_value=mock_fetch_emails() + ): + email_texts = outlook_reader.load_data() + + # Verify the results + assert len(email_texts) == 2 + assert email_texts[0] == "Subject: Test Email 1\n\nThis is the body of email 1." + assert email_texts[1] == "Subject: Test Email 2\n\nThis is the body of email 2."