diff --git a/llama-index-integrations/readers/llama-index-readers-docugami/llama_index/readers/docugami/base.py b/llama-index-integrations/readers/llama-index-readers-docugami/llama_index/readers/docugami/base.py index 19104bfac94d81775f0e2a8b77a171644e394722..c9f1da2167893696ccf70bee6ec8267dfc960425 100644 --- a/llama-index-integrations/readers/llama-index-readers-docugami/llama_index/readers/docugami/base.py +++ b/llama-index-integrations/readers/llama-index-readers-docugami/llama_index/readers/docugami/base.py @@ -80,6 +80,40 @@ class DocugamiReader(BaseReader): include_project_metadata_in_doc_metadata: bool = True """Set to True if you want to include the project metadata in the doc metadata.""" + def __init__( + self, + api: str = DEFAULT_API_ENDPOINT, + access_token: Optional[str] = os.environ.get("DOCUGAMI_API_KEY"), + max_text_length=4096, + min_text_length: int = 32, + max_metadata_length=512, + include_xml_tags: bool = False, + parent_hierarchy_levels: int = 0, + parent_id_key: str = "doc_id", + sub_chunk_tables: bool = False, + whitespace_normalize_text: bool = True, + docset_id: Optional[str] = None, + document_ids: Optional[Sequence[str]] = None, + file_paths: Optional[Sequence[Union[Path, str]]] = None, + include_project_metadata_in_doc_metadata: bool = True, + ): + self.api = api + self.access_token = access_token + self.max_text_length = max_text_length + self.min_text_length = min_text_length + self.max_metadata_length = max_metadata_length + self.include_xml_tags = include_xml_tags + self.parent_hierarchy_levels = parent_hierarchy_levels + self.parent_id_key = parent_id_key + self.sub_chunk_tables = sub_chunk_tables + self.whitespace_normalize_text = whitespace_normalize_text + self.docset_id = docset_id + self.document_ids = document_ids + self.file_paths = file_paths + self.include_project_metadata_in_doc_metadata = ( + include_project_metadata_in_doc_metadata + ) + def _parse_dgml( self, content: bytes, @@ -169,7 +203,7 @@ class DocugamiReader(BaseReader): if dg_chunk.parent: framework_parent_chunk = _build_framework_chunk(dg_chunk.parent) parent_id = framework_parent_chunk.metadata.get(ID_KEY) - if parent_id and framework_parent_chunk.page_content: + if parent_id and framework_parent_chunk.text: framework_chunk.metadata[self.parent_id_key] = parent_id framework_chunks[parent_id] = framework_parent_chunk diff --git a/llama-index-integrations/readers/llama-index-readers-docugami/pyproject.toml b/llama-index-integrations/readers/llama-index-readers-docugami/pyproject.toml index 09ae77b294bb48f79e22535133f718b0c010aaff..11da0d422a41599f79322f6d3ad75ed0ead00866 100644 --- a/llama-index-integrations/readers/llama-index-readers-docugami/pyproject.toml +++ b/llama-index-integrations/readers/llama-index-readers-docugami/pyproject.toml @@ -29,7 +29,7 @@ license = "MIT" maintainers = ["tjaffri"] name = "llama-index-readers-docugami" readme = "README.md" -version = "0.1.3" +version = "0.1.4" [tool.poetry.dependencies] python = ">=3.8.1,<4.0" diff --git a/llama-index-packs/llama-index-packs-docugami-kg-rag/llama_index/packs/docugami_kg_rag/base.py b/llama-index-packs/llama-index-packs-docugami-kg-rag/llama_index/packs/docugami_kg_rag/base.py index eaa172dead41ca05ffcd2c9d547294c80611a029..128b661cf503e276ad7cd73aaf9f1e087ea4f08b 100644 --- a/llama-index-packs/llama-index-packs-docugami-kg-rag/llama_index/packs/docugami_kg_rag/base.py +++ b/llama-index-packs/llama-index-packs-docugami-kg-rag/llama_index/packs/docugami_kg_rag/base.py @@ -5,11 +5,21 @@ from llama_index.core.tools import BaseTool from llama_index.core.llama_pack import BaseLlamaPack from llama_index.core.agent import ReActAgent -from helpers.prompts import ASSISTANT_SYSTEM_MESSAGE -from config import LARGE_CONTEXT_INSTRUCT_LLM, DEFAULT_USE_REPORTS -from helpers.indexing import read_all_local_index_state, index_docset -from helpers.reports import get_retrieval_tool_for_report -from helpers.retrieval import get_retrieval_tool_for_docset +from llama_index.packs.docugami_kg_rag.helpers.prompts import ASSISTANT_SYSTEM_MESSAGE +from llama_index.packs.docugami_kg_rag.config import ( + LARGE_CONTEXT_INSTRUCT_LLM, + DEFAULT_USE_REPORTS, +) +from llama_index.packs.docugami_kg_rag.helpers.indexing import ( + read_all_local_index_state, + index_docset, +) +from llama_index.packs.docugami_kg_rag.helpers.reports import ( + get_retrieval_tool_for_report, +) +from llama_index.packs.docugami_kg_rag.helpers.retrieval import ( + get_retrieval_tool_for_docset, +) class DocugamiKgRagPack(BaseLlamaPack): @@ -36,7 +46,8 @@ class DocugamiKgRagPack(BaseLlamaPack): """ docsets_response = self.docugami_client.docsets.list() docset = next( - [docset for docset in docsets_response.docsets if docset.id == docset_id] + (docset for docset in docsets_response.docsets if docset.id == docset_id), + None, ) if not docset: diff --git a/llama-index-packs/llama-index-packs-docugami-kg-rag/llama_index/packs/docugami_kg_rag/helpers/fused_summary_retriever.py b/llama-index-packs/llama-index-packs-docugami-kg-rag/llama_index/packs/docugami_kg_rag/helpers/fused_summary_retriever.py index ac9961bc527595a28f717d8ed136fec43d310b1a..32d45e9f80b4c4852550a474664ad2bb752b80d7 100644 --- a/llama-index-packs/llama-index-packs-docugami-kg-rag/llama_index/packs/docugami_kg_rag/helpers/fused_summary_retriever.py +++ b/llama-index-packs/llama-index-packs-docugami-kg-rag/llama_index/packs/docugami_kg_rag/helpers/fused_summary_retriever.py @@ -12,17 +12,17 @@ from llama_index.vector_stores.chroma import ChromaVectorStore from llama_index.core.retrievers import BaseRetriever - -from config import FULL_DOC_SUMMARY_ID_KEY, SOURCE_KEY, PARENT_DOC_ID_KEY, EMBEDDINGS - from llama_index.core.readers import Document - from dataclasses import dataclass from typing import List -from config import ( +from llama_index.packs.docugami_kg_rag.config import ( RETRIEVER_K, + FULL_DOC_SUMMARY_ID_KEY, + SOURCE_KEY, + PARENT_DOC_ID_KEY, + EMBEDDINGS, ) from llama_index.core import QueryBundle diff --git a/llama-index-packs/llama-index-packs-docugami-kg-rag/llama_index/packs/docugami_kg_rag/helpers/indexing.py b/llama-index-packs/llama-index-packs-docugami-kg-rag/llama_index/packs/docugami_kg_rag/helpers/indexing.py index dc3c6ae305bbe14db5b756317ee745542f62497b..64784def42a491c064bc20e8e1baf1cbc3e70733 100644 --- a/llama-index-packs/llama-index-packs-docugami-kg-rag/llama_index/packs/docugami_kg_rag/helpers/indexing.py +++ b/llama-index-packs/llama-index-packs-docugami-kg-rag/llama_index/packs/docugami_kg_rag/helpers/indexing.py @@ -7,9 +7,12 @@ from typing import Dict, List from llama_index.core import StorageContext, VectorStoreIndex from llama_index.vector_stores.chroma import ChromaVectorStore -from helpers.reports import ReportDetails, build_report_details +from llama_index.packs.docugami_kg_rag.helpers.reports import ( + ReportDetails, + build_report_details, +) -from config import ( +from llama_index.packs.docugami_kg_rag.config import ( CHROMA_DIRECTORY, EMBEDDINGS, FULL_DOC_SUMMARY_ID_KEY, @@ -26,12 +29,12 @@ import chromadb from llama_index.readers.docugami import DocugamiReader from llama_index.core.readers import Document -from helpers.summaries import ( +from llama_index.packs.docugami_kg_rag.helpers.summaries import ( build_chunk_summary_mappings, build_full_doc_summary_mappings, ) -from helpers.retrieval import ( +from llama_index.packs.docugami_kg_rag.helpers.retrieval import ( LocalIndexState, docset_name_to_direct_retriever_tool_function_name, chunks_to_direct_retriever_tool_description, diff --git a/llama-index-packs/llama-index-packs-docugami-kg-rag/llama_index/packs/docugami_kg_rag/helpers/reports.py b/llama-index-packs/llama-index-packs-docugami-kg-rag/llama_index/packs/docugami_kg_rag/helpers/reports.py index 101cb9b1796e2feb8f77b0b1b9f6ed0608c8c424..67010b7ee6fd8dfc410fe640044cf131dba88ae8 100644 --- a/llama-index-packs/llama-index-packs-docugami-kg-rag/llama_index/packs/docugami_kg_rag/helpers/reports.py +++ b/llama-index-packs/llama-index-packs-docugami-kg-rag/llama_index/packs/docugami_kg_rag/helpers/reports.py @@ -7,7 +7,7 @@ import pandas as pd import requests import sqlite3 import tempfile -from config import REPORT_DIRECTORY, DOCUGAMI_API_KEY +from llama_index.packs.docugami_kg_rag.config import REPORT_DIRECTORY, DOCUGAMI_API_KEY from docugami import Docugami from llama_index.core import SQLDatabase diff --git a/llama-index-packs/llama-index-packs-docugami-kg-rag/llama_index/packs/docugami_kg_rag/helpers/retrieval.py b/llama-index-packs/llama-index-packs-docugami-kg-rag/llama_index/packs/docugami_kg_rag/helpers/retrieval.py index 391b75b94c0a10ba8acea0592a7aa3379a162d00..49bfb48b294f6a15428d47e32d9ad4ea59f4a4ce 100644 --- a/llama-index-packs/llama-index-packs-docugami-kg-rag/llama_index/packs/docugami_kg_rag/helpers/retrieval.py +++ b/llama-index-packs/llama-index-packs-docugami-kg-rag/llama_index/packs/docugami_kg_rag/helpers/retrieval.py @@ -1,14 +1,14 @@ from typing import Dict, List, Optional from dataclasses import dataclass -from helpers.reports import ReportDetails +from llama_index.packs.docugami_kg_rag.helpers.reports import ReportDetails from llama_index.core.readers import Document -from config import ( +from llama_index.packs.docugami_kg_rag.config import ( MAX_CHUNK_TEXT_LENGTH, LARGE_CONTEXT_INSTRUCT_LLM, ) import re -from helpers.prompts import ( +from llama_index.packs.docugami_kg_rag.helpers.prompts import ( CREATE_DIRECT_RETRIEVAL_TOOL_DESCRIPTION_QUERY_PROMPT, CREATE_DIRECT_RETRIEVAL_TOOL_SYSTEM_PROMPT, ) @@ -18,8 +18,10 @@ from llama_index.core.llms import ChatMessage, MessageRole from llama_index.core.query_engine import RetrieverQueryEngine from llama_index.core.tools import BaseTool, ToolMetadata, QueryEngineTool -from helpers.vector_store import get_vector_store -from helpers.fused_summary_retriever import FusedSummaryRetriever +from llama_index.packs.docugami_kg_rag.helpers.vector_store import get_vector_store +from llama_index.packs.docugami_kg_rag.helpers.fused_summary_retriever import ( + FusedSummaryRetriever, +) @dataclass diff --git a/llama-index-packs/llama-index-packs-docugami-kg-rag/llama_index/packs/docugami_kg_rag/helpers/summaries.py b/llama-index-packs/llama-index-packs-docugami-kg-rag/llama_index/packs/docugami_kg_rag/helpers/summaries.py index 9b34e4aa5613ad23527c148e355d549acd857000..d78a74489d8b6a0f2771c31282abb4aa5d65da19 100644 --- a/llama-index-packs/llama-index-packs-docugami-kg-rag/llama_index/packs/docugami_kg_rag/helpers/summaries.py +++ b/llama-index-packs/llama-index-packs-docugami-kg-rag/llama_index/packs/docugami_kg_rag/helpers/summaries.py @@ -5,7 +5,7 @@ from tqdm import tqdm from llama_index.llms.openai import OpenAI -from config import ( +from llama_index.packs.docugami_kg_rag.config import ( LARGE_CONTEXT_INSTRUCT_LLM, MAX_CHUNK_TEXT_LENGTH, INCLUDE_XML_TAGS, @@ -15,13 +15,13 @@ from config import ( ) from llama_index.core.readers import Document -from helpers.prompts import ( +from llama_index.packs.docugami_kg_rag.helpers.prompts import ( CREATE_FULL_DOCUMENT_SUMMARY_QUERY_PROMPT, CREATE_FULL_DOCUMENT_SUMMARY_SYSTEM_PROMPT, CREATE_CHUNK_SUMMARY_QUERY_PROMPT, CREATE_CHUNK_SUMMARY_SYSTEM_PROMPT, ) -from config import PARENT_DOC_ID_KEY +from llama_index.packs.docugami_kg_rag.config import PARENT_DOC_ID_KEY from llama_index.core.llms import ChatMessage, MessageRole FORMAT = ( diff --git a/llama-index-packs/llama-index-packs-docugami-kg-rag/llama_index/packs/docugami_kg_rag/helpers/vector_store.py b/llama-index-packs/llama-index-packs-docugami-kg-rag/llama_index/packs/docugami_kg_rag/helpers/vector_store.py index 3cba42f51d97021757cbb3766714d8614377cb20..68052bc05d2a2a707195e4d152174d4d6b58e235 100644 --- a/llama-index-packs/llama-index-packs-docugami-kg-rag/llama_index/packs/docugami_kg_rag/helpers/vector_store.py +++ b/llama-index-packs/llama-index-packs-docugami-kg-rag/llama_index/packs/docugami_kg_rag/helpers/vector_store.py @@ -3,7 +3,7 @@ from typing import Optional from llama_index.core import VectorStoreIndex from llama_index.vector_stores.chroma import ChromaVectorStore -from config import CHROMA_DIRECTORY, EMBEDDINGS +from llama_index.packs.docugami_kg_rag.config import CHROMA_DIRECTORY, EMBEDDINGS import chromadb