Docugami Bug Fixes (#12154)

f54f2bd9 · Kenzie Mihardja · GitHub · b730fb67 · f54f2bd9 · f54f2bd9
Unverified Commit f54f2bd9 authored 1 year ago by Kenzie Mihardja Committed by GitHub 1 year ago
--- a/llama-index-integrations/readers/llama-index-readers-docugami/llama_index/readers/docugami/base.py
+++ b/llama-index-integrations/readers/llama-index-readers-docugami/llama_index/readers/docugami/base.py
@@ -80,6 +80,40 @@ class DocugamiReader(BaseReader):
    include_project_metadata_in_doc_metadata: bool = True
    """Set to True if you want to include the project metadata in the doc metadata."""
+    def __init__(
+        self,
+        api: str = DEFAULT_API_ENDPOINT,
+        access_token: Optional[str] = os.environ.get("DOCUGAMI_API_KEY"),
+        max_text_length=4096,
+        min_text_length: int = 32,
+        max_metadata_length=512,
+        include_xml_tags: bool = False,
+        parent_hierarchy_levels: int = 0,
+        parent_id_key: str = "doc_id",
+        sub_chunk_tables: bool = False,
+        whitespace_normalize_text: bool = True,
+        docset_id: Optional[str] = None,
+        document_ids: Optional[Sequence[str]] = None,
+        file_paths: Optional[Sequence[Union[Path, str]]] = None,
+        include_project_metadata_in_doc_metadata: bool = True,
+    ):
+        self.api = api
+        self.access_token = access_token
+        self.max_text_length = max_text_length
+        self.min_text_length = min_text_length
+        self.max_metadata_length = max_metadata_length
+        self.include_xml_tags = include_xml_tags
+        self.parent_hierarchy_levels = parent_hierarchy_levels
+        self.parent_id_key = parent_id_key
+        self.sub_chunk_tables = sub_chunk_tables
+        self.whitespace_normalize_text = whitespace_normalize_text
+        self.docset_id = docset_id
+        self.document_ids = document_ids
+        self.file_paths = file_paths
+        self.include_project_metadata_in_doc_metadata = (
+            include_project_metadata_in_doc_metadata
+        )
    def _parse_dgml(
        self,
        content: bytes,
@@ -169,7 +203,7 @@ class DocugamiReader(BaseReader):
                if dg_chunk.parent:
                    framework_parent_chunk = _build_framework_chunk(dg_chunk.parent)
                    parent_id = framework_parent_chunk.metadata.get(ID_KEY)
-                    if parent_id and framework_parent_chunk.page_content:
+                    if parent_id and framework_parent_chunk.text:
                        framework_chunk.metadata[self.parent_id_key] = parent_id
                        framework_chunks[parent_id] = framework_parent_chunk

--- a/llama-index-integrations/readers/llama-index-readers-docugami/pyproject.toml
+++ b/llama-index-integrations/readers/llama-index-readers-docugami/pyproject.toml
@@ -29,7 +29,7 @@ license = "MIT"
 maintainers = ["tjaffri"]
 name = "llama-index-readers-docugami"
 readme = "README.md"
-version = "0.1.3"
+version = "0.1.4"
 [tool.poetry.dependencies]
 python = ">=3.8.1,<4.0"

--- a/llama-index-packs/llama-index-packs-docugami-kg-rag/llama_index/packs/docugami_kg_rag/base.py
+++ b/llama-index-packs/llama-index-packs-docugami-kg-rag/llama_index/packs/docugami_kg_rag/base.py
@@ -5,11 +5,21 @@ from llama_index.core.tools import BaseTool
 from llama_index.core.llama_pack import BaseLlamaPack
 from llama_index.core.agent import ReActAgent
-from helpers.prompts import ASSISTANT_SYSTEM_MESSAGE
+from llama_index.packs.docugami_kg_rag.helpers.prompts import ASSISTANT_SYSTEM_MESSAGE
-from config import LARGE_CONTEXT_INSTRUCT_LLM, DEFAULT_USE_REPORTS
+from llama_index.packs.docugami_kg_rag.config import (
-from helpers.indexing import read_all_local_index_state, index_docset
+    LARGE_CONTEXT_INSTRUCT_LLM,
-from helpers.reports import get_retrieval_tool_for_report
+    DEFAULT_USE_REPORTS,
-from helpers.retrieval import get_retrieval_tool_for_docset
+)
+from llama_index.packs.docugami_kg_rag.helpers.indexing import (
+    read_all_local_index_state,
+    index_docset,
+)
+from llama_index.packs.docugami_kg_rag.helpers.reports import (
+    get_retrieval_tool_for_report,
+)
+from llama_index.packs.docugami_kg_rag.helpers.retrieval import (
+    get_retrieval_tool_for_docset,
+)
 class DocugamiKgRagPack(BaseLlamaPack):
@@ -36,7 +46,8 @@ class DocugamiKgRagPack(BaseLlamaPack):
        """
        docsets_response = self.docugami_client.docsets.list()
        docset = next(
-            [docset for docset in docsets_response.docsets if docset.id == docset_id]
+            (docset for docset in docsets_response.docsets if docset.id == docset_id),
+            None,
        )
        if not docset:

--- a/llama-index-packs/llama-index-packs-docugami-kg-rag/llama_index/packs/docugami_kg_rag/helpers/fused_summary_retriever.py
+++ b/llama-index-packs/llama-index-packs-docugami-kg-rag/llama_index/packs/docugami_kg_rag/helpers/fused_summary_retriever.py
@@ -12,17 +12,17 @@ from llama_index.vector_stores.chroma import ChromaVectorStore
 from llama_index.core.retrievers import BaseRetriever
-from config import FULL_DOC_SUMMARY_ID_KEY, SOURCE_KEY, PARENT_DOC_ID_KEY, EMBEDDINGS
 from llama_index.core.readers import Document
 from dataclasses import dataclass
 from typing import List
-from config import (
+from llama_index.packs.docugami_kg_rag.config import (
    RETRIEVER_K,
+    FULL_DOC_SUMMARY_ID_KEY,
+    SOURCE_KEY,
+    PARENT_DOC_ID_KEY,
+    EMBEDDINGS,
 )
 from llama_index.core import QueryBundle

--- a/llama-index-packs/llama-index-packs-docugami-kg-rag/llama_index/packs/docugami_kg_rag/helpers/indexing.py
+++ b/llama-index-packs/llama-index-packs-docugami-kg-rag/llama_index/packs/docugami_kg_rag/helpers/indexing.py
@@ -7,9 +7,12 @@ from typing import Dict, List
 from llama_index.core import StorageContext, VectorStoreIndex
 from llama_index.vector_stores.chroma import ChromaVectorStore
-from helpers.reports import ReportDetails, build_report_details
+from llama_index.packs.docugami_kg_rag.helpers.reports import (
+    ReportDetails,
+    build_report_details,
+)
-from config import (
+from llama_index.packs.docugami_kg_rag.config import (
    CHROMA_DIRECTORY,
    EMBEDDINGS,
    FULL_DOC_SUMMARY_ID_KEY,
@@ -26,12 +29,12 @@ import chromadb
 from llama_index.readers.docugami import DocugamiReader
 from llama_index.core.readers import Document
-from helpers.summaries import (
+from llama_index.packs.docugami_kg_rag.helpers.summaries import (
    build_chunk_summary_mappings,
    build_full_doc_summary_mappings,
 )
-from helpers.retrieval import (
+from llama_index.packs.docugami_kg_rag.helpers.retrieval import (
    LocalIndexState,
    docset_name_to_direct_retriever_tool_function_name,
    chunks_to_direct_retriever_tool_description,

--- a/llama-index-packs/llama-index-packs-docugami-kg-rag/llama_index/packs/docugami_kg_rag/helpers/reports.py
+++ b/llama-index-packs/llama-index-packs-docugami-kg-rag/llama_index/packs/docugami_kg_rag/helpers/reports.py
@@ -7,7 +7,7 @@ import pandas as pd
 import requests
 import sqlite3
 import tempfile
-from config import REPORT_DIRECTORY, DOCUGAMI_API_KEY
+from llama_index.packs.docugami_kg_rag.config import REPORT_DIRECTORY, DOCUGAMI_API_KEY
 from docugami import Docugami
 from llama_index.core import SQLDatabase

--- a/llama-index-packs/llama-index-packs-docugami-kg-rag/llama_index/packs/docugami_kg_rag/helpers/retrieval.py
+++ b/llama-index-packs/llama-index-packs-docugami-kg-rag/llama_index/packs/docugami_kg_rag/helpers/retrieval.py
 from typing import Dict, List, Optional
 from dataclasses import dataclass
-from helpers.reports import ReportDetails
+from llama_index.packs.docugami_kg_rag.helpers.reports import ReportDetails
 from llama_index.core.readers import Document
-from config import (
+from llama_index.packs.docugami_kg_rag.config import (
    MAX_CHUNK_TEXT_LENGTH,
    LARGE_CONTEXT_INSTRUCT_LLM,
 )
 import re
-from helpers.prompts import (
+from llama_index.packs.docugami_kg_rag.helpers.prompts import (
    CREATE_DIRECT_RETRIEVAL_TOOL_DESCRIPTION_QUERY_PROMPT,
    CREATE_DIRECT_RETRIEVAL_TOOL_SYSTEM_PROMPT,
 )
@@ -18,8 +18,10 @@ from llama_index.core.llms import ChatMessage, MessageRole
 from llama_index.core.query_engine import RetrieverQueryEngine
 from llama_index.core.tools import BaseTool, ToolMetadata, QueryEngineTool
-from helpers.vector_store import get_vector_store
+from llama_index.packs.docugami_kg_rag.helpers.vector_store import get_vector_store
-from helpers.fused_summary_retriever import FusedSummaryRetriever
+from llama_index.packs.docugami_kg_rag.helpers.fused_summary_retriever import (
+    FusedSummaryRetriever,
+)
 @dataclass

--- a/llama-index-packs/llama-index-packs-docugami-kg-rag/llama_index/packs/docugami_kg_rag/helpers/summaries.py
+++ b/llama-index-packs/llama-index-packs-docugami-kg-rag/llama_index/packs/docugami_kg_rag/helpers/summaries.py
@@ -5,7 +5,7 @@ from tqdm import tqdm
 from llama_index.llms.openai import OpenAI
-from config import (
+from llama_index.packs.docugami_kg_rag.config import (
    LARGE_CONTEXT_INSTRUCT_LLM,
    MAX_CHUNK_TEXT_LENGTH,
    INCLUDE_XML_TAGS,
@@ -15,13 +15,13 @@ from config import (
 )
 from llama_index.core.readers import Document
-from helpers.prompts import (
+from llama_index.packs.docugami_kg_rag.helpers.prompts import (
    CREATE_FULL_DOCUMENT_SUMMARY_QUERY_PROMPT,
    CREATE_FULL_DOCUMENT_SUMMARY_SYSTEM_PROMPT,
    CREATE_CHUNK_SUMMARY_QUERY_PROMPT,
    CREATE_CHUNK_SUMMARY_SYSTEM_PROMPT,
 )
-from config import PARENT_DOC_ID_KEY
+from llama_index.packs.docugami_kg_rag.config import PARENT_DOC_ID_KEY
 from llama_index.core.llms import ChatMessage, MessageRole
 FORMAT = (

--- a/llama-index-packs/llama-index-packs-docugami-kg-rag/llama_index/packs/docugami_kg_rag/helpers/vector_store.py
+++ b/llama-index-packs/llama-index-packs-docugami-kg-rag/llama_index/packs/docugami_kg_rag/helpers/vector_store.py
@@ -3,7 +3,7 @@ from typing import Optional
 from llama_index.core import VectorStoreIndex
 from llama_index.vector_stores.chroma import ChromaVectorStore
-from config import CHROMA_DIRECTORY, EMBEDDINGS
+from llama_index.packs.docugami_kg_rag.config import CHROMA_DIRECTORY, EMBEDDINGS
 import chromadb