Skip to content
Snippets Groups Projects
Unverified Commit f54f2bd9 authored by Kenzie Mihardja's avatar Kenzie Mihardja Committed by GitHub
Browse files

Docugami Bug Fixes (#12154)

parent b730fb67
No related branches found
No related tags found
No related merge requests found
Showing
with 77 additions and 27 deletions
...@@ -80,6 +80,40 @@ class DocugamiReader(BaseReader): ...@@ -80,6 +80,40 @@ class DocugamiReader(BaseReader):
include_project_metadata_in_doc_metadata: bool = True include_project_metadata_in_doc_metadata: bool = True
"""Set to True if you want to include the project metadata in the doc metadata.""" """Set to True if you want to include the project metadata in the doc metadata."""
def __init__(
self,
api: str = DEFAULT_API_ENDPOINT,
access_token: Optional[str] = os.environ.get("DOCUGAMI_API_KEY"),
max_text_length=4096,
min_text_length: int = 32,
max_metadata_length=512,
include_xml_tags: bool = False,
parent_hierarchy_levels: int = 0,
parent_id_key: str = "doc_id",
sub_chunk_tables: bool = False,
whitespace_normalize_text: bool = True,
docset_id: Optional[str] = None,
document_ids: Optional[Sequence[str]] = None,
file_paths: Optional[Sequence[Union[Path, str]]] = None,
include_project_metadata_in_doc_metadata: bool = True,
):
self.api = api
self.access_token = access_token
self.max_text_length = max_text_length
self.min_text_length = min_text_length
self.max_metadata_length = max_metadata_length
self.include_xml_tags = include_xml_tags
self.parent_hierarchy_levels = parent_hierarchy_levels
self.parent_id_key = parent_id_key
self.sub_chunk_tables = sub_chunk_tables
self.whitespace_normalize_text = whitespace_normalize_text
self.docset_id = docset_id
self.document_ids = document_ids
self.file_paths = file_paths
self.include_project_metadata_in_doc_metadata = (
include_project_metadata_in_doc_metadata
)
def _parse_dgml( def _parse_dgml(
self, self,
content: bytes, content: bytes,
...@@ -169,7 +203,7 @@ class DocugamiReader(BaseReader): ...@@ -169,7 +203,7 @@ class DocugamiReader(BaseReader):
if dg_chunk.parent: if dg_chunk.parent:
framework_parent_chunk = _build_framework_chunk(dg_chunk.parent) framework_parent_chunk = _build_framework_chunk(dg_chunk.parent)
parent_id = framework_parent_chunk.metadata.get(ID_KEY) parent_id = framework_parent_chunk.metadata.get(ID_KEY)
if parent_id and framework_parent_chunk.page_content: if parent_id and framework_parent_chunk.text:
framework_chunk.metadata[self.parent_id_key] = parent_id framework_chunk.metadata[self.parent_id_key] = parent_id
framework_chunks[parent_id] = framework_parent_chunk framework_chunks[parent_id] = framework_parent_chunk
......
...@@ -29,7 +29,7 @@ license = "MIT" ...@@ -29,7 +29,7 @@ license = "MIT"
maintainers = ["tjaffri"] maintainers = ["tjaffri"]
name = "llama-index-readers-docugami" name = "llama-index-readers-docugami"
readme = "README.md" readme = "README.md"
version = "0.1.3" version = "0.1.4"
[tool.poetry.dependencies] [tool.poetry.dependencies]
python = ">=3.8.1,<4.0" python = ">=3.8.1,<4.0"
......
...@@ -5,11 +5,21 @@ from llama_index.core.tools import BaseTool ...@@ -5,11 +5,21 @@ from llama_index.core.tools import BaseTool
from llama_index.core.llama_pack import BaseLlamaPack from llama_index.core.llama_pack import BaseLlamaPack
from llama_index.core.agent import ReActAgent from llama_index.core.agent import ReActAgent
from helpers.prompts import ASSISTANT_SYSTEM_MESSAGE from llama_index.packs.docugami_kg_rag.helpers.prompts import ASSISTANT_SYSTEM_MESSAGE
from config import LARGE_CONTEXT_INSTRUCT_LLM, DEFAULT_USE_REPORTS from llama_index.packs.docugami_kg_rag.config import (
from helpers.indexing import read_all_local_index_state, index_docset LARGE_CONTEXT_INSTRUCT_LLM,
from helpers.reports import get_retrieval_tool_for_report DEFAULT_USE_REPORTS,
from helpers.retrieval import get_retrieval_tool_for_docset )
from llama_index.packs.docugami_kg_rag.helpers.indexing import (
read_all_local_index_state,
index_docset,
)
from llama_index.packs.docugami_kg_rag.helpers.reports import (
get_retrieval_tool_for_report,
)
from llama_index.packs.docugami_kg_rag.helpers.retrieval import (
get_retrieval_tool_for_docset,
)
class DocugamiKgRagPack(BaseLlamaPack): class DocugamiKgRagPack(BaseLlamaPack):
...@@ -36,7 +46,8 @@ class DocugamiKgRagPack(BaseLlamaPack): ...@@ -36,7 +46,8 @@ class DocugamiKgRagPack(BaseLlamaPack):
""" """
docsets_response = self.docugami_client.docsets.list() docsets_response = self.docugami_client.docsets.list()
docset = next( docset = next(
[docset for docset in docsets_response.docsets if docset.id == docset_id] (docset for docset in docsets_response.docsets if docset.id == docset_id),
None,
) )
if not docset: if not docset:
......
...@@ -12,17 +12,17 @@ from llama_index.vector_stores.chroma import ChromaVectorStore ...@@ -12,17 +12,17 @@ from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core.retrievers import BaseRetriever from llama_index.core.retrievers import BaseRetriever
from config import FULL_DOC_SUMMARY_ID_KEY, SOURCE_KEY, PARENT_DOC_ID_KEY, EMBEDDINGS
from llama_index.core.readers import Document from llama_index.core.readers import Document
from dataclasses import dataclass from dataclasses import dataclass
from typing import List from typing import List
from config import ( from llama_index.packs.docugami_kg_rag.config import (
RETRIEVER_K, RETRIEVER_K,
FULL_DOC_SUMMARY_ID_KEY,
SOURCE_KEY,
PARENT_DOC_ID_KEY,
EMBEDDINGS,
) )
from llama_index.core import QueryBundle from llama_index.core import QueryBundle
......
...@@ -7,9 +7,12 @@ from typing import Dict, List ...@@ -7,9 +7,12 @@ from typing import Dict, List
from llama_index.core import StorageContext, VectorStoreIndex from llama_index.core import StorageContext, VectorStoreIndex
from llama_index.vector_stores.chroma import ChromaVectorStore from llama_index.vector_stores.chroma import ChromaVectorStore
from helpers.reports import ReportDetails, build_report_details from llama_index.packs.docugami_kg_rag.helpers.reports import (
ReportDetails,
build_report_details,
)
from config import ( from llama_index.packs.docugami_kg_rag.config import (
CHROMA_DIRECTORY, CHROMA_DIRECTORY,
EMBEDDINGS, EMBEDDINGS,
FULL_DOC_SUMMARY_ID_KEY, FULL_DOC_SUMMARY_ID_KEY,
...@@ -26,12 +29,12 @@ import chromadb ...@@ -26,12 +29,12 @@ import chromadb
from llama_index.readers.docugami import DocugamiReader from llama_index.readers.docugami import DocugamiReader
from llama_index.core.readers import Document from llama_index.core.readers import Document
from helpers.summaries import ( from llama_index.packs.docugami_kg_rag.helpers.summaries import (
build_chunk_summary_mappings, build_chunk_summary_mappings,
build_full_doc_summary_mappings, build_full_doc_summary_mappings,
) )
from helpers.retrieval import ( from llama_index.packs.docugami_kg_rag.helpers.retrieval import (
LocalIndexState, LocalIndexState,
docset_name_to_direct_retriever_tool_function_name, docset_name_to_direct_retriever_tool_function_name,
chunks_to_direct_retriever_tool_description, chunks_to_direct_retriever_tool_description,
......
...@@ -7,7 +7,7 @@ import pandas as pd ...@@ -7,7 +7,7 @@ import pandas as pd
import requests import requests
import sqlite3 import sqlite3
import tempfile import tempfile
from config import REPORT_DIRECTORY, DOCUGAMI_API_KEY from llama_index.packs.docugami_kg_rag.config import REPORT_DIRECTORY, DOCUGAMI_API_KEY
from docugami import Docugami from docugami import Docugami
from llama_index.core import SQLDatabase from llama_index.core import SQLDatabase
......
from typing import Dict, List, Optional from typing import Dict, List, Optional
from dataclasses import dataclass from dataclasses import dataclass
from helpers.reports import ReportDetails from llama_index.packs.docugami_kg_rag.helpers.reports import ReportDetails
from llama_index.core.readers import Document from llama_index.core.readers import Document
from config import ( from llama_index.packs.docugami_kg_rag.config import (
MAX_CHUNK_TEXT_LENGTH, MAX_CHUNK_TEXT_LENGTH,
LARGE_CONTEXT_INSTRUCT_LLM, LARGE_CONTEXT_INSTRUCT_LLM,
) )
import re import re
from helpers.prompts import ( from llama_index.packs.docugami_kg_rag.helpers.prompts import (
CREATE_DIRECT_RETRIEVAL_TOOL_DESCRIPTION_QUERY_PROMPT, CREATE_DIRECT_RETRIEVAL_TOOL_DESCRIPTION_QUERY_PROMPT,
CREATE_DIRECT_RETRIEVAL_TOOL_SYSTEM_PROMPT, CREATE_DIRECT_RETRIEVAL_TOOL_SYSTEM_PROMPT,
) )
...@@ -18,8 +18,10 @@ from llama_index.core.llms import ChatMessage, MessageRole ...@@ -18,8 +18,10 @@ from llama_index.core.llms import ChatMessage, MessageRole
from llama_index.core.query_engine import RetrieverQueryEngine from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.tools import BaseTool, ToolMetadata, QueryEngineTool from llama_index.core.tools import BaseTool, ToolMetadata, QueryEngineTool
from helpers.vector_store import get_vector_store from llama_index.packs.docugami_kg_rag.helpers.vector_store import get_vector_store
from helpers.fused_summary_retriever import FusedSummaryRetriever from llama_index.packs.docugami_kg_rag.helpers.fused_summary_retriever import (
FusedSummaryRetriever,
)
@dataclass @dataclass
......
...@@ -5,7 +5,7 @@ from tqdm import tqdm ...@@ -5,7 +5,7 @@ from tqdm import tqdm
from llama_index.llms.openai import OpenAI from llama_index.llms.openai import OpenAI
from config import ( from llama_index.packs.docugami_kg_rag.config import (
LARGE_CONTEXT_INSTRUCT_LLM, LARGE_CONTEXT_INSTRUCT_LLM,
MAX_CHUNK_TEXT_LENGTH, MAX_CHUNK_TEXT_LENGTH,
INCLUDE_XML_TAGS, INCLUDE_XML_TAGS,
...@@ -15,13 +15,13 @@ from config import ( ...@@ -15,13 +15,13 @@ from config import (
) )
from llama_index.core.readers import Document from llama_index.core.readers import Document
from helpers.prompts import ( from llama_index.packs.docugami_kg_rag.helpers.prompts import (
CREATE_FULL_DOCUMENT_SUMMARY_QUERY_PROMPT, CREATE_FULL_DOCUMENT_SUMMARY_QUERY_PROMPT,
CREATE_FULL_DOCUMENT_SUMMARY_SYSTEM_PROMPT, CREATE_FULL_DOCUMENT_SUMMARY_SYSTEM_PROMPT,
CREATE_CHUNK_SUMMARY_QUERY_PROMPT, CREATE_CHUNK_SUMMARY_QUERY_PROMPT,
CREATE_CHUNK_SUMMARY_SYSTEM_PROMPT, CREATE_CHUNK_SUMMARY_SYSTEM_PROMPT,
) )
from config import PARENT_DOC_ID_KEY from llama_index.packs.docugami_kg_rag.config import PARENT_DOC_ID_KEY
from llama_index.core.llms import ChatMessage, MessageRole from llama_index.core.llms import ChatMessage, MessageRole
FORMAT = ( FORMAT = (
......
...@@ -3,7 +3,7 @@ from typing import Optional ...@@ -3,7 +3,7 @@ from typing import Optional
from llama_index.core import VectorStoreIndex from llama_index.core import VectorStoreIndex
from llama_index.vector_stores.chroma import ChromaVectorStore from llama_index.vector_stores.chroma import ChromaVectorStore
from config import CHROMA_DIRECTORY, EMBEDDINGS from llama_index.packs.docugami_kg_rag.config import CHROMA_DIRECTORY, EMBEDDINGS
import chromadb import chromadb
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment