Skip to content
Snippets Groups Projects
Unverified Commit f81e8393 authored by raveenplgithub's avatar raveenplgithub Committed by GitHub
Browse files

Add support for extra partitioning parameters for partition_html used in...

Add support for extra partitioning parameters for partition_html used in UnstructuredElementNodeParser (#11077)

Add support for extra partitioning parameters for partition_html

Co-authored-by: default avatarPiotr Kandziora <piotr.kandziora@shijigroup.com>
parent cadf1308
No related branches found
No related tags found
No related merge requests found
"""Unstructured element node parser.""" """Unstructured element node parser."""
from typing import Any, Callable, List, Optional from typing import Any, Callable, List, Optional, Dict
from llama_index.core.bridge.pydantic import Field
import pandas as pd import pandas as pd
from llama_index.core.callbacks.base import CallbackManager from llama_index.core.callbacks.base import CallbackManager
...@@ -45,11 +48,17 @@ class UnstructuredElementNodeParser(BaseElementNodeParser): ...@@ -45,11 +48,17 @@ class UnstructuredElementNodeParser(BaseElementNodeParser):
""" """
partitioning_parameters: Optional[Dict[str, Any]] = Field(
default={},
description="Extra dictionary representing parameters of the partitioning process.",
)
def __init__( def __init__(
self, self,
callback_manager: Optional[CallbackManager] = None, callback_manager: Optional[CallbackManager] = None,
llm: Optional[Any] = None, llm: Optional[Any] = None,
summary_query_str: str = DEFAULT_SUMMARY_QUERY_STR, summary_query_str: str = DEFAULT_SUMMARY_QUERY_STR,
partitioning_parameters: Optional[Dict[str, Any]] = {},
) -> None: ) -> None:
"""Initialize.""" """Initialize."""
try: try:
...@@ -66,6 +75,7 @@ class UnstructuredElementNodeParser(BaseElementNodeParser): ...@@ -66,6 +75,7 @@ class UnstructuredElementNodeParser(BaseElementNodeParser):
callback_manager=callback_manager, callback_manager=callback_manager,
llm=llm, llm=llm,
summary_query_str=summary_query_str, summary_query_str=summary_query_str,
partitioning_parameters=partitioning_parameters,
) )
@classmethod @classmethod
...@@ -91,7 +101,7 @@ class UnstructuredElementNodeParser(BaseElementNodeParser): ...@@ -91,7 +101,7 @@ class UnstructuredElementNodeParser(BaseElementNodeParser):
from unstructured.partition.html import partition_html # pants: no-infer-dep from unstructured.partition.html import partition_html # pants: no-infer-dep
table_filters = table_filters or [] table_filters = table_filters or []
elements = partition_html(text=text) elements = partition_html(text=text, **self.partitioning_parameters)
output_els = [] output_els = []
for idx, element in enumerate(elements): for idx, element in enumerate(elements):
if "unstructured.documents.html.HTMLTable" in str(type(element)): if "unstructured.documents.html.HTMLTable" in str(type(element)):
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment