From f81e8393ae0c5ea29eb935e0f2e024b3e8cb931f Mon Sep 17 00:00:00 2001
From: raveenplgithub <128684516+raveenplgithub@users.noreply.github.com>
Date: Thu, 22 Feb 2024 00:50:40 +0100
Subject: [PATCH] Add support for extra partitioning parameters for
 partition_html used in UnstructuredElementNodeParser (#11077)

Add support for extra partitioning parameters for partition_html

Co-authored-by: Piotr Kandziora <piotr.kandziora@shijigroup.com>
---
 .../node_parser/relational/unstructured_element.py | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/llama-index-core/llama_index/core/node_parser/relational/unstructured_element.py b/llama-index-core/llama_index/core/node_parser/relational/unstructured_element.py
index 4dd2152fc3..e19124addc 100644
--- a/llama-index-core/llama_index/core/node_parser/relational/unstructured_element.py
+++ b/llama-index-core/llama_index/core/node_parser/relational/unstructured_element.py
@@ -1,6 +1,9 @@
 """Unstructured element node parser."""
 
-from typing import Any, Callable, List, Optional
+from typing import Any, Callable, List, Optional, Dict
+
+
+from llama_index.core.bridge.pydantic import Field
 
 import pandas as pd
 from llama_index.core.callbacks.base import CallbackManager
@@ -45,11 +48,17 @@ class UnstructuredElementNodeParser(BaseElementNodeParser):
 
     """
 
+    partitioning_parameters: Optional[Dict[str, Any]] = Field(
+        default={},
+        description="Extra dictionary representing parameters of the partitioning process.",
+    )
+
     def __init__(
         self,
         callback_manager: Optional[CallbackManager] = None,
         llm: Optional[Any] = None,
         summary_query_str: str = DEFAULT_SUMMARY_QUERY_STR,
+        partitioning_parameters: Optional[Dict[str, Any]] = {},
     ) -> None:
         """Initialize."""
         try:
@@ -66,6 +75,7 @@ class UnstructuredElementNodeParser(BaseElementNodeParser):
             callback_manager=callback_manager,
             llm=llm,
             summary_query_str=summary_query_str,
+            partitioning_parameters=partitioning_parameters,
         )
 
     @classmethod
@@ -91,7 +101,7 @@ class UnstructuredElementNodeParser(BaseElementNodeParser):
         from unstructured.partition.html import partition_html  # pants: no-infer-dep
 
         table_filters = table_filters or []
-        elements = partition_html(text=text)
+        elements = partition_html(text=text, **self.partitioning_parameters)
         output_els = []
         for idx, element in enumerate(elements):
             if "unstructured.documents.html.HTMLTable" in str(type(element)):
-- 
GitLab