Skip to content
Snippets Groups Projects
Unverified Commit e3a169b9 authored by Haotian Zhang's avatar Haotian Zhang Committed by GitHub
Browse files

Fix MD duplicated Node id from multiple docs (#10564)

* Fix MD duplicated Node id from multiple docs

* cr
parent 60b75cb0
No related branches found
No related tags found
No related merge requests found
...@@ -49,7 +49,9 @@ class MarkdownElementNodeParser(BaseElementNodeParser): ...@@ -49,7 +49,9 @@ class MarkdownElementNodeParser(BaseElementNodeParser):
def get_nodes_from_node(self, node: TextNode) -> List[BaseNode]: def get_nodes_from_node(self, node: TextNode) -> List[BaseNode]:
"""Get nodes from node.""" """Get nodes from node."""
elements = self.extract_elements( elements = self.extract_elements(
node.get_content(), table_filters=[self.filter_table] node.get_content(),
table_filters=[self.filter_table],
node_id=node.id_,
) )
table_elements = self.get_table_elements(elements) table_elements = self.get_table_elements(elements)
# extract summaries over table elements # extract summaries over table elements
...@@ -59,8 +61,13 @@ class MarkdownElementNodeParser(BaseElementNodeParser): ...@@ -59,8 +61,13 @@ class MarkdownElementNodeParser(BaseElementNodeParser):
return self.get_nodes_from_elements(elements) return self.get_nodes_from_elements(elements)
def extract_elements( def extract_elements(
self, text: str, table_filters: Optional[List[Callable]] = None, **kwargs: Any self,
text: str,
node_id: Optional[str] = None,
table_filters: Optional[List[Callable]] = None,
**kwargs: Any,
) -> List[Element]: ) -> List[Element]:
# get node id for each node so that we can avoid using the same id for different nodes
"""Extract elements from text.""" """Extract elements from text."""
lines = text.split("\n") lines = text.split("\n")
currentElement = None currentElement = None
...@@ -168,27 +175,30 @@ class MarkdownElementNodeParser(BaseElementNodeParser): ...@@ -168,27 +175,30 @@ class MarkdownElementNodeParser(BaseElementNodeParser):
table = md_to_df(element.element) table = md_to_df(element.element)
elements[idx] = Element( elements[idx] = Element(
id=f"id_{idx}", type="table", element=element, table=table id=f"id_{node_id}_{idx}" if node_id else f"id_{idx}",
type="table",
element=element,
table=table,
) )
else: else:
# for non-perfect tables, we will store the raw text # for non-perfect tables, we will store the raw text
# and give it a different type to differentiate it from perfect tables # and give it a different type to differentiate it from perfect tables
elements[idx] = Element( elements[idx] = Element(
id=f"id_{idx}", id=f"id_{node_id}_{idx}" if node_id else f"id_{idx}",
type="table_text", type="table_text",
element=element.element, element=element.element,
# table=table # table=table
) )
else: else:
elements[idx] = Element( elements[idx] = Element(
id=f"id_{idx}", id=f"id_{node_id}_{idx}" if node_id else f"id_{idx}",
type="text", type="text",
element=element.element, element=element.element,
) )
else: else:
# if the element is not a table, keep it as to text # if the element is not a table, keep it as to text
elements[idx] = Element( elements[idx] = Element(
id=f"id_{idx}", id=f"id_{node_id}_{idx}" if node_id else f"id_{idx}",
type="text", type="text",
element=element.element, element=element.element,
) )
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment