diff --git a/llama_index/node_parser/relational/markdown_element.py b/llama_index/node_parser/relational/markdown_element.py index 0be5edb179d721f191f287115fe83b9cf2e89cf2..3ea08a734ef567efdf9a41df14691d6ae0dd8f42 100644 --- a/llama_index/node_parser/relational/markdown_element.py +++ b/llama_index/node_parser/relational/markdown_element.py @@ -49,7 +49,9 @@ class MarkdownElementNodeParser(BaseElementNodeParser): def get_nodes_from_node(self, node: TextNode) -> List[BaseNode]: """Get nodes from node.""" elements = self.extract_elements( - node.get_content(), table_filters=[self.filter_table] + node.get_content(), + table_filters=[self.filter_table], + node_id=node.id_, ) table_elements = self.get_table_elements(elements) # extract summaries over table elements @@ -59,8 +61,13 @@ class MarkdownElementNodeParser(BaseElementNodeParser): return self.get_nodes_from_elements(elements) def extract_elements( - self, text: str, table_filters: Optional[List[Callable]] = None, **kwargs: Any + self, + text: str, + node_id: Optional[str] = None, + table_filters: Optional[List[Callable]] = None, + **kwargs: Any, ) -> List[Element]: + # get node id for each node so that we can avoid using the same id for different nodes """Extract elements from text.""" lines = text.split("\n") currentElement = None @@ -168,27 +175,30 @@ class MarkdownElementNodeParser(BaseElementNodeParser): table = md_to_df(element.element) elements[idx] = Element( - id=f"id_{idx}", type="table", element=element, table=table + id=f"id_{node_id}_{idx}" if node_id else f"id_{idx}", + type="table", + element=element, + table=table, ) else: # for non-perfect tables, we will store the raw text # and give it a different type to differentiate it from perfect tables elements[idx] = Element( - id=f"id_{idx}", + id=f"id_{node_id}_{idx}" if node_id else f"id_{idx}", type="table_text", element=element.element, # table=table ) else: elements[idx] = Element( - id=f"id_{idx}", + id=f"id_{node_id}_{idx}" if node_id else f"id_{idx}", type="text", element=element.element, ) else: # if the element is not a table, keep it as to text elements[idx] = Element( - id=f"id_{idx}", + id=f"id_{node_id}_{idx}" if node_id else f"id_{idx}", type="text", element=element.element, )