diff --git a/llama-index-core/llama_index/core/node_parser/file/markdown.py b/llama-index-core/llama_index/core/node_parser/file/markdown.py index 6f8ecb984051f9538daea5f03f7a881e49d78853..1be045696fd3d236ece0890df453047994bdfd31 100644 --- a/llama-index-core/llama_index/core/node_parser/file/markdown.py +++ b/llama-index-core/llama_index/core/node_parser/file/markdown.py @@ -65,7 +65,7 @@ class MarkdownNodeParser(NodeParser): current_section = "" for line in lines: - if line.startswith("```"): + if line.lstrip().startswith("```"): code_block = not code_block header_match = re.match(r"^(#+)\s(.*)", line) if header_match and not code_block: diff --git a/llama-index-core/tests/node_parser/test_markdown.py b/llama-index-core/tests/node_parser/test_markdown.py index 876aae23e887423203bcbe9102bcd7ad93e2c441..84141feec4a7aea3d4f31141f6f732d105e454b4 100644 --- a/llama-index-core/tests/node_parser/test_markdown.py +++ b/llama-index-core/tests/node_parser/test_markdown.py @@ -25,6 +25,77 @@ Header 2 content assert splits[1].text == "Header 2\nHeader 2 content" +def test_header_splits_with_indented_code_blocks() -> None: + markdown_parser = MarkdownNodeParser() + + splits = markdown_parser.get_nodes_from_documents( + [ + Document( + text="""Some text +# Header 1 +## Header 2 +### Header 3 +```txt +Non indented block code +``` +A list begins here: + +* Element 1 + + ```txt + # has some indented code, but it's not handled as that. + ``` +* Element 2 + +```txt + # also has some code, but unbalanced fences (different number of spaces). Everything after this is considered code block! + ``` + +* Element 3 +* Element 4 +### Another Header 3 + ```txt +# has some wrongly indented fence, and leads to incorrect header detection. +``` + +## Another Header 2 + """ + ) + ] + ) + + assert len(splits) == 6 + + assert splits[0].metadata == {} + assert splits[0].text == "Some text" + + assert splits[1].metadata == {"Header_1": "Header 1"} + assert splits[1].text == "Header 1" + + assert splits[2].metadata == {"Header_1": "Header 1", "Header_2": "Header 2"} + assert splits[2].text == "Header 2" + + assert splits[3].metadata == { + "Header_1": "Header 1", + "Header_2": "Header 2", + "Header_3": "Header 3", + } + assert splits[3].text.endswith("* Element 4") + + assert splits[4].metadata == { + "Header_1": "Header 1", + "Header_2": "Header 2", + "Header_3": "Another Header 3", + } + assert splits[4].text.endswith("```") + + assert splits[5].metadata == { + "Header_1": "Header 1", + "Header_2": "Another Header 2", + } + assert splits[5].text == "Another Header 2" + + def test_non_header_splits() -> None: markdown_parser = MarkdownNodeParser() diff --git a/llama-index-integrations/readers/llama-index-readers-file/pyproject.toml b/llama-index-integrations/readers/llama-index-readers-file/pyproject.toml index 411af57fff70d8bc3718228abaae2aa964f94e57..2f9ea3ad101168c7b0ceb8eecd69583a15bcc3c6 100644 --- a/llama-index-integrations/readers/llama-index-readers-file/pyproject.toml +++ b/llama-index-integrations/readers/llama-index-readers-file/pyproject.toml @@ -50,7 +50,7 @@ license = "MIT" maintainers = ["FarisHijazi", "Haowjy", "ephe-meral", "hursh-desai", "iamarunbrahma", "jon-chuang", "mmaatouk", "ravi03071991", "sangwongenip", "thejessezhang"] name = "llama-index-readers-file" readme = "README.md" -version = "0.1.15" +version = "0.1.16" [tool.poetry.dependencies] python = ">=3.8.1,<4.0"