From 08684b509dad928d2f86fc603ae2a3ecb5f13a7f Mon Sep 17 00:00:00 2001 From: Eloy Lafuente <stronk7@moodle.org> Date: Tue, 9 Apr 2024 20:00:56 +0200 Subject: [PATCH] Support indented code block fences (#12393) --- .../core/node_parser/file/markdown.py | 2 +- .../tests/node_parser/test_markdown.py | 71 +++++++++++++++++++ .../llama-index-readers-file/pyproject.toml | 2 +- 3 files changed, 73 insertions(+), 2 deletions(-) diff --git a/llama-index-core/llama_index/core/node_parser/file/markdown.py b/llama-index-core/llama_index/core/node_parser/file/markdown.py index 6f8ecb9840..1be045696f 100644 --- a/llama-index-core/llama_index/core/node_parser/file/markdown.py +++ b/llama-index-core/llama_index/core/node_parser/file/markdown.py @@ -65,7 +65,7 @@ class MarkdownNodeParser(NodeParser): current_section = "" for line in lines: - if line.startswith("```"): + if line.lstrip().startswith("```"): code_block = not code_block header_match = re.match(r"^(#+)\s(.*)", line) if header_match and not code_block: diff --git a/llama-index-core/tests/node_parser/test_markdown.py b/llama-index-core/tests/node_parser/test_markdown.py index 876aae23e8..84141feec4 100644 --- a/llama-index-core/tests/node_parser/test_markdown.py +++ b/llama-index-core/tests/node_parser/test_markdown.py @@ -25,6 +25,77 @@ Header 2 content assert splits[1].text == "Header 2\nHeader 2 content" +def test_header_splits_with_indented_code_blocks() -> None: + markdown_parser = MarkdownNodeParser() + + splits = markdown_parser.get_nodes_from_documents( + [ + Document( + text="""Some text +# Header 1 +## Header 2 +### Header 3 +```txt +Non indented block code +``` +A list begins here: + +* Element 1 + + ```txt + # has some indented code, but it's not handled as that. + ``` +* Element 2 + +```txt + # also has some code, but unbalanced fences (different number of spaces). Everything after this is considered code block! + ``` + +* Element 3 +* Element 4 +### Another Header 3 + ```txt +# has some wrongly indented fence, and leads to incorrect header detection. +``` + +## Another Header 2 + """ + ) + ] + ) + + assert len(splits) == 6 + + assert splits[0].metadata == {} + assert splits[0].text == "Some text" + + assert splits[1].metadata == {"Header_1": "Header 1"} + assert splits[1].text == "Header 1" + + assert splits[2].metadata == {"Header_1": "Header 1", "Header_2": "Header 2"} + assert splits[2].text == "Header 2" + + assert splits[3].metadata == { + "Header_1": "Header 1", + "Header_2": "Header 2", + "Header_3": "Header 3", + } + assert splits[3].text.endswith("* Element 4") + + assert splits[4].metadata == { + "Header_1": "Header 1", + "Header_2": "Header 2", + "Header_3": "Another Header 3", + } + assert splits[4].text.endswith("```") + + assert splits[5].metadata == { + "Header_1": "Header 1", + "Header_2": "Another Header 2", + } + assert splits[5].text == "Another Header 2" + + def test_non_header_splits() -> None: markdown_parser = MarkdownNodeParser() diff --git a/llama-index-integrations/readers/llama-index-readers-file/pyproject.toml b/llama-index-integrations/readers/llama-index-readers-file/pyproject.toml index 411af57fff..2f9ea3ad10 100644 --- a/llama-index-integrations/readers/llama-index-readers-file/pyproject.toml +++ b/llama-index-integrations/readers/llama-index-readers-file/pyproject.toml @@ -50,7 +50,7 @@ license = "MIT" maintainers = ["FarisHijazi", "Haowjy", "ephe-meral", "hursh-desai", "iamarunbrahma", "jon-chuang", "mmaatouk", "ravi03071991", "sangwongenip", "thejessezhang"] name = "llama-index-readers-file" readme = "README.md" -version = "0.1.15" +version = "0.1.16" [tool.poetry.dependencies] python = ">=3.8.1,<4.0" -- GitLab