Support indented code block fences (#12393)

08684b50 · Eloy Lafuente · GitHub · 5148c806 · 08684b50 · 08684b50
Unverified Commit 08684b50 authored 11 months ago by Eloy Lafuente Committed by GitHub 11 months ago
--- a/llama-index-core/llama_index/core/node_parser/file/markdown.py
+++ b/llama-index-core/llama_index/core/node_parser/file/markdown.py
@@ -65,7 +65,7 @@ class MarkdownNodeParser(NodeParser):
        current_section = ""

        for line in lines:
-            if line.startswith("```"):
+            if line.lstrip().startswith("```"):
                code_block = not code_block
            header_match = re.match(r"^(#+)\s(.*)", line)
            if header_match and not code_block:

--- a/llama-index-core/tests/node_parser/test_markdown.py
+++ b/llama-index-core/tests/node_parser/test_markdown.py
@@ -25,6 +25,77 @@ Header 2 content
    assert splits[1].text == "Header 2\nHeader 2 content"


+def test_header_splits_with_indented_code_blocks() -> None:
+    markdown_parser = MarkdownNodeParser()
+
+    splits = markdown_parser.get_nodes_from_documents(
+        [
+            Document(
+                text="""Some text
+# Header 1
+## Header 2
+### Header 3
+```txt
+Non indented block code
+```
+A list begins here:
+
+* Element 1
+
+    ```txt
+    # has some indented code, but it's not handled as that.
+    ```
+* Element 2
+
+```txt
+    # also has some code, but unbalanced fences (different number of spaces). Everything after this is considered code block!
+ ```
+
+* Element 3
+* Element 4
+### Another Header 3
+ ```txt
+# has some wrongly indented fence, and leads to incorrect header detection.
+```
+
+## Another Header 2
+    """
+            )
+        ]
+    )
+
+    assert len(splits) == 6
+
+    assert splits[0].metadata == {}
+    assert splits[0].text == "Some text"
+
+    assert splits[1].metadata == {"Header_1": "Header 1"}
+    assert splits[1].text == "Header 1"
+
+    assert splits[2].metadata == {"Header_1": "Header 1", "Header_2": "Header 2"}
+    assert splits[2].text == "Header 2"
+
+    assert splits[3].metadata == {
+        "Header_1": "Header 1",
+        "Header_2": "Header 2",
+        "Header_3": "Header 3",
+    }
+    assert splits[3].text.endswith("* Element 4")
+
+    assert splits[4].metadata == {
+        "Header_1": "Header 1",
+        "Header_2": "Header 2",
+        "Header_3": "Another Header 3",
+    }
+    assert splits[4].text.endswith("```")
+
+    assert splits[5].metadata == {
+        "Header_1": "Header 1",
+        "Header_2": "Another Header 2",
+    }
+    assert splits[5].text == "Another Header 2"
+
+
 def test_non_header_splits() -> None:
    markdown_parser = MarkdownNodeParser()


--- a/llama-index-integrations/readers/llama-index-readers-file/pyproject.toml
+++ b/llama-index-integrations/readers/llama-index-readers-file/pyproject.toml
@@ -50,7 +50,7 @@ license = "MIT"
 maintainers = ["FarisHijazi", "Haowjy", "ephe-meral", "hursh-desai", "iamarunbrahma", "jon-chuang", "mmaatouk", "ravi03071991", "sangwongenip", "thejessezhang"]
 name = "llama-index-readers-file"
 readme = "README.md"
-version = "0.1.15"
+version = "0.1.16"

 [tool.poetry.dependencies]
 python = ">=3.8.1,<4.0"