From 08684b509dad928d2f86fc603ae2a3ecb5f13a7f Mon Sep 17 00:00:00 2001
From: Eloy Lafuente <stronk7@moodle.org>
Date: Tue, 9 Apr 2024 20:00:56 +0200
Subject: [PATCH] Support indented code block fences (#12393)

---
 .../core/node_parser/file/markdown.py         |  2 +-
 .../tests/node_parser/test_markdown.py        | 71 +++++++++++++++++++
 .../llama-index-readers-file/pyproject.toml   |  2 +-
 3 files changed, 73 insertions(+), 2 deletions(-)

diff --git a/llama-index-core/llama_index/core/node_parser/file/markdown.py b/llama-index-core/llama_index/core/node_parser/file/markdown.py
index 6f8ecb9840..1be045696f 100644
--- a/llama-index-core/llama_index/core/node_parser/file/markdown.py
+++ b/llama-index-core/llama_index/core/node_parser/file/markdown.py
@@ -65,7 +65,7 @@ class MarkdownNodeParser(NodeParser):
         current_section = ""
 
         for line in lines:
-            if line.startswith("```"):
+            if line.lstrip().startswith("```"):
                 code_block = not code_block
             header_match = re.match(r"^(#+)\s(.*)", line)
             if header_match and not code_block:
diff --git a/llama-index-core/tests/node_parser/test_markdown.py b/llama-index-core/tests/node_parser/test_markdown.py
index 876aae23e8..84141feec4 100644
--- a/llama-index-core/tests/node_parser/test_markdown.py
+++ b/llama-index-core/tests/node_parser/test_markdown.py
@@ -25,6 +25,77 @@ Header 2 content
     assert splits[1].text == "Header 2\nHeader 2 content"
 
 
+def test_header_splits_with_indented_code_blocks() -> None:
+    markdown_parser = MarkdownNodeParser()
+
+    splits = markdown_parser.get_nodes_from_documents(
+        [
+            Document(
+                text="""Some text
+# Header 1
+## Header 2
+### Header 3
+```txt
+Non indented block code
+```
+A list begins here:
+
+* Element 1
+
+    ```txt
+    # has some indented code, but it's not handled as that.
+    ```
+* Element 2
+
+```txt
+    # also has some code, but unbalanced fences (different number of spaces). Everything after this is considered code block!
+ ```
+
+* Element 3
+* Element 4
+### Another Header 3
+ ```txt
+# has some wrongly indented fence, and leads to incorrect header detection.
+```
+
+## Another Header 2
+    """
+            )
+        ]
+    )
+
+    assert len(splits) == 6
+
+    assert splits[0].metadata == {}
+    assert splits[0].text == "Some text"
+
+    assert splits[1].metadata == {"Header_1": "Header 1"}
+    assert splits[1].text == "Header 1"
+
+    assert splits[2].metadata == {"Header_1": "Header 1", "Header_2": "Header 2"}
+    assert splits[2].text == "Header 2"
+
+    assert splits[3].metadata == {
+        "Header_1": "Header 1",
+        "Header_2": "Header 2",
+        "Header_3": "Header 3",
+    }
+    assert splits[3].text.endswith("* Element 4")
+
+    assert splits[4].metadata == {
+        "Header_1": "Header 1",
+        "Header_2": "Header 2",
+        "Header_3": "Another Header 3",
+    }
+    assert splits[4].text.endswith("```")
+
+    assert splits[5].metadata == {
+        "Header_1": "Header 1",
+        "Header_2": "Another Header 2",
+    }
+    assert splits[5].text == "Another Header 2"
+
+
 def test_non_header_splits() -> None:
     markdown_parser = MarkdownNodeParser()
 
diff --git a/llama-index-integrations/readers/llama-index-readers-file/pyproject.toml b/llama-index-integrations/readers/llama-index-readers-file/pyproject.toml
index 411af57fff..2f9ea3ad10 100644
--- a/llama-index-integrations/readers/llama-index-readers-file/pyproject.toml
+++ b/llama-index-integrations/readers/llama-index-readers-file/pyproject.toml
@@ -50,7 +50,7 @@ license = "MIT"
 maintainers = ["FarisHijazi", "Haowjy", "ephe-meral", "hursh-desai", "iamarunbrahma", "jon-chuang", "mmaatouk", "ravi03071991", "sangwongenip", "thejessezhang"]
 name = "llama-index-readers-file"
 readme = "README.md"
-version = "0.1.15"
+version = "0.1.16"
 
 [tool.poetry.dependencies]
 python = ">=3.8.1,<4.0"
-- 
GitLab