Skip to content
Snippets Groups Projects
Unverified Commit 08684b50 authored by Eloy Lafuente's avatar Eloy Lafuente Committed by GitHub
Browse files

Support indented code block fences (#12393)

parent 5148c806
No related branches found
No related tags found
No related merge requests found
......@@ -65,7 +65,7 @@ class MarkdownNodeParser(NodeParser):
current_section = ""
for line in lines:
if line.startswith("```"):
if line.lstrip().startswith("```"):
code_block = not code_block
header_match = re.match(r"^(#+)\s(.*)", line)
if header_match and not code_block:
......
......@@ -25,6 +25,77 @@ Header 2 content
assert splits[1].text == "Header 2\nHeader 2 content"
def test_header_splits_with_indented_code_blocks() -> None:
markdown_parser = MarkdownNodeParser()
splits = markdown_parser.get_nodes_from_documents(
[
Document(
text="""Some text
# Header 1
## Header 2
### Header 3
```txt
Non indented block code
```
A list begins here:
* Element 1
```txt
# has some indented code, but it's not handled as that.
```
* Element 2
```txt
# also has some code, but unbalanced fences (different number of spaces). Everything after this is considered code block!
```
* Element 3
* Element 4
### Another Header 3
```txt
# has some wrongly indented fence, and leads to incorrect header detection.
```
## Another Header 2
"""
)
]
)
assert len(splits) == 6
assert splits[0].metadata == {}
assert splits[0].text == "Some text"
assert splits[1].metadata == {"Header_1": "Header 1"}
assert splits[1].text == "Header 1"
assert splits[2].metadata == {"Header_1": "Header 1", "Header_2": "Header 2"}
assert splits[2].text == "Header 2"
assert splits[3].metadata == {
"Header_1": "Header 1",
"Header_2": "Header 2",
"Header_3": "Header 3",
}
assert splits[3].text.endswith("* Element 4")
assert splits[4].metadata == {
"Header_1": "Header 1",
"Header_2": "Header 2",
"Header_3": "Another Header 3",
}
assert splits[4].text.endswith("```")
assert splits[5].metadata == {
"Header_1": "Header 1",
"Header_2": "Another Header 2",
}
assert splits[5].text == "Another Header 2"
def test_non_header_splits() -> None:
markdown_parser = MarkdownNodeParser()
......
......@@ -50,7 +50,7 @@ license = "MIT"
maintainers = ["FarisHijazi", "Haowjy", "ephe-meral", "hursh-desai", "iamarunbrahma", "jon-chuang", "mmaatouk", "ravi03071991", "sangwongenip", "thejessezhang"]
name = "llama-index-readers-file"
readme = "README.md"
version = "0.1.15"
version = "0.1.16"
[tool.poetry.dependencies]
python = ">=3.8.1,<4.0"
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment