Skip to content
Snippets Groups Projects
Commit de02b3a3 authored by Simonas's avatar Simonas
Browse files

feat: Unstructured elements splitting example

parent 599be55f
No related branches found
No related tags found
No related merge requests found
%% Cell type:markdown id: tags:
### Partition elements using Unstructured library
%% Cell type:code id: tags:
``` python
# It may take longer to install the package
!pip install -q -U "unstructured[pdf]"
```
%% Cell type:code id: tags:
``` python
from unstructured.partition.auto import partition
article_url = "https://arxiv.org/pdf/2402.05131.pdf"
elements = partition(url=article_url, strategy="hi_res", pdf_infer_table_structure=True)
```
%% Output
Some weights of the model checkpoint at microsoft/table-transformer-structure-recognition were not used when initializing TableTransformerForObjectDetection: ['model.backbone.conv_encoder.model.layer2.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer3.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer4.0.downsample.1.num_batches_tracked']
- This IS expected if you are initializing TableTransformerForObjectDetection from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TableTransformerForObjectDetection from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
%% Cell type:markdown id: tags:
#### Define helper functions
%% Cell type:markdown id: tags:
Validate if parsed title element is a real title
%% Cell type:code id: tags:
``` python
import re
def is_valid_title(title: str) -> bool:
# Rule 1: Title starts with a lowercase letter
if re.match(r"^[a-z]", title):
return False
# Rule 2: Title has a special character (excluding :, -, and .)
if re.search(r"[^\w\s:\-\.]", title):
return False
# Rule 3: Title ends with a dot
if title.endswith("."):
return False
return True
```
%% Cell type:markdown id: tags:
Group elements by valid titles
%% Cell type:code id: tags:
``` python
from unstructured.documents.elements import Element
from colorama import Fore, Style
def group_elements_by_title(elements: list[Element]) -> dict:
grouped_elements = {}
current_title = "Untitled" # Default title for initial text without a title
for element in elements:
element_dict = element.to_dict()
if element_dict.get("type") == "Title":
potential_title = element_dict.get("text", "Untitled")
if is_valid_title(potential_title):
print(f"{Fore.GREEN}{potential_title}: True{Style.RESET_ALL}")
current_title = potential_title
else:
print(f"{Fore.RED}{potential_title}: False{Style.RESET_ALL}")
continue
else:
if current_title not in grouped_elements:
grouped_elements[current_title] = []
else:
grouped_elements[current_title].append(element)
return grouped_elements
```
%% Cell type:markdown id: tags:
Generates chunks grouped elements using semantic RollingWindow splitter
%% Cell type:code id: tags:
``` python
from semantic_router.splitters import RollingWindowSplitter
def create_title_chunks(
grouped_elements: dict, splitter: RollingWindowSplitter
) -> list:
title_with_chunks = []
for title, elements in grouped_elements.items():
if not elements:
continue
combined_element_texts = []
chunks = []
for element in elements:
if not element.text:
continue
element_dict = element.to_dict()
if element_dict.get("type") == "Table":
# Process accumulated text before the table
if combined_element_texts:
splits = splitter(combined_element_texts)
chunks.extend([split.content for split in splits])
combined_element_texts = [] # Reset combined texts after processing
# Add table as a separate chunk
table_text_html = element.metadata.text_as_html
chunks.append(table_text_html)
else:
combined_element_texts.append(element.text)
# Process any remaining accumulated text after the last table
# or if no table was encountered
if combined_element_texts:
splits = splitter(combined_element_texts)
chunks.extend([split.content for split in splits])
if chunks:
title_with_chunks.append({"title": title, "chunks": chunks})
return title_with_chunks
```
%% Cell type:markdown id: tags:
Display chunked text in colors
%% Cell type:code id: tags:
``` python
from IPython.display import display, HTML
import itertools
def print_chunks_by_title(chunks_by_title):
color_cycle = itertools.cycle(["red", "green", "blue", "magenta"])
html_output = ""
for section in chunks_by_title:
title = section["title"]
chunks = section["chunks"]
html_output += f"<h3 style='color: black;'>{title}</h3>"
for chunk in chunks:
color = next(color_cycle)
html_output += f"<p style='color: {color};'>{chunk}</p>"
display(HTML(html_output))
```
%% Cell type:markdown id: tags:
### Process the elements
%% Cell type:code id: tags:
``` python
import os
from semantic_router.encoders import OpenAIEncoder
encoder = OpenAIEncoder(openai_api_key=os.environ["OPENAI_API_KEY"])
splitter = RollingWindowSplitter(
encoder=encoder,
window_size=1, # Compares each element with the previous one
min_split_tokens=50,
max_split_tokens=300,
)
```
%% Cell type:code id: tags:
``` python
grouped_elements = group_elements_by_title(elements)
```
%% Output
et! ee: False
b e F 0 1: False
] L C . s c [: False
Financial Report Chunking for Effective Retrieval Augmented Generation: True
Introduction: True
2 Jimeno Yepes et al.: False
1 https://www.sec.gov 2 https://www.sec.gov/files/cf-frm.pdf: False
2 Related work: True
4 Jimeno Yepes et al.: False
3 Methods: True
3.1 RAG setting for the experiments: True
3.2 Indexing and retrieval: True
7 https://weaviate.io/developers/weaviate 8 https://huggingface.co/sentence-transformers/multi-qa-mpnet-base-dot-: False
v1: False
3.3 Generation: True
Question: {query}: False
3.4 Chunking: True
3.5 Dataset: True
4 Results: True
11 https://platform.openai.com/docs/guides/embeddings/limitations-risks: False
10 Jimeno Yepes et al.: False
5 Discussion: True
12 Jimeno Yepes et al.: False
6 Conclusions and Future Work: True
References: True
%% Cell type:code id: tags:
``` python
chunks_by_title = create_title_chunks(grouped_elements, splitter)
```
%% Cell type:code id: tags:
``` python
print_chunks_by_title(chunks_by_title)
```
%% Output
%% Cell type:code id: tags:
``` python
```
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment