Skip to content
Snippets Groups Projects
Unverified Commit 6b6fbaf1 authored by James Briggs's avatar James Briggs
Browse files

cleanup install to incl semantic router

parent 0445421e
No related branches found
No related tags found
No related merge requests found
%% Cell type:markdown id: tags:
### Partition elements using Unstructured library
%% Cell type:code id: tags:
``` python
# It may take longer to install the package
!pip install -q -U "unstructured[pdf]"
!pip install -qU \
"unstructured[pdf]==0.12.4" \
"semantic-router==0.0.24"
```
%% Cell type:markdown id: tags:
Start by downloading and processing an ArXiv paper.
%% Cell type:code id: tags:
``` python
from unstructured.partition.auto import partition
article_url = "https://arxiv.org/pdf/2402.05131.pdf"
elements = partition(url=article_url, strategy="hi_res", pdf_infer_table_structure=True)
```
%% Output
Some weights of the model checkpoint at microsoft/table-transformer-structure-recognition were not used when initializing TableTransformerForObjectDetection: ['model.backbone.conv_encoder.model.layer2.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer3.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer4.0.downsample.1.num_batches_tracked']
- This IS expected if you are initializing TableTransformerForObjectDetection from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TableTransformerForObjectDetection from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
%% Cell type:markdown id: tags:
#### Define helper functions
%% Cell type:markdown id: tags:
Validate if parsed title element is a real title
%% Cell type:code id: tags:
``` python
import re
def is_valid_title(title: str) -> bool:
# Rule 1: Title starts with a lowercase letter
if re.match(r"^[a-z]", title):
return False
# Rule 2: Title has a special character (excluding :, -, and .)
if re.search(r"[^\w\s:\-\.]", title):
return False
# Rule 3: Title ends with a dot
if title.endswith("."):
return False
return True
```
%% Cell type:markdown id: tags:
Group elements by valid titles
%% Cell type:code id: tags:
``` python
from unstructured.documents.elements import Element
from colorama import Fore, Style
def group_elements_by_title(elements: list[Element]) -> dict:
grouped_elements = {}
current_title = "Untitled" # Default title for initial text without a title
for element in elements:
element_dict = element.to_dict()
if element_dict.get("type") == "Title":
potential_title = element_dict.get("text", "Untitled")
if is_valid_title(potential_title):
print(f"{Fore.GREEN}{potential_title}: True{Style.RESET_ALL}")
current_title = potential_title
else:
print(f"{Fore.RED}{potential_title}: False{Style.RESET_ALL}")
continue
else:
if current_title not in grouped_elements:
grouped_elements[current_title] = []
else:
grouped_elements[current_title].append(element)
return grouped_elements
```
%% Cell type:markdown id: tags:
Generates chunks grouped elements using semantic RollingWindow splitter
%% Cell type:code id: tags:
``` python
from semantic_router.splitters import RollingWindowSplitter
def create_title_chunks(
grouped_elements: dict, splitter: RollingWindowSplitter
) -> list:
title_with_chunks = []
for title, elements in grouped_elements.items():
if not elements:
continue
combined_element_texts = []
chunks = []
for element in elements:
if not element.text:
continue
element_dict = element.to_dict()
if element_dict.get("type") == "Table":
# Process accumulated text before the table
if combined_element_texts:
splits = splitter(combined_element_texts)
chunks.extend([split.content for split in splits])
combined_element_texts = [] # Reset combined texts after processing
# Add table as a separate chunk
table_text_html = element.metadata.text_as_html
chunks.append(table_text_html)
else:
combined_element_texts.append(element.text)
# Process any remaining accumulated text after the last table
# or if no table was encountered
if combined_element_texts:
splits = splitter(combined_element_texts)
chunks.extend([split.content for split in splits])
if chunks:
title_with_chunks.append({"title": title, "chunks": chunks})
return title_with_chunks
```
%% Cell type:markdown id: tags:
Display chunked text in colors
%% Cell type:code id: tags:
``` python
from IPython.display import display, HTML
import itertools
def print_chunks_by_title(chunks_by_title):
color_cycle = itertools.cycle(["red", "green", "blue", "magenta"])
html_output = ""
for section in chunks_by_title:
title = section["title"]
chunks = section["chunks"]
html_output += f"<h3 style='color: black;'>{title}</h3>"
for chunk in chunks:
color = next(color_cycle)
html_output += f"<p style='color: {color};'>{chunk}</p>"
display(HTML(html_output))
```
%% Cell type:markdown id: tags:
### Process the elements
%% Cell type:code id: tags:
``` python
import os
from semantic_router.encoders import OpenAIEncoder
encoder = OpenAIEncoder(openai_api_key=os.environ["OPENAI_API_KEY"])
splitter = RollingWindowSplitter(
encoder=encoder,
window_size=1, # Compares each element with the previous one
min_split_tokens=50,
max_split_tokens=300,
)
```
%% Cell type:code id: tags:
``` python
grouped_elements = group_elements_by_title(elements)
```
%% Output
et! ee: False
b e F 0 1: False
] L C . s c [: False
Financial Report Chunking for Effective Retrieval Augmented Generation: True
Introduction: True
2 Jimeno Yepes et al.: False
1 https://www.sec.gov 2 https://www.sec.gov/files/cf-frm.pdf: False
2 Related work: True
4 Jimeno Yepes et al.: False
3 Methods: True
3.1 RAG setting for the experiments: True
3.2 Indexing and retrieval: True
7 https://weaviate.io/developers/weaviate 8 https://huggingface.co/sentence-transformers/multi-qa-mpnet-base-dot-: False
v1: False
3.3 Generation: True
Question: {query}: False
3.4 Chunking: True
3.5 Dataset: True
4 Results: True
11 https://platform.openai.com/docs/guides/embeddings/limitations-risks: False
10 Jimeno Yepes et al.: False
5 Discussion: True
12 Jimeno Yepes et al.: False
6 Conclusions and Future Work: True
References: True
%% Cell type:code id: tags:
``` python
chunks_by_title = create_title_chunks(grouped_elements, splitter)
```
%% Cell type:code id: tags:
``` python
print_chunks_by_title(chunks_by_title)
```
%% Output
%% Cell type:code id: tags:
%% Cell type:markdown id: tags:
``` python
```
---
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment