Skip to content
Snippets Groups Projects
Unverified Commit 8f0e7f2e authored by Siraj R Aizlewood's avatar Siraj R Aizlewood
Browse files

Ensuring that we change type hints from list to List.

list is only supported from Python 3.10 onwards, and we'd like to support 3.9 onwards.
parent f9e33e24
No related branches found
No related tags found
No related merge requests found
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
### Partition elements using Unstructured library ### Partition elements using Unstructured library
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
# It may take longer to install the package # It may take longer to install the package
!pip install -qU \ !pip install -qU \
"unstructured[pdf]==0.12.4" \ "unstructured[pdf]==0.12.4" \
semantic-router semantic-router
``` ```
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
Start by downloading and processing an ArXiv paper. Start by downloading and processing an ArXiv paper.
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
from unstructured.partition.auto import partition from unstructured.partition.auto import partition
article_url = "https://arxiv.org/pdf/2402.05131.pdf" article_url = "https://arxiv.org/pdf/2402.05131.pdf"
elements = partition(url=article_url, strategy="hi_res", pdf_infer_table_structure=True) elements = partition(url=article_url, strategy="hi_res", pdf_infer_table_structure=True)
``` ```
%% Output %% Output
/Users/jakit/customers/aurelio/semantic-router/.venv/lib/python3.9/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html /Users/jakit/customers/aurelio/semantic-router/.venv/lib/python3.9/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
from .autonotebook import tqdm as notebook_tqdm from .autonotebook import tqdm as notebook_tqdm
Conflict between variables skip_infer_table_types: ['pdf', 'jpg', 'png', 'xls', 'xlsx', 'heic'] and pdf_infer_table_structure: True, please reset skip_infer_table_types to turn on table extraction for PDFs. Conflict between variables skip_infer_table_types: ['pdf', 'jpg', 'png', 'xls', 'xlsx', 'heic'] and pdf_infer_table_structure: True, please reset skip_infer_table_types to turn on table extraction for PDFs.
This function will be deprecated in a future release and `unstructured` will simply use the DEFAULT_MODEL from `unstructured_inference.model.base` to set default model name This function will be deprecated in a future release and `unstructured` will simply use the DEFAULT_MODEL from `unstructured_inference.model.base` to set default model name
Some weights of the model checkpoint at microsoft/table-transformer-structure-recognition were not used when initializing TableTransformerForObjectDetection: ['model.backbone.conv_encoder.model.layer2.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer3.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer4.0.downsample.1.num_batches_tracked'] Some weights of the model checkpoint at microsoft/table-transformer-structure-recognition were not used when initializing TableTransformerForObjectDetection: ['model.backbone.conv_encoder.model.layer2.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer3.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer4.0.downsample.1.num_batches_tracked']
- This IS expected if you are initializing TableTransformerForObjectDetection from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). - This IS expected if you are initializing TableTransformerForObjectDetection from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TableTransformerForObjectDetection from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). - This IS NOT expected if you are initializing TableTransformerForObjectDetection from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
#### Define helper functions #### Define helper functions
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
Validate if parsed title element is a real title Validate if parsed title element is a real title
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
import re import re
def is_valid_title(title: str) -> bool: def is_valid_title(title: str) -> bool:
# Rule 1: Title starts with a lowercase letter # Rule 1: Title starts with a lowercase letter
if re.match(r"^[a-z]", title): if re.match(r"^[a-z]", title):
return False return False
# Rule 2: Title has a special character (excluding :, -, and .) # Rule 2: Title has a special character (excluding :, -, and .)
if re.search(r"[^\w\s:\-\.]", title): if re.search(r"[^\w\s:\-\.]", title):
return False return False
# Rule 3: Title ends with a dot # Rule 3: Title ends with a dot
if title.endswith("."): if title.endswith("."):
return False return False
return True return True
``` ```
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
Group elements by valid titles Group elements by valid titles
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
from unstructured.documents.elements import Element from unstructured.documents.elements import Element
from colorama import Fore, Style from colorama import Fore, Style
from typing import List
def group_elements_by_title(elements: list[Element]) -> dict: def group_elements_by_title(elements: List[Element]) -> dict:
grouped_elements = {} grouped_elements = {}
current_title = "Untitled" # Default title for initial text without a title current_title = "Untitled" # Default title for initial text without a title
for element in elements: for element in elements:
element_dict = element.to_dict() element_dict = element.to_dict()
if element_dict.get("type") == "Title": if element_dict.get("type") == "Title":
potential_title = element_dict.get("text", "Untitled") potential_title = element_dict.get("text", "Untitled")
if is_valid_title(potential_title): if is_valid_title(potential_title):
print(f"{Fore.GREEN}{potential_title}: True{Style.RESET_ALL}") print(f"{Fore.GREEN}{potential_title}: True{Style.RESET_ALL}")
current_title = potential_title current_title = potential_title
else: else:
print(f"{Fore.RED}{potential_title}: False{Style.RESET_ALL}") print(f"{Fore.RED}{potential_title}: False{Style.RESET_ALL}")
continue continue
else: else:
if current_title not in grouped_elements: if current_title not in grouped_elements:
grouped_elements[current_title] = [] grouped_elements[current_title] = []
else: else:
grouped_elements[current_title].append(element) grouped_elements[current_title].append(element)
return grouped_elements return grouped_elements
``` ```
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
Generates chunks grouped elements using semantic RollingWindow splitter Generates chunks grouped elements using semantic RollingWindow splitter
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
from semantic_router.splitters import RollingWindowSplitter from semantic_router.splitters import RollingWindowSplitter
def create_title_chunks( def create_title_chunks(
grouped_elements: dict, splitter: RollingWindowSplitter grouped_elements: dict, splitter: RollingWindowSplitter
) -> list: ) -> list:
title_with_chunks = [] title_with_chunks = []
for title, elements in grouped_elements.items(): for title, elements in grouped_elements.items():
if not elements: if not elements:
continue continue
combined_element_texts = [] combined_element_texts = []
chunks = [] chunks = []
for element in elements: for element in elements:
if not element.text: if not element.text:
continue continue
element_dict = element.to_dict() element_dict = element.to_dict()
if element_dict.get("type") == "Table": if element_dict.get("type") == "Table":
# Process accumulated text before the table # Process accumulated text before the table
if combined_element_texts: if combined_element_texts:
splits = splitter(combined_element_texts) splits = splitter(combined_element_texts)
print("-" * 80) print("-" * 80)
chunks.extend([split.content for split in splits]) chunks.extend([split.content for split in splits])
combined_element_texts = [] # Reset combined texts after processing combined_element_texts = [] # Reset combined texts after processing
# Add table as a separate chunk # Add table as a separate chunk
table_text_html = element.metadata.text_as_html table_text_html = element.metadata.text_as_html
chunks.append(table_text_html) chunks.append(table_text_html)
else: else:
combined_element_texts.append(element.text) combined_element_texts.append(element.text)
# Process any remaining accumulated text after the last table # Process any remaining accumulated text after the last table
# or if no table was encountered # or if no table was encountered
if combined_element_texts: if combined_element_texts:
splits = splitter(combined_element_texts) splits = splitter(combined_element_texts)
print("-" * 80) print("-" * 80)
chunks.extend([split.content for split in splits]) chunks.extend([split.content for split in splits])
if chunks: if chunks:
title_with_chunks.append({"title": title, "chunks": chunks}) title_with_chunks.append({"title": title, "chunks": chunks})
return title_with_chunks return title_with_chunks
``` ```
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
Display chunked text in colors Display chunked text in colors
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
from IPython.display import display, HTML from IPython.display import display, HTML
import itertools import itertools
def print_chunks_by_title(chunks_by_title): def print_chunks_by_title(chunks_by_title):
color_cycle = itertools.cycle(["red", "green", "blue", "magenta"]) color_cycle = itertools.cycle(["red", "green", "blue", "magenta"])
html_output = "" html_output = ""
for section in chunks_by_title: for section in chunks_by_title:
title = section["title"] title = section["title"]
chunks = section["chunks"] chunks = section["chunks"]
html_output += f"<h3 style='color: black;'>{title}</h3>" html_output += f"<h3 style='color: black;'>{title}</h3>"
for chunk in chunks: for chunk in chunks:
color = next(color_cycle) color = next(color_cycle)
html_output += f"<p style='color: {color};'>{chunk}</p>" html_output += f"<p style='color: {color};'>{chunk}</p>"
display(HTML(html_output)) display(HTML(html_output))
``` ```
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
### Process the elements ### Process the elements
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
import os import os
from semantic_router.encoders import OpenAIEncoder from semantic_router.encoders import OpenAIEncoder
encoder = OpenAIEncoder(openai_api_key=os.environ["OPENAI_API_KEY"]) encoder = OpenAIEncoder(openai_api_key=os.environ["OPENAI_API_KEY"])
splitter = RollingWindowSplitter( splitter = RollingWindowSplitter(
encoder=encoder, encoder=encoder,
window_size=1, # Compares each element with the previous one window_size=1, # Compares each element with the previous one
min_split_tokens=1, min_split_tokens=1,
max_split_tokens=500, max_split_tokens=500,
plot_splits=False, plot_splits=False,
) )
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
grouped_elements = group_elements_by_title(elements) grouped_elements = group_elements_by_title(elements)
``` ```
%% Output %% Output
et! ee: False et! ee: False
b e F 0 1: False b e F 0 1: False
] L C . s c [: False ] L C . s c [: False
Financial Report Chunking for Effective Retrieval Augmented Generation: True Financial Report Chunking for Effective Retrieval Augmented Generation: True
Introduction: True Introduction: True
2 Jimeno Yepes et al.: False 2 Jimeno Yepes et al.: False
1 https://www.sec.gov 2 https://www.sec.gov/files/cf-frm.pdf: False 1 https://www.sec.gov 2 https://www.sec.gov/files/cf-frm.pdf: False
2 Related work: True 2 Related work: True
4 Jimeno Yepes et al.: False 4 Jimeno Yepes et al.: False
3 Methods: True 3 Methods: True
3.1 RAG setting for the experiments: True 3.1 RAG setting for the experiments: True
3.2 Indexing and retrieval: True 3.2 Indexing and retrieval: True
7 https://weaviate.io/developers/weaviate 8 https://huggingface.co/sentence-transformers/multi-qa-mpnet-base-dot-: False 7 https://weaviate.io/developers/weaviate 8 https://huggingface.co/sentence-transformers/multi-qa-mpnet-base-dot-: False
v1: False v1: False
3.3 Generation: True 3.3 Generation: True
Question: {query}: False Question: {query}: False
3.4 Chunking: True 3.4 Chunking: True
3.5 Dataset: True 3.5 Dataset: True
4 Results: True 4 Results: True
11 https://platform.openai.com/docs/guides/embeddings/limitations-risks: False 11 https://platform.openai.com/docs/guides/embeddings/limitations-risks: False
10 Jimeno Yepes et al.: False 10 Jimeno Yepes et al.: False
5 Discussion: True 5 Discussion: True
12 Jimeno Yepes et al.: False 12 Jimeno Yepes et al.: False
6 Conclusions and Future Work: True 6 Conclusions and Future Work: True
References: True References: True
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
chunks_by_title = create_title_chunks(grouped_elements, splitter) chunks_by_title = create_title_chunks(grouped_elements, splitter)
``` ```
%% Output %% Output
/Users/jakit/customers/aurelio/semantic-router/.venv/lib/python3.9/site-packages/numpy/core/fromnumeric.py:3504: RuntimeWarning: Mean of empty slice. /Users/jakit/customers/aurelio/semantic-router/.venv/lib/python3.9/site-packages/numpy/core/fromnumeric.py:3504: RuntimeWarning: Mean of empty slice.
return _methods._mean(a, axis=axis, dtype=dtype, return _methods._mean(a, axis=axis, dtype=dtype,
/Users/jakit/customers/aurelio/semantic-router/.venv/lib/python3.9/site-packages/numpy/core/_methods.py:129: RuntimeWarning: invalid value encountered in scalar divide /Users/jakit/customers/aurelio/semantic-router/.venv/lib/python3.9/site-packages/numpy/core/_methods.py:129: RuntimeWarning: invalid value encountered in scalar divide
ret = ret.dtype.type(ret / rcount) ret = ret.dtype.type(ret / rcount)
/Users/jakit/customers/aurelio/semantic-router/.venv/lib/python3.9/site-packages/numpy/core/_methods.py:206: RuntimeWarning: Degrees of freedom <= 0 for slice /Users/jakit/customers/aurelio/semantic-router/.venv/lib/python3.9/site-packages/numpy/core/_methods.py:206: RuntimeWarning: Degrees of freedom <= 0 for slice
ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof, ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
/Users/jakit/customers/aurelio/semantic-router/.venv/lib/python3.9/site-packages/numpy/core/_methods.py:163: RuntimeWarning: invalid value encountered in divide /Users/jakit/customers/aurelio/semantic-router/.venv/lib/python3.9/site-packages/numpy/core/_methods.py:163: RuntimeWarning: invalid value encountered in divide
arrmean = um.true_divide(arrmean, div, out=arrmean, arrmean = um.true_divide(arrmean, div, out=arrmean,
/Users/jakit/customers/aurelio/semantic-router/.venv/lib/python3.9/site-packages/numpy/core/_methods.py:198: RuntimeWarning: invalid value encountered in scalar divide /Users/jakit/customers/aurelio/semantic-router/.venv/lib/python3.9/site-packages/numpy/core/_methods.py:198: RuntimeWarning: invalid value encountered in scalar divide
ret = ret.dtype.type(ret / rcount) ret = ret.dtype.type(ret / rcount)
2024-02-26 17:07:32 INFO semantic_router.utils.logger Optimal threshold 0.5 found with median tokens (27.0) in target range (1-500). 2024-02-26 17:07:32 INFO semantic_router.utils.logger Optimal threshold 0.5 found with median tokens (27.0) in target range (1-500).
2024-02-26 17:07:32 INFO semantic_router.utils.logger Splitting Statistics: 2024-02-26 17:07:32 INFO semantic_router.utils.logger Splitting Statistics:
- Total Documents: 1 - Total Documents: 1
- Total Splits: 1 - Total Splits: 1
- Splits by Threshold: 0 - Splits by Threshold: 0
- Splits by Max Chunk Size: 0 - Splits by Max Chunk Size: 0
- Last Split: 1 - Last Split: 1
- Minimum Token Size of Split: 27 - Minimum Token Size of Split: 27
- Maximum Token Size of Split: 27 - Maximum Token Size of Split: 27
- Similarity Split Ratio: 0.00 - Similarity Split Ratio: 0.00
-------------------------------------------------------------------------------- --------------------------------------------------------------------------------
2024-02-26 17:07:33 INFO semantic_router.utils.logger Optimal threshold 0.7912974224053915 found with median tokens (136.5) in target range (1-500). 2024-02-26 17:07:33 INFO semantic_router.utils.logger Optimal threshold 0.7912974224053915 found with median tokens (136.5) in target range (1-500).
2024-02-26 17:07:33 INFO semantic_router.utils.logger Splitting Statistics: 2024-02-26 17:07:33 INFO semantic_router.utils.logger Splitting Statistics:
- Total Documents: 3 - Total Documents: 3
- Total Splits: 2 - Total Splits: 2
- Splits by Threshold: 1 - Splits by Threshold: 1
- Splits by Max Chunk Size: 0 - Splits by Max Chunk Size: 0
- Last Split: 1 - Last Split: 1
- Minimum Token Size of Split: 19 - Minimum Token Size of Split: 19
- Maximum Token Size of Split: 254 - Maximum Token Size of Split: 254
- Similarity Split Ratio: 0.50 - Similarity Split Ratio: 0.50
-------------------------------------------------------------------------------- --------------------------------------------------------------------------------
2024-02-26 17:07:33 INFO semantic_router.utils.logger Optimal threshold 0.8514465425347408 found with median tokens (129.5) in target range (1-500). 2024-02-26 17:07:33 INFO semantic_router.utils.logger Optimal threshold 0.8514465425347408 found with median tokens (129.5) in target range (1-500).
2024-02-26 17:07:33 INFO semantic_router.utils.logger Splitting Statistics: 2024-02-26 17:07:33 INFO semantic_router.utils.logger Splitting Statistics:
- Total Documents: 7 - Total Documents: 7
- Total Splits: 4 - Total Splits: 4
- Splits by Threshold: 3 - Splits by Threshold: 3
- Splits by Max Chunk Size: 0 - Splits by Max Chunk Size: 0
- Last Split: 1 - Last Split: 1
- Minimum Token Size of Split: 64 - Minimum Token Size of Split: 64
- Maximum Token Size of Split: 400 - Maximum Token Size of Split: 400
- Similarity Split Ratio: 0.75 - Similarity Split Ratio: 0.75
-------------------------------------------------------------------------------- --------------------------------------------------------------------------------
2024-02-26 17:07:34 INFO semantic_router.utils.logger Optimal threshold 0.8371609601655312 found with median tokens (154.0) in target range (1-500). 2024-02-26 17:07:34 INFO semantic_router.utils.logger Optimal threshold 0.8371609601655312 found with median tokens (154.0) in target range (1-500).
2024-02-26 17:07:34 INFO semantic_router.utils.logger Splitting Statistics: 2024-02-26 17:07:34 INFO semantic_router.utils.logger Splitting Statistics:
- Total Documents: 7 - Total Documents: 7
- Total Splits: 4 - Total Splits: 4
- Splits by Threshold: 3 - Splits by Threshold: 3
- Splits by Max Chunk Size: 0 - Splits by Max Chunk Size: 0
- Last Split: 1 - Last Split: 1
- Minimum Token Size of Split: 37 - Minimum Token Size of Split: 37
- Maximum Token Size of Split: 362 - Maximum Token Size of Split: 362
- Similarity Split Ratio: 0.75 - Similarity Split Ratio: 0.75
-------------------------------------------------------------------------------- --------------------------------------------------------------------------------
2024-02-26 17:07:34 INFO semantic_router.utils.logger Optimal threshold 0.8004127909380481 found with median tokens (46.0) in target range (1-500). 2024-02-26 17:07:34 INFO semantic_router.utils.logger Optimal threshold 0.8004127909380481 found with median tokens (46.0) in target range (1-500).
2024-02-26 17:07:34 INFO semantic_router.utils.logger Splitting Statistics: 2024-02-26 17:07:34 INFO semantic_router.utils.logger Splitting Statistics:
- Total Documents: 5 - Total Documents: 5
- Total Splits: 3 - Total Splits: 3
- Splits by Threshold: 2 - Splits by Threshold: 2
- Splits by Max Chunk Size: 0 - Splits by Max Chunk Size: 0
- Last Split: 1 - Last Split: 1
- Minimum Token Size of Split: 15 - Minimum Token Size of Split: 15
- Maximum Token Size of Split: 161 - Maximum Token Size of Split: 161
- Similarity Split Ratio: 0.67 - Similarity Split Ratio: 0.67
-------------------------------------------------------------------------------- --------------------------------------------------------------------------------
2024-02-26 17:07:35 INFO semantic_router.utils.logger Optimal threshold 0.7219220831968602 found with median tokens (94.0) in target range (1-500). 2024-02-26 17:07:35 INFO semantic_router.utils.logger Optimal threshold 0.7219220831968602 found with median tokens (94.0) in target range (1-500).
2024-02-26 17:07:35 INFO semantic_router.utils.logger Splitting Statistics: 2024-02-26 17:07:35 INFO semantic_router.utils.logger Splitting Statistics:
- Total Documents: 6 - Total Documents: 6
- Total Splits: 3 - Total Splits: 3
- Splits by Threshold: 2 - Splits by Threshold: 2
- Splits by Max Chunk Size: 0 - Splits by Max Chunk Size: 0
- Last Split: 1 - Last Split: 1
- Minimum Token Size of Split: 8 - Minimum Token Size of Split: 8
- Maximum Token Size of Split: 100 - Maximum Token Size of Split: 100
- Similarity Split Ratio: 0.67 - Similarity Split Ratio: 0.67
-------------------------------------------------------------------------------- --------------------------------------------------------------------------------
2024-02-26 17:07:35 INFO semantic_router.utils.logger Optimal threshold 0.7865543500746407 found with median tokens (92.5) in target range (1-500). 2024-02-26 17:07:35 INFO semantic_router.utils.logger Optimal threshold 0.7865543500746407 found with median tokens (92.5) in target range (1-500).
2024-02-26 17:07:35 INFO semantic_router.utils.logger Splitting Statistics: 2024-02-26 17:07:35 INFO semantic_router.utils.logger Splitting Statistics:
- Total Documents: 4 - Total Documents: 4
- Total Splits: 2 - Total Splits: 2
- Splits by Threshold: 1 - Splits by Threshold: 1
- Splits by Max Chunk Size: 0 - Splits by Max Chunk Size: 0
- Last Split: 1 - Last Split: 1
- Minimum Token Size of Split: 12 - Minimum Token Size of Split: 12
- Maximum Token Size of Split: 173 - Maximum Token Size of Split: 173
- Similarity Split Ratio: 0.50 - Similarity Split Ratio: 0.50
-------------------------------------------------------------------------------- --------------------------------------------------------------------------------
2024-02-26 17:07:36 INFO semantic_router.utils.logger Optimal threshold 0.7759885849518695 found with median tokens (73.0) in target range (1-500). 2024-02-26 17:07:36 INFO semantic_router.utils.logger Optimal threshold 0.7759885849518695 found with median tokens (73.0) in target range (1-500).
2024-02-26 17:07:36 INFO semantic_router.utils.logger Splitting Statistics: 2024-02-26 17:07:36 INFO semantic_router.utils.logger Splitting Statistics:
- Total Documents: 9 - Total Documents: 9
- Total Splits: 5 - Total Splits: 5
- Splits by Threshold: 4 - Splits by Threshold: 4
- Splits by Max Chunk Size: 0 - Splits by Max Chunk Size: 0
- Last Split: 1 - Last Split: 1
- Minimum Token Size of Split: 15 - Minimum Token Size of Split: 15
- Maximum Token Size of Split: 210 - Maximum Token Size of Split: 210
- Similarity Split Ratio: 0.80 - Similarity Split Ratio: 0.80
-------------------------------------------------------------------------------- --------------------------------------------------------------------------------
2024-02-26 17:07:36 INFO semantic_router.utils.logger Optimal threshold 0.7356350410401438 found with median tokens (23.0) in target range (1-500). 2024-02-26 17:07:36 INFO semantic_router.utils.logger Optimal threshold 0.7356350410401438 found with median tokens (23.0) in target range (1-500).
2024-02-26 17:07:36 INFO semantic_router.utils.logger Splitting Statistics: 2024-02-26 17:07:36 INFO semantic_router.utils.logger Splitting Statistics:
- Total Documents: 5 - Total Documents: 5
- Total Splits: 3 - Total Splits: 3
- Splits by Threshold: 2 - Splits by Threshold: 2
- Splits by Max Chunk Size: 0 - Splits by Max Chunk Size: 0
- Last Split: 1 - Last Split: 1
- Minimum Token Size of Split: 8 - Minimum Token Size of Split: 8
- Maximum Token Size of Split: 198 - Maximum Token Size of Split: 198
- Similarity Split Ratio: 0.67 - Similarity Split Ratio: 0.67
-------------------------------------------------------------------------------- --------------------------------------------------------------------------------
2024-02-26 17:07:37 INFO semantic_router.utils.logger Optimal threshold 0.7993056373716161 found with median tokens (14.0) in target range (1-500). 2024-02-26 17:07:37 INFO semantic_router.utils.logger Optimal threshold 0.7993056373716161 found with median tokens (14.0) in target range (1-500).
2024-02-26 17:07:37 INFO semantic_router.utils.logger Splitting Statistics: 2024-02-26 17:07:37 INFO semantic_router.utils.logger Splitting Statistics:
- Total Documents: 5 - Total Documents: 5
- Total Splits: 3 - Total Splits: 3
- Splits by Threshold: 2 - Splits by Threshold: 2
- Splits by Max Chunk Size: 0 - Splits by Max Chunk Size: 0
- Last Split: 1 - Last Split: 1
- Minimum Token Size of Split: 10 - Minimum Token Size of Split: 10
- Maximum Token Size of Split: 95 - Maximum Token Size of Split: 95
- Similarity Split Ratio: 0.67 - Similarity Split Ratio: 0.67
-------------------------------------------------------------------------------- --------------------------------------------------------------------------------
2024-02-26 17:07:37 INFO semantic_router.utils.logger Optimal threshold 0.7946781280578719 found with median tokens (104.5) in target range (1-500). 2024-02-26 17:07:37 INFO semantic_router.utils.logger Optimal threshold 0.7946781280578719 found with median tokens (104.5) in target range (1-500).
2024-02-26 17:07:37 INFO semantic_router.utils.logger Splitting Statistics: 2024-02-26 17:07:37 INFO semantic_router.utils.logger Splitting Statistics:
- Total Documents: 4 - Total Documents: 4
- Total Splits: 2 - Total Splits: 2
- Splits by Threshold: 1 - Splits by Threshold: 1
- Splits by Max Chunk Size: 0 - Splits by Max Chunk Size: 0
- Last Split: 1 - Last Split: 1
- Minimum Token Size of Split: 87 - Minimum Token Size of Split: 87
- Maximum Token Size of Split: 122 - Maximum Token Size of Split: 122
- Similarity Split Ratio: 0.50 - Similarity Split Ratio: 0.50
-------------------------------------------------------------------------------- --------------------------------------------------------------------------------
2024-02-26 17:07:38 INFO semantic_router.utils.logger Optimal threshold 0.7079124801171096 found with median tokens (15.0) in target range (1-500). 2024-02-26 17:07:38 INFO semantic_router.utils.logger Optimal threshold 0.7079124801171096 found with median tokens (15.0) in target range (1-500).
2024-02-26 17:07:38 INFO semantic_router.utils.logger Splitting Statistics: 2024-02-26 17:07:38 INFO semantic_router.utils.logger Splitting Statistics:
- Total Documents: 2 - Total Documents: 2
- Total Splits: 1 - Total Splits: 1
- Splits by Threshold: 0 - Splits by Threshold: 0
- Splits by Max Chunk Size: 0 - Splits by Max Chunk Size: 0
- Last Split: 1 - Last Split: 1
- Minimum Token Size of Split: 15 - Minimum Token Size of Split: 15
- Maximum Token Size of Split: 15 - Maximum Token Size of Split: 15
- Similarity Split Ratio: 0.00 - Similarity Split Ratio: 0.00
-------------------------------------------------------------------------------- --------------------------------------------------------------------------------
2024-02-26 17:07:38 INFO semantic_router.utils.logger Optimal threshold 0.8324466121743902 found with median tokens (110.5) in target range (1-500). 2024-02-26 17:07:38 INFO semantic_router.utils.logger Optimal threshold 0.8324466121743902 found with median tokens (110.5) in target range (1-500).
2024-02-26 17:07:38 INFO semantic_router.utils.logger Splitting Statistics: 2024-02-26 17:07:38 INFO semantic_router.utils.logger Splitting Statistics:
- Total Documents: 12 - Total Documents: 12
- Total Splits: 6 - Total Splits: 6
- Splits by Threshold: 5 - Splits by Threshold: 5
- Splits by Max Chunk Size: 0 - Splits by Max Chunk Size: 0
- Last Split: 1 - Last Split: 1
- Minimum Token Size of Split: 57 - Minimum Token Size of Split: 57
- Maximum Token Size of Split: 254 - Maximum Token Size of Split: 254
- Similarity Split Ratio: 0.83 - Similarity Split Ratio: 0.83
-------------------------------------------------------------------------------- --------------------------------------------------------------------------------
2024-02-26 17:07:39 INFO semantic_router.utils.logger Optimal threshold 0.8128022034342155 found with median tokens (16.5) in target range (1-500). 2024-02-26 17:07:39 INFO semantic_router.utils.logger Optimal threshold 0.8128022034342155 found with median tokens (16.5) in target range (1-500).
2024-02-26 17:07:39 INFO semantic_router.utils.logger Splitting Statistics: 2024-02-26 17:07:39 INFO semantic_router.utils.logger Splitting Statistics:
- Total Documents: 3 - Total Documents: 3
- Total Splits: 2 - Total Splits: 2
- Splits by Threshold: 1 - Splits by Threshold: 1
- Splits by Max Chunk Size: 0 - Splits by Max Chunk Size: 0
- Last Split: 1 - Last Split: 1
- Minimum Token Size of Split: 4 - Minimum Token Size of Split: 4
- Maximum Token Size of Split: 29 - Maximum Token Size of Split: 29
- Similarity Split Ratio: 0.50 - Similarity Split Ratio: 0.50
-------------------------------------------------------------------------------- --------------------------------------------------------------------------------
2024-02-26 17:07:39 INFO semantic_router.utils.logger Optimal threshold 0.786452236757286 found with median tokens (173.5) in target range (1-500). 2024-02-26 17:07:39 INFO semantic_router.utils.logger Optimal threshold 0.786452236757286 found with median tokens (173.5) in target range (1-500).
2024-02-26 17:07:39 INFO semantic_router.utils.logger Splitting Statistics: 2024-02-26 17:07:39 INFO semantic_router.utils.logger Splitting Statistics:
- Total Documents: 8 - Total Documents: 8
- Total Splits: 4 - Total Splits: 4
- Splits by Threshold: 3 - Splits by Threshold: 3
- Splits by Max Chunk Size: 0 - Splits by Max Chunk Size: 0
- Last Split: 1 - Last Split: 1
- Minimum Token Size of Split: 8 - Minimum Token Size of Split: 8
- Maximum Token Size of Split: 241 - Maximum Token Size of Split: 241
- Similarity Split Ratio: 0.75 - Similarity Split Ratio: 0.75
-------------------------------------------------------------------------------- --------------------------------------------------------------------------------
2024-02-26 17:07:40 INFO semantic_router.utils.logger Optimal threshold 0.8250029487527775 found with median tokens (41.0) in target range (1-500). 2024-02-26 17:07:40 INFO semantic_router.utils.logger Optimal threshold 0.8250029487527775 found with median tokens (41.0) in target range (1-500).
2024-02-26 17:07:40 INFO semantic_router.utils.logger Splitting Statistics: 2024-02-26 17:07:40 INFO semantic_router.utils.logger Splitting Statistics:
- Total Documents: 2 - Total Documents: 2
- Total Splits: 1 - Total Splits: 1
- Splits by Threshold: 0 - Splits by Threshold: 0
- Splits by Max Chunk Size: 0 - Splits by Max Chunk Size: 0
- Last Split: 1 - Last Split: 1
- Minimum Token Size of Split: 41 - Minimum Token Size of Split: 41
- Maximum Token Size of Split: 41 - Maximum Token Size of Split: 41
- Similarity Split Ratio: 0.00 - Similarity Split Ratio: 0.00
-------------------------------------------------------------------------------- --------------------------------------------------------------------------------
2024-02-26 17:07:41 INFO semantic_router.utils.logger Optimal threshold 0.8086076732442027 found with median tokens (108.0) in target range (1-500). 2024-02-26 17:07:41 INFO semantic_router.utils.logger Optimal threshold 0.8086076732442027 found with median tokens (108.0) in target range (1-500).
2024-02-26 17:07:41 INFO semantic_router.utils.logger Splitting Statistics: 2024-02-26 17:07:41 INFO semantic_router.utils.logger Splitting Statistics:
- Total Documents: 45 - Total Documents: 45
- Total Splits: 23 - Total Splits: 23
- Splits by Threshold: 22 - Splits by Threshold: 22
- Splits by Max Chunk Size: 0 - Splits by Max Chunk Size: 0
- Last Split: 1 - Last Split: 1
- Minimum Token Size of Split: 4 - Minimum Token Size of Split: 4
- Maximum Token Size of Split: 513 - Maximum Token Size of Split: 513
- Similarity Split Ratio: 0.96 - Similarity Split Ratio: 0.96
-------------------------------------------------------------------------------- --------------------------------------------------------------------------------
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
print_chunks_by_title(chunks_by_title) print_chunks_by_title(chunks_by_title)
``` ```
%% Output %% Output
......
import string import string
from collections import Counter from collections import Counter
from typing import Dict from typing import Dict, List
import numpy as np import numpy as np
from numpy import ndarray from numpy import ndarray
...@@ -20,7 +20,7 @@ class TfidfEncoder(BaseEncoder): ...@@ -20,7 +20,7 @@ class TfidfEncoder(BaseEncoder):
self.word_index = {} self.word_index = {}
self.idf = np.array([]) self.idf = np.array([])
def __call__(self, docs: list[str]) -> list[list[float]]: def __call__(self, docs: List[str]) -> List[List[float]]:
if len(self.word_index) == 0 or self.idf.size == 0: if len(self.word_index) == 0 or self.idf.size == 0:
raise ValueError("Vectorizer is not initialized.") raise ValueError("Vectorizer is not initialized.")
if len(docs) == 0: if len(docs) == 0:
...@@ -31,7 +31,7 @@ class TfidfEncoder(BaseEncoder): ...@@ -31,7 +31,7 @@ class TfidfEncoder(BaseEncoder):
tfidf = tf * self.idf tfidf = tf * self.idf
return tfidf.tolist() return tfidf.tolist()
def fit(self, routes: list[Route]): def fit(self, routes: List[Route]):
docs = [] docs = []
for route in routes: for route in routes:
for doc in route.utterances: for doc in route.utterances:
...@@ -39,7 +39,7 @@ class TfidfEncoder(BaseEncoder): ...@@ -39,7 +39,7 @@ class TfidfEncoder(BaseEncoder):
self.word_index = self._build_word_index(docs) self.word_index = self._build_word_index(docs)
self.idf = self._compute_idf(docs) self.idf = self._compute_idf(docs)
def _build_word_index(self, docs: list[str]) -> dict: def _build_word_index(self, docs: List[str]) -> dict:
words = set() words = set()
for doc in docs: for doc in docs:
for word in doc.split(): for word in doc.split():
...@@ -47,7 +47,7 @@ class TfidfEncoder(BaseEncoder): ...@@ -47,7 +47,7 @@ class TfidfEncoder(BaseEncoder):
word_index = {word: i for i, word in enumerate(words)} word_index = {word: i for i, word in enumerate(words)}
return word_index return word_index
def _compute_tf(self, docs: list[str]) -> np.ndarray: def _compute_tf(self, docs: List[str]) -> np.ndarray:
if len(self.word_index) == 0: if len(self.word_index) == 0:
raise ValueError("Word index is not initialized.") raise ValueError("Word index is not initialized.")
tf = np.zeros((len(docs), len(self.word_index))) tf = np.zeros((len(docs), len(self.word_index)))
...@@ -60,7 +60,7 @@ class TfidfEncoder(BaseEncoder): ...@@ -60,7 +60,7 @@ class TfidfEncoder(BaseEncoder):
tf = tf / norm(tf, axis=1, keepdims=True) tf = tf / norm(tf, axis=1, keepdims=True)
return tf return tf
def _compute_idf(self, docs: list[str]) -> np.ndarray: def _compute_idf(self, docs: List[str]) -> np.ndarray:
if len(self.word_index) == 0: if len(self.word_index) == 0:
raise ValueError("Word index is not initialized.") raise ValueError("Word index is not initialized.")
idf = np.zeros(len(self.word_index)) idf = np.zeros(len(self.word_index))
......
...@@ -196,7 +196,7 @@ class RouteLayer: ...@@ -196,7 +196,7 @@ class RouteLayer:
else: else:
self.encoder = encoder self.encoder = encoder
self.llm = llm self.llm = llm
self.routes: list[Route] = routes if routes is not None else [] self.routes: List[Route] = routes if routes is not None else []
self.score_threshold = self.encoder.score_threshold self.score_threshold = self.encoder.score_threshold
self.top_k = top_k self.top_k = top_k
if self.top_k < 1: if self.top_k < 1:
......
...@@ -20,7 +20,7 @@ class BaseLLM(BaseModel): ...@@ -20,7 +20,7 @@ class BaseLLM(BaseModel):
raise NotImplementedError("Subclasses must implement this method") raise NotImplementedError("Subclasses must implement this method")
def _is_valid_inputs( def _is_valid_inputs(
self, inputs: list[dict[str, Any]], function_schemas: list[dict[str, Any]] self, inputs: List[dict[str, Any]], function_schemas: List[dict[str, Any]]
) -> bool: ) -> bool:
"""Determine if the functions chosen by the LLM exist within the function_schemas, """Determine if the functions chosen by the LLM exist within the function_schemas,
and if the input arguments are valid for those functions.""" and if the input arguments are valid for those functions."""
...@@ -68,7 +68,7 @@ class BaseLLM(BaseModel): ...@@ -68,7 +68,7 @@ class BaseLLM(BaseModel):
logger.error(f"Single input validation error: {str(e)}") logger.error(f"Single input validation error: {str(e)}")
return False return False
def _extract_parameter_info(self, signature: str) -> tuple[list[str], list[str]]: def _extract_parameter_info(self, signature: str) -> tuple[List[str], List[str]]:
"""Extract parameter names and types from the function signature.""" """Extract parameter names and types from the function signature."""
param_info = [param.strip() for param in signature[1:-1].split(",")] param_info = [param.strip() for param in signature[1:-1].split(",")]
param_names = [info.split(":")[0].strip() for info in param_info] param_names = [info.split(":")[0].strip() for info in param_info]
...@@ -78,7 +78,7 @@ class BaseLLM(BaseModel): ...@@ -78,7 +78,7 @@ class BaseLLM(BaseModel):
return param_names, param_types return param_names, param_types
def extract_function_inputs( def extract_function_inputs(
self, query: str, function_schemas: list[dict[str, Any]] self, query: str, function_schemas: List[dict[str, Any]]
) -> dict: ) -> dict:
logger.info("Extracting function input...") logger.info("Extracting function input...")
......
from contextlib import contextmanager from contextlib import contextmanager
from pathlib import Path from pathlib import Path
from typing import Any, Optional from typing import Any, Optional, List
from pydantic.v1 import PrivateAttr from pydantic.v1 import PrivateAttr
...@@ -48,7 +48,7 @@ class LlamaCppLLM(BaseLLM): ...@@ -48,7 +48,7 @@ class LlamaCppLLM(BaseLLM):
def __call__( def __call__(
self, self,
messages: list[Message], messages: List[Message],
) -> str: ) -> str:
try: try:
completion = self.llm.create_chat_completion( completion = self.llm.create_chat_completion(
......
...@@ -41,7 +41,7 @@ class OpenAILLM(BaseLLM): ...@@ -41,7 +41,7 @@ class OpenAILLM(BaseLLM):
self.temperature = temperature self.temperature = temperature
self.max_tokens = max_tokens self.max_tokens = max_tokens
def _extract_tool_calls_info(self, tool_calls: list[dict[str, Any]]) -> list[dict[str, Any]]: def _extract_tool_calls_info(self, tool_calls: List[dict[str, Any]]) -> List[dict[str, Any]]:
tool_calls_info = [] tool_calls_info = []
for tool_call in tool_calls: for tool_call in tool_calls:
if tool_call.function.arguments is None: if tool_call.function.arguments is None:
...@@ -57,7 +57,7 @@ class OpenAILLM(BaseLLM): ...@@ -57,7 +57,7 @@ class OpenAILLM(BaseLLM):
def __call__( def __call__(
self, self,
messages: List[Message], messages: List[Message],
function_schemas: Optional[list[dict[str, Any]]] = None, function_schemas: Optional[List[dict[str, Any]]] = None,
) -> str: ) -> str:
if self.client is None: if self.client is None:
raise ValueError("OpenAI client is not initialized.") raise ValueError("OpenAI client is not initialized.")
...@@ -99,7 +99,7 @@ class OpenAILLM(BaseLLM): ...@@ -99,7 +99,7 @@ class OpenAILLM(BaseLLM):
raise Exception(f"LLM error: {e}") from e raise Exception(f"LLM error: {e}") from e
def extract_function_inputs( def extract_function_inputs(
self, query: str, function_schemas: list[dict[str, Any]] self, query: str, function_schemas: List[dict[str, Any]]
) -> dict: ) -> dict:
messages = [] messages = []
system_prompt = "You are an intelligent AI. Given a command or request from the user, call the function to complete the request." system_prompt = "You are an intelligent AI. Given a command or request from the user, call the function to complete the request."
......
...@@ -47,7 +47,7 @@ class Route(BaseModel): ...@@ -47,7 +47,7 @@ class Route(BaseModel):
name: str name: str
utterances: Union[List[str], List[Union[Any, "Image"]]] utterances: Union[List[str], List[Union[Any, "Image"]]]
description: Optional[str] = None description: Optional[str] = None
function_schemas: Optional[list[Dict[str, Any]]] = None function_schemas: Optional[List[Dict[str, Any]]] = None
llm: Optional[BaseLLM] = None llm: Optional[BaseLLM] = None
score_threshold: Optional[float] = None score_threshold: Optional[float] = None
......
...@@ -24,7 +24,7 @@ class EncoderInfo(BaseModel): ...@@ -24,7 +24,7 @@ class EncoderInfo(BaseModel):
class RouteChoice(BaseModel): class RouteChoice(BaseModel):
name: Optional[str] = None name: Optional[str] = None
function_call: Optional[list[dict]] = None function_call: Optional[List[dict]] = None
similarity_score: Optional[float] = None similarity_score: Optional[float] = None
......
...@@ -340,7 +340,7 @@ class RollingWindowSplitter(BaseSplitter): ...@@ -340,7 +340,7 @@ class RollingWindowSplitter(BaseSplitter):
self, self,
similarities: List[float], similarities: List[float],
split_indices: List[int], split_indices: List[int],
splits: list[DocumentSplit], splits: List[DocumentSplit],
): ):
try: try:
from matplotlib import pyplot as plt from matplotlib import pyplot as plt
......
import regex import regex
import tiktoken import tiktoken
from typing import List
def split_to_sentences(text: str) -> List[str]:
def split_to_sentences(text: str) -> list[str]:
""" """
Enhanced regex pattern to split a given text into sentences more accurately. Enhanced regex pattern to split a given text into sentences more accurately.
......
...@@ -157,7 +157,7 @@ class Conversation(BaseModel): ...@@ -157,7 +157,7 @@ class Conversation(BaseModel):
:raises ValueError: If the splitter is not configured before calling this method. :raises ValueError: If the splitter is not configured before calling this method.
:return: A tuple containing the updated list of topics and the list of new topics generated in this call. :return: A tuple containing the updated list of topics and the list of new topics generated in this call.
:rtype: tuple[list[tuple[int, str]], list[DocumentSplit]] :rtype: tuple[List[tuple[int, str]], List[DocumentSplit]]
""" """
if self.splitter is None: if self.splitter is None:
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment