cleanup install to incl semantic router

6b6fbaf1 · James Briggs · 0445421e · 6b6fbaf1
Unverified Commit 6b6fbaf1 authored 1 year ago by James Briggs
--- a/docs/examples/unstructured-element-splitter.ipynb
+++ b/docs/examples/unstructured-element-splitter.ipynb
@@ -14,7 +14,16 @@
   "outputs": [],
   "source": [
    "# It may take longer to install the package\n",
-    "!pip install -q -U \"unstructured[pdf]\""
+    "!pip install -qU \\\n",
+    "    \"unstructured[pdf]==0.12.4\" \\\n",
+    "    \"semantic-router==0.0.24\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Start by downloading and processing an ArXiv paper."
   ]
  },
  {
@@ -300,11 +309,11 @@
   ]
  },
  {
-   "cell_type": "code",
+   "cell_type": "markdown",
-   "execution_count": null,
   "metadata": {},
-   "outputs": [],
+   "source": [
-   "source": []
+    "---"
+   ]
  }
 ],
 "metadata": {

 %% Cell type:markdown id: tags:
 ### Partition elements using Unstructured library
 %% Cell type:code id: tags:
 ``` python
 # It may take longer to install the package
-!pip install -q -U "unstructured[pdf]"
+!pip install -qU \
+    "unstructured[pdf]==0.12.4" \
+    "semantic-router==0.0.24"
 ```
+%% Cell type:markdown id: tags:
+Start by downloading and processing an ArXiv paper.
 %% Cell type:code id: tags:
 ``` python
 from unstructured.partition.auto import partition
 article_url = "https://arxiv.org/pdf/2402.05131.pdf"
 elements = partition(url=article_url, strategy="hi_res", pdf_infer_table_structure=True)
 ```
 %% Output
    Some weights of the model checkpoint at microsoft/table-transformer-structure-recognition were not used when initializing TableTransformerForObjectDetection: ['model.backbone.conv_encoder.model.layer2.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer3.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer4.0.downsample.1.num_batches_tracked']
    - This IS expected if you are initializing TableTransformerForObjectDetection from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
    - This IS NOT expected if you are initializing TableTransformerForObjectDetection from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
 %% Cell type:markdown id: tags:
 #### Define helper functions
 %% Cell type:markdown id: tags:
 Validate if parsed title element is a real title
 %% Cell type:code id: tags:
 ``` python
 import re
 def is_valid_title(title: str) -> bool:
    # Rule 1: Title starts with a lowercase letter
    if re.match(r"^[a-z]", title):
        return False
    # Rule 2: Title has a special character (excluding :, -, and .)
    if re.search(r"[^\w\s:\-\.]", title):
        return False
    # Rule 3: Title ends with a dot
    if title.endswith("."):
        return False
    return True
 ```
 %% Cell type:markdown id: tags:
 Group elements by valid titles
 %% Cell type:code id: tags:
 ``` python
 from unstructured.documents.elements import Element
 from colorama import Fore, Style
 def group_elements_by_title(elements: list[Element]) -> dict:
    grouped_elements = {}
    current_title = "Untitled"  # Default title for initial text without a title
    for element in elements:
        element_dict = element.to_dict()
        if element_dict.get("type") == "Title":
            potential_title = element_dict.get("text", "Untitled")
            if is_valid_title(potential_title):
                print(f"{Fore.GREEN}{potential_title}: True{Style.RESET_ALL}")
                current_title = potential_title
            else:
                print(f"{Fore.RED}{potential_title}: False{Style.RESET_ALL}")
                continue
        else:
            if current_title not in grouped_elements:
                grouped_elements[current_title] = []
            else:
                grouped_elements[current_title].append(element)
    return grouped_elements
 ```
 %% Cell type:markdown id: tags:
 Generates chunks grouped elements using semantic RollingWindow splitter
 %% Cell type:code id: tags:
 ``` python
 from semantic_router.splitters import RollingWindowSplitter
 def create_title_chunks(
    grouped_elements: dict, splitter: RollingWindowSplitter
 ) -> list:
    title_with_chunks = []
    for title, elements in grouped_elements.items():
        if not elements:
            continue
        combined_element_texts = []
        chunks = []
        for element in elements:
            if not element.text:
                continue
            element_dict = element.to_dict()
            if element_dict.get("type") == "Table":
                # Process accumulated text before the table
                if combined_element_texts:
                    splits = splitter(combined_element_texts)
                    chunks.extend([split.content for split in splits])
                    combined_element_texts = []  # Reset combined texts after processing
                # Add table as a separate chunk
                table_text_html = element.metadata.text_as_html
                chunks.append(table_text_html)
            else:
                combined_element_texts.append(element.text)
        # Process any remaining accumulated text after the last table
        # or if no table was encountered
        if combined_element_texts:
            splits = splitter(combined_element_texts)
            chunks.extend([split.content for split in splits])
        if chunks:
            title_with_chunks.append({"title": title, "chunks": chunks})
    return title_with_chunks
 ```
 %% Cell type:markdown id: tags:
 Display chunked text in colors
 %% Cell type:code id: tags:
 ``` python
 from IPython.display import display, HTML
 import itertools
 def print_chunks_by_title(chunks_by_title):
    color_cycle = itertools.cycle(["red", "green", "blue", "magenta"])
    html_output = ""
    for section in chunks_by_title:
        title = section["title"]
        chunks = section["chunks"]
        html_output += f"<h3 style='color: black;'>{title}</h3>"
        for chunk in chunks:
            color = next(color_cycle)
            html_output += f"<p style='color: {color};'>{chunk}</p>"
    display(HTML(html_output))
 ```
 %% Cell type:markdown id: tags:
 ### Process the elements
 %% Cell type:code id: tags:
 ``` python
 import os
 from semantic_router.encoders import OpenAIEncoder
 encoder = OpenAIEncoder(openai_api_key=os.environ["OPENAI_API_KEY"])
 splitter = RollingWindowSplitter(
    encoder=encoder,
    window_size=1,  # Compares each element with the previous one
    min_split_tokens=50,
    max_split_tokens=300,
 )
 ```
 %% Cell type:code id: tags:
 ``` python
 grouped_elements = group_elements_by_title(elements)
 ```
 %% Output
    [31met! ee: False[0m
    [31mb e F 0 1: False[0m
    [31m] L C . s c [: False[0m
    [32mFinancial Report Chunking for Eﬀective Retrieval Augmented Generation: True[0m
    [32mIntroduction: True[0m
    [31m2 Jimeno Yepes et al.: False[0m
    [31m1 https://www.sec.gov 2 https://www.sec.gov/files/cf-frm.pdf: False[0m
    [32m2 Related work: True[0m
    [31m4 Jimeno Yepes et al.: False[0m
    [32m3 Methods: True[0m
    [32m3.1 RAG setting for the experiments: True[0m
    [32m3.2 Indexing and retrieval: True[0m
    [31m7 https://weaviate.io/developers/weaviate 8 https://huggingface.co/sentence-transformers/multi-qa-mpnet-base-dot-: False[0m
    [31mv1: False[0m
    [32m3.3 Generation: True[0m
    [31mQuestion: {query}: False[0m
    [32m3.4 Chunking: True[0m
    [32m3.5 Dataset: True[0m
    [32m4 Results: True[0m
    [31m11 https://platform.openai.com/docs/guides/embeddings/limitations-risks: False[0m
    [31m10 Jimeno Yepes et al.: False[0m
    [32m5 Discussion: True[0m
    [31m12 Jimeno Yepes et al.: False[0m
    [32m6 Conclusions and Future Work: True[0m
    [32mReferences: True[0m
 %% Cell type:code id: tags:
 ``` python
 chunks_by_title = create_title_chunks(grouped_elements, splitter)
 ```
 %% Cell type:code id: tags:
 ``` python
 print_chunks_by_title(chunks_by_title)
 ```
 %% Output
-%% Cell type:code id: tags:
+%% Cell type:markdown id: tags:
-``` python
+---
-```