feat: Unstructured elements splitting example

de02b3a3 · Simonas · 599be55f · de02b3a3
Commit de02b3a3 authored 1 year ago by Simonas
--- a/docs/examples/unstructured-element-splitter.ipynb
+++ b/docs/examples/unstructured-element-splitter.ipynb
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Partition elements using Unstructured library "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# It may take longer to install the package\n",
+    "!pip install -q -U \"unstructured[pdf]\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Some weights of the model checkpoint at microsoft/table-transformer-structure-recognition were not used when initializing TableTransformerForObjectDetection: ['model.backbone.conv_encoder.model.layer2.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer3.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer4.0.downsample.1.num_batches_tracked']\n",
+      "- This IS expected if you are initializing TableTransformerForObjectDetection from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
+      "- This IS NOT expected if you are initializing TableTransformerForObjectDetection from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n"
+     ]
+    }
+   ],
+   "source": [
+    "from unstructured.partition.auto import partition\n",
+    "\n",
+    "article_url = \"https://arxiv.org/pdf/2402.05131.pdf\"\n",
+    "elements = partition(url=article_url, strategy=\"hi_res\", pdf_infer_table_structure=True)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Define helper functions"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Validate if parsed title element is a real title"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import re\n",
+    "\n",
+    "\n",
+    "def is_valid_title(title: str) -> bool:\n",
+    "    # Rule 1: Title starts with a lowercase letter\n",
+    "    if re.match(r\"^[a-z]\", title):\n",
+    "        return False\n",
+    "    # Rule 2: Title has a special character (excluding :, -, and .)\n",
+    "    if re.search(r\"[^\\w\\s:\\-\\.]\", title):\n",
+    "        return False\n",
+    "    # Rule 3: Title ends with a dot\n",
+    "    if title.endswith(\".\"):\n",
+    "        return False\n",
+    "    return True"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Group elements by valid titles"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from unstructured.documents.elements import Element\n",
+    "from colorama import Fore, Style\n",
+    "\n",
+    "\n",
+    "def group_elements_by_title(elements: list[Element]) -> dict:\n",
+    "    grouped_elements = {}\n",
+    "    current_title = \"Untitled\"  # Default title for initial text without a title\n",
+    "\n",
+    "    for element in elements:\n",
+    "        element_dict = element.to_dict()\n",
+    "\n",
+    "        if element_dict.get(\"type\") == \"Title\":\n",
+    "            potential_title = element_dict.get(\"text\", \"Untitled\")\n",
+    "            if is_valid_title(potential_title):\n",
+    "                print(f\"{Fore.GREEN}{potential_title}: True{Style.RESET_ALL}\")\n",
+    "                current_title = potential_title\n",
+    "            else:\n",
+    "                print(f\"{Fore.RED}{potential_title}: False{Style.RESET_ALL}\")\n",
+    "                continue\n",
+    "        else:\n",
+    "            if current_title not in grouped_elements:\n",
+    "                grouped_elements[current_title] = []\n",
+    "            else:\n",
+    "                grouped_elements[current_title].append(element)\n",
+    "    return grouped_elements"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Generates chunks grouped elements using semantic RollingWindow splitter"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from semantic_router.splitters import RollingWindowSplitter\n",
+    "\n",
+    "\n",
+    "def create_title_chunks(\n",
+    "    grouped_elements: dict, splitter: RollingWindowSplitter\n",
+    ") -> list:\n",
+    "    title_with_chunks = []\n",
+    "    for title, elements in grouped_elements.items():\n",
+    "        if not elements:\n",
+    "            continue\n",
+    "        combined_element_texts = []\n",
+    "        chunks = []\n",
+    "\n",
+    "        for element in elements:\n",
+    "            if not element.text:\n",
+    "                continue\n",
+    "            element_dict = element.to_dict()\n",
+    "            if element_dict.get(\"type\") == \"Table\":\n",
+    "                # Process accumulated text before the table\n",
+    "                if combined_element_texts:\n",
+    "                    splits = splitter(combined_element_texts)\n",
+    "                    chunks.extend([split.content for split in splits])\n",
+    "                    combined_element_texts = []  # Reset combined texts after processing\n",
+    "\n",
+    "                # Add table as a separate chunk\n",
+    "                table_text_html = element.metadata.text_as_html\n",
+    "                chunks.append(table_text_html)\n",
+    "            else:\n",
+    "                combined_element_texts.append(element.text)\n",
+    "\n",
+    "        # Process any remaining accumulated text after the last table\n",
+    "        # or if no table was encountered\n",
+    "\n",
+    "        if combined_element_texts:\n",
+    "            splits = splitter(combined_element_texts)\n",
+    "            chunks.extend([split.content for split in splits])\n",
+    "\n",
+    "        if chunks:\n",
+    "            title_with_chunks.append({\"title\": title, \"chunks\": chunks})\n",
+    "\n",
+    "    return title_with_chunks"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Display chunked text in colors"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from IPython.display import display, HTML\n",
+    "import itertools\n",
+    "\n",
+    "\n",
+    "def print_chunks_by_title(chunks_by_title):\n",
+    "    color_cycle = itertools.cycle([\"red\", \"green\", \"blue\", \"magenta\"])\n",
+    "    html_output = \"\"\n",
+    "    for section in chunks_by_title:\n",
+    "        title = section[\"title\"]\n",
+    "        chunks = section[\"chunks\"]\n",
+    "        html_output += f\"<h3 style='color: black;'>{title}</h3>\"\n",
+    "        for chunk in chunks:\n",
+    "            color = next(color_cycle)\n",
+    "            html_output += f\"<p style='color: {color};'>{chunk}</p>\"\n",
+    "    display(HTML(html_output))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Process the elements"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "from semantic_router.encoders import OpenAIEncoder\n",
+    "\n",
+    "encoder = OpenAIEncoder(openai_api_key=os.environ[\"OPENAI_API_KEY\"])\n",
+    "\n",
+    "splitter = RollingWindowSplitter(\n",
+    "    encoder=encoder,\n",
+    "    window_size=1,  # Compares each element with the previous one\n",
+    "    min_split_tokens=50,\n",
+    "    max_split_tokens=300,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\u001b[31met! ee: False\u001b[0m\n",
+      "\u001b[31mb e F 0 1: False\u001b[0m\n",
+      "\u001b[31m] L C . s c [: False\u001b[0m\n",
+      "\u001b[32mFinancial Report Chunking for Eﬀective Retrieval Augmented Generation: True\u001b[0m\n",
+      "\u001b[32mIntroduction: True\u001b[0m\n",
+      "\u001b[31m2 Jimeno Yepes et al.: False\u001b[0m\n",
+      "\u001b[31m1 https://www.sec.gov 2 https://www.sec.gov/files/cf-frm.pdf: False\u001b[0m\n",
+      "\u001b[32m2 Related work: True\u001b[0m\n",
+      "\u001b[31m4 Jimeno Yepes et al.: False\u001b[0m\n",
+      "\u001b[32m3 Methods: True\u001b[0m\n",
+      "\u001b[32m3.1 RAG setting for the experiments: True\u001b[0m\n",
+      "\u001b[32m3.2 Indexing and retrieval: True\u001b[0m\n",
+      "\u001b[31m7 https://weaviate.io/developers/weaviate 8 https://huggingface.co/sentence-transformers/multi-qa-mpnet-base-dot-: False\u001b[0m\n",
+      "\u001b[31mv1: False\u001b[0m\n",
+      "\u001b[32m3.3 Generation: True\u001b[0m\n",
+      "\u001b[31mQuestion: {query}: False\u001b[0m\n",
+      "\u001b[32m3.4 Chunking: True\u001b[0m\n",
+      "\u001b[32m3.5 Dataset: True\u001b[0m\n",
+      "\u001b[32m4 Results: True\u001b[0m\n",
+      "\u001b[31m11 https://platform.openai.com/docs/guides/embeddings/limitations-risks: False\u001b[0m\n",
+      "\u001b[31m10 Jimeno Yepes et al.: False\u001b[0m\n",
+      "\u001b[32m5 Discussion: True\u001b[0m\n",
+      "\u001b[31m12 Jimeno Yepes et al.: False\u001b[0m\n",
+      "\u001b[32m6 Conclusions and Future Work: True\u001b[0m\n",
+      "\u001b[32mReferences: True\u001b[0m\n"
+     ]
+    }
+   ],
+   "source": [
+    "grouped_elements = group_elements_by_title(elements)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "chunks_by_title = create_title_chunks(grouped_elements, splitter)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<h3 style='color: black;'>Untitled</h3><p style='color: red;'>2 v 1 3 1 5 0 . 2 0 4 2 : v i X r a</p><h3 style='color: black;'>Financial Report Chunking for Eﬀective Retrieval Augmented Generation</h3><p style='color: green;'>Unstructured Technologies Sacramento, CA, USA leah@unstructured.io https://unstructured.io Abstract. Chunking information is a key step in Retrieval Augmented Generation (RAG). Current research primarily centers on paragraph- level chunking. This approach treats all texts as equal and neglects the information contained in the structure of documents. We propose an expanded approach to chunk documents by moving beyond mere paragraph-level chunking to chunk primary by structural element com- ponents of documents. Dissecting documents into these constituent ele- ments creates a new way to chunk documents that yields the best chunk size without tuning. We introduce a novel framework that evaluates how chunking based on element types annotated by document understanding models contributes to the overall context and accuracy of the informa- tion retrieved. We also demonstrate how this approach impacts RAG assisted Question & Answer task performance. Our research includes a comprehensive analysis of various element types, their role in eﬀective information retrieval, and the impact they have on the quality of RAG outputs. Findings support that element type based chunking largely im- prove RAG results on ﬁnancial reporting. Through this research, we are also able to answer how to uncover highly accurate RAG. Keywords: Retrieval Augmented Generation · Document Chunking · Document Pre-Processing · Financial Domain · Large Language Models</p><h3 style='color: black;'>Introduction</h3><p style='color: blue;'>contents of extensive documents [25,22,18]. By dissecting large volumes of text into smaller, more focused segments, LLMs can process each part with greater precision, ensuring a thorough understanding of each section. This segmented approach allows for meticulous analysis of unstructured data, enabling LLMs to construct a more comprehensive and coherent understanding of the entire docu- ment [41]. There remains a challenge in ensuring factual accuracy and relevance in the generated responses, especially when dealing with complex or extensive information. Recently, Retrieval Augmented Generation (RAG) [21,12] has been devel- oped to address the hallucination problem with LLMs [15,43] when recovering factual information directly from an LLM. In RAG, instead of answering a user query directly using an LLM, the user query is used to retrieve documents or segments from a corpus and the top retrieved documents or segments are used to generate the answer in conjunction with an LLM. In this way, RAG con- straints the answer to the set of retrieved documents. RAGs have been used as well to answer questions from single documents [14]. The documents are split into smaller parts or chunks, indexed by a retrieval system and recovered and processed depending on the user information need. In a sense, this process allows answering questions about information in a single document, thus contributing to the set of techniques available for document understanding.</p><p style='color: magenta;'>Since documents need to be chunked for RAG processing, this raises the question about what is the best practice to chunk documents for eﬀective RAG document understanding. There are several dimensions to consider when decid- ing how to chunk a document, which includes the size of the chunks. The retrieval system in RAG can use traditional retrieval systems using bag- of-words methods or a vector database. If a vector database is used, then an embedding needs to be obtained from each chunk, thus the number of tokens in the chunk is relevant since the neural networks processing the chunks might have constraints on the number of tokens. As well, diﬀerent chunk sizes might have undesirable retrieval results. Since the most relevant retrieved chunks need to be processed by an LLM, the number of tokens in retrieved chunks might have an eﬀect in the generation of the answer [25]. As we see, chunking is re- quired for RAG systems and there are several advantages and disadvantages when considering how to chunk a document.</p><p style='color: red;'>In this work, we study speciﬁcally the chunking of U.S. Securities and Ex- change Commission (SEC)1 Financial Reports2, including 10-Ks, 10-Qs, and 8-Ks. This study plays a critical role in oﬀering insights into the ﬁnancial health and operational dynamics of public companies. These documents present unique challenges in terms of document processing and information extraction as they consist of varying sizes and layouts, and contain a variety of tabular informa- tion. Previous work has evaluated the processing of these reports with simple chunking strategies (e.g., tokens), but we believe that a more eﬀective use of these reports might be achieved by a better pre-processing of the documents</p><p style='color: green;'>Financial Report Chunking for Eﬀective Retrieval Augmented Generation and chunking conﬁguration3 [14]. To the best of our knowledge, this is the ﬁrst systematic study on chunking for document understanding and more speciﬁcally for processing ﬁnancial reports.</p><h3 style='color: black;'>2 Related work</h3><p style='color: blue;'>Exploring the structure of ﬁnancial reports is an exceptional area for es- tablishing optimal principles for chunking. The intricate nature of document structures and contents has resulted in most of the work processing ﬁnancial reports focusing on the identiﬁcation of structural elements. Among previous work, we ﬁnd El-Haj et al. [10] and the FinTOC challenges [17,4,11] that have worked at the document structure level for UK and French ﬁnancial reports. Ad- 3 https://www.cnbc.com/2023/12/19/gpt-and-other-ai-models-cant-analyze- an-sec-filing-researchers-find.html</p><p style='color: magenta;'>ditionally, there is recent work that considers U.S. SEC reports, which includes DocLayNet [33] and more speciﬁcally with the report tables in FinTabNet [45]. On the side of ﬁnancial models, there is work in sentiment analysis in ﬁ- nance [37], which includes the pre-training of specialised models such as Fin- BERT by Liu et al. [26], which is a BERT based model pre-trained on large corpora including large collections of ﬁnancial news collected from diﬀerent sites and FinBERT by DeSola et al, [9] trained on Wikipedia, BookCorpus and U.S. SEC data. Additional models include BloombergGPT [40], FinGPT [42] and Instruct-FinGPT[44]. More advance datasets in the ﬁnancial domain include FinQA [6], LLMWare [27], ConFIRM [8] and TAT-QA [46] among others [7,38,19] that have been prepared for retrieval and or Questions and Answering (Q&A) tasks over snippets of ﬁ- nancial data that includes tabular data, which has allowed methods on large language models to be tested on them [39]. Most of the previous work has focused on understanding the layout of ﬁ- nancial documents or understanding speciﬁc snippets of existing reports with diﬀerent levels of complexity, but there has not been much research in under- standing ﬁnancial report documents, except some more recent work that includes FinanceBench [14], in which a set of questions about the content of ﬁnancial re- ports are proposed that includes the evidence snippet.</p><p style='color: red;'>More speciﬁcally on document chunking methods for RAG, there are stan- dard approaches being considered such as chunking text into spans of a given token length (e.g. 128 and 256) or chunking based on sentences. Open source projects already allow simple processing of documents (e.g. Unstructured4, Lla- maindex5 or Langchain 6), without explicitly considering the table structure on which these chunking strategies are applied. Even though diﬀerent approaches are available, an exhaustive evaluation of chunking applied to RAG and speciﬁcally to ﬁnancial reporting, except for some limited chunking analysis [14,36], is non-existent. In our work, we compare a broad range of chunking approaches in addition to more simple ones and provide an analysis of the outcomes of diﬀerent methods when asking questions about diﬀerent aspects of the reports.</p><h3 style='color: black;'>3.1 RAG setting for the experiments</h3><p style='color: green;'>Financial Report Chunking for Eﬀective Retrieval Augmented Generation document, the document is split into chunks and the chunks are indexed into a vector database (vectordb). When a question is sent to the RAG system, the top-k chunks most similar to the question are retrieved from the vector database and used to generate the answer using a large language model as generator. In order to retrieve chunks from the vector database, the question is encoded into a vector that is compared to the vector previously generated from the chunks. To prompt the generator, the question is converted into a set of instructions that instruct the LLM to ﬁnd the answer within the top-k retrieved chunks. question vectordb top k question vector chunks encoder v | generator —+ answer * question to prompt + rome</p><p style='color: blue;'>Fig. 1. RAG steps to answer a question about a document In our experiments, we modify the way documents are chunked prior to being indexed in the vector database. All other settings remain constant. In the following sections, we describe in more detail each one of the components and processes used.</p><h3 style='color: black;'>3.2 Indexing and retrieval</h3><p style='color: magenta;'>As shown in ﬁgure 2, to index a document, ﬁrst the document is split into chunks, then each chunk is processed by an encoder model and then indexed into the vector database. Based on the chunking strategy a document will be split into a larger or smaller set of chunks. chunks vectors Fig. 2. Indexing of document chunks into the vector database ttps://huggingface. co/sentence-transformers/multi-qa-mpnet-base-dot-</p><p style='color: red;'>6 Jimeno Yepes et al. As shown in ﬁgure 1, to retrieve chunks relevant to a question, the question is converted into a vector representation and the vector database returns a ranked list of chunks based on the similarity between question vector and the chunks in the database. Weaviate implements an approximate nearest neighbours algo- rithm [28] as their retrieval approach, which supports fast retrieval with high accuracy. In our experiments, we retrieve the top-10 chunks for each question.</p><h3 style='color: black;'>3.3 Generation</h3><p style='color: green;'>We have used GPT-4 [31] as the generator, which has shown best performance compared to earlier versions. As well, its performance was better compared to existing open source alternatives [22] such as Mixtral [16]. We used the prompt presented in ﬁgure 3 that we designed on another similar RAG implementation with diﬀerent document types. The prompt conditions the answer to the query and the chunks, referred to as source, and if the generator cannot answer it should return No answer. please answer the question below by referencing the list of sources provided after the question; if the question can not be answered just respond ’No answer’. The sources are listed after \"Sources:\".  Question: {query}  Sources: {key} - {source} ...  Sources: {key} - {source} ...</p><p style='color: blue;'>Fig. 3. Example prompt template used by the generator</p><h3 style='color: black;'>3.4 Chunking</h3><p style='color: magenta;'>In addition to chunking based on the number of tokens, we have processed the documents using computer vision and natural language processing to extract elements identiﬁed in the reports. The list of elements considered are provided by the Unstructured9 open source library. From the set of processing strategies, 9 https://unstructured-io.github.io/unstructured/introduction.html# elements</p><p style='color: red;'>Financial Report Chunking for Eﬀective Retrieval Augmented Generation we use Chipper, a vision encoder decoder10 model inspired by Donut [20] to showcase the performance diﬀerence. The Chipper model outputs results as a JSON representation of the document, listing elements per page characterized by their element type. Additionally, Chipper provides a bounding box enclosing each element on the page and the corresponding element text.</p><p style='color: green;'>These elements are sometimes short to be considered as chunks, so to gen- erate chunks from elements the following steps have been followed. Given the structure of ﬁnance reporting documents, our structural chunking eﬀorts are con- centrated on processing titles, texts, and tables. The steps to generate element- based chunks are:</p><p style='color: blue;'>– if the element text length is smaller than 2,048 characters, a merge with the following element is attempted – iteratively, element texts are merged following the step above till either the desired length is achieved, without breaking the element – if a title element is found, a new chunk is started – if a table element is found, a new chunk is started, preserving the entire table After element-based chunks have been derived, three types of metadata are generated to enrich the content and support eﬃcient indexing. The ﬁrst two types, generated via predeﬁned prompt templates with GPT-4, include: 1) up to 6 representative keywords of the composite chunk 2) a summarised paragraph of the composite chunk. The third type is 3) Naive representation using the ﬁrst two sentences from a composite chunk (a kind of preﬁx) and in the case of tables, the description of the table, which is typically identiﬁed in the table caption.</p><h3 style='color: black;'>3.5 Dataset</h3><p style='color: magenta;'>This dataset is made of 150 instances with questions and answers from 84 unique reports. The dataset does not include the source documents, which we have downloaded. We were able to recover only 80 documents, which reduces the number of questions to 141 from the original 150. The distribution of Un- structured elements predictions are shown in table 1. Documents have a varying number of pages, spanning from 4 pages (FOOT- LOCKER 2022 8K dated-2022-05-20) to 549 pages (e.g. PEPSICO 2021 10K), with an average of 147.34 with std 97.78 with a total of 11,787 pages combined. Each instance contains a link to the report, the question, a question type , the answer and supporting evidence, with page number where the evidence is located 10 https://huggingface.co/docs/transformers/model_doc/vision-encoder- decoder 8 Jimeno Yepes et al. Table 1. Unstructured element types distribution for Chipper predictions against doc- uments in FinanceBench.</p><p style='color: red;'><table><thead><th>Element Type</th><th>[Chipper Entities</th></thead><tr><td>NarrativeText</td><td>61,780</td></tr><tr><td>Title</td><td>29,664</td></tr><tr><td>ListItem</td><td>33,054</td></tr><tr><td>UncategorizedText</td><td>9,400</td></tr><tr><td>Footer</td><td>1,026</td></tr><tr><td>Table</td><td>7,700</td></tr><tr><td>Header</td><td>3,959</td></tr><tr><td>Image</td><td>26</td></tr><tr><td>FigureCaption</td><td>54</td></tr><tr><td>Formula</td><td>29</td></tr><tr><td>Address</td><td>229</td></tr><tr><td>Total</td><td>146,921</td></tr></table></p><p style='color: green;'>in the document, that allows for a closer evaluation of the results. Based on the page number, evidence contexts are located in diﬀerent areas in the documents, ranging from the ﬁrst page in some cases up to page 304 in one instance. The mean page number to ﬁnd the evidence is 54.58 with a standard deviation of 43.66, which shows that evidence contexts to answer the questions are spread within a document.</p><p style='color: blue;'>These characteristics make FinanceBench a perfect dataset for evaluating RAG. An example instance is available in table 2.</p><h3 style='color: black;'>4 Results</h3><p style='color: magenta;'>We are considering 80 documents and 141 questions from FinanceBench. Using the OpenAI tokenizer from the model text-embedding-ada-002 that uses the tokenizer cl100k base11, there are on average 102,444.35 tokens with std of 61,979.45, which shows the large variability of document lengths as seen by the diﬀerent number of pages per document presented above. Chunking Eﬃciency The ﬁrst thing we analyzed is the total number of chunks, as it impacts indexing time. We would like to observe the relationship between accuracy and total chunk size. Table 3 shows the number of chunks derived from each one of the processing methods. Unstructured element-based chunks are closer in size to Base 512, and as the chunk size decreases for the basic chunking strategies, the total number of chunks increases linearly. Financial Report Chunking for Eﬀective Retrieval Augmented Generation Table 2. Example question from the FinanceBench dataset</p><p style='color: red;'><table><thead><th>Field</th><th>Value</th></thead><tr><td></td><td>financebench-id|financebench.id_00859</td></tr><tr><td>doc_name</td><td>VERIZON.2021_10K</td></tr><tr><td>doc_link</td><td>https: //www.verizon.com/about/sites/default /files/2021-Annual- Report-on-Form-10-K.pdf</td></tr><tr><td>question_type</td><td>*novel-generated’</td></tr><tr><td>question</td><td>Among all of the derivative instruments that Verizon used to manage] the exposure to fluctuations of foreign currencies exchange rates or interest rates, which one had the highest notional value in FY 2021?</td></tr><tr><td>answer</td><td>Cross currency swaps. Its notional value was $32,502 million.,</td></tr><tr><td>evidence_text</td><td>Derivative Instruments We enter into derivative transactions primarily to manage our exposure to fluctuations in foreign currency exchange rates and interest rates. We employ risk management strategies, which may include the use of a variety of derivatives including interest rate swaps, cross currency swaps, forward starting interest rate swaps, trea- sury rate locks, interest rate caps, swaptions and foreign exchange for- wards. We do not hold derivatives for trading purposes. The following table sets forth the notional amounts of our outstanding derivative in- struments: (dollars in millions) At December 31, 2021 2020 Interest rate swaps $ 19,779 $ 17,768 Cross currency swaps 32,502 26,288 Forward starting interest rate 1,000 2,000 Foreign exchange forwards 932</td></tr><tr><td>page-number</td><td></td></tr></table></p><p style='color: green;'>Table 3. Chunks statistics for basic chunking elements and Unstructured elements</p><p style='color: blue;'><table><thead><th>Processing|total</th><th>chunks|mean</th><th>chunks per document</th><th>(std)|tables mean (std)</th></thead><tr><td>Base 128</td><td>| 64,058</td><td>800.73 (484.11)</td><td></td></tr><tr><td>Base 256</td><td>| 32,051</td><td>400.64 (242.04) (</td><td></td></tr><tr><td>Base 512</td><td>| 16,046</td><td>200.58 (121. 01)</td><td></td></tr><tr><td>Chipper</td><td>20,843</td><td>260.57 (145.80)</td><td>96.20 (57.53)</td></tr></table></p><p style='color: magenta;'>Retrieval Accuracy Secondly, we evaluate the capabilities of each chunking strategy in terms of retrieval accuracy. We use the page numbers in the ground truth to calculate the page-level retrieval accuracy, and we use ROUGE [24] and BLEU [32] scores to evaluate the accuracy of paragraph-level retrieval compared to the ground truth evidence paragraphs. As shown in Table 4, when compared to Unstructured element-based chunk- ing strategies, basic chunking strategies seem to have higher page-level retrieval accuracy but lower paragraph-level accuracy on average. Additionally, basic chunking strategies also lack consistency between page-level and paragraph-level accuracy; higher page-level accuracy doesn’t ensure higher paragraph-level ac- curacy. For example, Base 128 has the second highest page-level accuracy but the lowest paragraph-level scores among all. On the other hand, element-based chunking strategies showed more consistent results. A fascinating discovery is that when various chunking strategies are com- bined, it results in enhanced retrieval scores, achieving superior performance at both the page level (84.4%) and paragraph level (with ROUGE at 0.568% and BLEU at 0.452%). This ﬁnding addresses an unresolved question: how to improve the accuracy of RAG.</p><p style='color: red;'>The element based method provides the highest scores and it also provides a mechanism to chunk documents without the need to ﬁne tune hyper-parameters like the number of tokens in a chunk. This suggests the element based method is more generalizable and can be applied to new types of documents.</p><p style='color: green;'>Q&A Accuracy Third, we evaluate the Q&A accuracy for the chunking strate- gies. In addition to manual evaluation, we have investigated an automatic evalua- tion using GPT-4. GPT-4 compares how the answers provided by our method are similar to or diﬀerent from the FinanceBench gold standard, similar approaches have been previously evaluated [13,23,29,30]. The automatic evaluation allows scaling the evaluation eﬀorts for the diﬀerent chunking strategies that we have considered. We used the prompt template in ﬁgure 4.</p><p style='color: blue;'>Begin with True or False. Are the two following answers (Answer 1 and Answer 2) the same with respect to the question between single quotes ’{question}’?  Answer 1: ’{ground_truth_answer}’ Answer 2: ’{generated_answer}’  Fig. 4. Evaluation prompt template. The {question}, {ground truth answer} and {generated answer} ﬁelds are substituted for each question accordingly.</p><p style='color: magenta;'>Results in table 5 show that element-based chunking strategies oﬀer the best question-answering accuracy, which is consistent with page retrieval and para- graph retrieval accuracy. Lastly, our approach stands out for its eﬃciency. Not only is element-based chunking generalizable without the need to select the chunk size, but when com- pared to the aggregation results that yield the highest retrieval scores. Element- based chunking achieves the highest retrieval scores with only half the number of chunks required compared to methods that do not consider the structure of the documents (62,529 v.s. 112,155). This can reduce the indexing cost and im- prove query latency because there are only half as many vectors to index for the vectordb that stores the chunks. This underscores the eﬀectiveness of our solu- tion in optimizing the balance between performance and computational resource requirements. Financial Report Chunking for Eﬀective Retrieval Augmented Generation</p><p style='color: red;'>Table 4. Retrieval results. For each chunking strategy, we show the number of chunks for all the documents (Total Chunks), Page Accuracy, and ROUGE and BLEU scores. ROUGE and BLEU are calculated as the maximum score from the list of recovered contexts for a question when compared to the known evidence for that question.</p><p style='color: green;'><table><thead><th>Chunking strategy</th><th>Total Chunks}</th><th>Page Accuracy</th><th>ROUGE|BLEU.</th></thead><tr><td>Base 128</td><td>64,058</td><td>72.34</td><td>0.383</td></tr><tr><td>Base 256</td><td>32,051</td><td>73.05</td><td>0.433</td></tr><tr><td>Base 512</td><td>16,046</td><td>68.09</td><td>0.455</td></tr><tr><td>Base Aggregation</td><td>112,155</td><td>83.69</td><td>0.536</td></tr><tr><td>Keywords Chipper</td><td></td><td>46.10</td><td>0.444</td></tr><tr><td>Summary Chipper</td><td></td><td>62.41</td><td>0.473</td></tr><tr><td>Prefix &amp; Table Description Chipper</td><td></td><td>67.38</td><td>0.514</td></tr><tr><td>Chipper Aggregation</td><td>a</td><td>84.40</td><td>0.568</td></tr></table></p><p style='color: blue;'>Table 5. Q&A results. We show the percentage of questions with no answer and as well the accuracy either estimated automatically using GPT-4 or manually.</p><p style='color: magenta;'><table><thead><th>Chunking strategy</th><th>No</th><th></th><th>answer|GPT-4|Manual</th></thead><tr><td>Base 128</td><td>35.46</td><td>29.08</td><td>| 35.46</td></tr><tr><td>Base 256</td><td>5.5¢</td><td>32.62</td><td>| 36.88</td></tr><tr><td>Base 512</td><td>24.82</td><td>41.84</td><td>| 48.23</td></tr><tr><td>Keywords Chipper</td><td>22.70 |</td><td>43.97]</td><td>53.19</td></tr><tr><td>Summary Chipper</td><td>17.73</td><td>|43.97])</td><td>51.77</td></tr><tr><td>Prefix &amp; Table Description Chipper]</td><td>20.57</td><td>41.13</td><td>| 53.19</td></tr></table></p><h3 style='color: black;'>5 Discussion</h3><p style='color: red;'>We have observed that using basic 512 chunking strategies produces results most similar to the Unstructured element-based approach, which may be due to the fact that 512 tokens share a similar length with the token size within our element-based chunks and capture a long context, but fail keep a coherent context in some cases, leaving out relevant information required for Q&A. This is further observed when considering the ROUGE and BLEU scores in table 4, where the chunk contexts for the baseline have lower scores. These ﬁndings support existing research stating that the best basic chunk size varies from data to data [3]. These results show, as well, that our method adapts to diﬀerent documents without tuning. Our method relies on the struc- tural information that is present in the document’s layout to adjust the chunk size automatically.</p><p style='color: green;'>We have evaluated aggregating the output of diﬀerent chunking methods in the retrieval experiments as sown in table 4. Even though the aggregation seems to be eﬀective for retrieval, the Q&A exceeded the GPT-4 token limit, which resulted in a non-eﬀective Q&A solution using the selected model. As well, we evaluated variations of the prompt used to generate the answers (see ﬁgure 3). Re-ordering the retrieval context and the question, but results were not statistically diﬀerent. We experimented as well with variations of the verbs using in the prompt, e.g. changing referencing with using, which seemed to lower the quality of the answers generated. This shows that prompt engineering is a relevant factor in RAG. We evaluated using GPT-4 for evaluation instead of relying on manual evalu- ation. In most cases, GPT-4 evaluated correctly but failed when a more elaborate answer is provided. As shown in ﬁgure 5, the answer is 39.7% while the estimated answer is 39.73% but with a detailed explanation of the calculation.</p><p style='color: blue;'>Question: ’What is Coca Cola’s FY2021 COGS % margin? Calculate what was asked by utilizing the line items clearly shown in the income statement.’?  Answer 1: ’39.7%’ Answer 2: ’From the income statement referenced on page 60 of COCACOLA_2021_10K_embedded.json, we can see that Coca Cola’s total revenue in FY2021 was $38,655 million and their cost of goods sold (COGS) was $15,357 million. To calculate the COGS % margin, we divide the COGS by the total revenue and multiply by 100: (15,357 / 38,655) * 100 = 39.73% So, Coca Cola’s FY2021 COGS % margin was approximately 39.73%.’ </p><p style='color: magenta;'>Fig. 5. Evaluation prompt template</p><h3 style='color: black;'>6 Conclusions and Future Work</h3><p style='color: red;'>Financial Report Chunking for Eﬀective Retrieval Augmented Generation Furthermore, we would like to study the impact of RAG conﬁguration and ele- ment type based chunking.</p><h3 style='color: black;'>References</h3><p style='color: green;'>2. Balaguer, A., Benara, V., de Freitas Cunha, R.L., de M. Estev˜ao Filho, R., Hendry, T., Holstein, D., Marsman, J., Mecklenburg, N., Malvar, S., Nunes, L.O., Padilha, R., Sharp, M., Silva, B., Sharma, S., Aski, V., Chandra, R.: Rag vs ﬁne-tuning: Pipelines, tradeoﬀs, and a case study on agriculture (2024) 3. Barnett, S., Kurniawan, S., Thudumu, S., Brannelly, Z., Abdelrazek, M.: Seven Failure Points When Engineering a Retrieval Augmented Generation System (2024)</p><p style='color: blue;'>4. Bentabet, N.I., Juge, R., El Maarouf, I., Mouilleron, V., Valsamou-Stanislawski, D., El-Haj, M.: The ﬁnancial document structure extraction shared task (ﬁntoc 2020). In: Proceedings of the 1st Joint Workshop on Financial Narrative Processing and MultiLing Financial Summarisation. pp. 13–22 (2020)</p><p style='color: magenta;'>5. Chen, H., Jiao, F., Li, X., Qin, C., Ravaut, M., Zhao, R., Xiong, C., Joty, S.: Chat- GPT’s One-year Anniversary: Are Open-Source Large Language Models Catching up? arXiv preprint arXiv:2311.16989 (2023) 6. Chen, Z., Chen, W., Smiley, C., Shah, S., Borova, I., Langdon, D., Moussa, R., Beane, M., Huang, T.H., Routledge, B., et al.: Finqa: A dataset of numerical reasoning over ﬁnancial data. arXiv preprint arXiv:2109.00122 (2021) 7. Chen, Z., Li, S., Smiley, C., Ma, Z., Shah, S., Wang, W.Y.: ConvFinQA: Exploring the Chain of Numerical Reasoning in Conversational Finance Question Answering (2022) 8. Choi, S., Gazeley, W., Wong, S.H., Li, T.: Conversational Financial Information Retrieval Model (ConFIRM). arXiv preprint arXiv:2310.13001 (2023)</p><p style='color: red;'>9. DeSola, V., Hanna, K., Nonis, P.: Finbert: pre-trained model on sec ﬁlings for ﬁnancial natural language tasks. University of California (2019) 10. El-Haj, M., Rayson, P., Young, S., Walker, M.: Detecting document structure in a very large corpus of UK ﬁnancial reports. European Language Resources Associa- tion (ELRA) (2014) 11. El Maarouf, I., Kang, J., Azzi, A.A., Bellato, S., Gan, M., El-Haj, M.: The ﬁnancial document structure extraction shared task (FinTOC2021). In: Proceedings of the 3rd Financial Narrative Processing Workshop. pp. 111–119 (2021)</p><p style='color: green;'>12. Gao, Y., Xiong, Y., Gao, X., Jia, K., Pan, J., Bi, Y., Dai, Y., Sun, J., Wang, H.: Retrieval-augmented generation for large language models: A survey. arXiv preprint arXiv:2312.10997 (2023) 13. Hada, R., Gumma, V., de Wynter, A., Diddee, H., Ahmed, M., Choudhury, M., Bali, K., Sitaram, S.: Are large language model-based evaluators the solution to scaling up multilingual evaluation? arXiv preprint arXiv:2309.07462 (2023) 14. Islam, P., Kannappan, A., Kiela, D., Qian, R., Scherrer, N., Vidgen, B.: Fi- nanceBench: A New Benchmark for Financial Question Answering. arXiv preprint arXiv:2311.11944 (2023)</p><p style='color: blue;'>15. Ji, Z., Lee, N., Frieske, R., Yu, T., Su, D., Xu, Y., Ishii, E., Bang, Y.J., Madotto, A., Fung, P.: Survey of Hallucination in Natural Language Generation. ACM Comput- ing Surveys 55(12), 1–38 (Mar 2023). https://doi.org/10.1145/3571730, http:// dx.doi.org/10.1145/3571730</p><p style='color: magenta;'>14 Jimeno Yepes et al. 16. Jiang, A.Q., Sablayrolles, A., Roux, A., Mensch, A., Savary, B., Bamford, C., Chaplot, D.S., de las Casas, D., Hanna, E.B., Bressand, F., Lengyel, G., Bour, G., Lample, G., Lavaud, L.R., Saulnier, L., Lachaux, M.A., Stock, P., Subramanian, S., Yang, S., Antoniak, S., Scao, T.L., Gervet, T., Lavril, T., Wang, T., Lacroix, T., Sayed, W.E.: Mixtral of Experts (2024)</p><p style='color: red;'>17. Juge, R., Bentabet, I., Ferradans, S.: The ﬁntoc-2019 shared task: Financial doc- ument structure extraction. In: Proceedings of the Second Financial Narrative Processing Workshop (FNP 2019). pp. 51–57 (2019)</p><p style='color: green;'>18. Kaddour, J., Harris, J., Mozes, M., Bradley, H., Raileanu, R., McHardy, R.: Chal- lenges and applications of large language models. arXiv preprint arXiv:2307.10169 (2023)</p><p style='color: blue;'>19. Kaur, S., Smiley, C., Gupta, A., Sain, J., Wang, D., Siddagangappa, S., Aguda, T., Shah, S.: REFinD: Relation Extraction Financial Dataset. In: the 46th International ACM SIGIR Conference on Re- Proceedings of search and Development in Information Retrieval. SIGIR ’23, ACM (Jul 2023). https://doi.org/10.1145/3539618.3591911, http://dx.doi.org/10.1145/ 3539618.3591911</p><p style='color: magenta;'>20. Kim, G., Hong, T., Yim, M., Park, J., Yim, J., Hwang, W., Yun, S., Han, D., Park, S.: Donut: Document understanding transformer without ocr. arXiv preprint arXiv:2111.15664 7, 15 (2021) 21. Lewis, P., Perez, E., Piktus, A., Petroni, F., Karpukhin, V., Goyal, N., K¨uttler, H., Lewis, M., Yih, W.t., Rockt¨aschel, T., et al.: Retrieval-augmented generation for knowledge-intensive NLP tasks. Advances in Neural Information Processing Systems 33, 9459–9474 (2020)</p><p style='color: red;'>22. Li, D., Shao, R., Xie, A., Sheng, Y., Zheng, L., Gonzalez, J.E., Stoica, I., Ma, X., Zhang, H.: How Long Can Open-Source LLMs Truly Promise on Context Length? (June 2023), https://lmsys.org/blog/2023-06-29-longchat</p><p style='color: green;'>23. Li, Y., Duan, Y.: The evaluation of experiments of artiﬁcial general intelligence with gpt-4 based on dikwp. arXiv preprint (2023) 24. Lin, C.Y.: Rouge: A package for automatic evaluation of summaries. In: Text sum- marization branches out. pp. 74–81 (2004) 25. Liu, N.F., Lin, K., Hewitt, J., Paranjape, A., Bevilacqua, M., Petroni, F., Liang, P.: Lost in the middle: How language models use long contexts. arXiv preprint arXiv:2307.03172 (2023) 26. Liu, Z., Huang, D., Huang, K., Li, Z., Zhao, J.: Finbert: A pre-trained ﬁnancial language representation model for ﬁnancial text mining. In: Proceedings of the twenty-ninth international conference on international joint conferences on artiﬁcial intelligence. pp. 4513–4519 (2021) llmware: Rag Instruct Benchmark Tester. https://huggingface.co/datasets/ llmware/rag_instruct_benchmark_tester, Accessed: January 15, 2024</p><p style='color: blue;'>28. Malkov, Y.A., Yashunin, D.A.: Eﬃcient and robust approximate nearest neigh- bor search using hierarchical navigable small world graphs. IEEE transactions on pattern analysis and machine intelligence 42(4), 824–836 (2018)</p><p style='color: magenta;'>29. Moore, S., Nguyen, H.A., Chen, T., Stamper, J.: Assessing the quality of multiple- choice questions using gpt-4 and rule-based methods. In: European Conference on Technology Enhanced Learning. pp. 229–245. Springer (2023) 30. Naismith, B., Mulcaire, P., Burstein, J.: Automated evaluation of written discourse coherence using gpt-4. In: Proceedings of the 18th Workshop on Innovative Use of NLP for Building Educational Applications (BEA 2023). pp. 394–403 (2023) 31. OpenAI, :, Achiam, J., Adler, S., Agarwal, S., et al.: GPT-4 Technical Report</p><p style='color: red;'>(2023) Financial Report Chunking for Eﬀective Retrieval Augmented Generation 32. Papineni, K., Roukos, S., Ward, T., Zhu, W.J.: Bleu: a method for automatic evaluation of machine translation. In: Proceedings of the 40th annual meeting of the Association for Computational Linguistics. pp. 311–318 (2002) 33. Pﬁtzmann, B., Auer, C., Dolﬁ, M., Nassar, A.S., Staar, P.: Doclaynet: A large human-annotated dataset for document-layout segmentation. In: Proceedings of the 28th ACM SIGKDD Conference on Knowledge Discovery and Data Mining. pp. 3743–3751 (2022)</p><p style='color: green;'>34. Pinecone: Chunking strategies for llm applications, https://www.pinecone.io/ learn/chunking-strategies/ 35. Reimers, N., Gurevych, I.: Sentence-bert: Sentence embeddings using siamese bert- networks. In: Proceedings of the 2019 Conference on Empirical Methods in Nat- ural Language Processing. Association for Computational Linguistics (11 2019), https://arxiv.org/abs/1908.10084</p><p style='color: blue;'>36. Retteter, J.: Mastering Table Extraction: Revolutionize Your Earnings Re- ports Analysis with AI. https://medium.com/unstructured-io/mastering- table-extraction-revolutionize-your-earnings-reports-analysis-with- ai-1bc32c22720e, Accessed: January 15, 2024 37. Rizinski, M., Peshov, H., Mishev, K., Jovanovik, M., Trajanov, D.: Sentiment Anal- ysis in Finance: From Transformers Back to eXplainable Lexicons (XLex) (2023) 38. Shah, R.S., Chawla, K., Eidnani, D., Shah, A., Du, W., Chava, S., Raman, N., Smiley, C., Chen, J., Yang, D.: WHEN FLUE MEETS FLANG: Benchmarks and Large Pre-trained Language Model for Financial Domain (2022) 39. Singh Phogat, K., Harsha, C., Dasaratha, S., Ramakrishna, S., Akhil Puranam, S.: Zero-Shot Question Answering over Financial Documents using Large Language Models. arXiv e-prints pp. arXiv–2311 (2023)</p><p style='color: magenta;'>40. Wu, S., Irsoy, O., Lu, S., Dabravolski, V., Dredze, M., Gehrmann, S., Kambadur, P., Rosenberg, D., Mann, G.: BloombergGPT: A Large Language Model for Finance (2023) 41. Xu, P., Ping, W., Wu, X., McAfee, L., Zhu, C., Liu, Z., Subramanian, S., Bakhtu- rina, E., Shoeybi, M., Catanzaro, B.: Retrieval meets Long Context Large Language Models (2023) 42. Yang, H., Liu, X.Y., Wang, C.D.: FinGPT: Open-Source Financial Large Language Models (2023) 43. Ye, H., Liu, T., Zhang, A., Hua, W., Jia, W.: Cognitive Mirage: A Review of Hallucinations in Large Language Models (2023) 44. Zhang, B., Yang, H., Liu, X.Y.: Instruct-FinGPT: Financial Sentiment Analysis by Instruction Tuning of General-Purpose Large Language Models (2023)</p><p style='color: red;'>45. Zheng, X., Burdick, D., Popa, L., Zhong, X., Wang, N.X.R.: Global table extractor (gte): A framework for joint table identiﬁcation and cell structure recognition using visual context. In: Proceedings of the IEEE/CVF winter conference on applications of computer vision. pp. 697–706 (2021) 46. Zhu, F., Lei, W., Huang, Y., Wang, C., Zhang, S., Lv, J., Feng, F., Chua, T.S.: TAT-QA: A question answering benchmark on a hybrid of tabular and textual content in ﬁnance. arXiv preprint arXiv:2105.07624 (2021)</p>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "print_chunks_by_title(chunks_by_title)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.18"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
+%% Cell type:markdown id: tags:
+
+### Partition elements using Unstructured library
+
+%% Cell type:code id: tags:
+
+``` python
+# It may take longer to install the package
+!pip install -q -U "unstructured[pdf]"
+```
+
+%% Cell type:code id: tags:
+
+``` python
+from unstructured.partition.auto import partition
+
+article_url = "https://arxiv.org/pdf/2402.05131.pdf"
+elements = partition(url=article_url, strategy="hi_res", pdf_infer_table_structure=True)
+```
+
+%% Output
+
+    Some weights of the model checkpoint at microsoft/table-transformer-structure-recognition were not used when initializing TableTransformerForObjectDetection: ['model.backbone.conv_encoder.model.layer2.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer3.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer4.0.downsample.1.num_batches_tracked']
+    - This IS expected if you are initializing TableTransformerForObjectDetection from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
+    - This IS NOT expected if you are initializing TableTransformerForObjectDetection from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
+
+%% Cell type:markdown id: tags:
+
+#### Define helper functions
+
+%% Cell type:markdown id: tags:
+
+Validate if parsed title element is a real title
+
+%% Cell type:code id: tags:
+
+``` python
+import re
+
+
+def is_valid_title(title: str) -> bool:
+    # Rule 1: Title starts with a lowercase letter
+    if re.match(r"^[a-z]", title):
+        return False
+    # Rule 2: Title has a special character (excluding :, -, and .)
+    if re.search(r"[^\w\s:\-\.]", title):
+        return False
+    # Rule 3: Title ends with a dot
+    if title.endswith("."):
+        return False
+    return True
+```
+
+%% Cell type:markdown id: tags:
+
+Group elements by valid titles
+
+%% Cell type:code id: tags:
+
+``` python
+from unstructured.documents.elements import Element
+from colorama import Fore, Style
+
+
+def group_elements_by_title(elements: list[Element]) -> dict:
+    grouped_elements = {}
+    current_title = "Untitled"  # Default title for initial text without a title
+
+    for element in elements:
+        element_dict = element.to_dict()
+
+        if element_dict.get("type") == "Title":
+            potential_title = element_dict.get("text", "Untitled")
+            if is_valid_title(potential_title):
+                print(f"{Fore.GREEN}{potential_title}: True{Style.RESET_ALL}")
+                current_title = potential_title
+            else:
+                print(f"{Fore.RED}{potential_title}: False{Style.RESET_ALL}")
+                continue
+        else:
+            if current_title not in grouped_elements:
+                grouped_elements[current_title] = []
+            else:
+                grouped_elements[current_title].append(element)
+    return grouped_elements
+```
+
+%% Cell type:markdown id: tags:
+
+Generates chunks grouped elements using semantic RollingWindow splitter
+
+%% Cell type:code id: tags:
+
+``` python
+from semantic_router.splitters import RollingWindowSplitter
+
+
+def create_title_chunks(
+    grouped_elements: dict, splitter: RollingWindowSplitter
+) -> list:
+    title_with_chunks = []
+    for title, elements in grouped_elements.items():
+        if not elements:
+            continue
+        combined_element_texts = []
+        chunks = []
+
+        for element in elements:
+            if not element.text:
+                continue
+            element_dict = element.to_dict()
+            if element_dict.get("type") == "Table":
+                # Process accumulated text before the table
+                if combined_element_texts:
+                    splits = splitter(combined_element_texts)
+                    chunks.extend([split.content for split in splits])
+                    combined_element_texts = []  # Reset combined texts after processing
+
+                # Add table as a separate chunk
+                table_text_html = element.metadata.text_as_html
+                chunks.append(table_text_html)
+            else:
+                combined_element_texts.append(element.text)
+
+        # Process any remaining accumulated text after the last table
+        # or if no table was encountered
+
+        if combined_element_texts:
+            splits = splitter(combined_element_texts)
+            chunks.extend([split.content for split in splits])
+
+        if chunks:
+            title_with_chunks.append({"title": title, "chunks": chunks})
+
+    return title_with_chunks
+```
+
+%% Cell type:markdown id: tags:
+
+Display chunked text in colors
+
+%% Cell type:code id: tags:
+
+``` python
+from IPython.display import display, HTML
+import itertools
+
+
+def print_chunks_by_title(chunks_by_title):
+    color_cycle = itertools.cycle(["red", "green", "blue", "magenta"])
+    html_output = ""
+    for section in chunks_by_title:
+        title = section["title"]
+        chunks = section["chunks"]
+        html_output += f"<h3 style='color: black;'>{title}</h3>"
+        for chunk in chunks:
+            color = next(color_cycle)
+            html_output += f"<p style='color: {color};'>{chunk}</p>"
+    display(HTML(html_output))
+```
+
+%% Cell type:markdown id: tags:
+
+### Process the elements
+
+%% Cell type:code id: tags:
+
+``` python
+import os
+from semantic_router.encoders import OpenAIEncoder
+
+encoder = OpenAIEncoder(openai_api_key=os.environ["OPENAI_API_KEY"])
+
+splitter = RollingWindowSplitter(
+    encoder=encoder,
+    window_size=1,  # Compares each element with the previous one
+    min_split_tokens=50,
+    max_split_tokens=300,
+)
+```
+
+%% Cell type:code id: tags:
+
+``` python
+grouped_elements = group_elements_by_title(elements)
+```
+
+%% Output
+
+    [31met! ee: False[0m
+    [31mb e F 0 1: False[0m
+    [31m] L C . s c [: False[0m
+    [32mFinancial Report Chunking for Eﬀective Retrieval Augmented Generation: True[0m
+    [32mIntroduction: True[0m
+    [31m2 Jimeno Yepes et al.: False[0m
+    [31m1 https://www.sec.gov 2 https://www.sec.gov/files/cf-frm.pdf: False[0m
+    [32m2 Related work: True[0m
+    [31m4 Jimeno Yepes et al.: False[0m
+    [32m3 Methods: True[0m
+    [32m3.1 RAG setting for the experiments: True[0m
+    [32m3.2 Indexing and retrieval: True[0m
+    [31m7 https://weaviate.io/developers/weaviate 8 https://huggingface.co/sentence-transformers/multi-qa-mpnet-base-dot-: False[0m
+    [31mv1: False[0m
+    [32m3.3 Generation: True[0m
+    [31mQuestion: {query}: False[0m
+    [32m3.4 Chunking: True[0m
+    [32m3.5 Dataset: True[0m
+    [32m4 Results: True[0m
+    [31m11 https://platform.openai.com/docs/guides/embeddings/limitations-risks: False[0m
+    [31m10 Jimeno Yepes et al.: False[0m
+    [32m5 Discussion: True[0m
+    [31m12 Jimeno Yepes et al.: False[0m
+    [32m6 Conclusions and Future Work: True[0m
+    [32mReferences: True[0m
+
+%% Cell type:code id: tags:
+
+``` python
+chunks_by_title = create_title_chunks(grouped_elements, splitter)
+```
+
+%% Cell type:code id: tags:
+
+``` python
+print_chunks_by_title(chunks_by_title)
+```
+
+%% Output
+
+
+%% Cell type:code id: tags:
+
+``` python
+```