diff --git a/coverage.xml b/coverage.xml
index 14b27c40f0c8e94dbd46232dd28b021f8ff9da1b..8a9d849a6698b1dcc3a105b115441a1c938e1953 100644
--- a/coverage.xml
+++ b/coverage.xml
@@ -18,7 +18,7 @@
 						<line number="7" hits="1"/>
 					</lines>
 				</class>
-				<class name="hybrid_layer.py" filename="hybrid_layer.py" complexity="0" line-rate="0.9808" branch-rate="0">
+				<class name="hybrid_layer.py" filename="hybrid_layer.py" complexity="0" line-rate="0.2115" branch-rate="0">
 					<methods/>
 					<lines>
 						<line number="1" hits="1"/>
@@ -33,101 +33,101 @@
 						<line number="18" hits="1"/>
 						<line number="19" hits="1"/>
 						<line number="21" hits="1"/>
-						<line number="28" hits="1"/>
-						<line number="29" hits="1"/>
-						<line number="31" hits="1"/>
+						<line number="28" hits="0"/>
+						<line number="29" hits="0"/>
+						<line number="31" hits="0"/>
 						<line number="32" hits="0"/>
 						<line number="33" hits="0"/>
-						<line number="35" hits="1"/>
-						<line number="37" hits="1"/>
-						<line number="38" hits="1"/>
-						<line number="39" hits="1"/>
-						<line number="42" hits="1"/>
-						<line number="44" hits="1"/>
-						<line number="48" hits="1"/>
+						<line number="35" hits="0"/>
+						<line number="37" hits="0"/>
+						<line number="38" hits="0"/>
+						<line number="39" hits="0"/>
+						<line number="42" hits="0"/>
+						<line number="44" hits="0"/>
+						<line number="48" hits="0"/>
 						<line number="50" hits="1"/>
-						<line number="51" hits="1"/>
-						<line number="52" hits="1"/>
-						<line number="53" hits="1"/>
-						<line number="54" hits="1"/>
-						<line number="55" hits="1"/>
-						<line number="57" hits="1"/>
+						<line number="51" hits="0"/>
+						<line number="52" hits="0"/>
+						<line number="53" hits="0"/>
+						<line number="54" hits="0"/>
+						<line number="55" hits="0"/>
+						<line number="57" hits="0"/>
 						<line number="59" hits="1"/>
-						<line number="60" hits="1"/>
+						<line number="60" hits="0"/>
 						<line number="62" hits="1"/>
-						<line number="63" hits="1"/>
-						<line number="65" hits="1"/>
-						<line number="67" hits="1"/>
-						<line number="70" hits="1"/>
-						<line number="72" hits="1"/>
-						<line number="73" hits="1"/>
-						<line number="76" hits="1"/>
-						<line number="78" hits="1"/>
-						<line number="81" hits="1"/>
-						<line number="82" hits="1"/>
-						<line number="84" hits="1"/>
-						<line number="85" hits="1"/>
-						<line number="86" hits="1"/>
+						<line number="63" hits="0"/>
+						<line number="65" hits="0"/>
+						<line number="67" hits="0"/>
+						<line number="70" hits="0"/>
+						<line number="72" hits="0"/>
+						<line number="73" hits="0"/>
+						<line number="76" hits="0"/>
+						<line number="78" hits="0"/>
+						<line number="81" hits="0"/>
+						<line number="82" hits="0"/>
+						<line number="84" hits="0"/>
+						<line number="85" hits="0"/>
+						<line number="86" hits="0"/>
 						<line number="88" hits="1"/>
-						<line number="90" hits="1"/>
-						<line number="91" hits="1"/>
-						<line number="94" hits="1"/>
-						<line number="95" hits="1"/>
-						<line number="98" hits="1"/>
-						<line number="99" hits="1"/>
-						<line number="100" hits="1"/>
+						<line number="90" hits="0"/>
+						<line number="91" hits="0"/>
+						<line number="94" hits="0"/>
+						<line number="95" hits="0"/>
+						<line number="98" hits="0"/>
+						<line number="99" hits="0"/>
+						<line number="100" hits="0"/>
 						<line number="106" hits="1"/>
-						<line number="107" hits="1"/>
-						<line number="109" hits="1"/>
+						<line number="107" hits="0"/>
+						<line number="109" hits="0"/>
 						<line number="115" hits="1"/>
-						<line number="116" hits="1"/>
-						<line number="118" hits="1"/>
+						<line number="116" hits="0"/>
+						<line number="118" hits="0"/>
 						<line number="124" hits="1"/>
-						<line number="129" hits="1"/>
-						<line number="130" hits="1"/>
-						<line number="132" hits="1"/>
-						<line number="133" hits="1"/>
-						<line number="135" hits="1"/>
-						<line number="137" hits="1"/>
-						<line number="139" hits="1"/>
-						<line number="140" hits="1"/>
-						<line number="141" hits="1"/>
-						<line number="143" hits="1"/>
-						<line number="144" hits="1"/>
-						<line number="145" hits="1"/>
-						<line number="146" hits="1"/>
-						<line number="148" hits="1"/>
-						<line number="149" hits="1"/>
-						<line number="150" hits="1"/>
-						<line number="152" hits="1"/>
-						<line number="153" hits="1"/>
-						<line number="155" hits="1"/>
-						<line number="156" hits="1"/>
+						<line number="129" hits="0"/>
+						<line number="130" hits="0"/>
+						<line number="132" hits="0"/>
+						<line number="133" hits="0"/>
+						<line number="135" hits="0"/>
+						<line number="137" hits="0"/>
+						<line number="139" hits="0"/>
+						<line number="140" hits="0"/>
+						<line number="141" hits="0"/>
+						<line number="143" hits="0"/>
+						<line number="144" hits="0"/>
+						<line number="145" hits="0"/>
+						<line number="146" hits="0"/>
+						<line number="148" hits="0"/>
+						<line number="149" hits="0"/>
+						<line number="150" hits="0"/>
+						<line number="152" hits="0"/>
+						<line number="153" hits="0"/>
+						<line number="155" hits="0"/>
+						<line number="156" hits="0"/>
 						<line number="158" hits="1"/>
-						<line number="160" hits="1"/>
-						<line number="161" hits="1"/>
-						<line number="162" hits="1"/>
+						<line number="160" hits="0"/>
+						<line number="161" hits="0"/>
+						<line number="162" hits="0"/>
 						<line number="164" hits="1"/>
-						<line number="165" hits="1"/>
-						<line number="166" hits="1"/>
-						<line number="167" hits="1"/>
-						<line number="168" hits="1"/>
-						<line number="169" hits="1"/>
-						<line number="170" hits="1"/>
-						<line number="172" hits="1"/>
-						<line number="175" hits="1"/>
-						<line number="176" hits="1"/>
-						<line number="179" hits="1"/>
-						<line number="180" hits="1"/>
-						<line number="182" hits="1"/>
-						<line number="183" hits="1"/>
+						<line number="165" hits="0"/>
+						<line number="166" hits="0"/>
+						<line number="167" hits="0"/>
+						<line number="168" hits="0"/>
+						<line number="169" hits="0"/>
+						<line number="170" hits="0"/>
+						<line number="172" hits="0"/>
+						<line number="175" hits="0"/>
+						<line number="176" hits="0"/>
+						<line number="179" hits="0"/>
+						<line number="180" hits="0"/>
+						<line number="182" hits="0"/>
+						<line number="183" hits="0"/>
 						<line number="185" hits="1"/>
-						<line number="186" hits="1"/>
-						<line number="187" hits="1"/>
-						<line number="189" hits="1"/>
+						<line number="186" hits="0"/>
+						<line number="187" hits="0"/>
+						<line number="189" hits="0"/>
 					</lines>
 				</class>
-				<class name="layer.py" filename="layer.py" complexity="0" line-rate="0.8949" branch-rate="0">
+				<class name="layer.py" filename="layer.py" complexity="0" line-rate="0.8576" branch-rate="0">
 					<methods/>
 					<lines>
 						<line number="1" hits="1"/>
@@ -459,8 +459,6 @@
 						<line number="41" hits="1"/>
 						<line number="42" hits="1"/>
 						<line number="43" hits="1"/>
-						<line number="44" hits="1"/>
-						<line number="45" hits="1"/>
 						<line number="46" hits="1"/>
 						<line number="47" hits="1"/>
 						<line number="49" hits="1"/>
@@ -509,7 +507,6 @@
 						<line number="149" hits="1"/>
 						<line number="151" hits="1"/>
 						<line number="152" hits="1"/>
-						<line number="153" hits="1"/>
 						<line number="154" hits="1"/>
 						<line number="155" hits="0"/>
 					</lines>
@@ -693,7 +690,7 @@
 						<line number="15" hits="1"/>
 					</lines>
 				</class>
-				<class name="bm25.py" filename="encoders/bm25.py" complexity="0" line-rate="0.9574" branch-rate="0">
+				<class name="bm25.py" filename="encoders/bm25.py" complexity="0" line-rate="0.3404" branch-rate="0">
 					<methods/>
 					<lines>
 						<line number="1" hits="1"/>
@@ -707,42 +704,42 @@
 						<line number="18" hits="1"/>
 						<line number="19" hits="1"/>
 						<line number="20" hits="1"/>
-						<line number="21" hits="0"/>
-						<line number="22" hits="0"/>
-						<line number="27" hits="1"/>
-						<line number="29" hits="1"/>
-						<line number="30" hits="1"/>
-						<line number="31" hits="1"/>
-						<line number="32" hits="1"/>
+						<line number="21" hits="1"/>
+						<line number="22" hits="1"/>
+						<line number="27" hits="0"/>
+						<line number="29" hits="0"/>
+						<line number="30" hits="0"/>
+						<line number="31" hits="0"/>
+						<line number="32" hits="0"/>
 						<line number="34" hits="1"/>
-						<line number="35" hits="1"/>
-						<line number="36" hits="1"/>
-						<line number="37" hits="1"/>
-						<line number="38" hits="1"/>
-						<line number="39" hits="1"/>
-						<line number="41" hits="1"/>
+						<line number="35" hits="0"/>
+						<line number="36" hits="0"/>
+						<line number="37" hits="0"/>
+						<line number="38" hits="0"/>
+						<line number="39" hits="0"/>
+						<line number="41" hits="0"/>
 						<line number="43" hits="1"/>
-						<line number="44" hits="1"/>
-						<line number="45" hits="1"/>
-						<line number="46" hits="1"/>
-						<line number="47" hits="1"/>
-						<line number="48" hits="1"/>
-						<line number="49" hits="1"/>
-						<line number="51" hits="1"/>
-						<line number="53" hits="1"/>
-						<line number="54" hits="1"/>
-						<line number="55" hits="1"/>
-						<line number="56" hits="1"/>
-						<line number="57" hits="1"/>
-						<line number="58" hits="1"/>
-						<line number="59" hits="1"/>
-						<line number="60" hits="1"/>
-						<line number="61" hits="1"/>
+						<line number="44" hits="0"/>
+						<line number="45" hits="0"/>
+						<line number="46" hits="0"/>
+						<line number="47" hits="0"/>
+						<line number="48" hits="0"/>
+						<line number="49" hits="0"/>
+						<line number="51" hits="0"/>
+						<line number="53" hits="0"/>
+						<line number="54" hits="0"/>
+						<line number="55" hits="0"/>
+						<line number="56" hits="0"/>
+						<line number="57" hits="0"/>
+						<line number="58" hits="0"/>
+						<line number="59" hits="0"/>
+						<line number="60" hits="0"/>
+						<line number="61" hits="0"/>
 						<line number="63" hits="1"/>
-						<line number="64" hits="1"/>
-						<line number="65" hits="1"/>
-						<line number="66" hits="1"/>
-						<line number="67" hits="1"/>
+						<line number="64" hits="0"/>
+						<line number="65" hits="0"/>
+						<line number="66" hits="0"/>
+						<line number="67" hits="0"/>
 					</lines>
 				</class>
 				<class name="clip.py" filename="encoders/clip.py" complexity="0" line-rate="0.939" branch-rate="0">
@@ -866,7 +863,7 @@
 						<line number="49" hits="1"/>
 					</lines>
 				</class>
-				<class name="fastembed.py" filename="encoders/fastembed.py" complexity="0" line-rate="0.8667" branch-rate="0">
+				<class name="fastembed.py" filename="encoders/fastembed.py" complexity="0" line-rate="0.6667" branch-rate="0">
 					<methods/>
 					<lines>
 						<line number="1" hits="1"/>
@@ -886,17 +883,17 @@
 						<line number="23" hits="1"/>
 						<line number="24" hits="1"/>
 						<line number="25" hits="1"/>
-						<line number="26" hits="0"/>
-						<line number="27" hits="0"/>
-						<line number="33" hits="1"/>
-						<line number="40" hits="1"/>
-						<line number="42" hits="1"/>
-						<line number="43" hits="1"/>
+						<line number="26" hits="1"/>
+						<line number="27" hits="1"/>
+						<line number="33" hits="0"/>
+						<line number="40" hits="0"/>
+						<line number="42" hits="0"/>
+						<line number="43" hits="0"/>
 						<line number="45" hits="1"/>
-						<line number="46" hits="1"/>
-						<line number="47" hits="1"/>
-						<line number="48" hits="1"/>
-						<line number="49" hits="1"/>
+						<line number="46" hits="0"/>
+						<line number="47" hits="0"/>
+						<line number="48" hits="0"/>
+						<line number="49" hits="0"/>
 						<line number="50" hits="0"/>
 						<line number="51" hits="0"/>
 					</lines>
@@ -1144,8 +1141,6 @@
 						<line number="9" hits="1"/>
 						<line number="10" hits="1"/>
 						<line number="11" hits="1"/>
-						<line number="12" hits="1"/>
-						<line number="13" hits="1"/>
 						<line number="14" hits="1"/>
 						<line number="15" hits="1"/>
 						<line number="16" hits="1"/>
diff --git a/docs/examples/rolling-window-splitter.ipynb b/docs/examples/rolling-window-splitter.ipynb
index b94d3a5615ec9b2edda91da90a481be235829bef..6cba2cd33c7b8e73aabcc6349a603280e7c2123c 100644
--- a/docs/examples/rolling-window-splitter.ipynb
+++ b/docs/examples/rolling-window-splitter.ipynb
@@ -39,7 +39,7 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "/Users/jamesbriggs/opt/anaconda3/envs/decision-layer/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "/Users/jakit/customers/aurelio/semantic-router/.venv/lib/python3.9/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
       "  from .autonotebook import tqdm as notebook_tqdm\n"
      ]
     }
@@ -49,6 +49,9 @@
     "from getpass import getpass\n",
     "from semantic_router.splitters import RollingWindowSplitter\n",
     "from semantic_router.encoders import OpenAIEncoder\n",
+    "from semantic_router.utils.logger import logger\n",
+    "\n",
+    "logger.setLevel(\"INFO\")\n",
     "\n",
     "os.environ[\"OPENAI_API_KEY\"] = os.getenv(\"OPENAI_API_KEY\") or getpass(\n",
     "    \"Enter your OpenAI API key: \"\n",
@@ -56,9 +59,9 @@
     "\n",
     "splitter = RollingWindowSplitter(\n",
     "    encoder=OpenAIEncoder(),\n",
-    "    min_split_tokens=50,\n",
-    "    max_split_tokens=300,\n",
-    "    window_size=5,  # sentences\n",
+    "    min_split_tokens=30,\n",
+    "    max_split_tokens=200,\n",
+    "    window_size=1,  # sentences\n",
     "    plot_splits=True,\n",
     ")"
    ]
@@ -72,33 +75,34 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "\u001b[33m2024-02-23 08:44:26 WARNING semantic_router.utils.logger Single document exceeds the maximum token limit of 300. Splitting to sentences before semantically splitting.\u001b[0m\n",
-      "\u001b[32m2024-02-23 08:44:26 INFO semantic_router.utils.logger Iteration 0: Trying threshold: 0.8881277932028191\u001b[0m\n",
-      "\u001b[32m2024-02-23 08:44:26 INFO semantic_router.utils.logger Iteration 0: Median tokens per split: 24.0\u001b[0m\n",
-      "\u001b[32m2024-02-23 08:44:26 INFO semantic_router.utils.logger Iteration 0: Adjusting high to 0.8781277932028191\u001b[0m\n",
-      "\u001b[32m2024-02-23 08:44:26 INFO semantic_router.utils.logger Iteration 1: Trying threshold: 0.8687934834140205\u001b[0m\n",
-      "\u001b[32m2024-02-23 08:44:26 INFO semantic_router.utils.logger Iteration 1: Median tokens per split: 34.5\u001b[0m\n",
-      "\u001b[32m2024-02-23 08:44:26 INFO semantic_router.utils.logger Iteration 1: Adjusting high to 0.8587934834140205\u001b[0m\n",
-      "\u001b[32m2024-02-23 08:44:26 INFO semantic_router.utils.logger Final optimal threshold: 0.8687934834140205\u001b[0m\n",
-      "\u001b[32m2024-02-23 08:44:27 INFO semantic_router.utils.logger Split finalized with 218 tokens due to threshold 0.8687934834140205.\u001b[0m\n",
-      "\u001b[32m2024-02-23 08:44:27 INFO semantic_router.utils.logger Split finalized with 262 tokens due to exceeding token limit of 300.\u001b[0m\n",
-      "\u001b[32m2024-02-23 08:44:27 INFO semantic_router.utils.logger Split finalized with 137 tokens due to threshold 0.8687934834140205.\u001b[0m\n",
-      "\u001b[32m2024-02-23 08:44:27 INFO semantic_router.utils.logger Split finalized with 249 tokens due to threshold 0.8687934834140205.\u001b[0m\n",
-      "\u001b[32m2024-02-23 08:44:27 INFO semantic_router.utils.logger Split finalized with 117 tokens due to threshold 0.8687934834140205.\u001b[0m\n",
-      "\u001b[32m2024-02-23 08:44:27 INFO semantic_router.utils.logger Split finalized with 171 tokens due to threshold 0.8687934834140205.\u001b[0m\n",
-      "\u001b[32m2024-02-23 08:44:27 INFO semantic_router.utils.logger Split finalized with 72 tokens due to threshold 0.8687934834140205.\u001b[0m\n",
-      "\u001b[32m2024-02-23 08:44:27 INFO semantic_router.utils.logger Final split added with 23 tokens due to remaining documents.\u001b[0m\n"
+      "\u001b[33m2024-02-26 17:01:00 WARNING semantic_router.utils.logger Single document exceeds the maximum token limit of 200. Splitting to sentences before semantically splitting.\u001b[0m\n",
+      "\u001b[32m2024-02-26 17:01:01 INFO semantic_router.utils.logger Optimal threshold 0.8294063747414075 found with median tokens (49.5) in target range (30-200).\u001b[0m\n"
      ]
     },
     {
      "data": {
-      "image/png": "",
+      "image/png": "",
       "text/plain": [
-       "<Figure size 1200x600 with 1 Axes>"
+       "<Figure size 1200x1200 with 2 Axes>"
       ]
      },
      "metadata": {},
      "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\u001b[32m2024-02-26 17:01:02 INFO semantic_router.utils.logger Splitting Statistics:\n",
+      "  - Total Documents: 40\n",
+      "  - Total Splits: 13\n",
+      "  - Splits by Threshold: 12\n",
+      "  - Splits by Max Chunk Size: 0\n",
+      "  - Last Split: 1\n",
+      "  - Minimum Token Size of Split: 23\n",
+      "  - Maximum Token Size of Split: 223\n",
+      "  - Similarity Split Ratio: 0.92\u001b[0m\n"
+     ]
     }
    ],
    "source": [
@@ -114,43 +118,68 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Split 1, tokens 218, triggered by: 0.85\n",
-      "\u001b[31mIn a recent surge of social media discussions on Weibo, Chinese netizens have been engaging in conversations about the struggles and challenges of earning money. The online debate sparked a wave of opinions and perspectives on the relationship between hard work, high pay, and finding contentment. Among the tweets, several users pontificated that one should avoid earning \"too much hard-earned money.\" The tweets and discussions revolve around the idea that working too hard for one's income can have a detrimental effect on one's life, both physically and mentally. Some users advocate for finding opportunities that align with one's strengths and passions, rather than simply focusing on high-paying jobs that may require excessive hours and intense labor. One Weibo user pontificates, \"Don't earn that much hard-earned money,\" a sentiment echoed by others with tweets such as, \"Why is it that when earning money, that process always has to be so tough?\" This question is followed by a comparison between two types of people - those who are used to earning money the hard way and those who seem to effortlessly obtain wealth.\u001b[0m\n",
+      "Split 1, tokens 140, triggered by: 0.83\n",
+      "\u001b[31mIn a recent surge of social media discussions on Weibo, Chinese netizens have been engaging in conversations about the struggles and challenges of earning money. The online debate sparked a wave of opinions and perspectives on the relationship between hard work, high pay, and finding contentment. Among the tweets, several users pontificated that one should avoid earning \"too much hard-earned money.\" The tweets and discussions revolve around the idea that working too hard for one's income can have a detrimental effect on one's life, both physically and mentally. Some users advocate for finding opportunities that align with one's strengths and passions, rather than simply focusing on high-paying jobs that may require excessive hours and intense labor.\u001b[0m\n",
       "----------------------------------------------------------------------------------------\n",
       "\n",
       "\n",
-      "Split 2, tokens 262, triggered by: token limit\n",
-      "\u001b[32mWhile the former group is depicted as having been taught to suffer from a young age, the latter is shown as being able to focus solely on their natural talents and thriving in their niche advantageously. Discussions on the platform draw attention to a variety of issues that those who earn money the hard way might face. For example, they are described as likely having to work overtime, forgo time off for illness or rest, and maintain an unyielding dedication to their occupation, which often results in a never-ending cycle of work without any perceived progression in their lives. Another tweet that captures this sentiment reads, \"Drowning in more work and poverty despite trying harder and harder,\" pointing to a sense of despair and dissatisfaction that comes with work that is both disproportionately demanding and inadequately rewarding. Critics also note how the pursuit of hard-earned money could potentially create physical and mental health risks due to the unrelenting pressure and stress that these jobs might impose. Conversely, those in favor of earning money with less difficulty contend that it's crucial to harness one's strengths and passions to create opportunities that yield financial success without the need for excessive labor. The debate revolves around the concept that people should seek out ways to work smarter, not harder, especially if it means a healthier and more fulfilling lifestyle.\u001b[0m\n",
+      "Split 2, tokens 78, triggered by: 0.81\n",
+      "\u001b[32mOne Weibo user pontificates, \"Don't earn that much hard-earned money,\" a sentiment echoed by others with tweets such as, \"Why is it that when earning money, that process always has to be so tough?\" This question is followed by a comparison between two types of people - those who are used to earning money the hard way and those who seem to effortlessly obtain wealth.\u001b[0m\n",
       "----------------------------------------------------------------------------------------\n",
       "\n",
       "\n",
-      "Split 3, tokens 137, triggered by: 0.85\n",
-      "\u001b[34mIn fact, the notion of a \"vicious cycle,\" often attributed to those chasing hard-earned money, is juxtaposed with an idealized image of someone operating in their zone of excellence. Confidently focused on their strengths, such individuals are depicted as enjoying a more relaxed and less stressful work environment, one in which they can thrive without the need for never-ending overtime or self-sacrifice. Some tweets even extend this sentiment to the broader socio-economic context, observing how wealth is not merely derived from manual labor or high-paying positions requiring extraordinary work hours. The tweets emphasize the importance of cultivating an entrepreneurial spirit and a penchant for innovative thinking, especially in the modern digital age.\u001b[0m\n",
+      "Split 3, tokens 39, triggered by: 0.79\n",
+      "\u001b[34mWhile the former group is depicted as having been taught to suffer from a young age, the latter is shown as being able to focus solely on their natural talents and thriving in their niche advantageously.\u001b[0m\n",
       "----------------------------------------------------------------------------------------\n",
       "\n",
       "\n",
-      "Split 4, tokens 249, triggered by: 0.86\n",
-      "\u001b[35mOne user writes, \"Too hard-earned money isn't worth it. Learn how to make money using your brain, not your body,\" while another suggests, \"Love will flow towards those who are not lacking in love, and money will flow towards those who are not lacking in money!\" While some of the discussions take a somewhat passive-aggressive view, others acknowledge that financial security and comfort might not always be possible for everyone. In a more realistic tone, a user remarks, \"If life were so easy that diligence led to wealth, then the world's richest person would be the best worker bee. But that's not the case.\" This acknowledgment underscores the complexities of the economy and the role that factors like luck, connections, and a rapidly evolving job market can play in financial success. Some users are quick to criticize the notion that earning money the hard way should be avoided, with one tweet expressing, \"The person who advises you to avoid hard-earned money is likely a scammer who profits off providing emotional value in exchange for exploitation.\" Others argue that while it's essential to find enjoyment and fulfillment in one's work, it's crucial not to shun or belittle those who choose to work in physically demanding or high-paying industries.\u001b[0m\n",
+      "Split 4, tokens 223, triggered by: 0.83\n",
+      "\u001b[35mDiscussions on the platform draw attention to a variety of issues that those who earn money the hard way might face. For example, they are described as likely having to work overtime, forgo time off for illness or rest, and maintain an unyielding dedication to their occupation, which often results in a never-ending cycle of work without any perceived progression in their lives. Another tweet that captures this sentiment reads, \"Drowning in more work and poverty despite trying harder and harder,\" pointing to a sense of despair and dissatisfaction that comes with work that is both disproportionately demanding and inadequately rewarding. Critics also note how the pursuit of hard-earned money could potentially create physical and mental health risks due to the unrelenting pressure and stress that these jobs might impose. Conversely, those in favor of earning money with less difficulty contend that it's crucial to harness one's strengths and passions to create opportunities that yield financial success without the need for excessive labor. The debate revolves around the concept that people should seek out ways to work smarter, not harder, especially if it means a healthier and more fulfilling lifestyle.\u001b[0m\n",
       "----------------------------------------------------------------------------------------\n",
       "\n",
       "\n",
-      "Split 5, tokens 117, triggered by: 0.87\n",
-      "\u001b[31mOverall, the Weibo discussions offer a fascinating insight into the complexities of the modern Chinese labor market and the work-life balance that people strive to achieve. As in many countries, striking the right balance between work and play is an ongoing challenge for many Chinese citizens. However, the conversations on Weibo signal an increasing awareness of the importance of finding meaningful, fulfilling, and financially rewarding work that doesn't necessitate excessive sacrifice or sufferance. In the end, as one user succinctly puts it, \"Make sure you're earning your money in a way that brings you joy and satisfaction.\u001b[0m\n",
+      "Split 5, tokens 81, triggered by: 0.82\n",
+      "\u001b[31mIn fact, the notion of a \"vicious cycle,\" often attributed to those chasing hard-earned money, is juxtaposed with an idealized image of someone operating in their zone of excellence. Confidently focused on their strengths, such individuals are depicted as enjoying a more relaxed and less stressful work environment, one in which they can thrive without the need for never-ending overtime or self-sacrifice.\u001b[0m\n",
       "----------------------------------------------------------------------------------------\n",
       "\n",
       "\n",
-      "Split 6, tokens 171, triggered by: 0.82\n",
-      "\u001b[32mThat's the only way to ensure that your life doesn't become a never-ending cycle of hard work without any tangible progress.\" In this context, social media discussions focusing on the trials and tribulations of earning money serve not only as an outlet for venting frustrations but also as a means of promoting dialogue and shared understanding about the challenges faced by workers across all industries. These virtual conversations sparked by tweets and in-depth discussions likely resonate with a wide swath of Chinese citizens struggling to navigate the complexities of balancing a career that pays well with one that brings them joy, fulfillment, and a sense of purpose. As the discussions on Weibo continue to evolve and unfold, it is evident that the discourse around work, money, and life satisfaction holds the potential to inspire meaningful change and shift societal attitudes towards a more holistic, balanced, and humane understanding of success and prosperity.\u001b[0m\n",
+      "Split 6, tokens 56, triggered by: 0.77\n",
+      "\u001b[32mSome tweets even extend this sentiment to the broader socio-economic context, observing how wealth is not merely derived from manual labor or high-paying positions requiring extraordinary work hours. The tweets emphasize the importance of cultivating an entrepreneurial spirit and a penchant for innovative thinking, especially in the modern digital age.\u001b[0m\n",
       "----------------------------------------------------------------------------------------\n",
       "\n",
       "\n",
-      "Split 7, tokens 72, triggered by: 0.80\n",
-      "\u001b[34m--- Note: The translated tweets and user quotes from Chinese to English were used as the foundation for the long-form news article. The author tried to maintain the integrity of the original content in the translation while adapting it to fit a journalistic format. No inaccuracies were introduced during translation, and the opinion-based nature of the original content was preserved while maintaining objectivity.\u001b[0m\n",
+      "Split 7, tokens 58, triggered by: 0.81\n",
+      "\u001b[34mOne user writes, \"Too hard-earned money isn't worth it. Learn how to make money using your brain, not your body,\" while another suggests, \"Love will flow towards those who are not lacking in love, and money will flow towards those who are not lacking in money!\"\u001b[0m\n",
       "----------------------------------------------------------------------------------------\n",
       "\n",
       "\n",
-      "Split 8, tokens 23, triggered by: final split\n",
-      "\u001b[35mHeart count: 0/2 Note: The author did not include any Chinese characters in the final response. Collapse\u001b[0m\n",
+      "Split 8, tokens 100, triggered by: 0.83\n",
+      "\u001b[35mWhile some of the discussions take a somewhat passive-aggressive view, others acknowledge that financial security and comfort might not always be possible for everyone. In a more realistic tone, a user remarks, \"If life were so easy that diligence led to wealth, then the world's richest person would be the best worker bee. But that's not the case.\" This acknowledgment underscores the complexities of the economy and the role that factors like luck, connections, and a rapidly evolving job market can play in financial success.\u001b[0m\n",
+      "----------------------------------------------------------------------------------------\n",
+      "\n",
+      "\n",
+      "Split 9, tokens 91, triggered by: 0.80\n",
+      "\u001b[31mSome users are quick to criticize the notion that earning money the hard way should be avoided, with one tweet expressing, \"The person who advises you to avoid hard-earned money is likely a scammer who profits off providing emotional value in exchange for exploitation.\" Others argue that while it's essential to find enjoyment and fulfillment in one's work, it's crucial not to shun or belittle those who choose to work in physically demanding or high-paying industries.\u001b[0m\n",
+      "----------------------------------------------------------------------------------------\n",
+      "\n",
+      "\n",
+      "Split 10, tokens 142, triggered by: 0.78\n",
+      "\u001b[32mOverall, the Weibo discussions offer a fascinating insight into the complexities of the modern Chinese labor market and the work-life balance that people strive to achieve. As in many countries, striking the right balance between work and play is an ongoing challenge for many Chinese citizens. However, the conversations on Weibo signal an increasing awareness of the importance of finding meaningful, fulfilling, and financially rewarding work that doesn't necessitate excessive sacrifice or sufferance. In the end, as one user succinctly puts it, \"Make sure you're earning your money in a way that brings you joy and satisfaction. That's the only way to ensure that your life doesn't become a never-ending cycle of hard work without any tangible progress.\"\u001b[0m\n",
+      "----------------------------------------------------------------------------------------\n",
+      "\n",
+      "\n",
+      "Split 11, tokens 146, triggered by: 0.77\n",
+      "\u001b[34mIn this context, social media discussions focusing on the trials and tribulations of earning money serve not only as an outlet for venting frustrations but also as a means of promoting dialogue and shared understanding about the challenges faced by workers across all industries. These virtual conversations sparked by tweets and in-depth discussions likely resonate with a wide swath of Chinese citizens struggling to navigate the complexities of balancing a career that pays well with one that brings them joy, fulfillment, and a sense of purpose. As the discussions on Weibo continue to evolve and unfold, it is evident that the discourse around work, money, and life satisfaction holds the potential to inspire meaningful change and shift societal attitudes towards a more holistic, balanced, and humane understanding of success and prosperity.\u001b[0m\n",
+      "----------------------------------------------------------------------------------------\n",
+      "\n",
+      "\n",
+      "Split 12, tokens 72, triggered by: 0.72\n",
+      "\u001b[35m--- Note: The translated tweets and user quotes from Chinese to English were used as the foundation for the long-form news article. The author tried to maintain the integrity of the original content in the translation while adapting it to fit a journalistic format. No inaccuracies were introduced during translation, and the opinion-based nature of the original content was preserved while maintaining objectivity.\u001b[0m\n",
+      "----------------------------------------------------------------------------------------\n",
+      "\n",
+      "\n",
+      "Split 13, tokens 23, triggered by: final split\n",
+      "\u001b[31mHeart count: 0/2 Note: The author did not include any Chinese characters in the final response. Collapse\u001b[0m\n",
       "----------------------------------------------------------------------------------------\n",
       "\n",
       "\n"
@@ -160,13 +189,6 @@
    "source": [
     "splitter.print(splits)"
    ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
   }
  ],
  "metadata": {
@@ -185,7 +207,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.11.5"
+   "version": "3.9.18"
   }
  },
  "nbformat": 4,
diff --git a/docs/examples/unstructured-element-splitter.ipynb b/docs/examples/unstructured-element-splitter.ipynb
index c13a6a59b1b90c4a9d1cd36016cf007ad5ab10af..5a2bb5778aafb17dc0382d35150145cf4a1187cb 100644
--- a/docs/examples/unstructured-element-splitter.ipynb
+++ b/docs/examples/unstructured-element-splitter.ipynb
@@ -16,7 +16,7 @@
     "# It may take longer to install the package\n",
     "!pip install -qU \\\n",
     "    \"unstructured[pdf]==0.12.4\" \\\n",
-    "    \"semantic-router==0.0.24\""
+    "    \"semantic-router==0.0.26\""
    ]
   },
   {
@@ -28,13 +28,17 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 1,
    "metadata": {},
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
+      "/Users/jakit/customers/aurelio/semantic-router/.venv/lib/python3.9/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n",
+      "Conflict between variables skip_infer_table_types: ['pdf', 'jpg', 'png', 'xls', 'xlsx', 'heic'] and pdf_infer_table_structure: True, please reset skip_infer_table_types to turn on table extraction for PDFs.\n",
+      "This function will be deprecated in a future release and `unstructured` will simply use the DEFAULT_MODEL from `unstructured_inference.model.base` to set default model name\n",
       "Some weights of the model checkpoint at microsoft/table-transformer-structure-recognition were not used when initializing TableTransformerForObjectDetection: ['model.backbone.conv_encoder.model.layer2.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer3.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer4.0.downsample.1.num_batches_tracked']\n",
       "- This IS expected if you are initializing TableTransformerForObjectDetection from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
       "- This IS NOT expected if you are initializing TableTransformerForObjectDetection from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n"
@@ -70,7 +74,6 @@
    "source": [
     "import re\n",
     "\n",
-    "\n",
     "def is_valid_title(title: str) -> bool:\n",
     "    # Rule 1: Title starts with a lowercase letter\n",
     "    if re.match(r\"^[a-z]\", title):\n",
@@ -133,10 +136,11 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 10,
    "metadata": {},
    "outputs": [],
    "source": [
+    "\n",
     "from semantic_router.splitters import RollingWindowSplitter\n",
     "\n",
     "\n",
@@ -158,6 +162,7 @@
     "                # Process accumulated text before the table\n",
     "                if combined_element_texts:\n",
     "                    splits = splitter(combined_element_texts)\n",
+    "                    print(\"-\" * 80)\n",
     "                    chunks.extend([split.content for split in splits])\n",
     "                    combined_element_texts = []  # Reset combined texts after processing\n",
     "\n",
@@ -172,6 +177,7 @@
     "\n",
     "        if combined_element_texts:\n",
     "            splits = splitter(combined_element_texts)\n",
+    "            print(\"-\" * 80)\n",
     "            chunks.extend([split.content for split in splits])\n",
     "\n",
     "        if chunks:\n",
@@ -189,7 +195,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 11,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -219,7 +225,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 15,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -231,14 +237,15 @@
     "splitter = RollingWindowSplitter(\n",
     "    encoder=encoder,\n",
     "    window_size=1,  # Compares each element with the previous one\n",
-    "    min_split_tokens=50,\n",
-    "    max_split_tokens=300,\n",
+    "    min_split_tokens=1,\n",
+    "    max_split_tokens=500,\n",
+    "    plot_splits=False,\n",
     ")"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 16,
    "metadata": {},
    "outputs": [
     {
@@ -279,22 +286,424 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": 17,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/Users/jakit/customers/aurelio/semantic-router/.venv/lib/python3.9/site-packages/numpy/core/fromnumeric.py:3504: RuntimeWarning: Mean of empty slice.\n",
+      "  return _methods._mean(a, axis=axis, dtype=dtype,\n",
+      "/Users/jakit/customers/aurelio/semantic-router/.venv/lib/python3.9/site-packages/numpy/core/_methods.py:129: RuntimeWarning: invalid value encountered in scalar divide\n",
+      "  ret = ret.dtype.type(ret / rcount)\n",
+      "/Users/jakit/customers/aurelio/semantic-router/.venv/lib/python3.9/site-packages/numpy/core/_methods.py:206: RuntimeWarning: Degrees of freedom <= 0 for slice\n",
+      "  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,\n",
+      "/Users/jakit/customers/aurelio/semantic-router/.venv/lib/python3.9/site-packages/numpy/core/_methods.py:163: RuntimeWarning: invalid value encountered in divide\n",
+      "  arrmean = um.true_divide(arrmean, div, out=arrmean,\n",
+      "/Users/jakit/customers/aurelio/semantic-router/.venv/lib/python3.9/site-packages/numpy/core/_methods.py:198: RuntimeWarning: invalid value encountered in scalar divide\n",
+      "  ret = ret.dtype.type(ret / rcount)\n",
+      "\u001b[32m2024-02-26 17:07:32 INFO semantic_router.utils.logger Optimal threshold 0.5 found with median tokens (27.0) in target range (1-500).\u001b[0m\n",
+      "\u001b[32m2024-02-26 17:07:32 INFO semantic_router.utils.logger Splitting Statistics:\n",
+      "  - Total Documents: 1\n",
+      "  - Total Splits: 1\n",
+      "  - Splits by Threshold: 0\n",
+      "  - Splits by Max Chunk Size: 0\n",
+      "  - Last Split: 1\n",
+      "  - Minimum Token Size of Split: 27\n",
+      "  - Maximum Token Size of Split: 27\n",
+      "  - Similarity Split Ratio: 0.00\u001b[0m\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "--------------------------------------------------------------------------------\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\u001b[32m2024-02-26 17:07:33 INFO semantic_router.utils.logger Optimal threshold 0.7912974224053915 found with median tokens (136.5) in target range (1-500).\u001b[0m\n",
+      "\u001b[32m2024-02-26 17:07:33 INFO semantic_router.utils.logger Splitting Statistics:\n",
+      "  - Total Documents: 3\n",
+      "  - Total Splits: 2\n",
+      "  - Splits by Threshold: 1\n",
+      "  - Splits by Max Chunk Size: 0\n",
+      "  - Last Split: 1\n",
+      "  - Minimum Token Size of Split: 19\n",
+      "  - Maximum Token Size of Split: 254\n",
+      "  - Similarity Split Ratio: 0.50\u001b[0m\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "--------------------------------------------------------------------------------\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\u001b[32m2024-02-26 17:07:33 INFO semantic_router.utils.logger Optimal threshold 0.8514465425347408 found with median tokens (129.5) in target range (1-500).\u001b[0m\n",
+      "\u001b[32m2024-02-26 17:07:33 INFO semantic_router.utils.logger Splitting Statistics:\n",
+      "  - Total Documents: 7\n",
+      "  - Total Splits: 4\n",
+      "  - Splits by Threshold: 3\n",
+      "  - Splits by Max Chunk Size: 0\n",
+      "  - Last Split: 1\n",
+      "  - Minimum Token Size of Split: 64\n",
+      "  - Maximum Token Size of Split: 400\n",
+      "  - Similarity Split Ratio: 0.75\u001b[0m\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "--------------------------------------------------------------------------------\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\u001b[32m2024-02-26 17:07:34 INFO semantic_router.utils.logger Optimal threshold 0.8371609601655312 found with median tokens (154.0) in target range (1-500).\u001b[0m\n",
+      "\u001b[32m2024-02-26 17:07:34 INFO semantic_router.utils.logger Splitting Statistics:\n",
+      "  - Total Documents: 7\n",
+      "  - Total Splits: 4\n",
+      "  - Splits by Threshold: 3\n",
+      "  - Splits by Max Chunk Size: 0\n",
+      "  - Last Split: 1\n",
+      "  - Minimum Token Size of Split: 37\n",
+      "  - Maximum Token Size of Split: 362\n",
+      "  - Similarity Split Ratio: 0.75\u001b[0m\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "--------------------------------------------------------------------------------\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\u001b[32m2024-02-26 17:07:34 INFO semantic_router.utils.logger Optimal threshold 0.8004127909380481 found with median tokens (46.0) in target range (1-500).\u001b[0m\n",
+      "\u001b[32m2024-02-26 17:07:34 INFO semantic_router.utils.logger Splitting Statistics:\n",
+      "  - Total Documents: 5\n",
+      "  - Total Splits: 3\n",
+      "  - Splits by Threshold: 2\n",
+      "  - Splits by Max Chunk Size: 0\n",
+      "  - Last Split: 1\n",
+      "  - Minimum Token Size of Split: 15\n",
+      "  - Maximum Token Size of Split: 161\n",
+      "  - Similarity Split Ratio: 0.67\u001b[0m\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "--------------------------------------------------------------------------------\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\u001b[32m2024-02-26 17:07:35 INFO semantic_router.utils.logger Optimal threshold 0.7219220831968602 found with median tokens (94.0) in target range (1-500).\u001b[0m\n",
+      "\u001b[32m2024-02-26 17:07:35 INFO semantic_router.utils.logger Splitting Statistics:\n",
+      "  - Total Documents: 6\n",
+      "  - Total Splits: 3\n",
+      "  - Splits by Threshold: 2\n",
+      "  - Splits by Max Chunk Size: 0\n",
+      "  - Last Split: 1\n",
+      "  - Minimum Token Size of Split: 8\n",
+      "  - Maximum Token Size of Split: 100\n",
+      "  - Similarity Split Ratio: 0.67\u001b[0m\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "--------------------------------------------------------------------------------\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\u001b[32m2024-02-26 17:07:35 INFO semantic_router.utils.logger Optimal threshold 0.7865543500746407 found with median tokens (92.5) in target range (1-500).\u001b[0m\n",
+      "\u001b[32m2024-02-26 17:07:35 INFO semantic_router.utils.logger Splitting Statistics:\n",
+      "  - Total Documents: 4\n",
+      "  - Total Splits: 2\n",
+      "  - Splits by Threshold: 1\n",
+      "  - Splits by Max Chunk Size: 0\n",
+      "  - Last Split: 1\n",
+      "  - Minimum Token Size of Split: 12\n",
+      "  - Maximum Token Size of Split: 173\n",
+      "  - Similarity Split Ratio: 0.50\u001b[0m\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "--------------------------------------------------------------------------------\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\u001b[32m2024-02-26 17:07:36 INFO semantic_router.utils.logger Optimal threshold 0.7759885849518695 found with median tokens (73.0) in target range (1-500).\u001b[0m\n",
+      "\u001b[32m2024-02-26 17:07:36 INFO semantic_router.utils.logger Splitting Statistics:\n",
+      "  - Total Documents: 9\n",
+      "  - Total Splits: 5\n",
+      "  - Splits by Threshold: 4\n",
+      "  - Splits by Max Chunk Size: 0\n",
+      "  - Last Split: 1\n",
+      "  - Minimum Token Size of Split: 15\n",
+      "  - Maximum Token Size of Split: 210\n",
+      "  - Similarity Split Ratio: 0.80\u001b[0m\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "--------------------------------------------------------------------------------\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\u001b[32m2024-02-26 17:07:36 INFO semantic_router.utils.logger Optimal threshold 0.7356350410401438 found with median tokens (23.0) in target range (1-500).\u001b[0m\n",
+      "\u001b[32m2024-02-26 17:07:36 INFO semantic_router.utils.logger Splitting Statistics:\n",
+      "  - Total Documents: 5\n",
+      "  - Total Splits: 3\n",
+      "  - Splits by Threshold: 2\n",
+      "  - Splits by Max Chunk Size: 0\n",
+      "  - Last Split: 1\n",
+      "  - Minimum Token Size of Split: 8\n",
+      "  - Maximum Token Size of Split: 198\n",
+      "  - Similarity Split Ratio: 0.67\u001b[0m\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "--------------------------------------------------------------------------------\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\u001b[32m2024-02-26 17:07:37 INFO semantic_router.utils.logger Optimal threshold 0.7993056373716161 found with median tokens (14.0) in target range (1-500).\u001b[0m\n",
+      "\u001b[32m2024-02-26 17:07:37 INFO semantic_router.utils.logger Splitting Statistics:\n",
+      "  - Total Documents: 5\n",
+      "  - Total Splits: 3\n",
+      "  - Splits by Threshold: 2\n",
+      "  - Splits by Max Chunk Size: 0\n",
+      "  - Last Split: 1\n",
+      "  - Minimum Token Size of Split: 10\n",
+      "  - Maximum Token Size of Split: 95\n",
+      "  - Similarity Split Ratio: 0.67\u001b[0m\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "--------------------------------------------------------------------------------\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\u001b[32m2024-02-26 17:07:37 INFO semantic_router.utils.logger Optimal threshold 0.7946781280578719 found with median tokens (104.5) in target range (1-500).\u001b[0m\n",
+      "\u001b[32m2024-02-26 17:07:37 INFO semantic_router.utils.logger Splitting Statistics:\n",
+      "  - Total Documents: 4\n",
+      "  - Total Splits: 2\n",
+      "  - Splits by Threshold: 1\n",
+      "  - Splits by Max Chunk Size: 0\n",
+      "  - Last Split: 1\n",
+      "  - Minimum Token Size of Split: 87\n",
+      "  - Maximum Token Size of Split: 122\n",
+      "  - Similarity Split Ratio: 0.50\u001b[0m\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "--------------------------------------------------------------------------------\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\u001b[32m2024-02-26 17:07:38 INFO semantic_router.utils.logger Optimal threshold 0.7079124801171096 found with median tokens (15.0) in target range (1-500).\u001b[0m\n",
+      "\u001b[32m2024-02-26 17:07:38 INFO semantic_router.utils.logger Splitting Statistics:\n",
+      "  - Total Documents: 2\n",
+      "  - Total Splits: 1\n",
+      "  - Splits by Threshold: 0\n",
+      "  - Splits by Max Chunk Size: 0\n",
+      "  - Last Split: 1\n",
+      "  - Minimum Token Size of Split: 15\n",
+      "  - Maximum Token Size of Split: 15\n",
+      "  - Similarity Split Ratio: 0.00\u001b[0m\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "--------------------------------------------------------------------------------\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\u001b[32m2024-02-26 17:07:38 INFO semantic_router.utils.logger Optimal threshold 0.8324466121743902 found with median tokens (110.5) in target range (1-500).\u001b[0m\n",
+      "\u001b[32m2024-02-26 17:07:38 INFO semantic_router.utils.logger Splitting Statistics:\n",
+      "  - Total Documents: 12\n",
+      "  - Total Splits: 6\n",
+      "  - Splits by Threshold: 5\n",
+      "  - Splits by Max Chunk Size: 0\n",
+      "  - Last Split: 1\n",
+      "  - Minimum Token Size of Split: 57\n",
+      "  - Maximum Token Size of Split: 254\n",
+      "  - Similarity Split Ratio: 0.83\u001b[0m\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "--------------------------------------------------------------------------------\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\u001b[32m2024-02-26 17:07:39 INFO semantic_router.utils.logger Optimal threshold 0.8128022034342155 found with median tokens (16.5) in target range (1-500).\u001b[0m\n",
+      "\u001b[32m2024-02-26 17:07:39 INFO semantic_router.utils.logger Splitting Statistics:\n",
+      "  - Total Documents: 3\n",
+      "  - Total Splits: 2\n",
+      "  - Splits by Threshold: 1\n",
+      "  - Splits by Max Chunk Size: 0\n",
+      "  - Last Split: 1\n",
+      "  - Minimum Token Size of Split: 4\n",
+      "  - Maximum Token Size of Split: 29\n",
+      "  - Similarity Split Ratio: 0.50\u001b[0m\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "--------------------------------------------------------------------------------\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\u001b[32m2024-02-26 17:07:39 INFO semantic_router.utils.logger Optimal threshold 0.786452236757286 found with median tokens (173.5) in target range (1-500).\u001b[0m\n",
+      "\u001b[32m2024-02-26 17:07:39 INFO semantic_router.utils.logger Splitting Statistics:\n",
+      "  - Total Documents: 8\n",
+      "  - Total Splits: 4\n",
+      "  - Splits by Threshold: 3\n",
+      "  - Splits by Max Chunk Size: 0\n",
+      "  - Last Split: 1\n",
+      "  - Minimum Token Size of Split: 8\n",
+      "  - Maximum Token Size of Split: 241\n",
+      "  - Similarity Split Ratio: 0.75\u001b[0m\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "--------------------------------------------------------------------------------\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\u001b[32m2024-02-26 17:07:40 INFO semantic_router.utils.logger Optimal threshold 0.8250029487527775 found with median tokens (41.0) in target range (1-500).\u001b[0m\n",
+      "\u001b[32m2024-02-26 17:07:40 INFO semantic_router.utils.logger Splitting Statistics:\n",
+      "  - Total Documents: 2\n",
+      "  - Total Splits: 1\n",
+      "  - Splits by Threshold: 0\n",
+      "  - Splits by Max Chunk Size: 0\n",
+      "  - Last Split: 1\n",
+      "  - Minimum Token Size of Split: 41\n",
+      "  - Maximum Token Size of Split: 41\n",
+      "  - Similarity Split Ratio: 0.00\u001b[0m\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "--------------------------------------------------------------------------------\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\u001b[32m2024-02-26 17:07:41 INFO semantic_router.utils.logger Optimal threshold 0.8086076732442027 found with median tokens (108.0) in target range (1-500).\u001b[0m\n",
+      "\u001b[32m2024-02-26 17:07:41 INFO semantic_router.utils.logger Splitting Statistics:\n",
+      "  - Total Documents: 45\n",
+      "  - Total Splits: 23\n",
+      "  - Splits by Threshold: 22\n",
+      "  - Splits by Max Chunk Size: 0\n",
+      "  - Last Split: 1\n",
+      "  - Minimum Token Size of Split: 4\n",
+      "  - Maximum Token Size of Split: 513\n",
+      "  - Similarity Split Ratio: 0.96\u001b[0m\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "--------------------------------------------------------------------------------\n"
+     ]
+    }
+   ],
    "source": [
     "chunks_by_title = create_title_chunks(grouped_elements, splitter)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": 18,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/html": [
-       "<h3 style='color: black;'>Untitled</h3><p style='color: red;'>2 v 1 3 1 5 0 . 2 0 4 2 : v i X r a</p><h3 style='color: black;'>Financial Report Chunking for Effective Retrieval Augmented Generation</h3><p style='color: green;'>Unstructured Technologies Sacramento, CA, USA leah@unstructured.io https://unstructured.io Abstract. Chunking information is a key step in Retrieval Augmented Generation (RAG). Current research primarily centers on paragraph- level chunking. This approach treats all texts as equal and neglects the information contained in the structure of documents. We propose an expanded approach to chunk documents by moving beyond mere paragraph-level chunking to chunk primary by structural element com- ponents of documents. Dissecting documents into these constituent ele- ments creates a new way to chunk documents that yields the best chunk size without tuning. We introduce a novel framework that evaluates how chunking based on element types annotated by document understanding models contributes to the overall context and accuracy of the informa- tion retrieved. We also demonstrate how this approach impacts RAG assisted Question & Answer task performance. Our research includes a comprehensive analysis of various element types, their role in effective information retrieval, and the impact they have on the quality of RAG outputs. Findings support that element type based chunking largely im- prove RAG results on financial reporting. Through this research, we are also able to answer how to uncover highly accurate RAG. Keywords: Retrieval Augmented Generation · Document Chunking · Document Pre-Processing · Financial Domain · Large Language Models</p><h3 style='color: black;'>Introduction</h3><p style='color: blue;'>contents of extensive documents [25,22,18]. By dissecting large volumes of text into smaller, more focused segments, LLMs can process each part with greater precision, ensuring a thorough understanding of each section. This segmented approach allows for meticulous analysis of unstructured data, enabling LLMs to construct a more comprehensive and coherent understanding of the entire docu- ment [41]. There remains a challenge in ensuring factual accuracy and relevance in the generated responses, especially when dealing with complex or extensive information. Recently, Retrieval Augmented Generation (RAG) [21,12] has been devel- oped to address the hallucination problem with LLMs [15,43] when recovering factual information directly from an LLM. In RAG, instead of answering a user query directly using an LLM, the user query is used to retrieve documents or segments from a corpus and the top retrieved documents or segments are used to generate the answer in conjunction with an LLM. In this way, RAG con- straints the answer to the set of retrieved documents. RAGs have been used as well to answer questions from single documents [14]. The documents are split into smaller parts or chunks, indexed by a retrieval system and recovered and processed depending on the user information need. In a sense, this process allows answering questions about information in a single document, thus contributing to the set of techniques available for document understanding.</p><p style='color: magenta;'>Since documents need to be chunked for RAG processing, this raises the question about what is the best practice to chunk documents for effective RAG document understanding. There are several dimensions to consider when decid- ing how to chunk a document, which includes the size of the chunks. The retrieval system in RAG can use traditional retrieval systems using bag- of-words methods or a vector database. If a vector database is used, then an embedding needs to be obtained from each chunk, thus the number of tokens in the chunk is relevant since the neural networks processing the chunks might have constraints on the number of tokens. As well, different chunk sizes might have undesirable retrieval results. Since the most relevant retrieved chunks need to be processed by an LLM, the number of tokens in retrieved chunks might have an effect in the generation of the answer [25]. As we see, chunking is re- quired for RAG systems and there are several advantages and disadvantages when considering how to chunk a document.</p><p style='color: red;'>In this work, we study specifically the chunking of U.S. Securities and Ex- change Commission (SEC)1 Financial Reports2, including 10-Ks, 10-Qs, and 8-Ks. This study plays a critical role in offering insights into the financial health and operational dynamics of public companies. These documents present unique challenges in terms of document processing and information extraction as they consist of varying sizes and layouts, and contain a variety of tabular informa- tion. Previous work has evaluated the processing of these reports with simple chunking strategies (e.g., tokens), but we believe that a more effective use of these reports might be achieved by a better pre-processing of the documents</p><p style='color: green;'>Financial Report Chunking for Effective Retrieval Augmented Generation and chunking configuration3 [14]. To the best of our knowledge, this is the first systematic study on chunking for document understanding and more specifically for processing financial reports.</p><h3 style='color: black;'>2 Related work</h3><p style='color: blue;'>Exploring the structure of financial reports is an exceptional area for es- tablishing optimal principles for chunking. The intricate nature of document structures and contents has resulted in most of the work processing financial reports focusing on the identification of structural elements. Among previous work, we find El-Haj et al. [10] and the FinTOC challenges [17,4,11] that have worked at the document structure level for UK and French financial reports. Ad- 3 https://www.cnbc.com/2023/12/19/gpt-and-other-ai-models-cant-analyze- an-sec-filing-researchers-find.html</p><p style='color: magenta;'>ditionally, there is recent work that considers U.S. SEC reports, which includes DocLayNet [33] and more specifically with the report tables in FinTabNet [45]. On the side of financial models, there is work in sentiment analysis in fi- nance [37], which includes the pre-training of specialised models such as Fin- BERT by Liu et al. [26], which is a BERT based model pre-trained on large corpora including large collections of financial news collected from different sites and FinBERT by DeSola et al, [9] trained on Wikipedia, BookCorpus and U.S. SEC data. Additional models include BloombergGPT [40], FinGPT [42] and Instruct-FinGPT[44]. More advance datasets in the financial domain include FinQA [6], LLMWare [27], ConFIRM [8] and TAT-QA [46] among others [7,38,19] that have been prepared for retrieval and or Questions and Answering (Q&A) tasks over snippets of fi- nancial data that includes tabular data, which has allowed methods on large language models to be tested on them [39]. Most of the previous work has focused on understanding the layout of fi- nancial documents or understanding specific snippets of existing reports with different levels of complexity, but there has not been much research in under- standing financial report documents, except some more recent work that includes FinanceBench [14], in which a set of questions about the content of financial re- ports are proposed that includes the evidence snippet.</p><p style='color: red;'>More specifically on document chunking methods for RAG, there are stan- dard approaches being considered such as chunking text into spans of a given token length (e.g. 128 and 256) or chunking based on sentences. Open source projects already allow simple processing of documents (e.g. Unstructured4, Lla- maindex5 or Langchain 6), without explicitly considering the table structure on which these chunking strategies are applied. Even though different approaches are available, an exhaustive evaluation of chunking applied to RAG and specifically to financial reporting, except for some limited chunking analysis [14,36], is non-existent. In our work, we compare a broad range of chunking approaches in addition to more simple ones and provide an analysis of the outcomes of different methods when asking questions about different aspects of the reports.</p><h3 style='color: black;'>3.1 RAG setting for the experiments</h3><p style='color: green;'>Financial Report Chunking for Effective Retrieval Augmented Generation document, the document is split into chunks and the chunks are indexed into a vector database (vectordb). When a question is sent to the RAG system, the top-k chunks most similar to the question are retrieved from the vector database and used to generate the answer using a large language model as generator. In order to retrieve chunks from the vector database, the question is encoded into a vector that is compared to the vector previously generated from the chunks. To prompt the generator, the question is converted into a set of instructions that instruct the LLM to find the answer within the top-k retrieved chunks. question vectordb top k question vector chunks encoder v | generator —+ answer * question to prompt + rome</p><p style='color: blue;'>Fig. 1. RAG steps to answer a question about a document In our experiments, we modify the way documents are chunked prior to being indexed in the vector database. All other settings remain constant. In the following sections, we describe in more detail each one of the components and processes used.</p><h3 style='color: black;'>3.2 Indexing and retrieval</h3><p style='color: magenta;'>As shown in figure 2, to index a document, first the document is split into chunks, then each chunk is processed by an encoder model and then indexed into the vector database. Based on the chunking strategy a document will be split into a larger or smaller set of chunks. chunks vectors Fig. 2. Indexing of document chunks into the vector database ttps://huggingface. co/sentence-transformers/multi-qa-mpnet-base-dot-</p><p style='color: red;'>6 Jimeno Yepes et al. As shown in figure 1, to retrieve chunks relevant to a question, the question is converted into a vector representation and the vector database returns a ranked list of chunks based on the similarity between question vector and the chunks in the database. Weaviate implements an approximate nearest neighbours algo- rithm [28] as their retrieval approach, which supports fast retrieval with high accuracy. In our experiments, we retrieve the top-10 chunks for each question.</p><h3 style='color: black;'>3.3 Generation</h3><p style='color: green;'>We have used GPT-4 [31] as the generator, which has shown best performance compared to earlier versions. As well, its performance was better compared to existing open source alternatives [22] such as Mixtral [16]. We used the prompt presented in figure 3 that we designed on another similar RAG implementation with different document types. The prompt conditions the answer to the query and the chunks, referred to as source, and if the generator cannot answer it should return No answer. please answer the question below by referencing the list of sources provided after the question; if the question can not be answered just respond ’No answer’. The sources are listed after \"Sources:\".  Question: {query}  Sources: {key} - {source} ...  Sources: {key} - {source} ...</p><p style='color: blue;'>Fig. 3. Example prompt template used by the generator</p><h3 style='color: black;'>3.4 Chunking</h3><p style='color: magenta;'>In addition to chunking based on the number of tokens, we have processed the documents using computer vision and natural language processing to extract elements identified in the reports. The list of elements considered are provided by the Unstructured9 open source library. From the set of processing strategies, 9 https://unstructured-io.github.io/unstructured/introduction.html# elements</p><p style='color: red;'>Financial Report Chunking for Effective Retrieval Augmented Generation we use Chipper, a vision encoder decoder10 model inspired by Donut [20] to showcase the performance difference. The Chipper model outputs results as a JSON representation of the document, listing elements per page characterized by their element type. Additionally, Chipper provides a bounding box enclosing each element on the page and the corresponding element text.</p><p style='color: green;'>These elements are sometimes short to be considered as chunks, so to gen- erate chunks from elements the following steps have been followed. Given the structure of finance reporting documents, our structural chunking efforts are con- centrated on processing titles, texts, and tables. The steps to generate element- based chunks are:</p><p style='color: blue;'>– if the element text length is smaller than 2,048 characters, a merge with the following element is attempted – iteratively, element texts are merged following the step above till either the desired length is achieved, without breaking the element – if a title element is found, a new chunk is started – if a table element is found, a new chunk is started, preserving the entire table After element-based chunks have been derived, three types of metadata are generated to enrich the content and support efficient indexing. The first two types, generated via predefined prompt templates with GPT-4, include: 1) up to 6 representative keywords of the composite chunk 2) a summarised paragraph of the composite chunk. The third type is 3) Naive representation using the first two sentences from a composite chunk (a kind of prefix) and in the case of tables, the description of the table, which is typically identified in the table caption.</p><h3 style='color: black;'>3.5 Dataset</h3><p style='color: magenta;'>This dataset is made of 150 instances with questions and answers from 84 unique reports. The dataset does not include the source documents, which we have downloaded. We were able to recover only 80 documents, which reduces the number of questions to 141 from the original 150. The distribution of Un- structured elements predictions are shown in table 1. Documents have a varying number of pages, spanning from 4 pages (FOOT- LOCKER 2022 8K dated-2022-05-20) to 549 pages (e.g. PEPSICO 2021 10K), with an average of 147.34 with std 97.78 with a total of 11,787 pages combined. Each instance contains a link to the report, the question, a question type , the answer and supporting evidence, with page number where the evidence is located 10 https://huggingface.co/docs/transformers/model_doc/vision-encoder- decoder 8 Jimeno Yepes et al. Table 1. Unstructured element types distribution for Chipper predictions against doc- uments in FinanceBench.</p><p style='color: red;'><table><thead><th>Element Type</th><th>[Chipper Entities</th></thead><tr><td>NarrativeText</td><td>61,780</td></tr><tr><td>Title</td><td>29,664</td></tr><tr><td>ListItem</td><td>33,054</td></tr><tr><td>UncategorizedText</td><td>9,400</td></tr><tr><td>Footer</td><td>1,026</td></tr><tr><td>Table</td><td>7,700</td></tr><tr><td>Header</td><td>3,959</td></tr><tr><td>Image</td><td>26</td></tr><tr><td>FigureCaption</td><td>54</td></tr><tr><td>Formula</td><td>29</td></tr><tr><td>Address</td><td>229</td></tr><tr><td>Total</td><td>146,921</td></tr></table></p><p style='color: green;'>in the document, that allows for a closer evaluation of the results. Based on the page number, evidence contexts are located in different areas in the documents, ranging from the first page in some cases up to page 304 in one instance. The mean page number to find the evidence is 54.58 with a standard deviation of 43.66, which shows that evidence contexts to answer the questions are spread within a document.</p><p style='color: blue;'>These characteristics make FinanceBench a perfect dataset for evaluating RAG. An example instance is available in table 2.</p><h3 style='color: black;'>4 Results</h3><p style='color: magenta;'>We are considering 80 documents and 141 questions from FinanceBench. Using the OpenAI tokenizer from the model text-embedding-ada-002 that uses the tokenizer cl100k base11, there are on average 102,444.35 tokens with std of 61,979.45, which shows the large variability of document lengths as seen by the different number of pages per document presented above. Chunking Efficiency The first thing we analyzed is the total number of chunks, as it impacts indexing time. We would like to observe the relationship between accuracy and total chunk size. Table 3 shows the number of chunks derived from each one of the processing methods. Unstructured element-based chunks are closer in size to Base 512, and as the chunk size decreases for the basic chunking strategies, the total number of chunks increases linearly. Financial Report Chunking for Effective Retrieval Augmented Generation Table 2. Example question from the FinanceBench dataset</p><p style='color: red;'><table><thead><th>Field</th><th>Value</th></thead><tr><td></td><td>financebench-id|financebench.id_00859</td></tr><tr><td>doc_name</td><td>VERIZON.2021_10K</td></tr><tr><td>doc_link</td><td>https: //www.verizon.com/about/sites/default /files/2021-Annual- Report-on-Form-10-K.pdf</td></tr><tr><td>question_type</td><td>*novel-generated’</td></tr><tr><td>question</td><td>Among all of the derivative instruments that Verizon used to manage] the exposure to fluctuations of foreign currencies exchange rates or interest rates, which one had the highest notional value in FY 2021?</td></tr><tr><td>answer</td><td>Cross currency swaps. Its notional value was $32,502 million.,</td></tr><tr><td>evidence_text</td><td>Derivative Instruments We enter into derivative transactions primarily to manage our exposure to fluctuations in foreign currency exchange rates and interest rates. We employ risk management strategies, which may include the use of a variety of derivatives including interest rate swaps, cross currency swaps, forward starting interest rate swaps, trea- sury rate locks, interest rate caps, swaptions and foreign exchange for- wards. We do not hold derivatives for trading purposes. The following table sets forth the notional amounts of our outstanding derivative in- struments: (dollars in millions) At December 31, 2021 2020 Interest rate swaps $ 19,779 $ 17,768 Cross currency swaps 32,502 26,288 Forward starting interest rate 1,000 2,000 Foreign exchange forwards 932</td></tr><tr><td>page-number</td><td></td></tr></table></p><p style='color: green;'>Table 3. Chunks statistics for basic chunking elements and Unstructured elements</p><p style='color: blue;'><table><thead><th>Processing|total</th><th>chunks|mean</th><th>chunks per document</th><th>(std)|tables mean (std)</th></thead><tr><td>Base 128</td><td>| 64,058</td><td>800.73 (484.11)</td><td></td></tr><tr><td>Base 256</td><td>| 32,051</td><td>400.64 (242.04) (</td><td></td></tr><tr><td>Base 512</td><td>| 16,046</td><td>200.58 (121. 01)</td><td></td></tr><tr><td>Chipper</td><td>20,843</td><td>260.57 (145.80)</td><td>96.20 (57.53)</td></tr></table></p><p style='color: magenta;'>Retrieval Accuracy Secondly, we evaluate the capabilities of each chunking strategy in terms of retrieval accuracy. We use the page numbers in the ground truth to calculate the page-level retrieval accuracy, and we use ROUGE [24] and BLEU [32] scores to evaluate the accuracy of paragraph-level retrieval compared to the ground truth evidence paragraphs. As shown in Table 4, when compared to Unstructured element-based chunk- ing strategies, basic chunking strategies seem to have higher page-level retrieval accuracy but lower paragraph-level accuracy on average. Additionally, basic chunking strategies also lack consistency between page-level and paragraph-level accuracy; higher page-level accuracy doesn’t ensure higher paragraph-level ac- curacy. For example, Base 128 has the second highest page-level accuracy but the lowest paragraph-level scores among all. On the other hand, element-based chunking strategies showed more consistent results. A fascinating discovery is that when various chunking strategies are com- bined, it results in enhanced retrieval scores, achieving superior performance at both the page level (84.4%) and paragraph level (with ROUGE at 0.568% and BLEU at 0.452%). This finding addresses an unresolved question: how to improve the accuracy of RAG.</p><p style='color: red;'>The element based method provides the highest scores and it also provides a mechanism to chunk documents without the need to fine tune hyper-parameters like the number of tokens in a chunk. This suggests the element based method is more generalizable and can be applied to new types of documents.</p><p style='color: green;'>Q&A Accuracy Third, we evaluate the Q&A accuracy for the chunking strate- gies. In addition to manual evaluation, we have investigated an automatic evalua- tion using GPT-4. GPT-4 compares how the answers provided by our method are similar to or different from the FinanceBench gold standard, similar approaches have been previously evaluated [13,23,29,30]. The automatic evaluation allows scaling the evaluation efforts for the different chunking strategies that we have considered. We used the prompt template in figure 4.</p><p style='color: blue;'>Begin with True or False. Are the two following answers (Answer 1 and Answer 2) the same with respect to the question between single quotes ’{question}’?  Answer 1: ’{ground_truth_answer}’ Answer 2: ’{generated_answer}’  Fig. 4. Evaluation prompt template. The {question}, {ground truth answer} and {generated answer} fields are substituted for each question accordingly.</p><p style='color: magenta;'>Results in table 5 show that element-based chunking strategies offer the best question-answering accuracy, which is consistent with page retrieval and para- graph retrieval accuracy. Lastly, our approach stands out for its efficiency. Not only is element-based chunking generalizable without the need to select the chunk size, but when com- pared to the aggregation results that yield the highest retrieval scores. Element- based chunking achieves the highest retrieval scores with only half the number of chunks required compared to methods that do not consider the structure of the documents (62,529 v.s. 112,155). This can reduce the indexing cost and im- prove query latency because there are only half as many vectors to index for the vectordb that stores the chunks. This underscores the effectiveness of our solu- tion in optimizing the balance between performance and computational resource requirements. Financial Report Chunking for Effective Retrieval Augmented Generation</p><p style='color: red;'>Table 4. Retrieval results. For each chunking strategy, we show the number of chunks for all the documents (Total Chunks), Page Accuracy, and ROUGE and BLEU scores. ROUGE and BLEU are calculated as the maximum score from the list of recovered contexts for a question when compared to the known evidence for that question.</p><p style='color: green;'><table><thead><th>Chunking strategy</th><th>Total Chunks}</th><th>Page Accuracy</th><th>ROUGE|BLEU.</th></thead><tr><td>Base 128</td><td>64,058</td><td>72.34</td><td>0.383</td></tr><tr><td>Base 256</td><td>32,051</td><td>73.05</td><td>0.433</td></tr><tr><td>Base 512</td><td>16,046</td><td>68.09</td><td>0.455</td></tr><tr><td>Base Aggregation</td><td>112,155</td><td>83.69</td><td>0.536</td></tr><tr><td>Keywords Chipper</td><td></td><td>46.10</td><td>0.444</td></tr><tr><td>Summary Chipper</td><td></td><td>62.41</td><td>0.473</td></tr><tr><td>Prefix &amp; Table Description Chipper</td><td></td><td>67.38</td><td>0.514</td></tr><tr><td>Chipper Aggregation</td><td>a</td><td>84.40</td><td>0.568</td></tr></table></p><p style='color: blue;'>Table 5. Q&A results. We show the percentage of questions with no answer and as well the accuracy either estimated automatically using GPT-4 or manually.</p><p style='color: magenta;'><table><thead><th>Chunking strategy</th><th>No</th><th></th><th>answer|GPT-4|Manual</th></thead><tr><td>Base 128</td><td>35.46</td><td>29.08</td><td>| 35.46</td></tr><tr><td>Base 256</td><td>5.5¢</td><td>32.62</td><td>| 36.88</td></tr><tr><td>Base 512</td><td>24.82</td><td>41.84</td><td>| 48.23</td></tr><tr><td>Keywords Chipper</td><td>22.70 |</td><td>43.97]</td><td>53.19</td></tr><tr><td>Summary Chipper</td><td>17.73</td><td>|43.97])</td><td>51.77</td></tr><tr><td>Prefix &amp; Table Description Chipper]</td><td>20.57</td><td>41.13</td><td>| 53.19</td></tr></table></p><h3 style='color: black;'>5 Discussion</h3><p style='color: red;'>We have observed that using basic 512 chunking strategies produces results most similar to the Unstructured element-based approach, which may be due to the fact that 512 tokens share a similar length with the token size within our element-based chunks and capture a long context, but fail keep a coherent context in some cases, leaving out relevant information required for Q&A. This is further observed when considering the ROUGE and BLEU scores in table 4, where the chunk contexts for the baseline have lower scores. These findings support existing research stating that the best basic chunk size varies from data to data [3]. These results show, as well, that our method adapts to different documents without tuning. Our method relies on the struc- tural information that is present in the document’s layout to adjust the chunk size automatically.</p><p style='color: green;'>We have evaluated aggregating the output of different chunking methods in the retrieval experiments as sown in table 4. Even though the aggregation seems to be effective for retrieval, the Q&A exceeded the GPT-4 token limit, which resulted in a non-effective Q&A solution using the selected model. As well, we evaluated variations of the prompt used to generate the answers (see figure 3). Re-ordering the retrieval context and the question, but results were not statistically different. We experimented as well with variations of the verbs using in the prompt, e.g. changing referencing with using, which seemed to lower the quality of the answers generated. This shows that prompt engineering is a relevant factor in RAG. We evaluated using GPT-4 for evaluation instead of relying on manual evalu- ation. In most cases, GPT-4 evaluated correctly but failed when a more elaborate answer is provided. As shown in figure 5, the answer is 39.7% while the estimated answer is 39.73% but with a detailed explanation of the calculation.</p><p style='color: blue;'>Question: ’What is Coca Cola’s FY2021 COGS % margin? Calculate what was asked by utilizing the line items clearly shown in the income statement.’?  Answer 1: ’39.7%’ Answer 2: ’From the income statement referenced on page 60 of COCACOLA_2021_10K_embedded.json, we can see that Coca Cola’s total revenue in FY2021 was $38,655 million and their cost of goods sold (COGS) was $15,357 million. To calculate the COGS % margin, we divide the COGS by the total revenue and multiply by 100: (15,357 / 38,655) * 100 = 39.73% So, Coca Cola’s FY2021 COGS % margin was approximately 39.73%.’ </p><p style='color: magenta;'>Fig. 5. Evaluation prompt template</p><h3 style='color: black;'>6 Conclusions and Future Work</h3><p style='color: red;'>Financial Report Chunking for Effective Retrieval Augmented Generation Furthermore, we would like to study the impact of RAG configuration and ele- ment type based chunking.</p><h3 style='color: black;'>References</h3><p style='color: green;'>2. Balaguer, A., Benara, V., de Freitas Cunha, R.L., de M. Estev˜ao Filho, R., Hendry, T., Holstein, D., Marsman, J., Mecklenburg, N., Malvar, S., Nunes, L.O., Padilha, R., Sharp, M., Silva, B., Sharma, S., Aski, V., Chandra, R.: Rag vs fine-tuning: Pipelines, tradeoffs, and a case study on agriculture (2024) 3. Barnett, S., Kurniawan, S., Thudumu, S., Brannelly, Z., Abdelrazek, M.: Seven Failure Points When Engineering a Retrieval Augmented Generation System (2024)</p><p style='color: blue;'>4. Bentabet, N.I., Juge, R., El Maarouf, I., Mouilleron, V., Valsamou-Stanislawski, D., El-Haj, M.: The financial document structure extraction shared task (fintoc 2020). In: Proceedings of the 1st Joint Workshop on Financial Narrative Processing and MultiLing Financial Summarisation. pp. 13–22 (2020)</p><p style='color: magenta;'>5. Chen, H., Jiao, F., Li, X., Qin, C., Ravaut, M., Zhao, R., Xiong, C., Joty, S.: Chat- GPT’s One-year Anniversary: Are Open-Source Large Language Models Catching up? arXiv preprint arXiv:2311.16989 (2023) 6. Chen, Z., Chen, W., Smiley, C., Shah, S., Borova, I., Langdon, D., Moussa, R., Beane, M., Huang, T.H., Routledge, B., et al.: Finqa: A dataset of numerical reasoning over financial data. arXiv preprint arXiv:2109.00122 (2021) 7. Chen, Z., Li, S., Smiley, C., Ma, Z., Shah, S., Wang, W.Y.: ConvFinQA: Exploring the Chain of Numerical Reasoning in Conversational Finance Question Answering (2022) 8. Choi, S., Gazeley, W., Wong, S.H., Li, T.: Conversational Financial Information Retrieval Model (ConFIRM). arXiv preprint arXiv:2310.13001 (2023)</p><p style='color: red;'>9. DeSola, V., Hanna, K., Nonis, P.: Finbert: pre-trained model on sec filings for financial natural language tasks. University of California (2019) 10. El-Haj, M., Rayson, P., Young, S., Walker, M.: Detecting document structure in a very large corpus of UK financial reports. European Language Resources Associa- tion (ELRA) (2014) 11. El Maarouf, I., Kang, J., Azzi, A.A., Bellato, S., Gan, M., El-Haj, M.: The financial document structure extraction shared task (FinTOC2021). In: Proceedings of the 3rd Financial Narrative Processing Workshop. pp. 111–119 (2021)</p><p style='color: green;'>12. Gao, Y., Xiong, Y., Gao, X., Jia, K., Pan, J., Bi, Y., Dai, Y., Sun, J., Wang, H.: Retrieval-augmented generation for large language models: A survey. arXiv preprint arXiv:2312.10997 (2023) 13. Hada, R., Gumma, V., de Wynter, A., Diddee, H., Ahmed, M., Choudhury, M., Bali, K., Sitaram, S.: Are large language model-based evaluators the solution to scaling up multilingual evaluation? arXiv preprint arXiv:2309.07462 (2023) 14. Islam, P., Kannappan, A., Kiela, D., Qian, R., Scherrer, N., Vidgen, B.: Fi- nanceBench: A New Benchmark for Financial Question Answering. arXiv preprint arXiv:2311.11944 (2023)</p><p style='color: blue;'>15. Ji, Z., Lee, N., Frieske, R., Yu, T., Su, D., Xu, Y., Ishii, E., Bang, Y.J., Madotto, A., Fung, P.: Survey of Hallucination in Natural Language Generation. ACM Comput- ing Surveys 55(12), 1–38 (Mar 2023). https://doi.org/10.1145/3571730, http:// dx.doi.org/10.1145/3571730</p><p style='color: magenta;'>14 Jimeno Yepes et al. 16. Jiang, A.Q., Sablayrolles, A., Roux, A., Mensch, A., Savary, B., Bamford, C., Chaplot, D.S., de las Casas, D., Hanna, E.B., Bressand, F., Lengyel, G., Bour, G., Lample, G., Lavaud, L.R., Saulnier, L., Lachaux, M.A., Stock, P., Subramanian, S., Yang, S., Antoniak, S., Scao, T.L., Gervet, T., Lavril, T., Wang, T., Lacroix, T., Sayed, W.E.: Mixtral of Experts (2024)</p><p style='color: red;'>17. Juge, R., Bentabet, I., Ferradans, S.: The fintoc-2019 shared task: Financial doc- ument structure extraction. In: Proceedings of the Second Financial Narrative Processing Workshop (FNP 2019). pp. 51–57 (2019)</p><p style='color: green;'>18. Kaddour, J., Harris, J., Mozes, M., Bradley, H., Raileanu, R., McHardy, R.: Chal- lenges and applications of large language models. arXiv preprint arXiv:2307.10169 (2023)</p><p style='color: blue;'>19. Kaur, S., Smiley, C., Gupta, A., Sain, J., Wang, D., Siddagangappa, S., Aguda, T., Shah, S.: REFinD: Relation Extraction Financial Dataset. In: the 46th International ACM SIGIR Conference on Re- Proceedings of search and Development in Information Retrieval. SIGIR ’23, ACM (Jul 2023). https://doi.org/10.1145/3539618.3591911, http://dx.doi.org/10.1145/ 3539618.3591911</p><p style='color: magenta;'>20. Kim, G., Hong, T., Yim, M., Park, J., Yim, J., Hwang, W., Yun, S., Han, D., Park, S.: Donut: Document understanding transformer without ocr. arXiv preprint arXiv:2111.15664 7, 15 (2021) 21. Lewis, P., Perez, E., Piktus, A., Petroni, F., Karpukhin, V., Goyal, N., K¨uttler, H., Lewis, M., Yih, W.t., Rockt¨aschel, T., et al.: Retrieval-augmented generation for knowledge-intensive NLP tasks. Advances in Neural Information Processing Systems 33, 9459–9474 (2020)</p><p style='color: red;'>22. Li, D., Shao, R., Xie, A., Sheng, Y., Zheng, L., Gonzalez, J.E., Stoica, I., Ma, X., Zhang, H.: How Long Can Open-Source LLMs Truly Promise on Context Length? (June 2023), https://lmsys.org/blog/2023-06-29-longchat</p><p style='color: green;'>23. Li, Y., Duan, Y.: The evaluation of experiments of artificial general intelligence with gpt-4 based on dikwp. arXiv preprint (2023) 24. Lin, C.Y.: Rouge: A package for automatic evaluation of summaries. In: Text sum- marization branches out. pp. 74–81 (2004) 25. Liu, N.F., Lin, K., Hewitt, J., Paranjape, A., Bevilacqua, M., Petroni, F., Liang, P.: Lost in the middle: How language models use long contexts. arXiv preprint arXiv:2307.03172 (2023) 26. Liu, Z., Huang, D., Huang, K., Li, Z., Zhao, J.: Finbert: A pre-trained financial language representation model for financial text mining. In: Proceedings of the twenty-ninth international conference on international joint conferences on artificial intelligence. pp. 4513–4519 (2021) llmware: Rag Instruct Benchmark Tester. https://huggingface.co/datasets/ llmware/rag_instruct_benchmark_tester, Accessed: January 15, 2024</p><p style='color: blue;'>28. Malkov, Y.A., Yashunin, D.A.: Efficient and robust approximate nearest neigh- bor search using hierarchical navigable small world graphs. IEEE transactions on pattern analysis and machine intelligence 42(4), 824–836 (2018)</p><p style='color: magenta;'>29. Moore, S., Nguyen, H.A., Chen, T., Stamper, J.: Assessing the quality of multiple- choice questions using gpt-4 and rule-based methods. In: European Conference on Technology Enhanced Learning. pp. 229–245. Springer (2023) 30. Naismith, B., Mulcaire, P., Burstein, J.: Automated evaluation of written discourse coherence using gpt-4. In: Proceedings of the 18th Workshop on Innovative Use of NLP for Building Educational Applications (BEA 2023). pp. 394–403 (2023) 31. OpenAI, :, Achiam, J., Adler, S., Agarwal, S., et al.: GPT-4 Technical Report</p><p style='color: red;'>(2023) Financial Report Chunking for Effective Retrieval Augmented Generation 32. Papineni, K., Roukos, S., Ward, T., Zhu, W.J.: Bleu: a method for automatic evaluation of machine translation. In: Proceedings of the 40th annual meeting of the Association for Computational Linguistics. pp. 311–318 (2002) 33. Pfitzmann, B., Auer, C., Dolfi, M., Nassar, A.S., Staar, P.: Doclaynet: A large human-annotated dataset for document-layout segmentation. In: Proceedings of the 28th ACM SIGKDD Conference on Knowledge Discovery and Data Mining. pp. 3743–3751 (2022)</p><p style='color: green;'>34. Pinecone: Chunking strategies for llm applications, https://www.pinecone.io/ learn/chunking-strategies/ 35. Reimers, N., Gurevych, I.: Sentence-bert: Sentence embeddings using siamese bert- networks. In: Proceedings of the 2019 Conference on Empirical Methods in Nat- ural Language Processing. Association for Computational Linguistics (11 2019), https://arxiv.org/abs/1908.10084</p><p style='color: blue;'>36. Retteter, J.: Mastering Table Extraction: Revolutionize Your Earnings Re- ports Analysis with AI. https://medium.com/unstructured-io/mastering- table-extraction-revolutionize-your-earnings-reports-analysis-with- ai-1bc32c22720e, Accessed: January 15, 2024 37. Rizinski, M., Peshov, H., Mishev, K., Jovanovik, M., Trajanov, D.: Sentiment Anal- ysis in Finance: From Transformers Back to eXplainable Lexicons (XLex) (2023) 38. Shah, R.S., Chawla, K., Eidnani, D., Shah, A., Du, W., Chava, S., Raman, N., Smiley, C., Chen, J., Yang, D.: WHEN FLUE MEETS FLANG: Benchmarks and Large Pre-trained Language Model for Financial Domain (2022) 39. Singh Phogat, K., Harsha, C., Dasaratha, S., Ramakrishna, S., Akhil Puranam, S.: Zero-Shot Question Answering over Financial Documents using Large Language Models. arXiv e-prints pp. arXiv–2311 (2023)</p><p style='color: magenta;'>40. Wu, S., Irsoy, O., Lu, S., Dabravolski, V., Dredze, M., Gehrmann, S., Kambadur, P., Rosenberg, D., Mann, G.: BloombergGPT: A Large Language Model for Finance (2023) 41. Xu, P., Ping, W., Wu, X., McAfee, L., Zhu, C., Liu, Z., Subramanian, S., Bakhtu- rina, E., Shoeybi, M., Catanzaro, B.: Retrieval meets Long Context Large Language Models (2023) 42. Yang, H., Liu, X.Y., Wang, C.D.: FinGPT: Open-Source Financial Large Language Models (2023) 43. Ye, H., Liu, T., Zhang, A., Hua, W., Jia, W.: Cognitive Mirage: A Review of Hallucinations in Large Language Models (2023) 44. Zhang, B., Yang, H., Liu, X.Y.: Instruct-FinGPT: Financial Sentiment Analysis by Instruction Tuning of General-Purpose Large Language Models (2023)</p><p style='color: red;'>45. Zheng, X., Burdick, D., Popa, L., Zhong, X., Wang, N.X.R.: Global table extractor (gte): A framework for joint table identification and cell structure recognition using visual context. In: Proceedings of the IEEE/CVF winter conference on applications of computer vision. pp. 697–706 (2021) 46. Zhu, F., Lei, W., Huang, Y., Wang, C., Zhang, S., Lv, J., Feng, F., Chua, T.S.: TAT-QA: A question answering benchmark on a hybrid of tabular and textual content in finance. arXiv preprint arXiv:2105.07624 (2021)</p>"
+       "<h3 style='color: black;'>Untitled</h3><p style='color: red;'>2 v 1 3 1 5 0 . 2 0 4 2 : v i X r a</p><h3 style='color: black;'>Financial Report Chunking for Effective Retrieval Augmented Generation</h3><p style='color: green;'>Unstructured Technologies Sacramento, CA, USA leah@unstructured.io https://unstructured.io</p><p style='color: blue;'>Abstract. Chunking information is a key step in Retrieval Augmented Generation (RAG). Current research primarily centers on paragraph- level chunking. This approach treats all texts as equal and neglects the information contained in the structure of documents. We propose an expanded approach to chunk documents by moving beyond mere paragraph-level chunking to chunk primary by structural element com- ponents of documents. Dissecting documents into these constituent ele- ments creates a new way to chunk documents that yields the best chunk size without tuning. We introduce a novel framework that evaluates how chunking based on element types annotated by document understanding models contributes to the overall context and accuracy of the informa- tion retrieved. We also demonstrate how this approach impacts RAG assisted Question & Answer task performance. Our research includes a comprehensive analysis of various element types, their role in effective information retrieval, and the impact they have on the quality of RAG outputs. Findings support that element type based chunking largely im- prove RAG results on financial reporting. Through this research, we are also able to answer how to uncover highly accurate RAG. Keywords: Retrieval Augmented Generation · Document Chunking · Document Pre-Processing · Financial Domain · Large Language Models</p><h3 style='color: black;'>Introduction</h3><p style='color: magenta;'>contents of extensive documents [25,22,18]. By dissecting large volumes of text into smaller, more focused segments, LLMs can process each part with greater precision, ensuring a thorough understanding of each section. This segmented approach allows for meticulous analysis of unstructured data, enabling LLMs to construct a more comprehensive and coherent understanding of the entire docu- ment [41]. There remains a challenge in ensuring factual accuracy and relevance in the generated responses, especially when dealing with complex or extensive information.</p><p style='color: red;'>Recently, Retrieval Augmented Generation (RAG) [21,12] has been devel- oped to address the hallucination problem with LLMs [15,43] when recovering factual information directly from an LLM. In RAG, instead of answering a user query directly using an LLM, the user query is used to retrieve documents or segments from a corpus and the top retrieved documents or segments are used to generate the answer in conjunction with an LLM. In this way, RAG con- straints the answer to the set of retrieved documents. RAGs have been used as well to answer questions from single documents [14]. The documents are split into smaller parts or chunks, indexed by a retrieval system and recovered and processed depending on the user information need. In a sense, this process allows answering questions about information in a single document, thus contributing to the set of techniques available for document understanding. Since documents need to be chunked for RAG processing, this raises the question about what is the best practice to chunk documents for effective RAG document understanding. There are several dimensions to consider when decid- ing how to chunk a document, which includes the size of the chunks. The retrieval system in RAG can use traditional retrieval systems using bag- of-words methods or a vector database. If a vector database is used, then an embedding needs to be obtained from each chunk, thus the number of tokens in the chunk is relevant since the neural networks processing the chunks might have constraints on the number of tokens. As well, different chunk sizes might have undesirable retrieval results. Since the most relevant retrieved chunks need to be processed by an LLM, the number of tokens in retrieved chunks might have an effect in the generation of the answer [25]. As we see, chunking is re- quired for RAG systems and there are several advantages and disadvantages when considering how to chunk a document.</p><p style='color: green;'>In this work, we study specifically the chunking of U.S. Securities and Ex- change Commission (SEC)1 Financial Reports2, including 10-Ks, 10-Qs, and 8-Ks. This study plays a critical role in offering insights into the financial health and operational dynamics of public companies. These documents present unique challenges in terms of document processing and information extraction as they consist of varying sizes and layouts, and contain a variety of tabular informa- tion. Previous work has evaluated the processing of these reports with simple chunking strategies (e.g., tokens), but we believe that a more effective use of these reports might be achieved by a better pre-processing of the documents</p><p style='color: blue;'>Financial Report Chunking for Effective Retrieval Augmented Generation and chunking configuration3 [14]. To the best of our knowledge, this is the first systematic study on chunking for document understanding and more specifically for processing financial reports.</p><h3 style='color: black;'>2 Related work</h3><p style='color: magenta;'>Exploring the structure of financial reports is an exceptional area for es- tablishing optimal principles for chunking. The intricate nature of document structures and contents has resulted in most of the work processing financial reports focusing on the identification of structural elements. Among previous work, we find El-Haj et al. [10] and the FinTOC challenges [17,4,11] that have worked at the document structure level for UK and French financial reports. Ad-</p><p style='color: red;'>3 https://www.cnbc.com/2023/12/19/gpt-and-other-ai-models-cant-analyze- an-sec-filing-researchers-find.html</p><p style='color: green;'>ditionally, there is recent work that considers U.S. SEC reports, which includes DocLayNet [33] and more specifically with the report tables in FinTabNet [45]. On the side of financial models, there is work in sentiment analysis in fi- nance [37], which includes the pre-training of specialised models such as Fin- BERT by Liu et al. [26], which is a BERT based model pre-trained on large corpora including large collections of financial news collected from different sites and FinBERT by DeSola et al, [9] trained on Wikipedia, BookCorpus and U.S. SEC data. Additional models include BloombergGPT [40], FinGPT [42] and Instruct-FinGPT[44]. More advance datasets in the financial domain include FinQA [6], LLMWare [27], ConFIRM [8] and TAT-QA [46] among others [7,38,19] that have been prepared for retrieval and or Questions and Answering (Q&A) tasks over snippets of fi- nancial data that includes tabular data, which has allowed methods on large language models to be tested on them [39]. Most of the previous work has focused on understanding the layout of fi- nancial documents or understanding specific snippets of existing reports with different levels of complexity, but there has not been much research in under- standing financial report documents, except some more recent work that includes FinanceBench [14], in which a set of questions about the content of financial re- ports are proposed that includes the evidence snippet.</p><p style='color: blue;'>More specifically on document chunking methods for RAG, there are stan- dard approaches being considered such as chunking text into spans of a given token length (e.g. 128 and 256) or chunking based on sentences. Open source projects already allow simple processing of documents (e.g. Unstructured4, Lla- maindex5 or Langchain 6), without explicitly considering the table structure on which these chunking strategies are applied. Even though different approaches are available, an exhaustive evaluation of chunking applied to RAG and specifically to financial reporting, except for some limited chunking analysis [14,36], is non-existent. In our work, we compare a broad range of chunking approaches in addition to more simple ones and provide an analysis of the outcomes of different methods when asking questions about different aspects of the reports.</p><h3 style='color: black;'>3.1 RAG setting for the experiments</h3><p style='color: magenta;'>Financial Report Chunking for Effective Retrieval Augmented Generation document, the document is split into chunks and the chunks are indexed into a vector database (vectordb). When a question is sent to the RAG system, the top-k chunks most similar to the question are retrieved from the vector database and used to generate the answer using a large language model as generator. In order to retrieve chunks from the vector database, the question is encoded into a vector that is compared to the vector previously generated from the chunks. To prompt the generator, the question is converted into a set of instructions that instruct the LLM to find the answer within the top-k retrieved chunks. question vectordb top k question vector chunks encoder v | generator —+ answer * question to prompt + rome</p><p style='color: red;'>Fig. 1. RAG steps to answer a question about a document</p><p style='color: green;'>In our experiments, we modify the way documents are chunked prior to being indexed in the vector database. All other settings remain constant. In the following sections, we describe in more detail each one of the components and processes used.</p><h3 style='color: black;'>3.2 Indexing and retrieval</h3><p style='color: blue;'>As shown in figure 2, to index a document, first the document is split into chunks, then each chunk is processed by an encoder model and then indexed into the vector database. Based on the chunking strategy a document will be split into a larger or smaller set of chunks. chunks vectors Fig. 2. Indexing of document chunks into the vector database ttps://huggingface. co/sentence-transformers/multi-qa-mpnet-base-dot-</p><p style='color: magenta;'>6 Jimeno Yepes et al.</p><p style='color: red;'>As shown in figure 1, to retrieve chunks relevant to a question, the question is converted into a vector representation and the vector database returns a ranked list of chunks based on the similarity between question vector and the chunks in the database. Weaviate implements an approximate nearest neighbours algo- rithm [28] as their retrieval approach, which supports fast retrieval with high accuracy. In our experiments, we retrieve the top-10 chunks for each question.</p><h3 style='color: black;'>3.3 Generation</h3><p style='color: green;'>We have used GPT-4 [31] as the generator, which has shown best performance compared to earlier versions. As well, its performance was better compared to existing open source alternatives [22] such as Mixtral [16]. We used the prompt presented in figure 3 that we designed on another similar RAG implementation with different document types. The prompt conditions the answer to the query and the chunks, referred to as source, and if the generator cannot answer it should return No answer. please answer the question below by referencing the list of sources provided after the question; if the question can not be answered just respond ’No answer’. The sources are listed after \"Sources:\".  Question: {query}  Sources: {key} - {source} ...  Sources: {key} - {source} ...</p><p style='color: blue;'>Fig. 3. Example prompt template used by the generator</p><h3 style='color: black;'>3.4 Chunking</h3><p style='color: magenta;'>In addition to chunking based on the number of tokens, we have processed the documents using computer vision and natural language processing to extract elements identified in the reports. The list of elements considered are provided by the Unstructured9 open source library. From the set of processing strategies, 9 https://unstructured-io.github.io/unstructured/introduction.html# elements</p><p style='color: red;'>Financial Report Chunking for Effective Retrieval Augmented Generation</p><p style='color: green;'>we use Chipper, a vision encoder decoder10 model inspired by Donut [20] to showcase the performance difference. The Chipper model outputs results as a JSON representation of the document, listing elements per page characterized by their element type. Additionally, Chipper provides a bounding box enclosing each element on the page and the corresponding element text.</p><p style='color: blue;'>These elements are sometimes short to be considered as chunks, so to gen- erate chunks from elements the following steps have been followed. Given the structure of finance reporting documents, our structural chunking efforts are con- centrated on processing titles, texts, and tables. The steps to generate element- based chunks are:</p><p style='color: magenta;'>– if the element text length is smaller than 2,048 characters, a merge with the following element is attempted – iteratively, element texts are merged following the step above till either the desired length is achieved, without breaking the element – if a title element is found, a new chunk is started – if a table element is found, a new chunk is started, preserving the entire table After element-based chunks have been derived, three types of metadata are generated to enrich the content and support efficient indexing. The first two types, generated via predefined prompt templates with GPT-4, include: 1) up to 6 representative keywords of the composite chunk 2) a summarised paragraph of the composite chunk. The third type is 3) Naive representation using the first two sentences from a composite chunk (a kind of prefix) and in the case of tables, the description of the table, which is typically identified in the table caption.</p><h3 style='color: black;'>3.5 Dataset</h3><p style='color: red;'>This dataset is made of 150 instances with questions and answers from 84 unique reports. The dataset does not include the source documents, which we have downloaded. We were able to recover only 80 documents, which reduces the number of questions to 141 from the original 150. The distribution of Un- structured elements predictions are shown in table 1. Documents have a varying number of pages, spanning from 4 pages (FOOT- LOCKER 2022 8K dated-2022-05-20) to 549 pages (e.g. PEPSICO 2021 10K), with an average of 147.34 with std 97.78 with a total of 11,787 pages combined. Each instance contains a link to the report, the question, a question type , the answer and supporting evidence, with page number where the evidence is located 10 https://huggingface.co/docs/transformers/model_doc/vision-encoder- decoder</p><p style='color: green;'>8 Jimeno Yepes et al.</p><p style='color: blue;'>Table 1. Unstructured element types distribution for Chipper predictions against doc- uments in FinanceBench.</p><p style='color: magenta;'><table><thead><th>Element Type</th><th>[Chipper Entities</th></thead><tr><td>NarrativeText</td><td>61,780</td></tr><tr><td>Title</td><td>29,664</td></tr><tr><td>ListItem</td><td>33,054</td></tr><tr><td>UncategorizedText</td><td>9,400</td></tr><tr><td>Footer</td><td>1,026</td></tr><tr><td>Table</td><td>7,700</td></tr><tr><td>Header</td><td>3,959</td></tr><tr><td>Image</td><td>26</td></tr><tr><td>FigureCaption</td><td>54</td></tr><tr><td>Formula</td><td>29</td></tr><tr><td>Address</td><td>229</td></tr><tr><td>Total</td><td>146,921</td></tr></table></p><p style='color: red;'>in the document, that allows for a closer evaluation of the results. Based on the page number, evidence contexts are located in different areas in the documents, ranging from the first page in some cases up to page 304 in one instance. The mean page number to find the evidence is 54.58 with a standard deviation of 43.66, which shows that evidence contexts to answer the questions are spread within a document.</p><p style='color: green;'>These characteristics make FinanceBench a perfect dataset for evaluating RAG.</p><p style='color: blue;'>An example instance is available in table 2.</p><h3 style='color: black;'>4 Results</h3><p style='color: magenta;'>We are considering 80 documents and 141 questions from FinanceBench. Using the OpenAI tokenizer from the model text-embedding-ada-002 that uses the tokenizer cl100k base11, there are on average 102,444.35 tokens with std of 61,979.45, which shows the large variability of document lengths as seen by the different number of pages per document presented above.</p><p style='color: red;'>Chunking Efficiency The first thing we analyzed is the total number of chunks, as it impacts indexing time. We would like to observe the relationship between accuracy and total chunk size. Table 3 shows the number of chunks derived from each one of the processing methods. Unstructured element-based chunks are closer in size to Base 512, and as the chunk size decreases for the basic chunking strategies, the total number of chunks increases linearly. Financial Report Chunking for Effective Retrieval Augmented Generation Table 2. Example question from the FinanceBench dataset</p><p style='color: green;'><table><thead><th>Field</th><th>Value</th></thead><tr><td></td><td>financebench-id|financebench.id_00859</td></tr><tr><td>doc_name</td><td>VERIZON.2021_10K</td></tr><tr><td>doc_link</td><td>https: //www.verizon.com/about/sites/default /files/2021-Annual- Report-on-Form-10-K.pdf</td></tr><tr><td>question_type</td><td>*novel-generated’</td></tr><tr><td>question</td><td>Among all of the derivative instruments that Verizon used to manage] the exposure to fluctuations of foreign currencies exchange rates or interest rates, which one had the highest notional value in FY 2021?</td></tr><tr><td>answer</td><td>Cross currency swaps. Its notional value was $32,502 million.,</td></tr><tr><td>evidence_text</td><td>Derivative Instruments We enter into derivative transactions primarily to manage our exposure to fluctuations in foreign currency exchange rates and interest rates. We employ risk management strategies, which may include the use of a variety of derivatives including interest rate swaps, cross currency swaps, forward starting interest rate swaps, trea- sury rate locks, interest rate caps, swaptions and foreign exchange for- wards. We do not hold derivatives for trading purposes. The following table sets forth the notional amounts of our outstanding derivative in- struments: (dollars in millions) At December 31, 2021 2020 Interest rate swaps $ 19,779 $ 17,768 Cross currency swaps 32,502 26,288 Forward starting interest rate 1,000 2,000 Foreign exchange forwards 932</td></tr><tr><td>page-number</td><td></td></tr></table></p><p style='color: blue;'>Table 3. Chunks statistics for basic chunking elements and Unstructured elements</p><p style='color: magenta;'><table><thead><th>Processing|total</th><th>chunks|mean</th><th>chunks per document</th><th>(std)|tables mean (std)</th></thead><tr><td>Base 128</td><td>| 64,058</td><td>800.73 (484.11)</td><td></td></tr><tr><td>Base 256</td><td>| 32,051</td><td>400.64 (242.04) (</td><td></td></tr><tr><td>Base 512</td><td>| 16,046</td><td>200.58 (121. 01)</td><td></td></tr><tr><td>Chipper</td><td>20,843</td><td>260.57 (145.80)</td><td>96.20 (57.53)</td></tr></table></p><p style='color: red;'>Retrieval Accuracy Secondly, we evaluate the capabilities of each chunking strategy in terms of retrieval accuracy. We use the page numbers in the ground truth to calculate the page-level retrieval accuracy, and we use ROUGE [24] and BLEU [32] scores to evaluate the accuracy of paragraph-level retrieval compared to the ground truth evidence paragraphs. As shown in Table 4, when compared to Unstructured element-based chunk- ing strategies, basic chunking strategies seem to have higher page-level retrieval accuracy but lower paragraph-level accuracy on average. Additionally, basic chunking strategies also lack consistency between page-level and paragraph-level accuracy; higher page-level accuracy doesn’t ensure higher paragraph-level ac- curacy. For example, Base 128 has the second highest page-level accuracy but the lowest paragraph-level scores among all. On the other hand, element-based chunking strategies showed more consistent results. A fascinating discovery is that when various chunking strategies are com- bined, it results in enhanced retrieval scores, achieving superior performance at both the page level (84.4%) and paragraph level (with ROUGE at 0.568% and BLEU at 0.452%). This finding addresses an unresolved question: how to improve the accuracy of RAG.</p><p style='color: green;'>The element based method provides the highest scores and it also provides a mechanism to chunk documents without the need to fine tune hyper-parameters like the number of tokens in a chunk. This suggests the element based method is more generalizable and can be applied to new types of documents.</p><p style='color: blue;'>Q&A Accuracy Third, we evaluate the Q&A accuracy for the chunking strate- gies. In addition to manual evaluation, we have investigated an automatic evalua- tion using GPT-4. GPT-4 compares how the answers provided by our method are similar to or different from the FinanceBench gold standard, similar approaches have been previously evaluated [13,23,29,30]. The automatic evaluation allows scaling the evaluation efforts for the different chunking strategies that we have considered. We used the prompt template in figure 4.</p><p style='color: magenta;'>Begin with True or False. Are the two following answers (Answer 1 and Answer 2) the same with respect to the question between single quotes ’{question}’?  Answer 1: ’{ground_truth_answer}’ Answer 2: ’{generated_answer}’  Fig. 4. Evaluation prompt template. The {question}, {ground truth answer} and {generated answer} fields are substituted for each question accordingly.</p><p style='color: red;'>Results in table 5 show that element-based chunking strategies offer the best question-answering accuracy, which is consistent with page retrieval and para- graph retrieval accuracy. Lastly, our approach stands out for its efficiency. Not only is element-based chunking generalizable without the need to select the chunk size, but when com- pared to the aggregation results that yield the highest retrieval scores. Element- based chunking achieves the highest retrieval scores with only half the number of chunks required compared to methods that do not consider the structure of the documents (62,529 v.s. 112,155). This can reduce the indexing cost and im- prove query latency because there are only half as many vectors to index for the vectordb that stores the chunks. This underscores the effectiveness of our solu- tion in optimizing the balance between performance and computational resource requirements. Financial Report Chunking for Effective Retrieval Augmented Generation</p><p style='color: green;'>Table 4. Retrieval results. For each chunking strategy, we show the number of chunks for all the documents (Total Chunks), Page Accuracy, and ROUGE and BLEU scores. ROUGE and BLEU are calculated as the maximum score from the list of recovered contexts for a question when compared to the known evidence for that question.</p><p style='color: blue;'><table><thead><th>Chunking strategy</th><th>Total Chunks}</th><th>Page Accuracy</th><th>ROUGE|BLEU.</th></thead><tr><td>Base 128</td><td>64,058</td><td>72.34</td><td>0.383</td></tr><tr><td>Base 256</td><td>32,051</td><td>73.05</td><td>0.433</td></tr><tr><td>Base 512</td><td>16,046</td><td>68.09</td><td>0.455</td></tr><tr><td>Base Aggregation</td><td>112,155</td><td>83.69</td><td>0.536</td></tr><tr><td>Keywords Chipper</td><td></td><td>46.10</td><td>0.444</td></tr><tr><td>Summary Chipper</td><td></td><td>62.41</td><td>0.473</td></tr><tr><td>Prefix &amp; Table Description Chipper</td><td></td><td>67.38</td><td>0.514</td></tr><tr><td>Chipper Aggregation</td><td>a</td><td>84.40</td><td>0.568</td></tr></table></p><p style='color: magenta;'>Table 5.</p><p style='color: red;'>Q&A results. We show the percentage of questions with no answer and as well the accuracy either estimated automatically using GPT-4 or manually.</p><p style='color: green;'><table><thead><th>Chunking strategy</th><th>No</th><th></th><th>answer|GPT-4|Manual</th></thead><tr><td>Base 128</td><td>35.46</td><td>29.08</td><td>| 35.46</td></tr><tr><td>Base 256</td><td>5.5¢</td><td>32.62</td><td>| 36.88</td></tr><tr><td>Base 512</td><td>24.82</td><td>41.84</td><td>| 48.23</td></tr><tr><td>Keywords Chipper</td><td>22.70 |</td><td>43.97]</td><td>53.19</td></tr><tr><td>Summary Chipper</td><td>17.73</td><td>|43.97])</td><td>51.77</td></tr><tr><td>Prefix &amp; Table Description Chipper]</td><td>20.57</td><td>41.13</td><td>| 53.19</td></tr></table></p><h3 style='color: black;'>5 Discussion</h3><p style='color: blue;'>We have observed that using basic 512 chunking strategies produces results most similar to the Unstructured element-based approach, which may be due to the fact that 512 tokens share a similar length with the token size within our element-based chunks and capture a long context, but fail keep a coherent context in some cases, leaving out relevant information required for Q&A. This is further observed when considering the ROUGE and BLEU scores in table 4, where the chunk contexts for the baseline have lower scores. These findings support existing research stating that the best basic chunk size varies from data to data [3]. These results show, as well, that our method adapts to different documents without tuning. Our method relies on the struc- tural information that is present in the document’s layout to adjust the chunk size automatically.</p><p style='color: magenta;'>We have evaluated aggregating the output of different chunking methods in the retrieval experiments as sown in table 4. Even though the aggregation seems to be effective for retrieval, the Q&A exceeded the GPT-4 token limit, which resulted in a non-effective Q&A solution using the selected model. As well, we evaluated variations of the prompt used to generate the answers (see figure 3). Re-ordering the retrieval context and the question, but results were not statistically different. We experimented as well with variations of the verbs using in the prompt, e.g. changing referencing with using, which seemed to lower the quality of the answers generated. This shows that prompt engineering is a relevant factor in RAG. We evaluated using GPT-4 for evaluation instead of relying on manual evalu- ation. In most cases, GPT-4 evaluated correctly but failed when a more elaborate answer is provided. As shown in figure 5, the answer is 39.7% while the estimated answer is 39.73% but with a detailed explanation of the calculation.</p><p style='color: red;'>Question: ’What is Coca Cola’s FY2021 COGS % margin? Calculate what was asked by utilizing the line items clearly shown in the income statement.’?  Answer 1: ’39.7%’ Answer 2: ’From the income statement referenced on page 60 of COCACOLA_2021_10K_embedded.json, we can see that Coca Cola’s total revenue in FY2021 was $38,655 million and their cost of goods sold (COGS) was $15,357 million. To calculate the COGS % margin, we divide the COGS by the total revenue and multiply by 100: (15,357 / 38,655) * 100 = 39.73% So, Coca Cola’s FY2021 COGS % margin was approximately 39.73%.’ </p><p style='color: green;'>Fig. 5. Evaluation prompt template</p><h3 style='color: black;'>6 Conclusions and Future Work</h3><p style='color: blue;'>Financial Report Chunking for Effective Retrieval Augmented Generation Furthermore, we would like to study the impact of RAG configuration and ele- ment type based chunking.</p><h3 style='color: black;'>References</h3><p style='color: magenta;'>2. Balaguer, A., Benara, V., de Freitas Cunha, R.L., de M. Estev˜ao Filho, R., Hendry, T., Holstein, D., Marsman, J., Mecklenburg, N., Malvar, S., Nunes, L.O., Padilha, R., Sharp, M., Silva, B., Sharma, S., Aski, V., Chandra, R.: Rag vs fine-tuning: Pipelines, tradeoffs, and a case study on agriculture (2024)</p><p style='color: red;'>3. Barnett, S., Kurniawan, S., Thudumu, S., Brannelly, Z., Abdelrazek, M.: Seven Failure Points When Engineering a Retrieval Augmented Generation System (2024)</p><p style='color: green;'>4. Bentabet, N.I., Juge, R., El Maarouf, I., Mouilleron, V., Valsamou-Stanislawski, D., El-Haj, M.: The financial document structure extraction shared task (fintoc 2020). In: Proceedings of the 1st Joint Workshop on Financial Narrative Processing and MultiLing Financial Summarisation. pp. 13–22 (2020)</p><p style='color: blue;'>5. Chen, H., Jiao, F., Li, X., Qin, C., Ravaut, M., Zhao, R., Xiong, C., Joty, S.: Chat- GPT’s One-year Anniversary: Are Open-Source Large Language Models Catching up? arXiv preprint arXiv:2311.16989 (2023) 6. Chen, Z., Chen, W., Smiley, C., Shah, S., Borova, I., Langdon, D., Moussa, R., Beane, M., Huang, T.H., Routledge, B., et al.: Finqa: A dataset of numerical reasoning over financial data. arXiv preprint arXiv:2109.00122 (2021) 7. Chen, Z., Li, S., Smiley, C., Ma, Z., Shah, S., Wang, W.Y.: ConvFinQA: Exploring the Chain of Numerical Reasoning in Conversational Finance Question Answering (2022) 8. Choi, S., Gazeley, W., Wong, S.H., Li, T.: Conversational Financial Information Retrieval Model (ConFIRM). arXiv preprint arXiv:2310.13001 (2023) 9. DeSola, V., Hanna, K., Nonis, P.: Finbert: pre-trained model on sec filings for financial natural language tasks. University of California (2019) 10. El-Haj, M., Rayson, P., Young, S., Walker, M.: Detecting document structure in a very large corpus of UK financial reports. European Language Resources Associa- tion (ELRA) (2014) 11. El Maarouf, I., Kang, J., Azzi, A.A., Bellato, S., Gan, M., El-Haj, M.: The financial document structure extraction shared task (FinTOC2021). In: Proceedings of the 3rd Financial Narrative Processing Workshop. pp. 111–119 (2021)</p><p style='color: magenta;'>12. Gao, Y., Xiong, Y., Gao, X., Jia, K., Pan, J., Bi, Y., Dai, Y., Sun, J., Wang, H.: Retrieval-augmented generation for large language models: A survey. arXiv preprint arXiv:2312.10997 (2023) 13. Hada, R., Gumma, V., de Wynter, A., Diddee, H., Ahmed, M., Choudhury, M., Bali, K., Sitaram, S.: Are large language model-based evaluators the solution to scaling up multilingual evaluation? arXiv preprint arXiv:2309.07462 (2023) 14. Islam, P., Kannappan, A., Kiela, D., Qian, R., Scherrer, N., Vidgen, B.: Fi- nanceBench: A New Benchmark for Financial Question Answering. arXiv preprint arXiv:2311.11944 (2023)</p><p style='color: red;'>15. Ji, Z., Lee, N., Frieske, R., Yu, T., Su, D., Xu, Y., Ishii, E., Bang, Y.J., Madotto, A., Fung, P.: Survey of Hallucination in Natural Language Generation. ACM Comput- ing Surveys 55(12), 1–38 (Mar 2023). https://doi.org/10.1145/3571730, http:// dx.doi.org/10.1145/3571730</p><p style='color: green;'>14 Jimeno Yepes et al. 16. Jiang, A.Q., Sablayrolles, A., Roux, A., Mensch, A., Savary, B., Bamford, C., Chaplot, D.S., de las Casas, D., Hanna, E.B., Bressand, F., Lengyel, G., Bour, G., Lample, G., Lavaud, L.R., Saulnier, L., Lachaux, M.A., Stock, P., Subramanian, S., Yang, S., Antoniak, S., Scao, T.L., Gervet, T., Lavril, T., Wang, T., Lacroix, T., Sayed, W.E.: Mixtral of Experts (2024)</p><p style='color: blue;'>17. Juge, R., Bentabet, I., Ferradans, S.: The fintoc-2019 shared task: Financial doc- ument structure extraction. In: Proceedings of the Second Financial Narrative Processing Workshop (FNP 2019). pp. 51–57 (2019)</p><p style='color: magenta;'>18. Kaddour, J., Harris, J., Mozes, M., Bradley, H., Raileanu, R., McHardy, R.: Chal- lenges and applications of large language models. arXiv preprint arXiv:2307.10169 (2023)</p><p style='color: red;'>19. Kaur, S., Smiley, C., Gupta, A., Sain, J., Wang, D., Siddagangappa, S., Aguda, T., Shah, S.: REFinD: Relation Extraction Financial Dataset. In: the 46th International ACM SIGIR Conference on Re- Proceedings of search and Development in Information Retrieval. SIGIR ’23, ACM (Jul 2023). https://doi.org/10.1145/3539618.3591911, http://dx.doi.org/10.1145/ 3539618.3591911</p><p style='color: green;'>20. Kim, G., Hong, T., Yim, M., Park, J., Yim, J., Hwang, W., Yun, S., Han, D., Park, S.: Donut: Document understanding transformer without ocr. arXiv preprint arXiv:2111.15664 7, 15 (2021) 21. Lewis, P., Perez, E., Piktus, A., Petroni, F., Karpukhin, V., Goyal, N., K¨uttler, H., Lewis, M., Yih, W.t., Rockt¨aschel, T., et al.: Retrieval-augmented generation for knowledge-intensive NLP tasks. Advances in Neural Information Processing Systems 33, 9459–9474 (2020)</p><p style='color: blue;'>22. Li, D., Shao, R., Xie, A., Sheng, Y., Zheng, L., Gonzalez, J.E., Stoica, I., Ma, X., Zhang, H.: How Long Can Open-Source LLMs Truly Promise on Context Length? (June 2023), https://lmsys.org/blog/2023-06-29-longchat</p><p style='color: magenta;'>23. Li, Y., Duan, Y.: The evaluation of experiments of artificial general intelligence with gpt-4 based on dikwp. arXiv preprint (2023)</p><p style='color: red;'>24. Lin, C.Y.: Rouge: A package for automatic evaluation of summaries. In: Text sum- marization branches out. pp. 74–81 (2004) 25. Liu, N.F., Lin, K., Hewitt, J., Paranjape, A., Bevilacqua, M., Petroni, F., Liang, P.: Lost in the middle: How language models use long contexts. arXiv preprint arXiv:2307.03172 (2023) 26. Liu, Z., Huang, D., Huang, K., Li, Z., Zhao, J.: Finbert: A pre-trained financial language representation model for financial text mining. In: Proceedings of the twenty-ninth international conference on international joint conferences on artificial intelligence. pp. 4513–4519 (2021) llmware: Rag Instruct Benchmark Tester. https://huggingface.co/datasets/ llmware/rag_instruct_benchmark_tester, Accessed: January 15, 2024</p><p style='color: green;'>28. Malkov, Y.A., Yashunin, D.A.: Efficient and robust approximate nearest neigh- bor search using hierarchical navigable small world graphs. IEEE transactions on pattern analysis and machine intelligence 42(4), 824–836 (2018)</p><p style='color: blue;'>29. Moore, S., Nguyen, H.A., Chen, T., Stamper, J.: Assessing the quality of multiple- choice questions using gpt-4 and rule-based methods. In: European Conference on Technology Enhanced Learning. pp. 229–245. Springer (2023) 30. Naismith, B., Mulcaire, P., Burstein, J.: Automated evaluation of written discourse coherence using gpt-4. In: Proceedings of the 18th Workshop on Innovative Use of NLP for Building Educational Applications (BEA 2023). pp. 394–403 (2023) 31. OpenAI, :, Achiam, J., Adler, S., Agarwal, S., et al.: GPT-4 Technical Report</p><p style='color: magenta;'>(2023)</p><p style='color: red;'>Financial Report Chunking for Effective Retrieval Augmented Generation</p><p style='color: green;'>32. Papineni, K., Roukos, S., Ward, T., Zhu, W.J.: Bleu: a method for automatic evaluation of machine translation. In: Proceedings of the 40th annual meeting of the Association for Computational Linguistics. pp. 311–318 (2002) 33. Pfitzmann, B., Auer, C., Dolfi, M., Nassar, A.S., Staar, P.: Doclaynet: A large human-annotated dataset for document-layout segmentation. In: Proceedings of the 28th ACM SIGKDD Conference on Knowledge Discovery and Data Mining. pp. 3743–3751 (2022)</p><p style='color: blue;'>34. Pinecone: Chunking strategies for llm applications, https://www.pinecone.io/ learn/chunking-strategies/</p><p style='color: magenta;'>35. Reimers, N., Gurevych, I.: Sentence-bert: Sentence embeddings using siamese bert- networks. In: Proceedings of the 2019 Conference on Empirical Methods in Nat- ural Language Processing. Association for Computational Linguistics (11 2019), https://arxiv.org/abs/1908.10084</p><p style='color: red;'>36. Retteter, J.: Mastering Table Extraction: Revolutionize Your Earnings Re- ports Analysis with AI. https://medium.com/unstructured-io/mastering- table-extraction-revolutionize-your-earnings-reports-analysis-with- ai-1bc32c22720e, Accessed: January 15, 2024 37. Rizinski, M., Peshov, H., Mishev, K., Jovanovik, M., Trajanov, D.: Sentiment Anal- ysis in Finance: From Transformers Back to eXplainable Lexicons (XLex) (2023) 38. Shah, R.S., Chawla, K., Eidnani, D., Shah, A., Du, W., Chava, S., Raman, N., Smiley, C., Chen, J., Yang, D.: WHEN FLUE MEETS FLANG: Benchmarks and Large Pre-trained Language Model for Financial Domain (2022) 39. Singh Phogat, K., Harsha, C., Dasaratha, S., Ramakrishna, S., Akhil Puranam, S.: Zero-Shot Question Answering over Financial Documents using Large Language Models. arXiv e-prints pp. arXiv–2311 (2023) 40. Wu, S., Irsoy, O., Lu, S., Dabravolski, V., Dredze, M., Gehrmann, S., Kambadur, P., Rosenberg, D., Mann, G.: BloombergGPT: A Large Language Model for Finance (2023) 41. Xu, P., Ping, W., Wu, X., McAfee, L., Zhu, C., Liu, Z., Subramanian, S., Bakhtu- rina, E., Shoeybi, M., Catanzaro, B.: Retrieval meets Long Context Large Language Models (2023) 42. Yang, H., Liu, X.Y., Wang, C.D.: FinGPT: Open-Source Financial Large Language Models (2023) 43. Ye, H., Liu, T., Zhang, A., Hua, W., Jia, W.: Cognitive Mirage: A Review of Hallucinations in Large Language Models (2023) 44. Zhang, B., Yang, H., Liu, X.Y.: Instruct-FinGPT: Financial Sentiment Analysis by Instruction Tuning of General-Purpose Large Language Models (2023)</p><p style='color: green;'>45. Zheng, X., Burdick, D., Popa, L., Zhong, X., Wang, N.X.R.: Global table extractor (gte): A framework for joint table identification and cell structure recognition using visual context. In: Proceedings of the IEEE/CVF winter conference on applications of computer vision. pp. 697–706 (2021) 46. Zhu, F., Lei, W., Huang, Y., Wang, C., Zhang, S., Lv, J., Feng, F., Chua, T.S.: TAT-QA: A question answering benchmark on a hybrid of tabular and textual content in finance. arXiv preprint arXiv:2105.07624 (2021)</p>"
       ],
       "text/plain": [
        "<IPython.core.display.HTML object>"
@@ -307,13 +716,6 @@
    "source": [
     "print_chunks_by_title(chunks_by_title)"
    ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "---"
-   ]
   }
  ],
  "metadata": {
diff --git a/pyproject.toml b/pyproject.toml
index 7c910b9148f8bbc9afaefa92e1097961f3693b24..6d366978ab00d2a13d15abbe35c35546f4c380c0 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "semantic-router"
-version = "0.0.24"
+version = "0.0.26"
 description = "Super fast semantic router for AI decision making"
 authors = [
     "James Briggs <james@aurelio.ai>",
diff --git a/semantic_router/__init__.py b/semantic_router/__init__.py
index d810106abbd425f673391ada9a802561532318b3..978313b66f858aae66a8b68a64e1c569beb69cc3 100644
--- a/semantic_router/__init__.py
+++ b/semantic_router/__init__.py
@@ -4,4 +4,4 @@ from semantic_router.route import Route
 
 __all__ = ["RouteLayer", "HybridRouteLayer", "Route", "LayerConfig"]
 
-__version__ = "0.0.24"
+__version__ = "0.0.26"
diff --git a/semantic_router/encoders/vit.py b/semantic_router/encoders/vit.py
index 65de365ac6abca6a941e5e51e6e538f1a8449a34..9ff1369b9701517bc4557ea329ed298c0d14e421 100644
--- a/semantic_router/encoders/vit.py
+++ b/semantic_router/encoders/vit.py
@@ -3,6 +3,12 @@ from typing import Any, List, Optional
 from pydantic.v1 import PrivateAttr
 
 from semantic_router.encoders import BaseEncoder
+from semantic_router.utils.logger import logger
+
+try:
+    from PIL import Image
+except ImportError:
+    logger.warning("Pillow is not installed. Install it with `pip install pillow`")
 
 
 class VitEncoder(BaseEncoder):
diff --git a/semantic_router/route.py b/semantic_router/route.py
index 830dce5d51664f6ab2ff5903c37ee1a1f04db285..3d46a8b4f4578ce90da8984d60a8a85956341ed3 100644
--- a/semantic_router/route.py
+++ b/semantic_router/route.py
@@ -9,6 +9,11 @@ from semantic_router.schema import Message, RouteChoice
 from semantic_router.utils import function_call
 from semantic_router.utils.logger import logger
 
+try:
+    from PIL.Image import Image
+except ImportError:
+    pass
+
 
 def is_valid(route_config: str) -> bool:
     try:
@@ -40,7 +45,7 @@ def is_valid(route_config: str) -> bool:
 
 class Route(BaseModel):
     name: str
-    utterances: List[Any]
+    utterances: Union[List[str], List[Union[Any, "Image"]]]
     description: Optional[str] = None
     function_schema: Optional[Dict[str, Any]] = None
     llm: Optional[BaseLLM] = None
diff --git a/semantic_router/splitters/rolling_window.py b/semantic_router/splitters/rolling_window.py
index 0e7c651de4fbe3e113e6ad0313f7b2d2ce1355d3..dc5110a67efcc7353c80cebe00012aebf75db8e6 100644
--- a/semantic_router/splitters/rolling_window.py
+++ b/semantic_router/splitters/rolling_window.py
@@ -1,3 +1,4 @@
+from dataclasses import dataclass
 from typing import List
 
 import numpy as np
@@ -9,6 +10,31 @@ from semantic_router.splitters.utils import split_to_sentences, tiktoken_length
 from semantic_router.utils.logger import logger
 
 
+@dataclass
+class SplitStatistics:
+    total_documents: int
+    total_splits: int
+    splits_by_threshold: int
+    splits_by_max_chunk_size: int
+    splits_by_last_split: int
+    min_token_size: int
+    max_token_size: int
+    splits_by_similarity_ratio: float
+
+    def __str__(self):
+        return (
+            f"Splitting Statistics:\n"
+            f"  - Total Documents: {self.total_documents}\n"
+            f"  - Total Splits: {self.total_splits}\n"
+            f"  - Splits by Threshold: {self.splits_by_threshold}\n"
+            f"  - Splits by Max Chunk Size: {self.splits_by_max_chunk_size}\n"
+            f"  - Last Split: {self.splits_by_last_split}\n"
+            f"  - Minimum Token Size of Split: {self.min_token_size}\n"
+            f"  - Maximum Token Size of Split: {self.max_token_size}\n"
+            f"  - Similarity Split Ratio: {self.splits_by_similarity_ratio:.2f}"
+        )
+
+
 class RollingWindowSplitter(BaseSplitter):
     def __init__(
         self,
@@ -32,6 +58,7 @@ class RollingWindowSplitter(BaseSplitter):
         self.min_split_tokens = min_split_tokens
         self.max_split_tokens = max_split_tokens
         self.split_tokens_tolerance = split_tokens_tolerance
+        self.statistics: SplitStatistics
 
     def encode_documents(self, docs: List[str]) -> np.ndarray:
         try:
@@ -55,15 +82,20 @@ class RollingWindowSplitter(BaseSplitter):
 
     def find_split_indices(self, similarities: List[float]) -> List[int]:
         split_indices = []
-        for idx in range(1, len(similarities)):
-            if similarities[idx] < self.calculated_threshold:
+        for idx, score in enumerate(similarities):
+            logger.debug(f"Similarity score at index {idx}: {score}")
+            if score < self.calculated_threshold:
+                logger.debug(
+                    f"Adding to split_indices due to score < threshold: "
+                    f"{score} < {self.calculated_threshold}"
+                )
+                # Split after the document at idx
                 split_indices.append(idx + 1)
         return split_indices
 
-    def find_optimal_threshold(self, docs: List[str], encoded_docs: np.ndarray):
+    def find_optimal_threshold(self, docs: List[str], similarity_scores: List[float]):
         token_counts = [tiktoken_length(doc) for doc in docs]
         cumulative_token_counts = np.cumsum([0] + token_counts)
-        similarity_scores = self.calculate_similarity_scores(encoded_docs)
 
         # Analyze the distribution of similarity scores to set initial bounds
         median_score = np.median(similarity_scores)
@@ -74,12 +106,13 @@ class RollingWindowSplitter(BaseSplitter):
         high = min(1.0, float(median_score + std_dev))
 
         iteration = 0
+        median_tokens = 0
         while low <= high:
             self.calculated_threshold = (low + high) / 2
-            logger.info(
+            split_indices = self.find_split_indices(similarity_scores)
+            logger.debug(
                 f"Iteration {iteration}: Trying threshold: {self.calculated_threshold}"
             )
-            split_indices = self.find_split_indices(similarity_scores)
 
             # Calculate the token counts for each split using the cumulative sums
             split_token_counts = [
@@ -91,7 +124,7 @@ class RollingWindowSplitter(BaseSplitter):
 
             # Calculate the median token count for the splits
             median_tokens = np.median(split_token_counts)
-            logger.info(
+            logger.debug(
                 f"Iteration {iteration}: Median tokens per split: {median_tokens}"
             )
             if (
@@ -99,22 +132,22 @@ class RollingWindowSplitter(BaseSplitter):
                 <= median_tokens
                 <= self.max_split_tokens + self.split_tokens_tolerance
             ):
-                logger.info(
-                    f"Iteration {iteration}: "
-                    f"Optimal threshold {self.calculated_threshold} found "
-                    f"with median tokens ({median_tokens}) in target range "
-                    f" {self.min_split_tokens}-{self.max_split_tokens}."
-                )
+                logger.debug("Median tokens in target range. Stopping iteration.")
                 break
             elif median_tokens < self.min_split_tokens:
                 high = self.calculated_threshold - self.threshold_adjustment
-                logger.info(f"Iteration {iteration}: Adjusting high to {high}")
+                logger.debug(f"Iteration {iteration}: Adjusting high to {high}")
             else:
                 low = self.calculated_threshold + self.threshold_adjustment
-                logger.info(f"Iteration {iteration}: Adjusting low to {low}")
+                logger.debug(f"Iteration {iteration}: Adjusting low to {low}")
             iteration += 1
 
-        logger.info(f"Final optimal threshold: {self.calculated_threshold}")
+        logger.info(
+            f"Optimal threshold {self.calculated_threshold} found "
+            f"with median tokens ({median_tokens}) in target range "
+            f"({self.min_split_tokens}-{self.max_split_tokens})."
+        )
+
         return self.calculated_threshold
 
     def split_documents(
@@ -132,9 +165,15 @@ class RollingWindowSplitter(BaseSplitter):
         splits, current_split = [], []
         current_tokens_count = 0
 
+        # Statistics
+        splits_by_threshold = 0
+        splits_by_max_chunk_size = 0
+        splits_by_last_split = 0
+
         for doc_idx, doc in enumerate(docs):
             doc_token_count = token_counts[doc_idx]
-
+            logger.debug(f"Accumulative token count: {current_tokens_count} tokens")
+            logger.debug(f"Document token count: {doc_token_count} tokens")
             # Check if current index is a split point based on similarity
             if doc_idx + 1 in split_indices:
                 if current_tokens_count + doc_token_count >= self.min_split_tokens:
@@ -154,11 +193,12 @@ class RollingWindowSplitter(BaseSplitter):
                             token_count=current_tokens_count,
                         )
                     )
-                    logger.info(
+                    logger.debug(
                         f"Split finalized with {current_tokens_count} tokens due to "
                         f"threshold {self.calculated_threshold}."
                     )
                     current_split, current_tokens_count = [], 0
+                    splits_by_threshold += 1
                     continue  # Move to the next document after splitting
 
             # Check if adding the current document exceeds the max token limit
@@ -172,7 +212,8 @@ class RollingWindowSplitter(BaseSplitter):
                             token_count=current_tokens_count,
                         )
                     )
-                    logger.info(
+                    splits_by_max_chunk_size += 1
+                    logger.debug(
                         f"Split finalized with {current_tokens_count} tokens due to "
                         f"exceeding token limit of {self.max_split_tokens}."
                     )
@@ -191,7 +232,8 @@ class RollingWindowSplitter(BaseSplitter):
                     token_count=current_tokens_count,
                 )
             )
-            logger.info(
+            splits_by_last_split += 1
+            logger.debug(
                 f"Final split added with {current_tokens_count} "
                 "tokens due to remaining documents."
             )
@@ -209,10 +251,38 @@ class RollingWindowSplitter(BaseSplitter):
                 f"Token count mismatch: {original_token_count} != {split_token_count}"
             )
 
+        # Statistics
+        total_splits = len(splits)
+        splits_by_similarity_ratio = (
+            splits_by_threshold / total_splits if total_splits else 0
+        )
+        min_token_size = max_token_size = 0
+        if splits:
+            token_counts = [
+                split.token_count for split in splits if split.token_count is not None
+            ]
+            min_token_size, max_token_size = min(token_counts, default=0), max(
+                token_counts, default=0
+            )
+
+        self.statistics = SplitStatistics(
+            total_documents=len(docs),
+            total_splits=total_splits,
+            splits_by_threshold=splits_by_threshold,
+            splits_by_max_chunk_size=splits_by_max_chunk_size,
+            splits_by_last_split=splits_by_last_split,
+            min_token_size=min_token_size,
+            max_token_size=max_token_size,
+            splits_by_similarity_ratio=splits_by_similarity_ratio,
+        )
+
         return splits
 
     def plot_similarity_scores(
-        self, similarities: List[float], split_indices: List[int]
+        self,
+        similarities: List[float],
+        split_indices: List[int],
+        splits: list[DocumentSplit],
     ):
         try:
             from matplotlib import pyplot as plt
@@ -225,16 +295,18 @@ class RollingWindowSplitter(BaseSplitter):
 
         if not self.plot_splits:
             return
-        plt.figure(figsize=(12, 6))
-        plt.plot(similarities, label="Similarity Scores", marker="o")
+        fig, axs = plt.subplots(2, 1, figsize=(12, 12))  # Adjust for two plots
+
+        # Plot 1: Similarity Scores
+        axs[0].plot(similarities, label="Similarity Scores", marker="o")
         for split_index in split_indices:
-            plt.axvline(
+            axs[0].axvline(
                 x=split_index - 1,
                 color="r",
                 linestyle="--",
                 label="Split" if split_index == split_indices[0] else "",
             )
-        plt.axhline(
+        axs[0].axhline(
             y=self.calculated_threshold,
             color="g",
             linestyle="-.",
@@ -243,7 +315,7 @@ class RollingWindowSplitter(BaseSplitter):
 
         # Annotating each similarity score
         for i, score in enumerate(similarities):
-            plt.annotate(
+            axs[0].annotate(
                 f"{score:.2f}",  # Formatting to two decimal places
                 (i, score),
                 textcoords="offset points",
@@ -251,16 +323,35 @@ class RollingWindowSplitter(BaseSplitter):
                 ha="center",
             )  # Center-align the text
 
-        plt.xlabel("Document Segment Index")
-        plt.ylabel("Similarity Score")
-        plt.title(
+        axs[0].set_xlabel("Document Segment Index")
+        axs[0].set_ylabel("Similarity Score")
+        axs[0].set_title(
             f"Threshold: {self.calculated_threshold} |"
             f" Window Size: {self.window_size}",
             loc="right",
             fontsize=10,
         )
-        plt.suptitle("Document Similarity Scores", fontsize=14)
-        plt.legend()
+        axs[0].legend()
+
+        # Plot 2: Split Token Size Distribution
+        token_counts = [split.token_count for split in splits]
+        axs[1].bar(range(len(token_counts)), token_counts, color="lightblue")
+        axs[1].set_title("Split Token Sizes")
+        axs[1].set_xlabel("Split Index")
+        axs[1].set_ylabel("Token Count")
+        axs[1].set_xticks(range(len(token_counts)))
+        axs[1].set_xticklabels([str(i) for i in range(len(token_counts))])
+        axs[1].grid(True)
+
+        # Annotate each bar with the token size
+        for idx, token_count in enumerate(token_counts):
+            if not token_count:
+                continue
+            axs[1].text(
+                idx, token_count + 0.01, str(token_count), ha="center", va="bottom"
+            )
+
+        plt.tight_layout()
         plt.show()
 
     def plot_sentence_similarity_scores(
@@ -323,12 +414,13 @@ class RollingWindowSplitter(BaseSplitter):
                 )
             docs = split_to_sentences(docs[0])
         encoded_docs = self.encode_documents(docs)
+        similarities = self.calculate_similarity_scores(encoded_docs)
         if self.dynamic_threshold:
-            self.find_optimal_threshold(docs, encoded_docs)
+            self.find_optimal_threshold(docs, similarities)
         else:
             self.calculated_threshold = self.encoder.score_threshold
-        similarities = self.calculate_similarity_scores(encoded_docs)
         split_indices = self.find_split_indices(similarities=similarities)
         splits = self.split_documents(docs, split_indices, similarities)
-        self.plot_similarity_scores(similarities, split_indices)
+        self.plot_similarity_scores(similarities, split_indices, splits)
+        logger.info(self.statistics)
         return splits