From b9cb061958dad3ff0c718d102b1e2013ca3e3eb4 Mon Sep 17 00:00:00 2001
From: Juan Pablo Mesa Lopez <mesax1@gmail.com>
Date: Fri, 26 Apr 2024 13:45:14 -0500
Subject: [PATCH] fix: Split list of documents before embedding them

---
 semantic_router/splitters/rolling_window.py | 27 ++++++++++++++++-----
 1 file changed, 21 insertions(+), 6 deletions(-)

diff --git a/semantic_router/splitters/rolling_window.py b/semantic_router/splitters/rolling_window.py
index a2809ff5..89336a2b 100644
--- a/semantic_router/splitters/rolling_window.py
+++ b/semantic_router/splitters/rolling_window.py
@@ -100,12 +100,27 @@ class RollingWindowSplitter(BaseSplitter):
         return splits
 
     def _encode_documents(self, docs: List[str]) -> np.ndarray:
-        try:
-            embeddings = self.encoder(docs)
-            return np.array(embeddings)
-        except Exception as e:
-            logger.error(f"Error encoding documents {docs}: {e}")
-            raise
+        """
+        Encodes a list of documents into embeddings. If the number of documents exceeds 2000,
+        the documents are split into batches to avoid overloading the encoder. OpenAI has a
+        limit of len(array) < 2048.
+
+        :param docs: List of text documents to be encoded.
+        :return: A numpy array of embeddings for the given documents.
+        """
+        max_docs_per_batch = 2000
+        embeddings = []
+
+        for i in range(0, len(docs), max_docs_per_batch):
+            batch_docs = docs[i : i + max_docs_per_batch]
+            try:
+                batch_embeddings = self.encoder(batch_docs)
+                embeddings.extend(batch_embeddings)
+            except Exception as e:
+                logger.error(f"Error encoding documents {batch_docs}: {e}")
+                raise
+
+        return np.array(embeddings)
 
     def _calculate_similarity_scores(self, encoded_docs: np.ndarray) -> List[float]:
         raw_similarities = []
-- 
GitLab