From b9cb061958dad3ff0c718d102b1e2013ca3e3eb4 Mon Sep 17 00:00:00 2001 From: Juan Pablo Mesa Lopez <mesax1@gmail.com> Date: Fri, 26 Apr 2024 13:45:14 -0500 Subject: [PATCH] fix: Split list of documents before embedding them --- semantic_router/splitters/rolling_window.py | 27 ++++++++++++++++----- 1 file changed, 21 insertions(+), 6 deletions(-) diff --git a/semantic_router/splitters/rolling_window.py b/semantic_router/splitters/rolling_window.py index a2809ff5..89336a2b 100644 --- a/semantic_router/splitters/rolling_window.py +++ b/semantic_router/splitters/rolling_window.py @@ -100,12 +100,27 @@ class RollingWindowSplitter(BaseSplitter): return splits def _encode_documents(self, docs: List[str]) -> np.ndarray: - try: - embeddings = self.encoder(docs) - return np.array(embeddings) - except Exception as e: - logger.error(f"Error encoding documents {docs}: {e}") - raise + """ + Encodes a list of documents into embeddings. If the number of documents exceeds 2000, + the documents are split into batches to avoid overloading the encoder. OpenAI has a + limit of len(array) < 2048. + + :param docs: List of text documents to be encoded. + :return: A numpy array of embeddings for the given documents. + """ + max_docs_per_batch = 2000 + embeddings = [] + + for i in range(0, len(docs), max_docs_per_batch): + batch_docs = docs[i : i + max_docs_per_batch] + try: + batch_embeddings = self.encoder(batch_docs) + embeddings.extend(batch_embeddings) + except Exception as e: + logger.error(f"Error encoding documents {batch_docs}: {e}") + raise + + return np.array(embeddings) def _calculate_similarity_scores(self, encoded_docs: np.ndarray) -> List[float]: raw_similarities = [] -- GitLab