Skip to content
Snippets Groups Projects
Commit b9cb0619 authored by Juan Pablo Mesa Lopez's avatar Juan Pablo Mesa Lopez
Browse files

fix: Split list of documents before embedding them

parent ff1161f2
No related branches found
No related tags found
No related merge requests found
...@@ -100,12 +100,27 @@ class RollingWindowSplitter(BaseSplitter): ...@@ -100,12 +100,27 @@ class RollingWindowSplitter(BaseSplitter):
return splits return splits
def _encode_documents(self, docs: List[str]) -> np.ndarray: def _encode_documents(self, docs: List[str]) -> np.ndarray:
try: """
embeddings = self.encoder(docs) Encodes a list of documents into embeddings. If the number of documents exceeds 2000,
return np.array(embeddings) the documents are split into batches to avoid overloading the encoder. OpenAI has a
except Exception as e: limit of len(array) < 2048.
logger.error(f"Error encoding documents {docs}: {e}")
raise :param docs: List of text documents to be encoded.
:return: A numpy array of embeddings for the given documents.
"""
max_docs_per_batch = 2000
embeddings = []
for i in range(0, len(docs), max_docs_per_batch):
batch_docs = docs[i : i + max_docs_per_batch]
try:
batch_embeddings = self.encoder(batch_docs)
embeddings.extend(batch_embeddings)
except Exception as e:
logger.error(f"Error encoding documents {batch_docs}: {e}")
raise
return np.array(embeddings)
def _calculate_similarity_scores(self, encoded_docs: np.ndarray) -> List[float]: def _calculate_similarity_scores(self, encoded_docs: np.ndarray) -> List[float]:
raw_similarities = [] raw_similarities = []
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment