diff --git a/CHANGELOG.md b/CHANGELOG.md index bfbbca8bad09771ef304e476035303eb991a6fdd..e787511ead06bd1c6c701f80ce978fe1e5deac26 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,10 @@ # ChangeLog +## Unreleased + +### Bug Fixes / Nits +- Only convert newlines to spaces for text 001 embedding models in OpenAI + ## [0.8.14] - 2023-08-30 ### New Features diff --git a/llama_index/embeddings/openai.py b/llama_index/embeddings/openai.py index 35fb1f9f37829978d0ffd7fb3f0ee9d5c3ad6dee..216eaccc4c27bfdbae961e866647ea618a65364a 100644 --- a/llama_index/embeddings/openai.py +++ b/llama_index/embeddings/openai.py @@ -118,7 +118,13 @@ def get_embedding( like matplotlib, plotly, scipy, sklearn. """ - text = text.replace("\n", " ") + if ( + engine is not None + and engine.endswith("001") + and not engine.endswith("code-001") + ): + # replace newlines, which can negatively affect performance on text-001 models. + text = text.replace("\n", " ") return openai.Embedding.create(input=[text], model=engine, **kwargs)["data"][0][ "embedding" ] @@ -140,8 +146,13 @@ async def aget_embedding( like matplotlib, plotly, scipy, sklearn. """ - # replace newlines, which can negatively affect performance. - text = text.replace("\n", " ") + if ( + engine is not None + and engine.endswith("001") + and not engine.endswith("code-001") + ): + # replace newlines, which can negatively affect performance on text-001 models. + text = text.replace("\n", " ") return (await openai.Embedding.acreate(input=[text], model=engine, **kwargs))[ "data" @@ -166,8 +177,13 @@ def get_embeddings( """ assert len(list_of_text) <= 2048, "The batch size should not be larger than 2048." - # replace newlines, which can negatively affect performance. - list_of_text = [text.replace("\n", " ") for text in list_of_text] + if ( + engine is not None + and engine.endswith("001") + and not engine.endswith("code-001") + ): + # replace newlines, which can negatively affect performance on text-001 models. + list_of_text = [text.replace("\n", " ") for text in list_of_text] data = openai.Embedding.create(input=list_of_text, model=engine, **kwargs).data return [d["embedding"] for d in data] @@ -191,8 +207,13 @@ async def aget_embeddings( """ assert len(list_of_text) <= 2048, "The batch size should not be larger than 2048." - # replace newlines, which can negatively affect performance. - list_of_text = [text.replace("\n", " ") for text in list_of_text] + if ( + engine is not None + and engine.endswith("001") + and not engine.endswith("code-001") + ): + # replace newlines, which can negatively affect performance on text-001 models. + list_of_text = [text.replace("\n", " ") for text in list_of_text] data = ( await openai.Embedding.acreate(input=list_of_text, model=engine, **kwargs)