Skip to content
Snippets Groups Projects
Commit 8bd7cdc9 authored by Yi Ding's avatar Yi Ding
Browse files

built out embedding_utils

Fixed a few issues with integration of VectorStore including adding
getTopKMMREmbeddings (still need to test; mostly ChatGPT translated) and
adding all of the similarity options. Discovered issue with how
Euclidean "similarity" was being handled in Python so put PR there also
parent 3a6eca09
No related branches found
No related tags found
No related merge requests found
...@@ -51,6 +51,9 @@ export abstract class BaseDocument { ...@@ -51,6 +51,9 @@ export abstract class BaseDocument {
} }
getDocHash() { getDocHash() {
if (this.docHash === undefined) {
throw new Error("Doc hash not set");
}
return this.docHash; return this.docHash;
} }
......
import { DEFAULT_SIMILARITY_TOP_K } from "./constants"; import { DEFAULT_SIMILARITY_TOP_K } from "./constants";
import { OpenAISession, getOpenAISession } from "./openai"; import { OpenAISession, getOpenAISession } from "./openai";
import { VectorStoreQueryMode } from "./storage/vectorStore/types";
export enum SimilarityType { export enum SimilarityType {
DEFAULT = "cosine", DEFAULT = "cosine",
...@@ -7,8 +8,54 @@ export enum SimilarityType { ...@@ -7,8 +8,54 @@ export enum SimilarityType {
EUCLIDEAN = "euclidean", EUCLIDEAN = "euclidean",
} }
export function similarity(
embedding1: number[],
embedding2: number[],
mode: SimilarityType = SimilarityType.DEFAULT
): number {
if (embedding1.length !== embedding2.length) {
throw new Error("Embedding length mismatch");
}
// NOTE I've taken enough Kahan to know that we should probably leave the
// numeric programming to numeric programmers. The naive approach here
// will probably cause some avoidable loss of floating point precision
// ml-distance is worth watching although they currently also use the naive
// formulas
function norm(x: number[]): number {
let result = 0;
for (let i = 0; i < x.length; i++) {
result += x[i] * x[i];
}
return Math.sqrt(result);
}
switch (mode) {
case SimilarityType.EUCLIDEAN: {
let difference = embedding1.map((x, i) => x - embedding2[i]);
return -norm(difference);
}
case SimilarityType.DOT_PRODUCT: {
let result = 0;
for (let i = 0; i < embedding1.length; i++) {
result += embedding1[i] * embedding2[i];
}
return result;
}
case SimilarityType.DEFAULT: {
return (
similarity(embedding1, embedding2, SimilarityType.DOT_PRODUCT) /
(norm(embedding1) * norm(embedding2))
);
}
default:
throw new Error("Not implemented yet");
}
}
export function getTopKEmbeddings( export function getTopKEmbeddings(
query_embedding: number[], queryEmbedding: number[],
embeddings: number[][], embeddings: number[][],
similarityTopK: number = DEFAULT_SIMILARITY_TOP_K, similarityTopK: number = DEFAULT_SIMILARITY_TOP_K,
embeddingIds: any[] | null = null, embeddingIds: any[] | null = null,
...@@ -27,9 +74,9 @@ export function getTopKEmbeddings( ...@@ -27,9 +74,9 @@ export function getTopKEmbeddings(
let similarities: { similarity: number; id: number }[] = []; let similarities: { similarity: number; id: number }[] = [];
for (let i = 0; i < embeddings.length; i++) { for (let i = 0; i < embeddings.length; i++) {
let similarity = BaseEmbedding.similarity(query_embedding, embeddings[i]); const sim = similarity(queryEmbedding, embeddings[i]);
if (similarityCutoff == null || similarity > similarityCutoff) { if (similarityCutoff == null || sim > similarityCutoff) {
similarities.push({ similarity: similarity, id: embeddingIds[i] }); similarities.push({ similarity: sim, id: embeddingIds[i] });
} }
} }
...@@ -49,6 +96,83 @@ export function getTopKEmbeddings( ...@@ -49,6 +96,83 @@ export function getTopKEmbeddings(
return [resultSimilarities, resultIds]; return [resultSimilarities, resultIds];
} }
export function getTopKEmbeddingsLearner(
queryEmbedding: number[],
embeddings: number[][],
similarityTopK?: number,
embeddingsIds?: any[],
queryMode: VectorStoreQueryMode = VectorStoreQueryMode.SVM
): [number[], any[]] {
throw new Error("Not implemented yet");
// To support SVM properly we're probably going to have to use something like
// https://github.com/mljs/libsvm which itself hasn't been updated in a while
}
export function getTopKMMREmbeddings(
queryEmbedding: number[],
embeddings: number[][],
similarityFn: ((...args: any[]) => number) | null = null,
similarityTopK: number | null = null,
embeddingIds: any[] | null = null,
_similarityCutoff: number | null = null,
mmrThreshold: number | null = null
): [number[], any[]] {
let threshold = mmrThreshold || 0.5;
similarityFn = similarityFn || similarity;
if (embeddingIds === null || embeddingIds.length === 0) {
embeddingIds = Array.from({ length: embeddings.length }, (_, i) => i);
}
let fullEmbedMap = new Map(embeddingIds.map((value, i) => [value, i]));
let embedMap = new Map(fullEmbedMap);
let embedSimilarity: Map<any, number> = new Map();
let score: number = Number.NEGATIVE_INFINITY;
let highScoreId: any | null = null;
for (let i = 0; i < embeddings.length; i++) {
let emb = embeddings[i];
let similarity = similarityFn(queryEmbedding, emb);
embedSimilarity.set(embeddingIds[i], similarity);
if (similarity * threshold > score) {
highScoreId = embeddingIds[i];
score = similarity * threshold;
}
}
let results: [number, any][] = [];
let embeddingLength = embeddings.length;
let similarityTopKCount = similarityTopK || embeddingLength;
while (results.length < Math.min(similarityTopKCount, embeddingLength)) {
results.push([score, highScoreId]);
embedMap.delete(highScoreId!);
let recentEmbeddingId = highScoreId;
score = Number.NEGATIVE_INFINITY;
for (let embedId of Array.from(embedMap.keys())) {
let overlapWithRecent = similarityFn(
embeddings[embedMap.get(embedId)!],
embeddings[fullEmbedMap.get(recentEmbeddingId!)!]
);
if (
threshold * embedSimilarity.get(embedId)! -
(1 - threshold) * overlapWithRecent >
score
) {
score =
threshold * embedSimilarity.get(embedId)! -
(1 - threshold) * overlapWithRecent;
highScoreId = embedId;
}
}
}
let resultSimilarities = results.map(([s, _]) => s);
let resultIds = results.map(([_, n]) => n);
return [resultSimilarities, resultIds];
}
export abstract class BaseEmbedding { export abstract class BaseEmbedding {
static similarity( static similarity(
embedding1: number[], embedding1: number[],
......
...@@ -31,7 +31,7 @@ export async function storageContextFromDefaults({ ...@@ -31,7 +31,7 @@ export async function storageContextFromDefaults({
vectorStore, vectorStore,
persistDir, persistDir,
fs, fs,
}: BuilderParams): StorageContext { }: BuilderParams): Promise<StorageContext> {
persistDir = persistDir || DEFAULT_PERSIST_DIR; persistDir = persistDir || DEFAULT_PERSIST_DIR;
fs = fs || DEFAULT_FS; fs = fs || DEFAULT_FS;
......
import { IndexStruct } from "llama_index/data_structs/data_structs"; import { IndexStruct } from "../../dataStructs";
import { GenericFileSystem } from "../FileSystem"; import { GenericFileSystem } from "../FileSystem";
import { import {
DEFAULT_PERSIST_DIR, DEFAULT_PERSIST_DIR,
......
...@@ -107,6 +107,7 @@ export class SimpleVectorStore implements VectorStore { ...@@ -107,6 +107,7 @@ export class SimpleVectorStore implements VectorStore {
[topSimilarities, topIds] = getTopKMMREmbeddings( [topSimilarities, topIds] = getTopKMMREmbeddings(
queryEmbedding, queryEmbedding,
embeddings, embeddings,
null,
query.similarityTopK, query.similarityTopK,
nodeIds, nodeIds,
mmrThreshold mmrThreshold
......
This diff is collapsed.
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment