diff --git a/.changeset/proud-zoos-compare.md b/.changeset/proud-zoos-compare.md new file mode 100644 index 0000000000000000000000000000000000000000..31228ed7ae167a2a34d1c1378da5cb845a355e51 --- /dev/null +++ b/.changeset/proud-zoos-compare.md @@ -0,0 +1,5 @@ +--- +"llamaindex": patch +--- + +fix: clean up docstore when generating embedding fail diff --git a/packages/llamaindex/src/indices/vectorStore/index.ts b/packages/llamaindex/src/indices/vectorStore/index.ts index 5c8536f1aecf89887f6a1271cf0c3ca610c6b9d9..52bafb239ce0a1c14a52d4498214185f425f1b91 100644 --- a/packages/llamaindex/src/indices/vectorStore/index.ts +++ b/packages/llamaindex/src/indices/vectorStore/index.ts @@ -237,7 +237,12 @@ export class VectorStoreIndex extends BaseIndex<IndexDict> { if (args.logProgress) { console.log("Finished parsing documents."); } - return await this.init(args); + try { + return await this.init(args); + } catch (error) { + await docStoreStrategy.rollback(args.storageContext.docStore, args.nodes); + throw error; + } } static async fromVectorStores( diff --git a/packages/llamaindex/src/ingestion/strategies/DuplicatesStrategy.ts b/packages/llamaindex/src/ingestion/strategies/DuplicatesStrategy.ts index dc97aa6d07a599e7248911ca24b01f4805c35824..c2f3741b88c024bbe029ea32854cf27f31e0ab04 100644 --- a/packages/llamaindex/src/ingestion/strategies/DuplicatesStrategy.ts +++ b/packages/llamaindex/src/ingestion/strategies/DuplicatesStrategy.ts @@ -1,10 +1,11 @@ -import { BaseNode, TransformComponent } from "@llamaindex/core/schema"; +import { BaseNode } from "@llamaindex/core/schema"; import type { BaseDocumentStore } from "@llamaindex/core/storage/doc-store"; +import { RollbackableTransformComponent } from "./rollback.js"; /** * Handle doc store duplicates by checking all hashes. */ -export class DuplicatesStrategy extends TransformComponent { +export class DuplicatesStrategy extends RollbackableTransformComponent { private docStore: BaseDocumentStore; constructor(docStore: BaseDocumentStore) { diff --git a/packages/llamaindex/src/ingestion/strategies/UpsertsAndDeleteStrategy.ts b/packages/llamaindex/src/ingestion/strategies/UpsertsAndDeleteStrategy.ts index e28a508b691734f12b4b7916829ce2db01da551a..f36dc7cd71087067e6020293292b784e3af58a96 100644 --- a/packages/llamaindex/src/ingestion/strategies/UpsertsAndDeleteStrategy.ts +++ b/packages/llamaindex/src/ingestion/strategies/UpsertsAndDeleteStrategy.ts @@ -1,13 +1,14 @@ -import { BaseNode, TransformComponent } from "@llamaindex/core/schema"; +import { BaseNode } from "@llamaindex/core/schema"; import type { BaseDocumentStore } from "@llamaindex/core/storage/doc-store"; import type { BaseVectorStore } from "../../vector-store/types.js"; import { classify } from "./classify.js"; +import { RollbackableTransformComponent } from "./rollback.js"; /** * Handle docstore upserts by checking hashes and ids. * Identify missing docs and delete them from docstore and vector store */ -export class UpsertsAndDeleteStrategy extends TransformComponent { +export class UpsertsAndDeleteStrategy extends RollbackableTransformComponent { protected docStore: BaseDocumentStore; protected vectorStores: BaseVectorStore[] | undefined; diff --git a/packages/llamaindex/src/ingestion/strategies/UpsertsStrategy.ts b/packages/llamaindex/src/ingestion/strategies/UpsertsStrategy.ts index 3fb158e9828e339b19cdbb28ae288ae60a1b19da..370994cd87840984cc7dc39aaffcd7b00e3d01c7 100644 --- a/packages/llamaindex/src/ingestion/strategies/UpsertsStrategy.ts +++ b/packages/llamaindex/src/ingestion/strategies/UpsertsStrategy.ts @@ -1,12 +1,13 @@ -import { BaseNode, TransformComponent } from "@llamaindex/core/schema"; +import { BaseNode } from "@llamaindex/core/schema"; import type { BaseDocumentStore } from "@llamaindex/core/storage/doc-store"; import type { BaseVectorStore } from "../../vector-store/types.js"; import { classify } from "./classify.js"; +import { RollbackableTransformComponent } from "./rollback.js"; /** * Handles doc store upserts by checking hashes and ids. */ -export class UpsertsStrategy extends TransformComponent { +export class UpsertsStrategy extends RollbackableTransformComponent { protected docStore: BaseDocumentStore; protected vectorStores: BaseVectorStore[] | undefined; diff --git a/packages/llamaindex/src/ingestion/strategies/index.ts b/packages/llamaindex/src/ingestion/strategies/index.ts index e3516fca966005e2b1c4dde10a83ae559ac6ecc0..a6ba3573f6ac9a02cddf7df8f93819ec9f35bb6d 100644 --- a/packages/llamaindex/src/ingestion/strategies/index.ts +++ b/packages/llamaindex/src/ingestion/strategies/index.ts @@ -1,9 +1,9 @@ -import { TransformComponent } from "@llamaindex/core/schema"; import type { BaseDocumentStore } from "@llamaindex/core/storage/doc-store"; import type { BaseVectorStore } from "../../vector-store/types.js"; import { DuplicatesStrategy } from "./DuplicatesStrategy.js"; import { UpsertsAndDeleteStrategy } from "./UpsertsAndDeleteStrategy.js"; import { UpsertsStrategy } from "./UpsertsStrategy.js"; +import { RollbackableTransformComponent } from "./rollback.js"; /** * Document de-deduplication strategies work by comparing the hashes or ids stored in the document store. @@ -19,7 +19,7 @@ export enum DocStoreStrategy { NONE = "none", // no-op strategy } -class NoOpStrategy extends TransformComponent { +class NoOpStrategy extends RollbackableTransformComponent { constructor() { super(async (nodes) => nodes); } @@ -29,7 +29,7 @@ export function createDocStoreStrategy( docStoreStrategy: DocStoreStrategy, docStore?: BaseDocumentStore, vectorStores: BaseVectorStore[] = [], -): TransformComponent { +): RollbackableTransformComponent { if (docStoreStrategy === DocStoreStrategy.NONE) { return new NoOpStrategy(); } diff --git a/packages/llamaindex/src/ingestion/strategies/rollback.ts b/packages/llamaindex/src/ingestion/strategies/rollback.ts new file mode 100644 index 0000000000000000000000000000000000000000..36c3f026c7340b70ba72e86eecbfa25ab59dbebc --- /dev/null +++ b/packages/llamaindex/src/ingestion/strategies/rollback.ts @@ -0,0 +1,19 @@ +import { BaseNode, TransformComponent } from "@llamaindex/core/schema"; +import type { BaseDocumentStore } from "../../index.edge.js"; +import { classify } from "./classify.js"; + +export class RollbackableTransformComponent extends TransformComponent { + // Remove unused docs from the doc store. It is useful in case + // generating embeddings fails and we want to remove the unused docs + // TODO: override this in UpsertsStrategy if we want to revert removed docs also + public async rollback( + docStore: BaseDocumentStore, + nodes: BaseNode[], + ): Promise<void> { + const { unusedDocs } = await classify(docStore, nodes); + for (const docId of unusedDocs) { + await docStore.deleteDocument(docId, false); + } + docStore.persist(); + } +}