diff --git a/apps/simple/simple.txt b/apps/simple/simple.txt new file mode 100644 index 0000000000000000000000000000000000000000..7cd89b8d0d51646af616cc9f784f61fcb1469bf1 --- /dev/null +++ b/apps/simple/simple.txt @@ -0,0 +1,9 @@ +Simple flow: + +Get document list, in this case one document. +Split each document into nodes, in this case sentences or lines. +Embed each of the nodes and get vectors. Store them in memory for now. +Embed query. +Compare query with nodes and get the top n +Put the top n nodes into the prompt. +Execute prompt, get result. diff --git a/packages/core/package.json b/packages/core/package.json index 6df77fdcd9a96187d27c0baffa920f2181072dbe..31b33979faad26d5fd32c87419f2694ca355e016 100644 --- a/packages/core/package.json +++ b/packages/core/package.json @@ -3,10 +3,12 @@ "dependencies": { "axios": "^0.26.1", "js-tiktoken": "^1.0.7", - "openai": "^3.3.0" + "openai": "^3.3.0", + "uuid": "^9.0.0" }, "devDependencies": { - "@types/node": "^18" + "@types/node": "^18", + "@types/uuid": "^9.0.2" }, "main": "src/index.ts", "types": "src/index.ts", diff --git a/packages/core/src/Document.ts b/packages/core/src/Document.ts index 165437eb1a733b6a13d56145e3dcfd5abe625c8a..fcd804086db50cabf37d400972fc0879bcdf5eb3 100644 --- a/packages/core/src/Document.ts +++ b/packages/core/src/Document.ts @@ -1,38 +1,58 @@ -export interface BaseDocument { - getText(): string; - getDocId(): string; - getDocHash(): string; - getEmbedding(): number[]; -} - -export class Document implements BaseDocument { - docId: string; - text: string; - // embedding: number[]; - // docHash: string; +import { v4 as uuidv4 } from "uuid"; +export abstract class BaseDocument { + text?: string; + docId?: string; + embedding?: number[]; + docHash?: string; - constructor(docId: string, text: string) { + constructor( + text?: string, + docId?: string, + embedding?: number[], + docHash?: string + ) { this.docId = docId; this.text = text; + this.embedding = embedding; + this.docHash = docHash; + + if (!docId) { + this.docId = uuidv4(); + } } getText() { - console.log("getText"); - return ""; + if (this.text === undefined) { + throw new Error("Text not set"); + } + return this.text; } getDocId() { - console.log("getDocId"); - return ""; + if (this.docId === undefined) { + throw new Error("doc id not set"); + } + return this.docId; + } + + getEmbedding() { + if (this.embedding === undefined) { + throw new Error("Embedding not set"); + } + return this.embedding; } getDocHash() { - console.log("getDocHash"); - return ""; + return this.docHash; } +} - getEmbedding() { - console.log("getEmbedding"); - return []; +export class Document extends BaseDocument { + static getType() { + return "Document"; } } + +export class ImageDocuemnt extends Document { + image?: string; +} diff --git a/packages/core/src/Embedding.ts b/packages/core/src/Embedding.ts index 1d036e6ef99547febd7a51253d44db8347f89217..58921090ecaf2a286046ba23a1af242fa1183bb7 100644 --- a/packages/core/src/Embedding.ts +++ b/packages/core/src/Embedding.ts @@ -16,9 +16,21 @@ export class BaseEmbedding { similarity( embedding1: number[], embedding2: number[], - mode: SimilarityType + mode: SimilarityType = SimilarityType.DOT_PRODUCT ): number { - return 0; + if (embedding1.length !== embedding2.length) { + throw new Error("Embedding length mismatch"); + } + + if (mode === SimilarityType.DOT_PRODUCT) { + let result = 0; + for (let i = 0; i < embedding1.length; i++) { + result += embedding1[i] * embedding2[i]; + } + return result; + } else { + throw new Error("Not implemented yet"); + } } } @@ -26,4 +38,7 @@ enum OpenAIEmbeddingModelType { TEXT_EMBED_ADA_002 = "text-embedding-ada-002", } -export class OpenAIEmbedding extends BaseEmbedding {} +export class OpenAIEmbedding extends BaseEmbedding { + async aGetTextEmbedding(text: string) {} + async aGetQueryEbmedding(query: string) {} +} diff --git a/packages/core/src/Node.ts b/packages/core/src/Node.ts index 9be3427ae498ecaf757827b670bd747fd31ee5bd..3df77204c60ce1344e0c3284087905f2bb0d2612 100644 --- a/packages/core/src/Node.ts +++ b/packages/core/src/Node.ts @@ -14,8 +14,29 @@ export enum NodeType { INDEX, } -export class Node implements BaseDocument { - relationships: { [key in DocumentRelationship]: string | string[] }; +export class Node extends BaseDocument { + relationships: { [key in DocumentRelationship]: string | string[] | null }; + + constructor( + text: string, // Text is required + docId?: string, + embedding?: number[], + docHash?: string + ) { + if (!text) { + throw new Error("Text is required"); + } + + super(docId, text, embedding, docHash); + + this.relationships = { + source: null, + previous: null, + next: null, + parent: null, + child: [], + }; + } getText(): string { throw new Error("Method not implemented."); diff --git a/packages/core/src/NodeParser.ts b/packages/core/src/NodeParser.ts index 37b37c371c07ff9ccb1a9c4acf86256558a94b87..36dec62db18f0f7903286f85c1f47cf8d6e39bb0 100644 --- a/packages/core/src/NodeParser.ts +++ b/packages/core/src/NodeParser.ts @@ -1,5 +1,29 @@ -interface NodeParser {} +import { Document } from "./Document"; +import { Node } from "./Node"; +import { SentenceSplitter } from "./TextSplitter"; + +export function getTextSplitsFromDocument(document: Document) { + const sentenceSplit = new SentenceSplitter(); + const text = document.getText(); + const splits = sentenceSplit.splitText(text); + return splits; +} + +export function getNodesFromDocument(document: Document) { + const textSplits = getTextSplitsFromDocument(document); + + let nodes: Node[] = []; + + textSplits.forEach((textSplit, index) => { + const node = new Node(textSplit); + node.relationships.source = document.getDocId(); + nodes.push(node); + }); + return nodes; +} + +interface NodeParser {} class SimpleNodeParser implements NodeParser { constructor( textSplitter: any = null, @@ -10,4 +34,12 @@ class SimpleNodeParser implements NodeParser { static fromDefaults(): SimpleNodeParser { return new SimpleNodeParser(); } + + /** + * Generate Node objects from documents + * @param documents + */ + getNodesFromDocuments(documents: Document[]) { + return documents.map((document) => getNodesFromDocument(document)).flat(); + } } diff --git a/packages/core/src/openai.ts b/packages/core/src/openai.ts index 5527bd1ed3e14ee503effa177299be1523a851ea..9dfa81c2a71b568e916e41f7683a7bcfcfc5598e 100644 --- a/packages/core/src/openai.ts +++ b/packages/core/src/openai.ts @@ -66,4 +66,6 @@ export class OpenAIWrapper extends OpenAIApi { } } +// TODO we need to create an openAI singleton + export * from "openai"; diff --git a/packages/core/src/tests/Document.test.ts b/packages/core/src/tests/Document.test.ts index 6ef7e02e70a5fd4c6fad122b3f712eddba58e012..de799d517ce5f34315363f0082c8c0c90ae4e0dc 100644 --- a/packages/core/src/tests/Document.test.ts +++ b/packages/core/src/tests/Document.test.ts @@ -2,7 +2,7 @@ import { Document } from "../Document"; describe("Document", () => { test("initializes", () => { - const doc = new Document("docId", "text"); + const doc = new Document("text", "docId"); expect(doc).toBeDefined(); }); }); diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index af9f950080dba1b67693cb936e6dd25694066d5b..a6e071e7b3017bc62858d239eb872d9ef9b14ec3 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -131,10 +131,16 @@ importers: openai: specifier: ^3.3.0 version: 3.3.0 + uuid: + specifier: ^9.0.0 + version: 9.0.0 devDependencies: '@types/node': specifier: ^18 version: 18.6.0 + '@types/uuid': + specifier: ^9.0.2 + version: 9.0.2 packages/eslint-config-custom: dependencies: @@ -1142,6 +1148,10 @@ packages: '@types/node': 18.6.0 dev: true + /@types/uuid@9.0.2: + resolution: {integrity: sha512-kNnC1GFBLuhImSnV7w4njQkUiJi0ZXUycu1rUaouPqiKlXkh77JKgdRnTAp1x5eBwcIwbtI+3otwzuIDEuDoxQ==} + dev: true + /@types/yargs-parser@21.0.0: resolution: {integrity: sha512-iO9ZQHkZxHn4mSakYV0vFHAVDyEOIJQrV2uZ06HxEPcx+mt8swXoZHIbaaJ2crJYFfErySgktuTZ3BeLz+XmFA==} dev: true @@ -5046,6 +5056,11 @@ packages: resolution: {integrity: sha512-EPD5q1uXyFxJpCrLnCc1nHnq3gOa6DZBocAIiI2TaSCA7VCJ1UJDMagCzIkXNsUYfD1daK//LTEQ8xiIbrHtcw==} dev: true + /uuid@9.0.0: + resolution: {integrity: sha512-MXcSTerfPa4uqyzStbRoTgt5XIe3x5+42+q1sDuy3R5MDk66URdLMOZe5aPX/SQd+kuYAh0FdP/pO28IkQyTeg==} + hasBin: true + dev: false + /v8-compile-cache-lib@3.0.1: resolution: {integrity: sha512-wa7YjyUGfNZngI/vtK0UHAN+lgDCxBPCylVXGp0zu59Fz5aiGtNXaq3DhIov063MorB+VfufLh3JlF2KdTK3xg==} dev: true