Skip to content
Snippets Groups Projects
Unverified Commit 3cbfa98e authored by Emanuel Ferreira's avatar Emanuel Ferreira Committed by GitHub
Browse files

feat: LlamaCloudIndex from documents (#677)

parent d256cbe0
Branches
Tags
No related merge requests found
---
"llamaindex": patch
---
feat: llamacloud index from documents
import fs from "node:fs/promises";
import { stdin as input, stdout as output } from "node:process";
import readline from "node:readline/promises";
import { Document, LlamaCloudIndex } from "llamaindex";
async function main() {
const path = "node_modules/llamaindex/examples/abramov.txt";
const essay = await fs.readFile(path, "utf-8");
// Create Document object with essay
const document = new Document({ text: essay, id_: path });
const index = await LlamaCloudIndex.fromDocuments({
documents: [document],
name: "test",
projectName: "default",
apiKey: process.env.LLAMA_CLOUD_API_KEY,
baseUrl: process.env.LLAMA_CLOUD_BASE_URL,
});
const queryEngine = index.asQueryEngine({
denseSimilarityTopK: 5,
});
const rl = readline.createInterface({ input, output });
while (true) {
const query = await rl.question("Query: ");
const stream = await queryEngine.query({
query,
stream: true,
});
console.log();
for await (const chunk of stream) {
process.stdout.write(chunk.response);
}
}
}
main().catch(console.error);
......@@ -9,7 +9,7 @@
"@aws-crypto/sha256-js": "^5.2.0",
"@datastax/astra-db-ts": "^0.1.4",
"@grpc/grpc-js": "^1.10.2",
"@llamaindex/cloud": "0.0.4",
"@llamaindex/cloud": "0.0.5",
"@llamaindex/env": "workspace:*",
"@mistralai/mistralai": "^0.0.10",
"@notionhq/client": "^2.2.14",
......
......@@ -66,8 +66,9 @@ export const defaultParagraphSeparator = EOL + EOL + EOL;
* One of the advantages of SentenceSplitter is that even in the fixed length chunks it will try to keep sentences together.
*/
export class SentenceSplitter {
private chunkSize: number;
private chunkOverlap: number;
public chunkSize: number;
public chunkOverlap: number;
private tokenizer: any;
private tokenizerDecoder: any;
private paragraphSeparator: string;
......
import { PlatformApi } from "@llamaindex/cloud";
import type { Document } from "../Node.js";
import type { BaseRetriever } from "../Retriever.js";
import { RetrieverQueryEngine } from "../engines/query/RetrieverQueryEngine.js";
import type { TransformComponent } from "../ingestion/types.js";
import type { BaseNodePostprocessor } from "../postprocessors/types.js";
import type { BaseSynthesizer } from "../synthesizers/types.js";
import type { BaseQueryEngine } from "../types.js";
import type { CloudRetrieveParams } from "./LlamaCloudRetriever.js";
import { LlamaCloudRetriever } from "./LlamaCloudRetriever.js";
import { getPipelineCreate } from "./config.js";
import type { CloudConstructorParams } from "./types.js";
import { getAppBaseUrl, getClient } from "./utils.js";
import { getEnv } from "@llamaindex/env";
import { OpenAIEmbedding } from "../embeddings/OpenAIEmbedding.js";
import { SimpleNodeParser } from "../nodeParsers/SimpleNodeParser.js";
export class LlamaCloudIndex {
params: CloudConstructorParams;
......@@ -14,6 +23,151 @@ export class LlamaCloudIndex {
this.params = params;
}
static async fromDocuments(
params: {
documents: Document[];
transformations?: TransformComponent[];
verbose?: boolean;
} & CloudConstructorParams,
): Promise<LlamaCloudIndex> {
const defaultTransformations: TransformComponent[] = [
new OpenAIEmbedding({
apiKey: getEnv("OPENAI_API_KEY"),
}),
new SimpleNodeParser(),
];
const appUrl = getAppBaseUrl(params.baseUrl);
const client = await getClient({ ...params, baseUrl: appUrl });
const pipelineCreateParams = await getPipelineCreate({
pipelineName: params.name,
pipelineType: "MANAGED",
inputNodes: params.documents,
transformations: params.transformations ?? defaultTransformations,
});
const project = await client.project.upsertProject({
name: params.projectName ?? "default",
});
if (!project.id) {
throw new Error("Project ID should be defined");
}
const pipeline = await client.project.upsertPipelineForProject(
project.id,
pipelineCreateParams,
);
if (!pipeline.id) {
throw new Error("Pipeline ID must be defined");
}
if (params.verbose) {
console.log(`Created pipeline ${pipeline.id} with name ${params.name}`);
}
const executionsIds: {
exectionId: string;
dataSourceId: string;
}[] = [];
for (const dataSource of pipeline.dataSources) {
const dataSourceExection =
await client.dataSource.createDataSourceExecution(dataSource.id);
if (!dataSourceExection.id) {
throw new Error("Data Source Execution ID must be defined");
}
executionsIds.push({
exectionId: dataSourceExection.id,
dataSourceId: dataSource.id,
});
}
let isDone = false;
while (!isDone) {
const statuses = [];
for await (const execution of executionsIds) {
const dataSourceExecution =
await client.dataSource.getDataSourceExecution(
execution.dataSourceId,
execution.exectionId,
);
statuses.push(dataSourceExecution.status);
if (
statuses.every((status) => status === PlatformApi.StatusEnum.Success)
) {
isDone = true;
if (params.verbose) {
console.info("Data Source Execution completed");
}
break;
} else if (
statuses.some((status) => status === PlatformApi.StatusEnum.Error)
) {
throw new Error("Data Source Execution failed");
} else {
await new Promise((resolve) => setTimeout(resolve, 1000));
if (params.verbose) {
process.stdout.write(".");
}
}
}
}
isDone = false;
const execution = await client.pipeline.runManagedPipelineIngestion(
pipeline.id,
);
const ingestionId = execution.id;
if (!ingestionId) {
throw new Error("Ingestion ID must be defined");
}
while (!isDone) {
const pipelineStatus = await client.pipeline.getManagedIngestionExecution(
pipeline.id,
ingestionId,
);
if (pipelineStatus.status === PlatformApi.StatusEnum.Success) {
isDone = true;
if (params.verbose) {
console.info("Ingestion completed");
}
break;
} else if (pipelineStatus.status === PlatformApi.StatusEnum.Error) {
throw new Error("Ingestion failed");
} else {
await new Promise((resolve) => setTimeout(resolve, 1000));
if (params.verbose) {
process.stdout.write(".");
}
}
}
if (params.verbose) {
console.info(
`Ingestion completed, find your index at ${appUrl}/project/${project.id}/deploy/${pipeline.id}`,
);
}
return new LlamaCloudIndex({ ...params });
}
asRetriever(params: CloudRetrieveParams = {}): BaseRetriever {
return new LlamaCloudRetriever({ ...this.params, ...params });
}
......
......@@ -18,11 +18,11 @@ function getTransformationConfig(
return {
configurableTransformationType: "SENTENCE_AWARE_NODE_PARSER",
component: {
// TODO: API returns 422 if these parameters are included
// chunkSize: transformation.textSplitter.chunkSize, // TODO: set to public in SentenceSplitter
// chunkOverlap: transformation.textSplitter.chunkOverlap, // TODO: set to public in SentenceSplitter
// includeMetadata: transformation.includeMetadata,
// includePrevNextRel: transformation.includePrevNextRel,
// TODO: API doesnt accept camelCase
chunk_size: transformation.textSplitter.chunkSize, // TODO: set to public in SentenceSplitter
chunk_overlap: transformation.textSplitter.chunkOverlap, // TODO: set to public in SentenceSplitter
include_metadata: transformation.includeMetadata,
include_prev_next_rel: transformation.includePrevNextRel,
},
};
}
......@@ -30,9 +30,10 @@ function getTransformationConfig(
return {
configurableTransformationType: "OPENAI_EMBEDDING",
component: {
modelName: transformation.model,
apiKey: transformation.apiKey,
embedBatchSize: transformation.embedBatchSize,
// TODO: API doesnt accept camelCase
model: transformation.model,
api_key: transformation.apiKey,
embed_batch_size: transformation.embedBatchSize,
dimensions: transformation.dimensions,
},
};
......@@ -71,10 +72,12 @@ export async function getPipelineCreate(
inputNodes = [],
} = params;
const dataSources = inputNodes.map(getDataSourceConfig);
return {
name: pipelineName,
configuredTransformations: transformations.map(getTransformationConfig),
dataSources: inputNodes.map(getDataSourceConfig),
dataSources,
dataSinks: [],
pipelineType,
};
......
......@@ -8,7 +8,7 @@
"@aws-crypto/sha256-js": "^5.2.0",
"@datastax/astra-db-ts": "^0.1.4",
"@grpc/grpc-js": "^1.10.2",
"@llamaindex/cloud": "0.0.4",
"@llamaindex/cloud": "0.0.5",
"@llamaindex/env": "workspace:*",
"@mistralai/mistralai": "^0.0.10",
"@notionhq/client": "^2.2.14",
......
This diff is collapsed.
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment