Skip to content
Snippets Groups Projects
Unverified Commit 620c63cd authored by Alex Yang's avatar Alex Yang Committed by GitHub
Browse files

feat: add `@llamaindex/readers` package (#1404)

parent cb51ad90
No related branches found
No related tags found
No related merge requests found
Showing
with 110 additions and 156 deletions
import { NotionReader } from "@llamaindex/readers/notion";
import { Client } from "@notionhq/client";
import { program } from "commander";
import { VectorStoreIndex } from "llamaindex";
import { NotionReader } from "llamaindex/readers/NotionReader";
import { stdin as input, stdout as output } from "node:process";
import { createInterface } from "node:readline/promises";
......
import { PDFReader } from "@llamaindex/readers/pdf";
import { VectorStoreIndex } from "llamaindex";
import { PDFReader } from "llamaindex/readers/PDFReader";
async function main() {
// Load PDF
......
import { PDFReader } from "@llamaindex/readers/pdf";
import { FireworksEmbedding, FireworksLLM, VectorStoreIndex } from "llamaindex";
import { PDFReader } from "llamaindex/readers/PDFReader";
import { Settings } from "llamaindex";
......
import { PDFReader } from "@llamaindex/readers/pdf";
import { OpenAI, OpenAIEmbedding, VectorStoreIndex } from "llamaindex";
import { PDFReader } from "llamaindex/readers/PDFReader";
import { Settings } from "llamaindex";
......
import {
LlamaParseReader,
SimpleDirectoryReader,
VectorStoreIndex,
} from "llamaindex";
import { SimpleDirectoryReader } from "@llamaindex/readers/directory";
import { LlamaParseReader, VectorStoreIndex } from "llamaindex";
async function main() {
const reader = new SimpleDirectoryReader();
......
import { SimpleDirectoryReader } from "llamaindex/readers/SimpleDirectoryReader";
import { SimpleDirectoryReader } from "@llamaindex/readers/directory";
// or
// import { SimpleDirectoryReader } from 'llamaindex'
......
import {
PapaCSVReader,
CSVReader,
storageContextFromDefaults,
VectorStoreIndex,
WeaviateVectorStore,
......@@ -9,7 +9,7 @@ const indexName = "MovieReviews";
async function main() {
try {
const reader = new PapaCSVReader(false);
const reader = new CSVReader(false);
const docs = await reader.loadData("./data/movie_reviews.csv");
const vectorStore = new WeaviateVectorStore({ indexName });
const storageContext = await storageContextFromDefaults({ vectorStore });
......
......@@ -32,20 +32,22 @@ export class TransformComponent {
/**
* A reader takes imports data into Document objects.
*/
export interface BaseReader {
loadData(...args: unknown[]): Promise<Document[]>;
export interface BaseReader<T extends BaseNode = Document> {
loadData(...args: unknown[]): Promise<T[]>;
}
/**
* A FileReader takes file paths and imports data into Document objects.
*/
export abstract class FileReader implements BaseReader {
export abstract class FileReader<T extends BaseNode = Document>
implements BaseReader<T>
{
abstract loadDataAsContent(
fileContent: Uint8Array,
filename?: string,
): Promise<Document[]>;
): Promise<T[]>;
async loadData(filePath: string): Promise<Document[]> {
async loadData(filePath: string): Promise<T[]> {
const fileContent = await fs.readFile(filePath);
const filename = path.basename(filePath);
const docs = await this.loadDataAsContent(fileContent, filename);
......@@ -54,7 +56,7 @@ export abstract class FileReader implements BaseReader {
}
static addMetaData(filePath: string) {
return (doc: Document, index: number) => {
return (doc: BaseNode, index: number) => {
// generate id as loadDataAsContent is only responsible for the content
doc.id_ = `${filePath}_${index + 1}`;
doc.metadata["file_path"] = path.resolve(filePath);
......
......@@ -49,7 +49,12 @@ export function randomUUID(): string {
return crypto.randomUUID();
}
export const process: NodeJS.Process = globalThis.process;
export const process: NodeJS.Process = globalThis.process ?? {
platform: "unknown",
arch: "unknown",
version: "unknown",
versions: {},
};
export {
AsyncLocalStorage,
......
// test runtime
import "llamaindex";
import "llamaindex/readers/SimpleDirectoryReader";
// @ts-expect-error
if (typeof EdgeRuntime !== "string") {
......
......@@ -4,10 +4,10 @@ import {
OpenAIAgent,
QueryEngineTool,
Settings,
SimpleDirectoryReader,
VectorStoreIndex,
} from "llamaindex";
import { HuggingFaceEmbedding } from "llamaindex/embeddings/HuggingFaceEmbedding";
import { SimpleDirectoryReader } from "llamaindex/readers/SimpleDirectoryReader";
Settings.llm = new OpenAI({
// eslint-disable-next-line turbo/no-undeclared-env-vars
......
......@@ -22,9 +22,9 @@
"dependencies": {
"@anthropic-ai/sdk": "0.27.1",
"@aws-crypto/sha256-js": "^5.2.0",
"@aws-sdk/client-sso-oidc": "^3.679.0",
"@azure/identity": "^4.4.1",
"@datastax/astra-db-ts": "^1.4.1",
"@discordjs/rest": "^2.3.0",
"@discoveryjs/json-ext": "^0.6.1",
"@google-cloud/vertexai": "1.2.0",
"@google/generative-ai": "0.12.0",
......@@ -40,14 +40,14 @@
"@llamaindex/ollama": "workspace:*",
"@llamaindex/openai": "workspace:*",
"@llamaindex/portkey-ai": "workspace:*",
"@llamaindex/replicate": "workspace:^0.0.8",
"@llamaindex/readers": "workspace:*",
"@llamaindex/replicate": "workspace:*",
"@mistralai/mistralai": "^1.0.4",
"@mixedbread-ai/sdk": "^2.2.11",
"@pinecone-database/pinecone": "^3.0.2",
"@qdrant/js-client-rest": "^1.11.0",
"@types/lodash": "^4.17.7",
"@types/node": "^22.5.1",
"@types/papaparse": "^5.3.14",
"@types/pg": "^8.11.8",
"@upstash/vector": "^1.1.5",
"@zilliz/milvus2-sdk-node": "^2.4.6",
......@@ -56,36 +56,25 @@
"chromadb": "1.9.2",
"chromadb-default-embed": "^2.13.2",
"cohere-ai": "7.13.0",
"discord-api-types": "^0.37.98",
"gpt-tokenizer": "^2.5.0",
"groq-sdk": "^0.6.1",
"js-tiktoken": "^1.0.14",
"lodash": "^4.17.21",
"magic-bytes.js": "^1.10.0",
"mammoth": "^1.7.2",
"md-utils-ts": "^2.0.0",
"mongodb": "^6.7.0",
"notion-md-crawler": "^1.0.0",
"openai": "^4.60.0",
"papaparse": "^5.4.1",
"pathe": "^1.1.2",
"rake-modified": "^1.0.8",
"string-strip-html": "^13.4.8",
"unpdf": "^0.11.0",
"weaviate-client": "^3.1.4",
"wikipedia": "^2.1.2",
"wink-nlp": "^2.3.0",
"zod": "^3.23.8"
},
"peerDependencies": {
"@notionhq/client": "^2.2.15",
"pg": "^8.12.0",
"pgvector": "0.2.0"
},
"peerDependenciesMeta": {
"@notionhq/client": {
"optional": true
},
"pg": {
"optional": true
},
......@@ -94,7 +83,6 @@
}
},
"devDependencies": {
"@notionhq/client": "^2.2.15",
"@swc/cli": "^0.4.0",
"@swc/core": "^1.7.22",
"@vercel/postgres": "^0.10.0",
......@@ -134,6 +122,7 @@
"default": "./dist/cjs/index.js"
}
},
"./register": "./register.js",
"./internal/*": {
"import": "./dist/not-allow.js",
"require": "./dist/cjs/not-allow.js"
......@@ -168,6 +157,7 @@
}
},
"files": [
"./register.js",
"dist",
"CHANGELOG.md",
"examples",
......
/**
* ```shell
* node --import llamaindex/register ./loader.js
* ```
*/
import "@llamaindex/readers/node";
......@@ -3,17 +3,16 @@ export {
type Language,
type ResultType,
} from "@llamaindex/cloud/reader";
export * from "./AssemblyAIReader.js";
export * from "./CSVReader.js";
export * from "./DiscordReader.js";
export * from "./DocxReader.js";
export * from "./HTMLReader.js";
export * from "./ImageReader.js";
export * from "./JSONReader.js";
export * from "./MarkdownReader.js";
export * from "./NotionReader.js";
export * from "./PDFReader.js";
export * from "./SimpleDirectoryReader.js";
export * from "./SimpleMongoReader.js";
export * from "./TextFileReader.js";
export * from "./type.js";
export * from "@llamaindex/readers/assembly-ai";
export * from "@llamaindex/readers/csv";
export * from "@llamaindex/readers/directory";
export * from "@llamaindex/readers/discord";
export * from "@llamaindex/readers/docx";
export * from "@llamaindex/readers/html";
export * from "@llamaindex/readers/image";
export * from "@llamaindex/readers/json";
export * from "@llamaindex/readers/markdown";
export * from "@llamaindex/readers/mongo";
export * from "@llamaindex/readers/notion";
export * from "@llamaindex/readers/pdf";
export * from "@llamaindex/readers/text";
// Note: this code is taken from p-limit 5.0.0 and modified to work with non NodeJS envs by removing AsyncResource which seems not be needed in our case and also it's not recommended to used anymore. If we need to preserve some state between async calls better use `AsyncLocalStorage`.
// Also removed dependency to yocto-queue by using normal Array
export type LimitFunction = {
/**
The number of promises that are currently running.
*/
readonly activeCount: number;
/**
The number of promises that are waiting to run (i.e. their internal `fn` was not called yet).
*/
readonly pendingCount: number;
/**
Discard pending promises that are waiting to run.
This might be useful if you want to teardown the queue at the end of your program's lifecycle or discard any function calls referencing an intermediary state of your app.
Note: This does not cancel promises that are already running.
*/
clearQueue: () => void;
/**
@param fn - Promise-returning/async function.
@param arguments - Any arguments to pass through to `fn`. Support for passing arguments on to the `fn` is provided in order to be able to avoid creating unnecessary closures. You probably don't need this optimization unless you're pushing a lot of functions.
@returns The promise returned by calling `fn(...arguments)`.
*/
<Arguments extends unknown[], ReturnType>(
fn: (...arguments_: Arguments) => PromiseLike<ReturnType> | ReturnType,
...arguments_: Arguments
): Promise<ReturnType>;
};
export default function pLimit(concurrency: number): LimitFunction {
if (
!(
(Number.isInteger(concurrency) ||
concurrency === Number.POSITIVE_INFINITY) &&
concurrency > 0
)
) {
throw new TypeError("Expected `concurrency` to be a number from 1 and up");
}
const queue = new Array();
let activeCount = 0;
const next = () => {
activeCount--;
if (queue.length > 0) {
queue.shift()();
}
};
const run = async (function_: any, resolve: any, arguments_: any) => {
activeCount++;
const result = (async () => function_(...arguments_))();
resolve(result);
try {
await result;
} catch {}
next();
};
const enqueue = (function_: any, resolve: any, arguments_: any) => {
queue.push(run.bind(undefined, function_, resolve, arguments_));
(async () => {
// This function needs to wait until the next microtask before comparing
// `activeCount` to `concurrency`, because `activeCount` is updated asynchronously
// when the run function is dequeued and called. The comparison in the if-statement
// needs to happen asynchronously as well to get an up-to-date value for `activeCount`.
await Promise.resolve();
if (activeCount < concurrency && queue.length > 0) {
queue.shift()();
}
})();
};
const generator = (function_: any, ...arguments_: any) =>
new Promise((resolve) => {
enqueue(function_, resolve, arguments_);
});
Object.defineProperties(generator, {
activeCount: {
get: () => activeCount,
},
pendingCount: {
get: () => queue.length,
},
clearQueue: {
value() {
queue.length = 0;
},
},
});
return generator as LimitFunction;
}
# @llamaindex/readers
> Utilities for reading data from various sources
## Usage
```shell
npm i @llamaindex/readers
```
```ts
import { SimpleDirectoryReader } from "@llamaindex/readers/directory";
const reader = new SimpleDirectoryReader();
const documents = reader.loadData("./directory");
```
## License
MIT
{
"type": "module",
"main": "./dist/index.cjs",
"module": "./dist/index.js",
"types": "./dist/index.d.ts",
"exports": {
".": {
"edge-light": "./dist/index.edge-light.js",
"workerd": "./dist/index.workerd.js",
"default": "./dist/index.js"
}
},
"private": true
}
{
"type": "module",
"main": "./dist/index.cjs",
"module": "./dist/index.js",
"types": "./dist/index.d.ts",
"exports": {
".": {
"edge-light": "./dist/index.edge-light.js",
"workerd": "./dist/index.workerd.js",
"default": "./dist/index.js"
}
},
"private": true
}
{
"type": "module",
"main": "./dist/index.cjs",
"module": "./dist/index.js",
"types": "./dist/index.d.ts",
"exports": {
".": {
"edge-light": "./dist/index.edge-light.js",
"workerd": "./dist/index.workerd.js",
"default": "./dist/index.js"
}
},
"private": true
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment