From 089f1d49c0d5155c2142a84bf3e6222e18548ee2 Mon Sep 17 00:00:00 2001 From: Alex Yang <himself65@outlook.com> Date: Fri, 9 Aug 2024 09:53:50 -0700 Subject: [PATCH] refactor: migrate reader type into core (#1111) --- packages/core/src/schema/index.ts | 2 +- packages/core/src/schema/type.ts | 38 +++++- .../src/ingestion/IngestionPipeline.ts | 4 +- .../src/readers/AssemblyAIReader.ts | 3 +- packages/llamaindex/src/readers/CSVReader.ts | 3 +- .../llamaindex/src/readers/DiscordReader.ts | 4 +- packages/llamaindex/src/readers/DocxReader.ts | 3 +- packages/llamaindex/src/readers/HTMLReader.ts | 4 +- .../llamaindex/src/readers/ImageReader.ts | 3 +- packages/llamaindex/src/readers/JSONReader.ts | 4 +- .../src/readers/LlamaParseReader.ts | 89 ++++++++++++- .../llamaindex/src/readers/MarkdownReader.ts | 3 +- .../llamaindex/src/readers/NotionReader.ts | 2 +- packages/llamaindex/src/readers/PDFReader.ts | 3 +- .../src/readers/SimpleDirectoryReader.edge.ts | 2 +- .../src/readers/SimpleDirectoryReader.ts | 2 +- .../src/readers/SimpleMongoReader.ts | 3 +- .../llamaindex/src/readers/TextFileReader.ts | 4 +- packages/llamaindex/src/readers/type.ts | 124 ------------------ 19 files changed, 141 insertions(+), 159 deletions(-) diff --git a/packages/core/src/schema/index.ts b/packages/core/src/schema/index.ts index da1924211..a38ba9975 100644 --- a/packages/core/src/schema/index.ts +++ b/packages/core/src/schema/index.ts @@ -1,4 +1,4 @@ export * from "./node"; -export { TransformComponent } from "./type"; +export { FileReader, TransformComponent, type BaseReader } from "./type"; export { EngineResponse } from "./type/engine–response"; export * from "./zod"; diff --git a/packages/core/src/schema/type.ts b/packages/core/src/schema/type.ts index 7aa3add8a..4c7271d90 100644 --- a/packages/core/src/schema/type.ts +++ b/packages/core/src/schema/type.ts @@ -1,5 +1,5 @@ -import { randomUUID } from "@llamaindex/env"; -import type { BaseNode } from "./node"; +import { fs, path, randomUUID } from "@llamaindex/env"; +import type { BaseNode, Document } from "./node"; interface TransformComponentSignature { <Options extends Record<string, unknown>>( @@ -28,3 +28,37 @@ export class TransformComponent { return transform; } } + +/** + * A reader takes imports data into Document objects. + */ +export interface BaseReader { + loadData(...args: unknown[]): Promise<Document[]>; +} + +/** + * A FileReader takes file paths and imports data into Document objects. + */ +export abstract class FileReader implements BaseReader { + abstract loadDataAsContent( + fileContent: Uint8Array, + fileName?: string, + ): Promise<Document[]>; + + async loadData(filePath: string): Promise<Document[]> { + const fileContent = await fs.readFile(filePath); + const fileName = path.basename(filePath); + const docs = await this.loadDataAsContent(fileContent, fileName); + docs.forEach(FileReader.addMetaData(filePath)); + return docs; + } + + static addMetaData(filePath: string) { + return (doc: Document, index: number) => { + // generate id as loadDataAsContent is only responsible for the content + doc.id_ = `${filePath}_${index + 1}`; + doc.metadata["file_path"] = path.resolve(filePath); + doc.metadata["file_name"] = path.basename(filePath); + }; + } +} diff --git a/packages/llamaindex/src/ingestion/IngestionPipeline.ts b/packages/llamaindex/src/ingestion/IngestionPipeline.ts index fed97e992..8b7f355aa 100644 --- a/packages/llamaindex/src/ingestion/IngestionPipeline.ts +++ b/packages/llamaindex/src/ingestion/IngestionPipeline.ts @@ -1,4 +1,4 @@ -import type { TransformComponent } from "@llamaindex/core/schema"; +import type { BaseReader, TransformComponent } from "@llamaindex/core/schema"; import { ModalityType, splitNodesByType, @@ -6,7 +6,6 @@ import { type Document, type Metadata, } from "@llamaindex/core/schema"; -import type { BaseReader } from "../readers/type.js"; import type { BaseDocumentStore } from "../storage/docStore/types.js"; import type { VectorStore, @@ -107,6 +106,7 @@ export class IngestionPipeline { inputNodes.push(this.documents); } if (this.reader) { + // fixme: empty parameter might cause error inputNodes.push(await this.reader.loadData()); } return inputNodes.flat(); diff --git a/packages/llamaindex/src/readers/AssemblyAIReader.ts b/packages/llamaindex/src/readers/AssemblyAIReader.ts index 74184cb8f..ee368682d 100644 --- a/packages/llamaindex/src/readers/AssemblyAIReader.ts +++ b/packages/llamaindex/src/readers/AssemblyAIReader.ts @@ -1,4 +1,4 @@ -import { Document } from "@llamaindex/core/schema"; +import { type BaseReader, Document } from "@llamaindex/core/schema"; import { getEnv } from "@llamaindex/env"; import type { BaseServiceParams, @@ -8,7 +8,6 @@ import type { TranscriptSentence, } from "assemblyai"; import { AssemblyAI } from "assemblyai"; -import type { BaseReader } from "./type.js"; type AssemblyAIOptions = Partial<BaseServiceParams>; const defaultOptions = { diff --git a/packages/llamaindex/src/readers/CSVReader.ts b/packages/llamaindex/src/readers/CSVReader.ts index 9334574c7..eecb6a5bc 100644 --- a/packages/llamaindex/src/readers/CSVReader.ts +++ b/packages/llamaindex/src/readers/CSVReader.ts @@ -1,7 +1,6 @@ -import { Document } from "@llamaindex/core/schema"; +import { type BaseReader, Document, FileReader } from "@llamaindex/core/schema"; import type { ParseConfig } from "papaparse"; import Papa from "papaparse"; -import { FileReader } from "./type.js"; /** * papaparse-based csv parser diff --git a/packages/llamaindex/src/readers/DiscordReader.ts b/packages/llamaindex/src/readers/DiscordReader.ts index fdabcd091..8cf92981a 100644 --- a/packages/llamaindex/src/readers/DiscordReader.ts +++ b/packages/llamaindex/src/readers/DiscordReader.ts @@ -1,5 +1,5 @@ import { REST, type RESTOptions } from "@discordjs/rest"; -import { Document } from "@llamaindex/core/schema"; +import { Document, type BaseReader } from "@llamaindex/core/schema"; import { getEnv } from "@llamaindex/env"; import { Routes, type APIEmbed, type APIMessage } from "discord-api-types/v10"; @@ -7,7 +7,7 @@ import { Routes, type APIEmbed, type APIMessage } from "discord-api-types/v10"; * Represents a reader for Discord messages using @discordjs/rest * See https://github.com/discordjs/discord.js/tree/main/packages/rest */ -export class DiscordReader { +export class DiscordReader implements BaseReader { private client: REST; constructor( diff --git a/packages/llamaindex/src/readers/DocxReader.ts b/packages/llamaindex/src/readers/DocxReader.ts index 765a322df..79ce9857d 100644 --- a/packages/llamaindex/src/readers/DocxReader.ts +++ b/packages/llamaindex/src/readers/DocxReader.ts @@ -1,6 +1,5 @@ -import { Document } from "@llamaindex/core/schema"; +import { Document, FileReader } from "@llamaindex/core/schema"; import mammoth from "mammoth"; -import { FileReader } from "./type.js"; export class DocxReader extends FileReader { /** DocxParser */ diff --git a/packages/llamaindex/src/readers/HTMLReader.ts b/packages/llamaindex/src/readers/HTMLReader.ts index 02e08263d..6246bf8a4 100644 --- a/packages/llamaindex/src/readers/HTMLReader.ts +++ b/packages/llamaindex/src/readers/HTMLReader.ts @@ -1,6 +1,4 @@ -import { Document } from "@llamaindex/core/schema"; -import { FileReader } from "./type.js"; - +import { Document, FileReader } from "@llamaindex/core/schema"; /** * Extract the significant text from an arbitrary HTML document. * The contents of any head, script, style, and xml tags are removed completely. diff --git a/packages/llamaindex/src/readers/ImageReader.ts b/packages/llamaindex/src/readers/ImageReader.ts index 937737128..6a681e83f 100644 --- a/packages/llamaindex/src/readers/ImageReader.ts +++ b/packages/llamaindex/src/readers/ImageReader.ts @@ -1,6 +1,5 @@ import type { Document } from "@llamaindex/core/schema"; -import { ImageDocument } from "@llamaindex/core/schema"; -import { FileReader } from "./type.js"; +import { FileReader, ImageDocument } from "@llamaindex/core/schema"; /** * Reads the content of an image file into a Document object (which stores the image file as a Blob). diff --git a/packages/llamaindex/src/readers/JSONReader.ts b/packages/llamaindex/src/readers/JSONReader.ts index 13c54cd2b..2f4f99a9f 100644 --- a/packages/llamaindex/src/readers/JSONReader.ts +++ b/packages/llamaindex/src/readers/JSONReader.ts @@ -1,7 +1,5 @@ import type { JSONValue } from "@llamaindex/core/global"; -import { Document } from "@llamaindex/core/schema"; -import { FileReader } from "./type.js"; - +import { Document, FileReader } from "@llamaindex/core/schema"; export interface JSONReaderOptions { /** * Whether to ensure only ASCII characters. diff --git a/packages/llamaindex/src/readers/LlamaParseReader.ts b/packages/llamaindex/src/readers/LlamaParseReader.ts index 6532ac5cc..b45bb9fc7 100644 --- a/packages/llamaindex/src/readers/LlamaParseReader.ts +++ b/packages/llamaindex/src/readers/LlamaParseReader.ts @@ -1,7 +1,92 @@ -import { Document } from "@llamaindex/core/schema"; +import { Document, FileReader } from "@llamaindex/core/schema"; import { fs, getEnv } from "@llamaindex/env"; import { filetypeinfo } from "magic-bytes.js"; -import { FileReader, type Language, type ResultType } from "./type.js"; + +export type ResultType = "text" | "markdown" | "json"; +export type Language = + | "abq" + | "ady" + | "af" + | "ang" + | "ar" + | "as" + | "ava" + | "az" + | "be" + | "bg" + | "bh" + | "bho" + | "bn" + | "bs" + | "ch_sim" + | "ch_tra" + | "che" + | "cs" + | "cy" + | "da" + | "dar" + | "de" + | "en" + | "es" + | "et" + | "fa" + | "fr" + | "ga" + | "gom" + | "hi" + | "hr" + | "hu" + | "id" + | "inh" + | "is" + | "it" + | "ja" + | "kbd" + | "kn" + | "ko" + | "ku" + | "la" + | "lbe" + | "lez" + | "lt" + | "lv" + | "mah" + | "mai" + | "mi" + | "mn" + | "mr" + | "ms" + | "mt" + | "ne" + | "new" + | "nl" + | "no" + | "oc" + | "pi" + | "pl" + | "pt" + | "ro" + | "ru" + | "rs_cyrillic" + | "rs_latin" + | "sck" + | "sk" + | "sl" + | "sq" + | "sv" + | "sw" + | "ta" + | "tab" + | "te" + | "th" + | "tjk" + | "tl" + | "tr" + | "ug" + | "uk" + | "ur" + | "uz" + | "vi"; const SUPPORT_FILE_EXT: string[] = [ ".pdf", diff --git a/packages/llamaindex/src/readers/MarkdownReader.ts b/packages/llamaindex/src/readers/MarkdownReader.ts index 32c734f49..e8e43410c 100644 --- a/packages/llamaindex/src/readers/MarkdownReader.ts +++ b/packages/llamaindex/src/readers/MarkdownReader.ts @@ -1,5 +1,4 @@ -import { Document } from "@llamaindex/core/schema"; -import { FileReader } from "./type.js"; +import { Document, FileReader } from "@llamaindex/core/schema"; type MarkdownTuple = [string | null, string]; diff --git a/packages/llamaindex/src/readers/NotionReader.ts b/packages/llamaindex/src/readers/NotionReader.ts index 80168c7f8..8b2db4239 100644 --- a/packages/llamaindex/src/readers/NotionReader.ts +++ b/packages/llamaindex/src/readers/NotionReader.ts @@ -1,7 +1,7 @@ +import type { BaseReader } from "@llamaindex/core/schema"; import { Document } from "@llamaindex/core/schema"; import type { Crawler, CrawlerOptions, Page } from "notion-md-crawler"; import { crawler, pageToString } from "notion-md-crawler"; -import type { BaseReader } from "./type.js"; type NotionReaderOptions = Pick<CrawlerOptions, "client" | "serializers">; diff --git a/packages/llamaindex/src/readers/PDFReader.ts b/packages/llamaindex/src/readers/PDFReader.ts index 3e5e3e448..bd46e3ce3 100644 --- a/packages/llamaindex/src/readers/PDFReader.ts +++ b/packages/llamaindex/src/readers/PDFReader.ts @@ -1,5 +1,4 @@ -import { Document } from "@llamaindex/core/schema"; -import { FileReader } from "./type.js"; +import { Document, FileReader } from "@llamaindex/core/schema"; /** * Read the text of a PDF diff --git a/packages/llamaindex/src/readers/SimpleDirectoryReader.edge.ts b/packages/llamaindex/src/readers/SimpleDirectoryReader.edge.ts index 3e4e63dc8..938eb89e3 100644 --- a/packages/llamaindex/src/readers/SimpleDirectoryReader.edge.ts +++ b/packages/llamaindex/src/readers/SimpleDirectoryReader.edge.ts @@ -1,8 +1,8 @@ +import type { BaseReader, FileReader } from "@llamaindex/core/schema"; import { Document } from "@llamaindex/core/schema"; import { path } from "@llamaindex/env"; import { walk } from "../storage/FileSystem.js"; import { TextFileReader } from "./TextFileReader.js"; -import type { BaseReader, FileReader } from "./type.js"; import pLimit from "./utils.js"; type ReaderCallback = ( diff --git a/packages/llamaindex/src/readers/SimpleDirectoryReader.ts b/packages/llamaindex/src/readers/SimpleDirectoryReader.ts index 1a259eb0f..b4e12523a 100644 --- a/packages/llamaindex/src/readers/SimpleDirectoryReader.ts +++ b/packages/llamaindex/src/readers/SimpleDirectoryReader.ts @@ -1,3 +1,4 @@ +import type { FileReader } from "@llamaindex/core/schema"; import { Document } from "@llamaindex/core/schema"; import { PapaCSVReader } from "./CSVReader.js"; import { DocxReader } from "./DocxReader.js"; @@ -10,7 +11,6 @@ import { type SimpleDirectoryReaderLoadDataParams, } from "./SimpleDirectoryReader.edge.js"; import { TextFileReader } from "./TextFileReader.js"; -import type { FileReader } from "./type.js"; export const FILE_EXT_TO_READER: Record<string, FileReader> = { txt: new TextFileReader(), diff --git a/packages/llamaindex/src/readers/SimpleMongoReader.ts b/packages/llamaindex/src/readers/SimpleMongoReader.ts index 58dafc966..d85d1c13f 100644 --- a/packages/llamaindex/src/readers/SimpleMongoReader.ts +++ b/packages/llamaindex/src/readers/SimpleMongoReader.ts @@ -1,7 +1,6 @@ import type { Metadata } from "@llamaindex/core/schema"; -import { Document } from "@llamaindex/core/schema"; +import { type BaseReader, Document } from "@llamaindex/core/schema"; import type { MongoClient } from "mongodb"; -import type { BaseReader } from "./type.js"; /** * Read in from MongoDB diff --git a/packages/llamaindex/src/readers/TextFileReader.ts b/packages/llamaindex/src/readers/TextFileReader.ts index 972d787e4..785a8f01c 100644 --- a/packages/llamaindex/src/readers/TextFileReader.ts +++ b/packages/llamaindex/src/readers/TextFileReader.ts @@ -1,6 +1,4 @@ -import { Document } from "@llamaindex/core/schema"; -import { FileReader } from "./type.js"; - +import { Document, FileReader } from "@llamaindex/core/schema"; /** * Read a .txt file */ diff --git a/packages/llamaindex/src/readers/type.ts b/packages/llamaindex/src/readers/type.ts index 80fe584e9..e69de29bb 100644 --- a/packages/llamaindex/src/readers/type.ts +++ b/packages/llamaindex/src/readers/type.ts @@ -1,124 +0,0 @@ -import type { Document } from "@llamaindex/core/schema"; -import { fs, path } from "@llamaindex/env"; - -/** - * A reader takes imports data into Document objects. - */ -export interface BaseReader { - loadData(...args: unknown[]): Promise<Document[]>; -} - -/** - * A FileReader takes file paths and imports data into Document objects. - */ -export abstract class FileReader implements BaseReader { - abstract loadDataAsContent( - fileContent: Uint8Array, - fileName?: string, - ): Promise<Document[]>; - - async loadData(filePath: string): Promise<Document[]> { - const fileContent = await fs.readFile(filePath); - const fileName = path.basename(filePath); - const docs = await this.loadDataAsContent(fileContent, fileName); - docs.forEach(FileReader.addMetaData(filePath)); - return docs; - } - - static addMetaData(filePath: string) { - return (doc: Document, index: number) => { - // generate id as loadDataAsContent is only responsible for the content - doc.id_ = `${filePath}_${index + 1}`; - doc.metadata["file_path"] = path.resolve(filePath); - doc.metadata["file_name"] = path.basename(filePath); - }; - } -} - -// For LlamaParseReader.ts - -export type ResultType = "text" | "markdown" | "json"; -export type Language = - | "abq" - | "ady" - | "af" - | "ang" - | "ar" - | "as" - | "ava" - | "az" - | "be" - | "bg" - | "bh" - | "bho" - | "bn" - | "bs" - | "ch_sim" - | "ch_tra" - | "che" - | "cs" - | "cy" - | "da" - | "dar" - | "de" - | "en" - | "es" - | "et" - | "fa" - | "fr" - | "ga" - | "gom" - | "hi" - | "hr" - | "hu" - | "id" - | "inh" - | "is" - | "it" - | "ja" - | "kbd" - | "kn" - | "ko" - | "ku" - | "la" - | "lbe" - | "lez" - | "lt" - | "lv" - | "mah" - | "mai" - | "mi" - | "mn" - | "mr" - | "ms" - | "mt" - | "ne" - | "new" - | "nl" - | "no" - | "oc" - | "pi" - | "pl" - | "pt" - | "ro" - | "ru" - | "rs_cyrillic" - | "rs_latin" - | "sck" - | "sk" - | "sl" - | "sq" - | "sv" - | "sw" - | "ta" - | "tab" - | "te" - | "th" - | "tjk" - | "tl" - | "tr" - | "ug" - | "uk" - | "ur" - | "uz" - | "vi"; -- GitLab