From 8f8ee28ba0bb426b77cc48432df30cc418044b62 Mon Sep 17 00:00:00 2001 From: Jayanta Samaddar <jayanta@zenius.one> Date: Sat, 23 Sep 2023 06:23:17 +0530 Subject: [PATCH] Added DocxReader, adding support for reading .docx files. Made changes to relevant docs as well. --- apps/docs/docs/introduction.md | 2 +- .../docs/modules/high_level/data_loader.md | 2 +- packages/core/package.json | 1 + packages/core/src/readers/DocxReader.ts | 17 ++++ .../core/src/readers/SimpleDirectoryReader.ts | 6 +- pnpm-lock.yaml | 81 ++++++++++++++++++- 6 files changed, 103 insertions(+), 6 deletions(-) create mode 100644 packages/core/src/readers/DocxReader.ts diff --git a/apps/docs/docs/introduction.md b/apps/docs/docs/introduction.md index 9454de8db..5dc61843c 100644 --- a/apps/docs/docs/introduction.md +++ b/apps/docs/docs/introduction.md @@ -19,7 +19,7 @@ That's where **LlamaIndex.TS** comes in. LlamaIndex.TS provides the following tools: -- **Data loading** ingest your existing `txt` and `pdf` data directly +- **Data loading** ingest your existing `.txt`, `.pdf`, `.csv`, `.md` and `.docx` data directly - **Data indexes** structure your data in intermediate representations that are easy and performant for LLMs to consume. - **Engines** provide natural language access to your data. For example: - Query engines are powerful retrieval interfaces for knowledge-augmented output. diff --git a/apps/docs/docs/modules/high_level/data_loader.md b/apps/docs/docs/modules/high_level/data_loader.md index e2d4ec55b..977f2f57d 100644 --- a/apps/docs/docs/modules/high_level/data_loader.md +++ b/apps/docs/docs/modules/high_level/data_loader.md @@ -4,7 +4,7 @@ sidebar_position: 1 # Reader / Loader -LlamaIndex.TS supports easy loading of files from folders using the `SimpleDirectoryReader` class. Currently, `.txt` and `.pdf` files are supported, with more planned in the future! +LlamaIndex.TS supports easy loading of files from folders using the `SimpleDirectoryReader` class. Currently, `.txt`, `.pdf`, `.csv`, `.md` and `.docx` files are supported, with more planned in the future! ```typescript import { SimpleDirectoryReader } from "llamaindex"; diff --git a/packages/core/package.json b/packages/core/package.json index 11e0a5053..b6681fa77 100644 --- a/packages/core/package.json +++ b/packages/core/package.json @@ -5,6 +5,7 @@ "@anthropic-ai/sdk": "^0.6.2", "@notionhq/client": "^2.2.13", "lodash": "^4.17.21", + "mammoth": "^1.6.0", "md-utils-ts": "^2.0.0", "mongodb": "^6.0.0", "notion-md-crawler": "^0.0.2", diff --git a/packages/core/src/readers/DocxReader.ts b/packages/core/src/readers/DocxReader.ts new file mode 100644 index 000000000..fae5b696d --- /dev/null +++ b/packages/core/src/readers/DocxReader.ts @@ -0,0 +1,17 @@ +import mammoth from "mammoth"; +import { Document } from "../Node"; +import { DEFAULT_FS } from "../storage/constants"; +import { GenericFileSystem } from "../storage/FileSystem"; +import { BaseReader } from "./base"; + +export class DocxReader implements BaseReader { + /** DocxParser */ + async loadData( + file: string, + fs: GenericFileSystem = DEFAULT_FS, + ): Promise<Document[]> { + const dataBuffer = (await fs.readFile(file)) as any; + const { value } = await mammoth.extractRawText({ buffer: dataBuffer }); + return [new Document({ text: value, id_: file })]; + } +} diff --git a/packages/core/src/readers/SimpleDirectoryReader.ts b/packages/core/src/readers/SimpleDirectoryReader.ts index 828ded5ba..8b0df23b0 100644 --- a/packages/core/src/readers/SimpleDirectoryReader.ts +++ b/packages/core/src/readers/SimpleDirectoryReader.ts @@ -1,11 +1,12 @@ import _ from "lodash"; import { Document } from "../Node"; -import { CompleteFileSystem, walk } from "../storage/FileSystem"; import { DEFAULT_FS } from "../storage/constants"; +import { CompleteFileSystem, walk } from "../storage/FileSystem"; +import { BaseReader } from "./base"; import { PapaCSVReader } from "./CSVReader"; +import { DocxReader } from "./DocxReader"; import { MarkdownReader } from "./MarkdownReader"; import { PDFReader } from "./PDFReader"; -import { BaseReader } from "./base"; /** * Read a .txt file @@ -25,6 +26,7 @@ const FILE_EXT_TO_READER: Record<string, BaseReader> = { pdf: new PDFReader(), csv: new PapaCSVReader(), md: new MarkdownReader(), + docx: new DocxReader(), }; export type SimpleDirectoryReaderLoadDataProps = { diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 725611a7f..2966268d6 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -128,6 +128,9 @@ importers: lodash: specifier: ^4.17.21 version: 4.17.21 + mammoth: + specifier: ^1.6.0 + version: 1.6.0 md-utils-ts: specifier: ^2.0.0 version: 2.0.0 @@ -4544,6 +4547,11 @@ packages: '@webassemblyjs/ast': 1.11.6 '@xtuc/long': 4.2.2 + /@xmldom/xmldom@0.8.10: + resolution: {integrity: sha512-2WALfTl4xo2SkGCYRt6rDTFfk9R1czmBvUQy12gK2KuRKIpWEhcbbzy8EZXtz/jkRqHX8bFEc6FC1HjX4TUWYw==} + engines: {node: '>=10.0.0'} + dev: false + /@xtuc/ieee754@1.2.0: resolution: {integrity: sha512-DX8nKgqcGwsc0eJSqYt5lwP4DH5FlHnmuWWBRy7X0NcaGR0ZtuyeESgMwTYVEtxmsNGY+qit4QYT/MIYTOTPeA==} @@ -5095,7 +5103,6 @@ packages: /base64-js@1.5.1: resolution: {integrity: sha512-AKpaYlHn8t4SVbOHCy+b5+KKgvR4vrsD8vbvrbiQJps7fKDTkjkDry6ji0rUJjC0kzbNePLwzxq8iypo41qeWA==} - dev: true /basic-ftp@5.0.3: resolution: {integrity: sha512-QHX8HLlncOLpy54mh+k/sWIFd0ThmRqwe9ZjELybGZK+tZ8rUb9VO0saKJUROTbE+KhzDUT7xziGpGrW8Kmd+g==} @@ -5134,6 +5141,10 @@ packages: readable-stream: 3.6.2 dev: true + /bluebird@3.4.7: + resolution: {integrity: sha512-iD3898SR7sWVRHbiQv+sHUtHnMvC1o3nW5rAcqnq3uOn07DSAppZYUkIGslDz6gXC7HfunPe7YVBgoEJASPcHA==} + dev: false + /bn.js@4.12.0: resolution: {integrity: sha512-c98Bf3tPniI+scsdk237ku1Dc3ujXQTSgyiPUDEOe7tRkhrqridvh8klBv0HCEso1OLOYcHuCv/cS6DNxKH+ZA==} dev: true @@ -6535,6 +6546,10 @@ packages: md5: 2.3.0 dev: false + /dingbat-to-unicode@1.0.1: + resolution: {integrity: sha512-98l0sW87ZT58pU4i61wa2OHwxbiYSbuxsCBozaVnYX2iCnr3bLM3fIes1/ej7h1YdOKuKt/MLs706TVnALA65w==} + dev: false + /dir-glob@3.0.1: resolution: {integrity: sha512-WkrWp9GR4KXfKGYzOLmTuGVi1UWFfws377n9cc55/tb6DuqyF6pcQ5AbiHEshaDpY9v6oaSr2XCDidGmMwdzIA==} engines: {node: '>=8'} @@ -6656,6 +6671,12 @@ packages: is-obj: 2.0.0 dev: false + /duck@0.1.12: + resolution: {integrity: sha512-wkctla1O6VfP89gQ+J/yDesM0S7B7XLXjKGzXxMDVFg7uEn706niAtyYovKbyq1oT9YwDcly721/iUWoc8MVRg==} + dependencies: + underscore: 1.13.6 + dev: false + /duplexer3@0.1.5: resolution: {integrity: sha512-1A8za6ws41LQgv9HrE/66jyC5yuSjQ3L/KOpFtoBilsAK2iA2wuS5rTt1OCzIvtS2V7nVmedsUU+DGRcjBmOYA==} dev: false @@ -8424,6 +8445,10 @@ packages: queue: 6.0.2 dev: false + /immediate@3.0.6: + resolution: {integrity: sha512-XXOFtyqDjNDAQxVfYxuF7g9Il/IbWmmlQg2MYKOH8ExIT1qg6xc4zyS3HaEEATgs1btfzxq15ciUiY7gjSXRGQ==} + dev: false + /immer@9.0.21: resolution: {integrity: sha512-bc4NBHqOqSfRW7POMkHd51LvClaeMXpm8dx0e8oE2GORbq5aRK7Bxl4FyzVLdGtLmvLKL7BTDBG5ACQm4HWjTA==} dev: false @@ -9588,6 +9613,15 @@ packages: object.assign: 4.1.4 dev: false + /jszip@3.10.1: + resolution: {integrity: sha512-xXDvecyTpGLrqFrvkrUSoxxfJI5AH7U8zxxtVclpsUtMCq4JQ290LY8AW5c7Ggnr/Y/oK+bQMbqK2qmtk3pN4g==} + dependencies: + lie: 3.3.0 + pako: 1.0.11 + readable-stream: 2.3.8 + setimmediate: 1.0.5 + dev: false + /keyv@3.1.0: resolution: {integrity: sha512-9ykJ/46SN/9KPM/sichzQ7OvXyGDYKGTaDlKMGCAlg2UK8KRy4jb0d8sFc+0Tt0YYnThq8X2RZgCg74RPxgcVA==} dependencies: @@ -9647,6 +9681,12 @@ packages: prelude-ls: 1.2.1 type-check: 0.4.0 + /lie@3.3.0: + resolution: {integrity: sha512-UaiMJzeWRlEujzAuw5LokY1L5ecNQYZKfmyZ9L7wDHb/p5etKaxXhohBcrw0EYby+G/NA52vRSN4N39dxHAIwQ==} + dependencies: + immediate: 3.0.6 + dev: false + /lilconfig@2.1.0: resolution: {integrity: sha512-utWOt/GHzuUxnLKxB6dk81RoOeoNeHgbrXiuGk4yyF5qlRz+iIVWu56E2fqGHFrXz0QNUhLB/8nKqvRH66JKGQ==} engines: {node: '>=10'} @@ -9768,6 +9808,14 @@ packages: dependencies: js-tokens: 4.0.0 + /lop@0.4.1: + resolution: {integrity: sha512-9xyho9why2A2tzm5aIcMWKvzqKsnxrf9B5I+8O30olh6lQU8PH978LqZoI4++37RBgS1Em5i54v1TFs/3wnmXQ==} + dependencies: + duck: 0.1.12 + option: 0.2.4 + underscore: 1.13.6 + dev: false + /lower-case-first@1.0.2: resolution: {integrity: sha512-UuxaYakO7XeONbKrZf5FEgkantPf5DUqDayzP5VXZrtRPdH86s4kN47I8B3TW10S4QKiE3ziHNf3kRN//okHjA==} dependencies: @@ -9845,6 +9893,23 @@ packages: tmpl: 1.0.5 dev: true + /mammoth@1.6.0: + resolution: {integrity: sha512-jOwbj6BwJzxCf6jr2l1zmSemniIkLnchvELXnDJCANlJawhzyIKObIq48B8kWEPLgUUh57k7FtEO3DHFQMnjMg==} + engines: {node: '>=12.0.0'} + hasBin: true + dependencies: + '@xmldom/xmldom': 0.8.10 + argparse: 1.0.10 + base64-js: 1.5.1 + bluebird: 3.4.7 + dingbat-to-unicode: 1.0.1 + jszip: 3.10.1 + lop: 0.4.1 + path-is-absolute: 1.0.1 + underscore: 1.13.6 + xmlbuilder: 10.1.1 + dev: false + /map-obj@1.0.1: resolution: {integrity: sha512-7N/q3lyZ+LVCp7PzuxrJr4KMbBE2hW7BT7YNia330OFxIf4d3r5zVpicP2650l7CPN6RM9zOJRl3NGpqSiw3Eg==} engines: {node: '>=0.10.0'} @@ -10526,6 +10591,10 @@ packages: hasBin: true dev: false + /option@0.2.4: + resolution: {integrity: sha512-pkEqbDyl8ou5cpq+VsnQbe/WlEy5qS7xPzMS1U55OCG9KPvwFD46zDbxQIj3egJSFc3D+XhYOPUzz49zQAVy7A==} + dev: false + /optionator@0.9.3: resolution: {integrity: sha512-JjCoypp+jKn1ttEFExxhetCKeJt9zhAgAve5FXHixTvFDW/5aEktX9bufBKLRRMdU7bNtpLfcGu94B3cdEJgjg==} engines: {node: '>= 0.8.0'} @@ -10689,7 +10758,6 @@ packages: /pako@1.0.11: resolution: {integrity: sha512-4hLB8Py4zZce5s4yd9XzopqwVv/yGNhV1Bl8NTmCq1763HeK2+EwVTv+leGeL13Dnh2wfbqowVPXCIO0z4taYw==} - dev: true /papaparse@5.4.1: resolution: {integrity: sha512-HipMsgJkZu8br23pW15uvo6sib6wne/4woLZPlFf3rpDyMe9ywEXUsuD7+6K9PRkJlVT51j/sCOYDKGGS3ZJrw==} @@ -13522,6 +13590,10 @@ packages: which-boxed-primitive: 1.0.2 dev: false + /underscore@1.13.6: + resolution: {integrity: sha512-+A5Sja4HP1M08MaXya7p5LvjuM7K6q/2EaC0+iovj/wOcMsTzMvDFbasi/oSapiwOlt252IqsKqPjCl7huKS0A==} + dev: false + /unherit@1.1.3: resolution: {integrity: sha512-Ft16BJcnapDKp0+J/rqFC3Rrk6Y/Ng4nzsC028k2jdDII/rdZ7Wd3pPT/6+vIIxRagwRc9K0IUX0Ra4fKvw+WQ==} dependencies: @@ -14289,6 +14361,11 @@ packages: sax: 1.2.4 dev: false + /xmlbuilder@10.1.1: + resolution: {integrity: sha512-OyzrcFLL/nb6fMGHbiRDuPup9ljBycsdCypwuyg5AAHvyWzGfChJpCXMG88AGTIMFhGZ9RccFN1e6lhg3hkwKg==} + engines: {node: '>=4.0'} + dev: false + /xtend@4.0.2: resolution: {integrity: sha512-LKYU1iAXJXUgAXn9URjiu+MWhyUXHsvfp7mcuYm9dSUKK0/CjtrUwFAxD82/mCWbtLsGjFIad0wIsod4zrTAEQ==} engines: {node: '>=0.4'} -- GitLab