From 8f8ee28ba0bb426b77cc48432df30cc418044b62 Mon Sep 17 00:00:00 2001
From: Jayanta Samaddar <jayanta@zenius.one>
Date: Sat, 23 Sep 2023 06:23:17 +0530
Subject: [PATCH] Added DocxReader, adding support for reading .docx files.
 Made changes to relevant docs as well.

---
 apps/docs/docs/introduction.md                |  2 +-
 .../docs/modules/high_level/data_loader.md    |  2 +-
 packages/core/package.json                    |  1 +
 packages/core/src/readers/DocxReader.ts       | 17 ++++
 .../core/src/readers/SimpleDirectoryReader.ts |  6 +-
 pnpm-lock.yaml                                | 81 ++++++++++++++++++-
 6 files changed, 103 insertions(+), 6 deletions(-)
 create mode 100644 packages/core/src/readers/DocxReader.ts

diff --git a/apps/docs/docs/introduction.md b/apps/docs/docs/introduction.md
index 9454de8db..5dc61843c 100644
--- a/apps/docs/docs/introduction.md
+++ b/apps/docs/docs/introduction.md
@@ -19,7 +19,7 @@ That's where **LlamaIndex.TS** comes in.
 
 LlamaIndex.TS provides the following tools:
 
-- **Data loading** ingest your existing `txt` and `pdf` data directly
+- **Data loading** ingest your existing `.txt`, `.pdf`, `.csv`, `.md` and `.docx` data directly
 - **Data indexes** structure your data in intermediate representations that are easy and performant for LLMs to consume.
 - **Engines** provide natural language access to your data. For example:
   - Query engines are powerful retrieval interfaces for knowledge-augmented output.
diff --git a/apps/docs/docs/modules/high_level/data_loader.md b/apps/docs/docs/modules/high_level/data_loader.md
index e2d4ec55b..977f2f57d 100644
--- a/apps/docs/docs/modules/high_level/data_loader.md
+++ b/apps/docs/docs/modules/high_level/data_loader.md
@@ -4,7 +4,7 @@ sidebar_position: 1
 
 # Reader / Loader
 
-LlamaIndex.TS supports easy loading of files from folders using the `SimpleDirectoryReader` class. Currently, `.txt` and `.pdf` files are supported, with more planned in the future!
+LlamaIndex.TS supports easy loading of files from folders using the `SimpleDirectoryReader` class. Currently, `.txt`, `.pdf`, `.csv`, `.md` and `.docx` files are supported, with more planned in the future!
 
 ```typescript
 import { SimpleDirectoryReader } from "llamaindex";
diff --git a/packages/core/package.json b/packages/core/package.json
index 11e0a5053..b6681fa77 100644
--- a/packages/core/package.json
+++ b/packages/core/package.json
@@ -5,6 +5,7 @@
     "@anthropic-ai/sdk": "^0.6.2",
     "@notionhq/client": "^2.2.13",
     "lodash": "^4.17.21",
+    "mammoth": "^1.6.0",
     "md-utils-ts": "^2.0.0",
     "mongodb": "^6.0.0",
     "notion-md-crawler": "^0.0.2",
diff --git a/packages/core/src/readers/DocxReader.ts b/packages/core/src/readers/DocxReader.ts
new file mode 100644
index 000000000..fae5b696d
--- /dev/null
+++ b/packages/core/src/readers/DocxReader.ts
@@ -0,0 +1,17 @@
+import mammoth from "mammoth";
+import { Document } from "../Node";
+import { DEFAULT_FS } from "../storage/constants";
+import { GenericFileSystem } from "../storage/FileSystem";
+import { BaseReader } from "./base";
+
+export class DocxReader implements BaseReader {
+  /** DocxParser */
+  async loadData(
+    file: string,
+    fs: GenericFileSystem = DEFAULT_FS,
+  ): Promise<Document[]> {
+    const dataBuffer = (await fs.readFile(file)) as any;
+    const { value } = await mammoth.extractRawText({ buffer: dataBuffer });
+    return [new Document({ text: value, id_: file })];
+  }
+}
diff --git a/packages/core/src/readers/SimpleDirectoryReader.ts b/packages/core/src/readers/SimpleDirectoryReader.ts
index 828ded5ba..8b0df23b0 100644
--- a/packages/core/src/readers/SimpleDirectoryReader.ts
+++ b/packages/core/src/readers/SimpleDirectoryReader.ts
@@ -1,11 +1,12 @@
 import _ from "lodash";
 import { Document } from "../Node";
-import { CompleteFileSystem, walk } from "../storage/FileSystem";
 import { DEFAULT_FS } from "../storage/constants";
+import { CompleteFileSystem, walk } from "../storage/FileSystem";
+import { BaseReader } from "./base";
 import { PapaCSVReader } from "./CSVReader";
+import { DocxReader } from "./DocxReader";
 import { MarkdownReader } from "./MarkdownReader";
 import { PDFReader } from "./PDFReader";
-import { BaseReader } from "./base";
 
 /**
  * Read a .txt file
@@ -25,6 +26,7 @@ const FILE_EXT_TO_READER: Record<string, BaseReader> = {
   pdf: new PDFReader(),
   csv: new PapaCSVReader(),
   md: new MarkdownReader(),
+  docx: new DocxReader(),
 };
 
 export type SimpleDirectoryReaderLoadDataProps = {
diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml
index 725611a7f..2966268d6 100644
--- a/pnpm-lock.yaml
+++ b/pnpm-lock.yaml
@@ -128,6 +128,9 @@ importers:
       lodash:
         specifier: ^4.17.21
         version: 4.17.21
+      mammoth:
+        specifier: ^1.6.0
+        version: 1.6.0
       md-utils-ts:
         specifier: ^2.0.0
         version: 2.0.0
@@ -4544,6 +4547,11 @@ packages:
       '@webassemblyjs/ast': 1.11.6
       '@xtuc/long': 4.2.2
 
+  /@xmldom/xmldom@0.8.10:
+    resolution: {integrity: sha512-2WALfTl4xo2SkGCYRt6rDTFfk9R1czmBvUQy12gK2KuRKIpWEhcbbzy8EZXtz/jkRqHX8bFEc6FC1HjX4TUWYw==}
+    engines: {node: '>=10.0.0'}
+    dev: false
+
   /@xtuc/ieee754@1.2.0:
     resolution: {integrity: sha512-DX8nKgqcGwsc0eJSqYt5lwP4DH5FlHnmuWWBRy7X0NcaGR0ZtuyeESgMwTYVEtxmsNGY+qit4QYT/MIYTOTPeA==}
 
@@ -5095,7 +5103,6 @@ packages:
 
   /base64-js@1.5.1:
     resolution: {integrity: sha512-AKpaYlHn8t4SVbOHCy+b5+KKgvR4vrsD8vbvrbiQJps7fKDTkjkDry6ji0rUJjC0kzbNePLwzxq8iypo41qeWA==}
-    dev: true
 
   /basic-ftp@5.0.3:
     resolution: {integrity: sha512-QHX8HLlncOLpy54mh+k/sWIFd0ThmRqwe9ZjELybGZK+tZ8rUb9VO0saKJUROTbE+KhzDUT7xziGpGrW8Kmd+g==}
@@ -5134,6 +5141,10 @@ packages:
       readable-stream: 3.6.2
     dev: true
 
+  /bluebird@3.4.7:
+    resolution: {integrity: sha512-iD3898SR7sWVRHbiQv+sHUtHnMvC1o3nW5rAcqnq3uOn07DSAppZYUkIGslDz6gXC7HfunPe7YVBgoEJASPcHA==}
+    dev: false
+
   /bn.js@4.12.0:
     resolution: {integrity: sha512-c98Bf3tPniI+scsdk237ku1Dc3ujXQTSgyiPUDEOe7tRkhrqridvh8klBv0HCEso1OLOYcHuCv/cS6DNxKH+ZA==}
     dev: true
@@ -6535,6 +6546,10 @@ packages:
       md5: 2.3.0
     dev: false
 
+  /dingbat-to-unicode@1.0.1:
+    resolution: {integrity: sha512-98l0sW87ZT58pU4i61wa2OHwxbiYSbuxsCBozaVnYX2iCnr3bLM3fIes1/ej7h1YdOKuKt/MLs706TVnALA65w==}
+    dev: false
+
   /dir-glob@3.0.1:
     resolution: {integrity: sha512-WkrWp9GR4KXfKGYzOLmTuGVi1UWFfws377n9cc55/tb6DuqyF6pcQ5AbiHEshaDpY9v6oaSr2XCDidGmMwdzIA==}
     engines: {node: '>=8'}
@@ -6656,6 +6671,12 @@ packages:
       is-obj: 2.0.0
     dev: false
 
+  /duck@0.1.12:
+    resolution: {integrity: sha512-wkctla1O6VfP89gQ+J/yDesM0S7B7XLXjKGzXxMDVFg7uEn706niAtyYovKbyq1oT9YwDcly721/iUWoc8MVRg==}
+    dependencies:
+      underscore: 1.13.6
+    dev: false
+
   /duplexer3@0.1.5:
     resolution: {integrity: sha512-1A8za6ws41LQgv9HrE/66jyC5yuSjQ3L/KOpFtoBilsAK2iA2wuS5rTt1OCzIvtS2V7nVmedsUU+DGRcjBmOYA==}
     dev: false
@@ -8424,6 +8445,10 @@ packages:
       queue: 6.0.2
     dev: false
 
+  /immediate@3.0.6:
+    resolution: {integrity: sha512-XXOFtyqDjNDAQxVfYxuF7g9Il/IbWmmlQg2MYKOH8ExIT1qg6xc4zyS3HaEEATgs1btfzxq15ciUiY7gjSXRGQ==}
+    dev: false
+
   /immer@9.0.21:
     resolution: {integrity: sha512-bc4NBHqOqSfRW7POMkHd51LvClaeMXpm8dx0e8oE2GORbq5aRK7Bxl4FyzVLdGtLmvLKL7BTDBG5ACQm4HWjTA==}
     dev: false
@@ -9588,6 +9613,15 @@ packages:
       object.assign: 4.1.4
     dev: false
 
+  /jszip@3.10.1:
+    resolution: {integrity: sha512-xXDvecyTpGLrqFrvkrUSoxxfJI5AH7U8zxxtVclpsUtMCq4JQ290LY8AW5c7Ggnr/Y/oK+bQMbqK2qmtk3pN4g==}
+    dependencies:
+      lie: 3.3.0
+      pako: 1.0.11
+      readable-stream: 2.3.8
+      setimmediate: 1.0.5
+    dev: false
+
   /keyv@3.1.0:
     resolution: {integrity: sha512-9ykJ/46SN/9KPM/sichzQ7OvXyGDYKGTaDlKMGCAlg2UK8KRy4jb0d8sFc+0Tt0YYnThq8X2RZgCg74RPxgcVA==}
     dependencies:
@@ -9647,6 +9681,12 @@ packages:
       prelude-ls: 1.2.1
       type-check: 0.4.0
 
+  /lie@3.3.0:
+    resolution: {integrity: sha512-UaiMJzeWRlEujzAuw5LokY1L5ecNQYZKfmyZ9L7wDHb/p5etKaxXhohBcrw0EYby+G/NA52vRSN4N39dxHAIwQ==}
+    dependencies:
+      immediate: 3.0.6
+    dev: false
+
   /lilconfig@2.1.0:
     resolution: {integrity: sha512-utWOt/GHzuUxnLKxB6dk81RoOeoNeHgbrXiuGk4yyF5qlRz+iIVWu56E2fqGHFrXz0QNUhLB/8nKqvRH66JKGQ==}
     engines: {node: '>=10'}
@@ -9768,6 +9808,14 @@ packages:
     dependencies:
       js-tokens: 4.0.0
 
+  /lop@0.4.1:
+    resolution: {integrity: sha512-9xyho9why2A2tzm5aIcMWKvzqKsnxrf9B5I+8O30olh6lQU8PH978LqZoI4++37RBgS1Em5i54v1TFs/3wnmXQ==}
+    dependencies:
+      duck: 0.1.12
+      option: 0.2.4
+      underscore: 1.13.6
+    dev: false
+
   /lower-case-first@1.0.2:
     resolution: {integrity: sha512-UuxaYakO7XeONbKrZf5FEgkantPf5DUqDayzP5VXZrtRPdH86s4kN47I8B3TW10S4QKiE3ziHNf3kRN//okHjA==}
     dependencies:
@@ -9845,6 +9893,23 @@ packages:
       tmpl: 1.0.5
     dev: true
 
+  /mammoth@1.6.0:
+    resolution: {integrity: sha512-jOwbj6BwJzxCf6jr2l1zmSemniIkLnchvELXnDJCANlJawhzyIKObIq48B8kWEPLgUUh57k7FtEO3DHFQMnjMg==}
+    engines: {node: '>=12.0.0'}
+    hasBin: true
+    dependencies:
+      '@xmldom/xmldom': 0.8.10
+      argparse: 1.0.10
+      base64-js: 1.5.1
+      bluebird: 3.4.7
+      dingbat-to-unicode: 1.0.1
+      jszip: 3.10.1
+      lop: 0.4.1
+      path-is-absolute: 1.0.1
+      underscore: 1.13.6
+      xmlbuilder: 10.1.1
+    dev: false
+
   /map-obj@1.0.1:
     resolution: {integrity: sha512-7N/q3lyZ+LVCp7PzuxrJr4KMbBE2hW7BT7YNia330OFxIf4d3r5zVpicP2650l7CPN6RM9zOJRl3NGpqSiw3Eg==}
     engines: {node: '>=0.10.0'}
@@ -10526,6 +10591,10 @@ packages:
     hasBin: true
     dev: false
 
+  /option@0.2.4:
+    resolution: {integrity: sha512-pkEqbDyl8ou5cpq+VsnQbe/WlEy5qS7xPzMS1U55OCG9KPvwFD46zDbxQIj3egJSFc3D+XhYOPUzz49zQAVy7A==}
+    dev: false
+
   /optionator@0.9.3:
     resolution: {integrity: sha512-JjCoypp+jKn1ttEFExxhetCKeJt9zhAgAve5FXHixTvFDW/5aEktX9bufBKLRRMdU7bNtpLfcGu94B3cdEJgjg==}
     engines: {node: '>= 0.8.0'}
@@ -10689,7 +10758,6 @@ packages:
 
   /pako@1.0.11:
     resolution: {integrity: sha512-4hLB8Py4zZce5s4yd9XzopqwVv/yGNhV1Bl8NTmCq1763HeK2+EwVTv+leGeL13Dnh2wfbqowVPXCIO0z4taYw==}
-    dev: true
 
   /papaparse@5.4.1:
     resolution: {integrity: sha512-HipMsgJkZu8br23pW15uvo6sib6wne/4woLZPlFf3rpDyMe9ywEXUsuD7+6K9PRkJlVT51j/sCOYDKGGS3ZJrw==}
@@ -13522,6 +13590,10 @@ packages:
       which-boxed-primitive: 1.0.2
     dev: false
 
+  /underscore@1.13.6:
+    resolution: {integrity: sha512-+A5Sja4HP1M08MaXya7p5LvjuM7K6q/2EaC0+iovj/wOcMsTzMvDFbasi/oSapiwOlt252IqsKqPjCl7huKS0A==}
+    dev: false
+
   /unherit@1.1.3:
     resolution: {integrity: sha512-Ft16BJcnapDKp0+J/rqFC3Rrk6Y/Ng4nzsC028k2jdDII/rdZ7Wd3pPT/6+vIIxRagwRc9K0IUX0Ra4fKvw+WQ==}
     dependencies:
@@ -14289,6 +14361,11 @@ packages:
       sax: 1.2.4
     dev: false
 
+  /xmlbuilder@10.1.1:
+    resolution: {integrity: sha512-OyzrcFLL/nb6fMGHbiRDuPup9ljBycsdCypwuyg5AAHvyWzGfChJpCXMG88AGTIMFhGZ9RccFN1e6lhg3hkwKg==}
+    engines: {node: '>=4.0'}
+    dev: false
+
   /xtend@4.0.2:
     resolution: {integrity: sha512-LKYU1iAXJXUgAXn9URjiu+MWhyUXHsvfp7mcuYm9dSUKK0/CjtrUwFAxD82/mCWbtLsGjFIad0wIsod4zrTAEQ==}
     engines: {node: '>=0.4'}
-- 
GitLab