From 645fcf6c24e70ce110a803a4006a3b54ae2f74d2 Mon Sep 17 00:00:00 2001
From: ezirmusitua <jferroal@gmail.com>
Date: Tue, 7 May 2024 11:07:39 +0800
Subject: [PATCH] fix: use sha256 hash value as the `Document.id_` in
 `MarkdownReader` (#768)

Co-authored-by: Alex Yang <himself65@outlook.com>
---
 packages/core/src/readers/MarkdownReader.ts | 12 ++++++------
 packages/core/src/readers/PDFReader.ts      | 12 ++++--------
 2 files changed, 10 insertions(+), 14 deletions(-)

diff --git a/packages/core/src/readers/MarkdownReader.ts b/packages/core/src/readers/MarkdownReader.ts
index 90b2c7843..a747497b8 100644
--- a/packages/core/src/readers/MarkdownReader.ts
+++ b/packages/core/src/readers/MarkdownReader.ts
@@ -95,16 +95,16 @@ export class MarkdownReader implements FileReader {
     const content = await fs.readFile(file);
     const tups = this.parseTups(content);
     const results: Document[] = [];
+    let counter = 0;
     for (const [header, value] of tups) {
+      const id_ = `${file}_${counter}`;
       if (header) {
-        results.push(
-          new Document({
-            text: `\n\n${header}\n${value}`,
-          }),
-        );
+        const text = `\n\n${header}\n${value}`;
+        results.push(new Document({ text, id_ }));
       } else {
-        results.push(new Document({ text: value }));
+        results.push(new Document({ text: value, id_ }));
       }
+      counter += 1;
     }
     return results;
   }
diff --git a/packages/core/src/readers/PDFReader.ts b/packages/core/src/readers/PDFReader.ts
index 46b3c08f3..ee1b1b2f7 100644
--- a/packages/core/src/readers/PDFReader.ts
+++ b/packages/core/src/readers/PDFReader.ts
@@ -1,5 +1,5 @@
 import type { GenericFileSystem } from "@llamaindex/env";
-import { createSHA256, defaultFS } from "@llamaindex/env";
+import { defaultFS } from "@llamaindex/env";
 import { Document } from "../Node.js";
 import type { BaseReader } from "./type.js";
 
@@ -13,13 +13,9 @@ export class PDFReader implements BaseReader {
   ): Promise<Document[]> {
     const content = await fs.readRawFile(file);
     const text = await readPDF(content);
-    return text.map((text) => {
-      const sha256 = createSHA256();
-      sha256.update(text);
-      return new Document({
-        text,
-        id_: sha256.digest(),
-      });
+    return text.map((text, page) => {
+      const id_ = `${file}_${page}`;
+      return new Document({ text, id_ });
     });
   }
 }
-- 
GitLab