From ce94780b956c35cc5ad51ef137c62243249f56fd Mon Sep 17 00:00:00 2001
From: Marcus Schiesser <mail@marcusschiesser.de>
Date: Tue, 7 May 2024 11:45:55 +0800
Subject: [PATCH] feat: add page number to read PDFs (#815)

---
 .changeset/ninety-doors-impress.md     |  5 +++++
 packages/core/src/readers/PDFReader.ts | 11 +++++++----
 2 files changed, 12 insertions(+), 4 deletions(-)
 create mode 100644 .changeset/ninety-doors-impress.md

diff --git a/.changeset/ninety-doors-impress.md b/.changeset/ninety-doors-impress.md
new file mode 100644
index 000000000..856d529f6
--- /dev/null
+++ b/.changeset/ninety-doors-impress.md
@@ -0,0 +1,5 @@
+---
+"llamaindex": patch
+---
+
+Add page number to read PDFs and use generated IDs for PDF and markdown content
diff --git a/packages/core/src/readers/PDFReader.ts b/packages/core/src/readers/PDFReader.ts
index ee1b1b2f7..659ed5134 100644
--- a/packages/core/src/readers/PDFReader.ts
+++ b/packages/core/src/readers/PDFReader.ts
@@ -12,10 +12,13 @@ export class PDFReader implements BaseReader {
     fs: GenericFileSystem = defaultFS,
   ): Promise<Document[]> {
     const content = await fs.readRawFile(file);
-    const text = await readPDF(content);
-    return text.map((text, page) => {
-      const id_ = `${file}_${page}`;
-      return new Document({ text, id_ });
+    const pages = await readPDF(content);
+    return pages.map((text, page) => {
+      const id_ = `${file}_${page + 1}`;
+      const metadata = {
+        page_number: page + 1,
+      };
+      return new Document({ text, id_, metadata });
     });
   }
 }
-- 
GitLab