diff --git a/.changeset/ninety-doors-impress.md b/.changeset/ninety-doors-impress.md new file mode 100644 index 0000000000000000000000000000000000000000..856d529f6ea4a93fd299ebfa330eb668978f3786 --- /dev/null +++ b/.changeset/ninety-doors-impress.md @@ -0,0 +1,5 @@ +--- +"llamaindex": patch +--- + +Add page number to read PDFs and use generated IDs for PDF and markdown content diff --git a/packages/core/src/readers/PDFReader.ts b/packages/core/src/readers/PDFReader.ts index ee1b1b2f709cf625a0dab2c37638dfbd74d856d9..659ed5134618ba5f179f578cb4fd952616aceac1 100644 --- a/packages/core/src/readers/PDFReader.ts +++ b/packages/core/src/readers/PDFReader.ts @@ -12,10 +12,13 @@ export class PDFReader implements BaseReader { fs: GenericFileSystem = defaultFS, ): Promise<Document[]> { const content = await fs.readRawFile(file); - const text = await readPDF(content); - return text.map((text, page) => { - const id_ = `${file}_${page}`; - return new Document({ text, id_ }); + const pages = await readPDF(content); + return pages.map((text, page) => { + const id_ = `${file}_${page + 1}`; + const metadata = { + page_number: page + 1, + }; + return new Document({ text, id_, metadata }); }); } }