Skip to content
Snippets Groups Projects
Unverified Commit ce94780b authored by Marcus Schiesser's avatar Marcus Schiesser Committed by GitHub
Browse files

feat: add page number to read PDFs (#815)

parent 645fcf6c
No related branches found
No related tags found
No related merge requests found
---
"llamaindex": patch
---
Add page number to read PDFs and use generated IDs for PDF and markdown content
...@@ -12,10 +12,13 @@ export class PDFReader implements BaseReader { ...@@ -12,10 +12,13 @@ export class PDFReader implements BaseReader {
fs: GenericFileSystem = defaultFS, fs: GenericFileSystem = defaultFS,
): Promise<Document[]> { ): Promise<Document[]> {
const content = await fs.readRawFile(file); const content = await fs.readRawFile(file);
const text = await readPDF(content); const pages = await readPDF(content);
return text.map((text, page) => { return pages.map((text, page) => {
const id_ = `${file}_${page}`; const id_ = `${file}_${page + 1}`;
return new Document({ text, id_ }); const metadata = {
page_number: page + 1,
};
return new Document({ text, id_, metadata });
}); });
} }
} }
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment