Skip to content
Snippets Groups Projects
Unverified Commit 645fcf6c authored by ezirmusitua's avatar ezirmusitua Committed by GitHub
Browse files

fix: use sha256 hash value as the `Document.id_` in `MarkdownReader` (#768)


Co-authored-by: default avatarAlex Yang <himself65@outlook.com>
parent e37fa5d9
No related branches found
No related tags found
No related merge requests found
......@@ -95,16 +95,16 @@ export class MarkdownReader implements FileReader {
const content = await fs.readFile(file);
const tups = this.parseTups(content);
const results: Document[] = [];
let counter = 0;
for (const [header, value] of tups) {
const id_ = `${file}_${counter}`;
if (header) {
results.push(
new Document({
text: `\n\n${header}\n${value}`,
}),
);
const text = `\n\n${header}\n${value}`;
results.push(new Document({ text, id_ }));
} else {
results.push(new Document({ text: value }));
results.push(new Document({ text: value, id_ }));
}
counter += 1;
}
return results;
}
......
import type { GenericFileSystem } from "@llamaindex/env";
import { createSHA256, defaultFS } from "@llamaindex/env";
import { defaultFS } from "@llamaindex/env";
import { Document } from "../Node.js";
import type { BaseReader } from "./type.js";
......@@ -13,13 +13,9 @@ export class PDFReader implements BaseReader {
): Promise<Document[]> {
const content = await fs.readRawFile(file);
const text = await readPDF(content);
return text.map((text) => {
const sha256 = createSHA256();
sha256.update(text);
return new Document({
text,
id_: sha256.digest(),
});
return text.map((text, page) => {
const id_ = `${file}_${page}`;
return new Document({ text, id_ });
});
}
}
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment