Skip to content
Snippets Groups Projects
Commit 692e3cc5 authored by Michael Tutty's avatar Michael Tutty
Browse files

Add HTMLReader to core/src/readers, apps/simple example, and apps/simple/data HTML file

parent bcfbccc3
No related branches found
No related tags found
No related merge requests found
...@@ -4,5 +4,11 @@ ...@@ -4,5 +4,11 @@
"editor.defaultFormatter": "esbenp.prettier-vscode", "editor.defaultFormatter": "esbenp.prettier-vscode",
"[xml]": { "[xml]": {
"editor.defaultFormatter": "redhat.vscode-xml" "editor.defaultFormatter": "redhat.vscode-xml"
},
"[typescript]": {
"editor.defaultFormatter": "vscode.typescript-language-features"
},
"[html]": {
"editor.defaultFormatter": "vscode.html-language-features"
} }
} }
This diff is collapsed.
import { VectorStoreIndex } from "llamaindex";
import { HTMLReader } from "llamaindex";
async function main() {
// Load page
const reader = new HTMLReader();
const documents = await reader.loadData("data/18-1_Changelog.html");
// Split text and create embeddings. Store them in a VectorStoreIndex
const index = await VectorStoreIndex.fromDocuments(documents);
// Query the index
const queryEngine = index.asQueryEngine();
const response = await queryEngine.query("What were the notable changes in 18.1?");
// Output response
console.log(response.toString());
}
main().catch(console.error);
...@@ -9,7 +9,8 @@ ...@@ -9,7 +9,8 @@
"llamaindex": "workspace:*" "llamaindex": "workspace:*"
}, },
"devDependencies": { "devDependencies": {
"@types/node": "^18.18.6" "@types/node": "^18.18.6",
"ts-node": "^10.9.1"
}, },
"scripts": { "scripts": {
"lint": "eslint ." "lint": "eslint ."
......
...@@ -16,6 +16,7 @@ ...@@ -16,6 +16,7 @@
"portkey-ai": "^0.1.13", "portkey-ai": "^0.1.13",
"rake-modified": "^1.0.8", "rake-modified": "^1.0.8",
"replicate": "^0.20.1", "replicate": "^0.20.1",
"string-strip-html": "^8.5.0",
"tiktoken": "^1.0.10", "tiktoken": "^1.0.10",
"uuid": "^9.0.1", "uuid": "^9.0.1",
"wink-nlp": "^1.14.3" "wink-nlp": "^1.14.3"
......
...@@ -18,6 +18,7 @@ export * from "./readers/CSVReader"; ...@@ -18,6 +18,7 @@ export * from "./readers/CSVReader";
export * from "./readers/MarkdownReader"; export * from "./readers/MarkdownReader";
export * from "./readers/NotionReader"; export * from "./readers/NotionReader";
export * from "./readers/PDFReader"; export * from "./readers/PDFReader";
export * from "./readers/HTMLReader";
export * from "./readers/SimpleDirectoryReader"; export * from "./readers/SimpleDirectoryReader";
export * from "./Response"; export * from "./Response";
export * from "./ResponseSynthesizer"; export * from "./ResponseSynthesizer";
......
import { stripHtml } from "string-strip-html";
import { Document } from "../Node";
import { GenericFileSystem } from "../storage/FileSystem";
import { DEFAULT_FS } from "../storage/constants";
import { BaseReader } from "./base";
/**
* Extract the significant text from an arbitrary HTML document.
* The contents of any head, script, style, and xml tags are removed completely.
* The URLs for a[href] tags are extracted, along with the inner text of the tag.
* All other tags are removed, and the inner text is kept intact.
* Html entities (e.g., &) are not decoded.
*/
export class HTMLReader implements BaseReader {
/**
* Public method for this reader.
* Required by BaseReader interface.
* @param file Path/name of the file to be loaded.
* @param fs fs wrapper interface for getting the file content.
* @returns Promise<Document[]> A Promise object, eventually yielding zero or one Document parsed from the HTML content of the specified file.
*/
async loadData(
file: string,
fs: GenericFileSystem = DEFAULT_FS,
): Promise<Document[]> {
const dataBuffer = await fs.readFile(file, 'utf-8');
const htmlOptions = this.getOptions();
const content = this.parseContent(dataBuffer, htmlOptions);
return [new Document({ text: content, id_: file })];
}
/**
* Wrapper for string-strip-html usage.
* @param html Raw HTML content to be parsed.
* @param options An object of options for the underlying library
* @see getOptions
* @returns The HTML content, stripped of unwanted tags and attributes
*/
parseContent(html: string, options: any = {}): string {
return stripHtml(html).result;
}
/**
* Wrapper for our configuration options passed to string-strip-html library
* @see https://codsen.com/os/string-strip-html/examples
* @returns An object of options for the underlying library
*/
getOptions() {
return {
skipHtmlDecoding: true,
stripTogetherWithTheirContents: [
"script", // default
"style", // default
"xml", // default
"head", // <-- custom-added
],
// Keep the URLs for embedded links
// cb: (tag: any, deleteFrom: number, deleteTo: number, insert: string, rangesArr: any, proposedReturn: string) => {
// let temp;
// if (
// tag.name === "a" &&
// tag.attributes &&
// tag.attributes.some((attr: any) => {
// if (attr.name === "href") {
// temp = attr.value;
// return true;
// }
// })
// ) {
// rangesArr.push([deleteFrom, deleteTo, `${temp} ${insert || ""}`]);
// } else {
// rangesArr.push(proposedReturn);
// }
// },
};
}
}
...@@ -119,6 +119,9 @@ importers: ...@@ -119,6 +119,9 @@ importers:
'@types/node': '@types/node':
specifier: ^18.18.6 specifier: ^18.18.6
version: 18.18.6 version: 18.18.6
ts-node:
specifier: ^10.9.1
version: 10.9.1(@types/node@18.18.6)(typescript@4.9.5)
   
packages/core: packages/core:
dependencies: dependencies:
...@@ -161,6 +164,9 @@ importers: ...@@ -161,6 +164,9 @@ importers:
replicate: replicate:
specifier: ^0.20.1 specifier: ^0.20.1
version: 0.20.1 version: 0.20.1
string-strip-html:
specifier: ^8.5.0
version: 8.5.0
tiktoken: tiktoken:
specifier: ^1.0.10 specifier: ^1.0.10
version: 1.0.10 version: 1.0.10
...@@ -366,7 +372,7 @@ packages: ...@@ -366,7 +372,7 @@ packages:
engines: {node: '>=6.0.0'} engines: {node: '>=6.0.0'}
dependencies: dependencies:
'@jridgewell/gen-mapping': 0.3.3 '@jridgewell/gen-mapping': 0.3.3
'@jridgewell/trace-mapping': 0.3.19 '@jridgewell/trace-mapping': 0.3.20
   
/@anthropic-ai/sdk@0.8.0: /@anthropic-ai/sdk@0.8.0:
resolution: {integrity: sha512-1PmmPjztnl4ioGK5r33HHneSr2Hs4B6PObpvMpIAdeqbvj8t9wq8VBPvxPzmcIS2aC73dTUgAh7z5FTSJeZmOQ==} resolution: {integrity: sha512-1PmmPjztnl4ioGK5r33HHneSr2Hs4B6PObpvMpIAdeqbvj8t9wq8VBPvxPzmcIS2aC73dTUgAh7z5FTSJeZmOQ==}
...@@ -3486,7 +3492,7 @@ packages: ...@@ -3486,7 +3492,7 @@ packages:
resolution: {integrity: sha512-UTYAUj/wviwdsMfzoSJspJxbkH5o1snzwX0//0ENX1u/55kkZZkcTZP6u9bwKGkv+dkk9at4m1Cpt0uY80kcpQ==} resolution: {integrity: sha512-UTYAUj/wviwdsMfzoSJspJxbkH5o1snzwX0//0ENX1u/55kkZZkcTZP6u9bwKGkv+dkk9at4m1Cpt0uY80kcpQ==}
dependencies: dependencies:
'@jridgewell/gen-mapping': 0.3.3 '@jridgewell/gen-mapping': 0.3.3
'@jridgewell/trace-mapping': 0.3.19 '@jridgewell/trace-mapping': 0.3.20
   
/@jridgewell/sourcemap-codec@1.4.15: /@jridgewell/sourcemap-codec@1.4.15:
resolution: {integrity: sha512-eF2rxCRulEKXHTRiDrDy6erMYWqNw4LPdQ8UQA4huuxaQsVeRPFl2oM8oDGxMFhJUWZf9McpLtJasDDZb/Bpeg==} resolution: {integrity: sha512-eF2rxCRulEKXHTRiDrDy6erMYWqNw4LPdQ8UQA4huuxaQsVeRPFl2oM8oDGxMFhJUWZf9McpLtJasDDZb/Bpeg==}
...@@ -13071,6 +13077,11 @@ packages: ...@@ -13071,6 +13077,11 @@ packages:
strip-ansi: 6.0.1 strip-ansi: 6.0.1
dev: true dev: true
   
/string-strip-html@8.5.0:
resolution: {integrity: sha512-5ICsK1B1j0A3AF1d45m0sqQCcmi1Q+t1QpF+b794LO5FTHV+ITkGR5C+UCDJQZgs5LMuRruqr6j48PxQVIurJQ==}
engines: {node: '>=14'}
dev: false
/string-width@4.2.3: /string-width@4.2.3:
resolution: {integrity: sha512-wKyQRQpjJ0sIp62ErSZdGsjMJWsap5oRNihHhu6G7JVO/9jIB6UyevL+tXuOqrng8j/cxKTWyWUwvSTriiZz/g==} resolution: {integrity: sha512-wKyQRQpjJ0sIp62ErSZdGsjMJWsap5oRNihHhu6G7JVO/9jIB6UyevL+tXuOqrng8j/cxKTWyWUwvSTriiZz/g==}
engines: {node: '>=8'} engines: {node: '>=8'}
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment