From a221cfc11f0291c09700e718e5d40871ee9b1497 Mon Sep 17 00:00:00 2001 From: Huu Le <39040748+leehuwuj@users.noreply.github.com> Date: Tue, 9 Jul 2024 15:33:11 +0700 Subject: [PATCH] feat: use LlamaParse for all the supported types (#154) --- .changeset/olive-knives-cheat.md | 5 +++++ templates/components/loaders/python/file.py | 22 ++++++++++++++++----- 2 files changed, 22 insertions(+), 5 deletions(-) create mode 100644 .changeset/olive-knives-cheat.md diff --git a/.changeset/olive-knives-cheat.md b/.changeset/olive-knives-cheat.md new file mode 100644 index 00000000..f96091f8 --- /dev/null +++ b/.changeset/olive-knives-cheat.md @@ -0,0 +1,5 @@ +--- +"create-llama": patch +--- + +Use LlamaParse for all the file types that it supports (if activated) diff --git a/templates/components/loaders/python/file.py b/templates/components/loaders/python/file.py index 3baf7001..d8cd4f44 100644 --- a/templates/components/loaders/python/file.py +++ b/templates/components/loaders/python/file.py @@ -1,5 +1,6 @@ import os import logging +from typing import Dict from llama_parse import LlamaParse from pydantic import BaseModel, validator @@ -32,13 +33,18 @@ def llama_parse_parser(): return parser +def llama_parse_extractor() -> Dict[str, LlamaParse]: + from llama_parse.utils import SUPPORTED_FILE_TYPES + + parser = llama_parse_parser() + return {file_type: parser for file_type in SUPPORTED_FILE_TYPES} + + def get_file_documents(config: FileLoaderConfig): from llama_index.core.readers import SimpleDirectoryReader try: - reader = SimpleDirectoryReader( - config.data_dir, recursive=True, filename_as_id=True, raise_on_error=True - ) + file_extractor = None if config.use_llama_parse: # LlamaParse is async first, # so we need to use nest_asyncio to run it in sync mode @@ -46,8 +52,14 @@ def get_file_documents(config: FileLoaderConfig): nest_asyncio.apply() - parser = llama_parse_parser() - reader.file_extractor = {".pdf": parser} + file_extractor = llama_parse_extractor() + reader = SimpleDirectoryReader( + config.data_dir, + recursive=True, + filename_as_id=True, + raise_on_error=True, + file_extractor=file_extractor, + ) return reader.load_data() except Exception as e: import sys, traceback -- GitLab