diff --git a/.changeset/olive-knives-cheat.md b/.changeset/olive-knives-cheat.md new file mode 100644 index 0000000000000000000000000000000000000000..f96091f82418f0ebfbdadc6183cbfc0170df5afa --- /dev/null +++ b/.changeset/olive-knives-cheat.md @@ -0,0 +1,5 @@ +--- +"create-llama": patch +--- + +Use LlamaParse for all the file types that it supports (if activated) diff --git a/templates/components/loaders/python/file.py b/templates/components/loaders/python/file.py index 3baf70012efa6fd5e87d4559563836661a95e52c..d8cd4f4451c5be2c33af265312a6db9e8dad3c57 100644 --- a/templates/components/loaders/python/file.py +++ b/templates/components/loaders/python/file.py @@ -1,5 +1,6 @@ import os import logging +from typing import Dict from llama_parse import LlamaParse from pydantic import BaseModel, validator @@ -32,13 +33,18 @@ def llama_parse_parser(): return parser +def llama_parse_extractor() -> Dict[str, LlamaParse]: + from llama_parse.utils import SUPPORTED_FILE_TYPES + + parser = llama_parse_parser() + return {file_type: parser for file_type in SUPPORTED_FILE_TYPES} + + def get_file_documents(config: FileLoaderConfig): from llama_index.core.readers import SimpleDirectoryReader try: - reader = SimpleDirectoryReader( - config.data_dir, recursive=True, filename_as_id=True, raise_on_error=True - ) + file_extractor = None if config.use_llama_parse: # LlamaParse is async first, # so we need to use nest_asyncio to run it in sync mode @@ -46,8 +52,14 @@ def get_file_documents(config: FileLoaderConfig): nest_asyncio.apply() - parser = llama_parse_parser() - reader.file_extractor = {".pdf": parser} + file_extractor = llama_parse_extractor() + reader = SimpleDirectoryReader( + config.data_dir, + recursive=True, + filename_as_id=True, + raise_on_error=True, + file_extractor=file_extractor, + ) return reader.load_data() except Exception as e: import sys, traceback