From a221cfc11f0291c09700e718e5d40871ee9b1497 Mon Sep 17 00:00:00 2001
From: Huu Le <39040748+leehuwuj@users.noreply.github.com>
Date: Tue, 9 Jul 2024 15:33:11 +0700
Subject: [PATCH] feat: use LlamaParse for all the supported types (#154)

---
 .changeset/olive-knives-cheat.md            |  5 +++++
 templates/components/loaders/python/file.py | 22 ++++++++++++++++-----
 2 files changed, 22 insertions(+), 5 deletions(-)
 create mode 100644 .changeset/olive-knives-cheat.md

diff --git a/.changeset/olive-knives-cheat.md b/.changeset/olive-knives-cheat.md
new file mode 100644
index 00000000..f96091f8
--- /dev/null
+++ b/.changeset/olive-knives-cheat.md
@@ -0,0 +1,5 @@
+---
+"create-llama": patch
+---
+
+Use LlamaParse for all the file types that it supports (if activated)
diff --git a/templates/components/loaders/python/file.py b/templates/components/loaders/python/file.py
index 3baf7001..d8cd4f44 100644
--- a/templates/components/loaders/python/file.py
+++ b/templates/components/loaders/python/file.py
@@ -1,5 +1,6 @@
 import os
 import logging
+from typing import Dict
 from llama_parse import LlamaParse
 from pydantic import BaseModel, validator
 
@@ -32,13 +33,18 @@ def llama_parse_parser():
     return parser
 
 
+def llama_parse_extractor() -> Dict[str, LlamaParse]:
+    from llama_parse.utils import SUPPORTED_FILE_TYPES
+
+    parser = llama_parse_parser()
+    return {file_type: parser for file_type in SUPPORTED_FILE_TYPES}
+
+
 def get_file_documents(config: FileLoaderConfig):
     from llama_index.core.readers import SimpleDirectoryReader
 
     try:
-        reader = SimpleDirectoryReader(
-            config.data_dir, recursive=True, filename_as_id=True, raise_on_error=True
-        )
+        file_extractor = None
         if config.use_llama_parse:
             # LlamaParse is async first,
             # so we need to use nest_asyncio to run it in sync mode
@@ -46,8 +52,14 @@ def get_file_documents(config: FileLoaderConfig):
 
             nest_asyncio.apply()
 
-            parser = llama_parse_parser()
-            reader.file_extractor = {".pdf": parser}
+            file_extractor = llama_parse_extractor()
+        reader = SimpleDirectoryReader(
+            config.data_dir,
+            recursive=True,
+            filename_as_id=True,
+            raise_on_error=True,
+            file_extractor=file_extractor,
+        )
         return reader.load_data()
     except Exception as e:
         import sys, traceback
-- 
GitLab