Skip to content
Snippets Groups Projects
Unverified Commit b42493c6 authored by Timothy Carambat's avatar Timothy Carambat Committed by GitHub
Browse files

Split large PDFS into subfolder in documents (#176)

append time value to folder name to prevent duplicate uploads
parent 6e8d81c0
No related branches found
No related tags found
No related merge requests found
import os import os, time
from langchain.document_loaders import PyPDFLoader from langchain.document_loaders import PyPDFLoader
from slugify import slugify from slugify import slugify
from ..utils import guid, file_creation_time, write_to_server_documents, move_source from ..utils import guid, file_creation_time, write_to_server_documents, move_source
...@@ -11,6 +11,7 @@ def as_pdf(**kwargs): ...@@ -11,6 +11,7 @@ def as_pdf(**kwargs):
ext = kwargs.get('ext', '.txt') ext = kwargs.get('ext', '.txt')
remove = kwargs.get('remove_on_complete', False) remove = kwargs.get('remove_on_complete', False)
fullpath = f"{parent_dir}/{filename}{ext}" fullpath = f"{parent_dir}/{filename}{ext}"
destination = f"../server/storage/documents/{slugify(filename)}-{int(time.time())}"
loader = PyPDFLoader(fullpath) loader = PyPDFLoader(fullpath)
pages = loader.load_and_split() pages = loader.load_and_split()
...@@ -31,7 +32,7 @@ def as_pdf(**kwargs): ...@@ -31,7 +32,7 @@ def as_pdf(**kwargs):
'pageContent': content, 'pageContent': content,
'token_count_estimate': len(tokenize(content)) 'token_count_estimate': len(tokenize(content))
} }
write_to_server_documents(data, f"{slugify(filename)}-pg{pg_num}-{data.get('id')}") write_to_server_documents(data, f"{slugify(filename)}-pg{pg_num}-{data.get('id')}", destination)
move_source(parent_dir, f"{filename}{ext}", remove=remove) move_source(parent_dir, f"{filename}{ext}", remove=remove)
print(f"[SUCCESS]: {filename}{ext} converted & ready for embedding.\n") print(f"[SUCCESS]: {filename}{ext} converted & ready for embedding.\n")
\ No newline at end of file
...@@ -28,8 +28,8 @@ def move_source(working_dir='hotdir', new_destination_filename='', failed=False, ...@@ -28,8 +28,8 @@ def move_source(working_dir='hotdir', new_destination_filename='', failed=False,
os.replace(f"{working_dir}/{new_destination_filename}", f"{destination}/{new_destination_filename}") os.replace(f"{working_dir}/{new_destination_filename}", f"{destination}/{new_destination_filename}")
return return
def write_to_server_documents(data, filename): def write_to_server_documents(data, filename, override_destination = None):
destination = f"../server/storage/documents/custom-documents" destination = f"../server/storage/documents/custom-documents" if override_destination == None else override_destination
if os.path.exists(destination) == False: os.makedirs(destination) if os.path.exists(destination) == False: os.makedirs(destination)
with open(f"{destination}/{filename}.json", 'w', encoding='utf-8') as file: with open(f"{destination}/{filename}.json", 'w', encoding='utf-8') as file:
json.dump(data, file, ensure_ascii=True, indent=4) json.dump(data, file, ensure_ascii=True, indent=4)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment