diff --git a/collector/scripts/watch/convert/as_docx.py b/collector/scripts/watch/convert/as_docx.py index 6d16650e3e5ecf85a84914965d6547e06a3b94a7..33aaaaaeb0b9b619a6aee4c02f428f61e604db8e 100644 --- a/collector/scripts/watch/convert/as_docx.py +++ b/collector/scripts/watch/convert/as_docx.py @@ -16,6 +16,10 @@ def as_docx(**kwargs): data = loader.load()[0] content = data.page_content + if len(content) == 0: + print(f"Resulting text content was empty for {filename}{ext}.") + return(False, f"No text content found in {filename}{ext}") + print(f"-- Working {fullpath} --") data = { 'id': guid(), @@ -33,7 +37,9 @@ def as_docx(**kwargs): write_to_server_documents(data, f"{slugify(filename)}-{data.get('id')}") move_source(parent_dir, f"{filename}{ext}", remove=remove) + print(f"[SUCCESS]: {filename}{ext} converted & ready for embedding.\n") + return(True, None) def as_odt(**kwargs): parent_dir = kwargs.get('directory', 'hotdir') @@ -46,6 +52,10 @@ def as_odt(**kwargs): data = loader.load()[0] content = data.page_content + if len(content) == 0: + print(f"Resulting text content was empty for {filename}{ext}.") + return(False, f"No text content found in {filename}{ext}") + print(f"-- Working {fullpath} --") data = { 'id': guid(), @@ -63,4 +73,6 @@ def as_odt(**kwargs): write_to_server_documents(data, f"{slugify(filename)}-{data.get('id')}") move_source(parent_dir, f"{filename}{ext}", remove=remove) + print(f"[SUCCESS]: {filename}{ext} converted & ready for embedding.\n") + return(True, None) \ No newline at end of file diff --git a/collector/scripts/watch/convert/as_markdown.py b/collector/scripts/watch/convert/as_markdown.py index 3e1a3dba1a700b299544dc6bdcb00402c120675f..b7d9fbf6d274249a7f46a4cabb8b408c7f3be26f 100644 --- a/collector/scripts/watch/convert/as_markdown.py +++ b/collector/scripts/watch/convert/as_markdown.py @@ -16,6 +16,10 @@ def as_markdown(**kwargs): data = loader.load()[0] content = data.page_content + if len(content) == 0: + print(f"Resulting page content was empty - no text could be extracted from {filename}{ext}.") + return(False, f"No text could be extracted from {filename}{ext}.") + print(f"-- Working {fullpath} --") data = { 'id': guid(), @@ -33,4 +37,6 @@ def as_markdown(**kwargs): write_to_server_documents(data, f"{slugify(filename)}-{data.get('id')}") move_source(parent_dir, f"{filename}{ext}", remove=remove) + print(f"[SUCCESS]: {filename}{ext} converted & ready for embedding.\n") + return(True, None) diff --git a/collector/scripts/watch/convert/as_mbox.py b/collector/scripts/watch/convert/as_mbox.py index 96c9b25bb551726a6230a7f89334c6c19308a78a..048fd0e49051932a7f2318b7d19362dc4b486139 100644 --- a/collector/scripts/watch/convert/as_mbox.py +++ b/collector/scripts/watch/convert/as_mbox.py @@ -55,5 +55,7 @@ def as_mbox(**kwargs): } write_to_server_documents(data, f"{slugify(filename)}-{data.get('id')}") + move_source(parent_dir, f"{filename}{ext}", remove=remove) print(f"[SUCCESS]: {filename}{ext} converted & ready for embedding.\n") + return(True, None) diff --git a/collector/scripts/watch/convert/as_pdf.py b/collector/scripts/watch/convert/as_pdf.py index c5517332344a991fdd40a4e4890eadbee10fe97c..8fc1d1f3d2c80c952edb4db4a562fedfd956b84b 100644 --- a/collector/scripts/watch/convert/as_pdf.py +++ b/collector/scripts/watch/convert/as_pdf.py @@ -19,7 +19,7 @@ def as_pdf(**kwargs): if len(pages) == 0: print(f"{fullpath} parsing resulted in no pages - nothing to do.") - return False + return(False, f"No pages found for {filename}{ext}!") # Set doc to the first page so we can still get the metadata from PyMuPDF but without all the unicode issues. doc = pages[0] @@ -31,6 +31,10 @@ def as_pdf(**kwargs): print(f"-- Parsing content from pg {page.number} --") page_content += unidecode(page.get_text('text')) + if len(page_content) == 0: + print(f"Resulting page content was empty - no text could be extracted from the document.") + return(False, f"No text content could be extracted from {filename}{ext}!") + title = doc.metadata.get('title') author = doc.metadata.get('author') subject = doc.metadata.get('subject') @@ -50,4 +54,6 @@ def as_pdf(**kwargs): write_to_server_documents(data, f"{slugify(filename)}-{data.get('id')}") move_source(parent_dir, f"{filename}{ext}", remove=remove) + print(f"[SUCCESS]: {filename}{ext} converted & ready for embedding.\n") + return(True, None) diff --git a/collector/scripts/watch/convert/as_text.py b/collector/scripts/watch/convert/as_text.py index f3389c849c4ed85929668a882df3a3a09689ace8..e6ad85140d88e076ca8f57edccdd64a117c2d6cf 100644 --- a/collector/scripts/watch/convert/as_text.py +++ b/collector/scripts/watch/convert/as_text.py @@ -12,6 +12,10 @@ def as_text(**kwargs): fullpath = f"{parent_dir}/{filename}{ext}" content = open(fullpath).read() + if len(content) == 0: + print(f"Resulting text content was empty for {filename}{ext}.") + return(False, f"No text content found in {filename}{ext}") + print(f"-- Working {fullpath} --") data = { 'id': guid(), @@ -28,4 +32,6 @@ def as_text(**kwargs): write_to_server_documents(data, f"{slugify(filename)}-{data.get('id')}") move_source(parent_dir, f"{filename}{ext}", remove=remove) + print(f"[SUCCESS]: {filename}{ext} converted & ready for embedding.\n") + return(True, None) diff --git a/collector/scripts/watch/process_single.py b/collector/scripts/watch/process_single.py index f41219eb3837c077cd6f47464dfae54f72a7c6ea..19fc3978d496642ebd5fc4732c2d58c8e01548cf 100644 --- a/collector/scripts/watch/process_single.py +++ b/collector/scripts/watch/process_single.py @@ -25,11 +25,11 @@ def process_single(directory, target_doc): move_source(new_destination_filename=target_doc, failed=True, remove=True) return (False, f"{fileext} not a supported file type for conversion. It will not be processed.") - FILETYPES[fileext]( + # Returns Tuple of (Boolean, String|None) of success status and possible error message. + # Error message will display to user. + return FILETYPES[fileext]( directory=directory, filename=filename, ext=fileext, remove_on_complete=True # remove source document to save disk space. - ) - - return (True, None) + ) \ No newline at end of file diff --git a/frontend/src/components/Modals/MangeWorkspace/Upload/FileUploadProgress/index.jsx b/frontend/src/components/Modals/MangeWorkspace/Upload/FileUploadProgress/index.jsx index 5883c4b2095e89745d951c877ef72bc7805a974f..8c47581c4f3b9e46f2c8e424a5e2c8dff562c6d9 100644 --- a/frontend/src/components/Modals/MangeWorkspace/Upload/FileUploadProgress/index.jsx +++ b/frontend/src/components/Modals/MangeWorkspace/Upload/FileUploadProgress/index.jsx @@ -14,7 +14,8 @@ function FileUploadProgressComponent({ onUploadError, }) { const [timerMs, setTimerMs] = useState(10); - const [status, setStatus] = useState(file?.rejected ? "uploading" : "failed"); + const [status, setStatus] = useState("pending"); + const [error, setError] = useState(""); useEffect(() => { async function uploadFile() { @@ -31,6 +32,7 @@ function FileUploadProgressComponent({ setStatus("failed"); clearInterval(timer); onUploadError(data.error); + setError(data.error); } else { setStatus("complete"); clearInterval(timer); @@ -58,6 +60,24 @@ function FileUploadProgressComponent({ ); } + if (status === "failed") { + return ( + <div className="w-fit px-2 py-2 flex items-center gap-x-4 rounded-lg bg-blue-100 border-blue-600 dark:bg-stone-800 bg-opacity-50 border dark:border-stone-600"> + <div className="w-6 h-6"> + <XCircle className="w-6 h-6 stroke-white bg-red-500 rounded-full p-1 w-full h-full" /> + </div> + <div className="flex flex-col"> + <p className="text-black dark:text-stone-200 text-sm font-mono overflow-x-scroll"> + {truncate(file.name, 30)} + </p> + <p className="text-red-700 dark:text-red-400 text-xs font-mono"> + {error} + </p> + </div> + </div> + ); + } + return ( <div className="w-fit px-2 py-2 flex items-center gap-x-4 rounded-lg bg-blue-100 border-blue-600 dark:bg-stone-800 bg-opacity-50 border dark:border-stone-600"> <div className="w-6 h-6"> @@ -77,6 +97,8 @@ function FileUploadProgressComponent({ </div> </div> ); + + return null; } export default memo(FileUploadProgressComponent);