Skip to content
Snippets Groups Projects
Unverified Commit 3e784767 authored by Timothy Carambat's avatar Timothy Carambat Committed by GitHub
Browse files

Franzbischoff document improvements (#241)


* cosmetic changes to be compatible to hadolint

* common configuration for most editors until better plugins comes up

* Changes on PDF metadata, using PyMuPDF (faster and more compatible)

* small changes on other file ingestions in order to try to keep the fields equal

* Lint, review, and review

* fixed unknown chars

* Use PyMuPDF for pdf loading for 200% speed increase
linting

---------

Co-authored-by: default avatarFrancisco Bischoff <franzbischoff@gmail.com>
Co-authored-by: default avatarFrancisco Bischoff <984592+franzbischoff@users.noreply.github.com>
parent bb822a8a
No related branches found
No related tags found
No related merge requests found
# EditorConfig is awesome: https://EditorConfig.org
# top-most EditorConfig file
root = true
[*]
indent_style = space
indent_size = 2
end_of_line = lf
charset = utf-8
trim_trailing_whitespace = true
insert_final_newline = true
...@@ -54,6 +54,7 @@ mypy-extensions==1.0.0 ...@@ -54,6 +54,7 @@ mypy-extensions==1.0.0
nltk==3.8.1 nltk==3.8.1
numexpr==2.8.4 numexpr==2.8.4
numpy==1.23.5 numpy==1.23.5
oauthlib==3.2.2
olefile==0.46 olefile==0.46
openapi-schema-pydantic==1.2.4 openapi-schema-pydantic==1.2.4
openpyxl==3.1.2 openpyxl==3.1.2
...@@ -68,8 +69,8 @@ pycparser==2.21 ...@@ -68,8 +69,8 @@ pycparser==2.21
pydantic==1.10.8 pydantic==1.10.8
pyee==8.2.2 pyee==8.2.2
Pygments==2.15.1 Pygments==2.15.1
PyMuPDF==1.22.5
pypandoc==1.4 pypandoc==1.4
pypdf==3.9.0
pyppeteer==1.0.2 pyppeteer==1.0.2
pyquery==2.0.0 pyquery==2.0.0
python-dateutil==2.8.2 python-dateutil==2.8.2
...@@ -83,6 +84,7 @@ PyYAML==6.0 ...@@ -83,6 +84,7 @@ PyYAML==6.0
regex==2023.5.5 regex==2023.5.5
requests==2.31.0 requests==2.31.0
requests-html==0.10.0 requests-html==0.10.0
requests-oauthlib==1.3.1
rfc3986==1.5.0 rfc3986==1.5.0
rich==13.0.1 rich==13.0.1
six==1.16.0 six==1.16.0
...@@ -94,9 +96,11 @@ tenacity==8.2.2 ...@@ -94,9 +96,11 @@ tenacity==8.2.2
text-unidecode==1.3 text-unidecode==1.3
tiktoken==0.4.0 tiktoken==0.4.0
tqdm==4.65.0 tqdm==4.65.0
tweepy==4.14.0
typer==0.9.0 typer==0.9.0
typing-inspect==0.9.0 typing-inspect==0.9.0
typing_extensions==4.6.3 typing_extensions==4.6.3
Unidecode==1.3.6
unstructured==0.7.1 unstructured==0.7.1
urllib3==1.26.16 urllib3==1.26.16
uuid==1.30 uuid==1.30
...@@ -110,4 +114,3 @@ XlsxWriter==3.1.2 ...@@ -110,4 +114,3 @@ XlsxWriter==3.1.2
yarl==1.9.2 yarl==1.9.2
youtube-transcript-api==0.6.0 youtube-transcript-api==0.6.0
zipp==3.15.0 zipp==3.15.0
tweepy==4.14.0
...@@ -29,10 +29,10 @@ def gitbook(): ...@@ -29,10 +29,10 @@ def gitbook():
data = { data = {
'id': str(uuid4()), 'id': str(uuid4()),
'url': metadata.get('source'), 'url': metadata.get('source'),
"title": metadata.get('title'), 'title': metadata.get('title'),
"description": metadata.get('title'), 'description': metadata.get('title'),
"published": datetime.today().strftime('%Y-%m-%d %H:%M:%S'), 'published': datetime.today().strftime('%Y-%m-%d %H:%M:%S'),
"wordCount": len(content), 'wordCount': len(content),
'pageContent': content, 'pageContent': content,
'token_count_estimate': len(tokenize(content)) 'token_count_estimate': len(tokenize(content))
} }
......
...@@ -18,16 +18,19 @@ def as_docx(**kwargs): ...@@ -18,16 +18,19 @@ def as_docx(**kwargs):
print(f"-- Working {fullpath} --") print(f"-- Working {fullpath} --")
data = { data = {
'id': guid(), 'id': guid(),
'url': "file://"+os.path.abspath(f"{parent_dir}/processed/{filename}{ext}"), 'url': "file://"+os.path.abspath(f"{parent_dir}/processed/{filename}{ext}"),
'title': f"{filename}{ext}", 'title': f"{filename}{ext}",
'description': "a custom file uploaded by the user.", 'docAuthor': 'Unknown', # TODO: Find a better author
'description': 'Unknown', # TODO: Find a better bescription
'docSource': 'Docx Text file uploaded by the user.',
'chunkSource': f"{filename}{ext}",
'published': file_creation_time(fullpath), 'published': file_creation_time(fullpath),
'wordCount': len(content), 'wordCount': len(content),
'pageContent': content, 'pageContent': content,
'token_count_estimate': len(tokenize(content)) 'token_count_estimate': len(tokenize(content))
} }
write_to_server_documents(data, f"{slugify(filename)}-{data.get('id')}") write_to_server_documents(data, f"{slugify(filename)}-{data.get('id')}")
move_source(parent_dir, f"{filename}{ext}", remove=remove) move_source(parent_dir, f"{filename}{ext}", remove=remove)
print(f"[SUCCESS]: {filename}{ext} converted & ready for embedding.\n") print(f"[SUCCESS]: {filename}{ext} converted & ready for embedding.\n")
...@@ -45,16 +48,19 @@ def as_odt(**kwargs): ...@@ -45,16 +48,19 @@ def as_odt(**kwargs):
print(f"-- Working {fullpath} --") print(f"-- Working {fullpath} --")
data = { data = {
'id': guid(), 'id': guid(),
'url': "file://"+os.path.abspath(f"{parent_dir}/processed/{filename}{ext}"), 'url': "file://"+os.path.abspath(f"{parent_dir}/processed/{filename}{ext}"),
'title': f"{filename}{ext}", 'title': f"{filename}{ext}",
'description': "a custom file uploaded by the user.", 'author': 'Unknown', # TODO: Find a better author
'description': 'Unknown', # TODO: Find a better bescription
'docSource': 'ODT Text file uploaded by the user.',
'chunkSource': f"{filename}{ext}",
'published': file_creation_time(fullpath), 'published': file_creation_time(fullpath),
'wordCount': len(content), 'wordCount': len(content),
'pageContent': content, 'pageContent': content,
'token_count_estimate': len(tokenize(content)) 'token_count_estimate': len(tokenize(content))
} }
write_to_server_documents(data, f"{slugify(filename)}-{data.get('id')}") write_to_server_documents(data, f"{slugify(filename)}-{data.get('id')}")
move_source(parent_dir, f"{filename}{ext}", remove=remove) move_source(parent_dir, f"{filename}{ext}", remove=remove)
print(f"[SUCCESS]: {filename}{ext} converted & ready for embedding.\n") print(f"[SUCCESS]: {filename}{ext} converted & ready for embedding.\n")
\ No newline at end of file
...@@ -18,16 +18,19 @@ def as_markdown(**kwargs): ...@@ -18,16 +18,19 @@ def as_markdown(**kwargs):
print(f"-- Working {fullpath} --") print(f"-- Working {fullpath} --")
data = { data = {
'id': guid(), 'id': guid(),
'url': "file://"+os.path.abspath(f"{parent_dir}/processed/{filename}{ext}"), 'url': "file://"+os.path.abspath(f"{parent_dir}/processed/{filename}{ext}"),
'title': f"{filename}{ext}", 'title': f"{filename}", # TODO: find a better metadata
'description': "a custom file uploaded by the user.", 'docAuthor': 'Unknown', # TODO: find a better metadata
'description': 'Unknown', # TODO: find a better metadata
'docSource': 'markdown file uploaded by the user.',
'chunkSource': f"{filename}{ext}",
'published': file_creation_time(fullpath), 'published': file_creation_time(fullpath),
'wordCount': len(content), 'wordCount': len(content),
'pageContent': content, 'pageContent': content,
'token_count_estimate': len(tokenize(content)) 'token_count_estimate': len(tokenize(content))
} }
write_to_server_documents(data, f"{slugify(filename)}-{data.get('id')}") write_to_server_documents(data, f"{slugify(filename)}-{data.get('id')}")
move_source(parent_dir, f"{filename}{ext}", remove=remove) move_source(parent_dir, f"{filename}{ext}", remove=remove)
print(f"[SUCCESS]: {filename}{ext} converted & ready for embedding.\n") print(f"[SUCCESS]: {filename}{ext} converted & ready for embedding.\n")
\ No newline at end of file
import os import os
import datetime import datetime
import email.utils import email.utils
from mailbox import mbox from mailbox import mbox
from slugify import slugify from slugify import slugify
...@@ -36,12 +36,14 @@ def as_mbox(**kwargs): ...@@ -36,12 +36,14 @@ def as_mbox(**kwargs):
date_sent = local_date.strftime("%a, %d %b %Y %H:%M:%S") date_sent = local_date.strftime("%a, %d %b %Y %H:%M:%S")
else: else:
date_sent = None date_sent = None
data = { data = {
'id': guid(), 'id': guid(),
'url': "file://"+os.path.abspath(f"{parent_dir}/processed/{slugify(filename)}-{guid()}{ext}"), 'url': "file://"+os.path.abspath(f"{parent_dir}/processed/{slugify(filename)}-{guid()}{ext}"),
'title': f"{filename}{ext}", 'title': message['Subject'],
'description': "a custom file uploaded by the user.", 'docAuthor': message['From'],
'description': f"email {message['From']} to {message['To']}",
'docSource': "mbox file uploaded by the user.",
'published': file_creation_time(fullpath), 'published': file_creation_time(fullpath),
'sender': message['From'], 'sender': message['From'],
'recipient': message['To'], 'recipient': message['To'],
......
import os, time import os, fitz
from langchain.document_loaders import PyPDFLoader from langchain.document_loaders import PyMuPDFLoader # better UTF support and metadata
from slugify import slugify from slugify import slugify
from ..utils import guid, file_creation_time, write_to_server_documents, move_source from ..utils import guid, file_creation_time, write_to_server_documents, move_source
from ...utils import tokenize from ...utils import tokenize
from unidecode import unidecode
# Process all text-related documents. # Process all PDF-related documents.
def as_pdf(**kwargs): def as_pdf(**kwargs):
parent_dir = kwargs.get('directory', 'hotdir') parent_dir = kwargs.get('directory', 'hotdir')
filename = kwargs.get('filename') filename = kwargs.get('filename')
ext = kwargs.get('ext', '.txt') ext = kwargs.get('ext', '.txt')
remove = kwargs.get('remove_on_complete', False) remove = kwargs.get('remove_on_complete', False)
fullpath = f"{parent_dir}/{filename}{ext}" fullpath = f"{parent_dir}/{filename}{ext}"
destination = f"../server/storage/documents/{slugify(filename)}-{int(time.time())}"
loader = PyPDFLoader(fullpath)
pages = loader.load_and_split()
print(f"-- Working {fullpath} --") print(f"-- Working {fullpath} --")
for page in pages: loader = PyMuPDFLoader(fullpath)
pg_num = page.metadata.get('page') pages = loader.load()
print(f"-- Working page {pg_num} --")
if len(pages) == 0:
print(f"{fullpath} parsing resulted in no pages - nothing to do.")
return False
# Set doc to the first page so we can still get the metadata from PyMuPDF but without all the unicode issues.
doc = pages[0]
del loader
del pages
page_content = ''
for page in fitz.open(fullpath):
print(f"-- Parsing content from pg {page.number} --")
page_content += unidecode(page.get_text('text'))
content = page.page_content title = doc.metadata.get('title')
data = { author = doc.metadata.get('author')
'id': guid(), subject = doc.metadata.get('subject')
'url': "file://"+os.path.abspath(f"{parent_dir}/processed/{filename}{ext}"), data = {
'title': f"{filename}_pg{pg_num}{ext}", 'id': guid(),
'description': "a custom file uploaded by the user.", 'url': "file://"+os.path.abspath(f"{parent_dir}/processed/{filename}{ext}"),
'published': file_creation_time(fullpath), 'title': title if title else f"{filename}{ext}",
'wordCount': len(content), 'docAuthor': author if author else 'No author found',
'pageContent': content, 'description': subject if subject else 'No description found.',
'token_count_estimate': len(tokenize(content)) 'docSource': 'pdf file uploaded by the user.',
} 'chunkSource': f"{filename}{ext}",
write_to_server_documents(data, f"{slugify(filename)}-pg{pg_num}-{data.get('id')}", destination) 'published': file_creation_time(fullpath),
'wordCount': len(page_content), # Technically a letter count :p
'pageContent': page_content,
'token_count_estimate': len(tokenize(page_content))
}
write_to_server_documents(data, f"{slugify(filename)}-{data.get('id')}")
move_source(parent_dir, f"{filename}{ext}", remove=remove) move_source(parent_dir, f"{filename}{ext}", remove=remove)
print(f"[SUCCESS]: {filename}{ext} converted & ready for embedding.\n") print(f"[SUCCESS]: {filename}{ext} converted & ready for embedding.\n")
\ No newline at end of file
...@@ -14,16 +14,18 @@ def as_text(**kwargs): ...@@ -14,16 +14,18 @@ def as_text(**kwargs):
print(f"-- Working {fullpath} --") print(f"-- Working {fullpath} --")
data = { data = {
'id': guid(), 'id': guid(),
'url': "file://"+os.path.abspath(f"{parent_dir}/processed/{filename}{ext}"), 'url': "file://"+os.path.abspath(f"{parent_dir}/processed/{filename}{ext}"),
'title': f"{filename}{ext}", 'title': f"{filename}{ext}",
'description': "a custom file uploaded by the user.", 'docAuthor': 'Unknown', # TODO: Find a better author
'description': 'Unknown', # TODO: Find a better description
'chunkSource': f"{filename}{ext}",
'published': file_creation_time(fullpath), 'published': file_creation_time(fullpath),
'wordCount': len(content), 'wordCount': len(content),
'pageContent': content, 'pageContent': content,
'token_count_estimate': len(tokenize(content)) 'token_count_estimate': len(tokenize(content))
} }
write_to_server_documents(data, f"{slugify(filename)}-{data.get('id')}") write_to_server_documents(data, f"{slugify(filename)}-{data.get('id')}")
move_source(parent_dir, f"{filename}{ext}", remove=remove) move_source(parent_dir, f"{filename}{ext}", remove=remove)
print(f"[SUCCESS]: {filename}{ext} converted & ready for embedding.\n") print(f"[SUCCESS]: {filename}{ext} converted & ready for embedding.\n")
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment