Skip to content
Snippets Groups Projects
Unverified Commit 54417172 authored by Timothy Carambat's avatar Timothy Carambat Committed by GitHub
Browse files

normalize parser struct for all file types (#321)

parent 1c5d55c4
No related branches found
No related tags found
No related merge requests found
...@@ -61,7 +61,7 @@ def as_odt(**kwargs): ...@@ -61,7 +61,7 @@ def as_odt(**kwargs):
'id': guid(), 'id': guid(),
'url': "file://"+os.path.abspath(f"{parent_dir}/processed/{filename}{ext}"), 'url': "file://"+os.path.abspath(f"{parent_dir}/processed/{filename}{ext}"),
'title': f"{filename}{ext}", 'title': f"{filename}{ext}",
'author': 'Unknown', # TODO: Find a better author 'docAuthor': 'Unknown', # TODO: Find a better author
'description': 'Unknown', # TODO: Find a better bescription 'description': 'Unknown', # TODO: Find a better bescription
'docSource': 'ODT Text file uploaded by the user.', 'docSource': 'ODT Text file uploaded by the user.',
'chunkSource': f"{filename}{ext}", 'chunkSource': f"{filename}{ext}",
......
...@@ -110,11 +110,8 @@ def as_mbox(**kwargs): ...@@ -110,11 +110,8 @@ def as_mbox(**kwargs):
"docAuthor": message["From"], "docAuthor": message["From"],
"description": f"email from {message['From']} to {message['To']}", "description": f"email from {message['From']} to {message['To']}",
"docSource": "mbox file uploaded by the user.", "docSource": "mbox file uploaded by the user.",
"chunkSource": subject,
"published": file_creation_time(fullpath), "published": file_creation_time(fullpath),
"sender": message["From"],
"recipient": message["To"],
"subject": subject,
"date_sent": date_sent,
"wordCount": len(content), "wordCount": len(content),
"pageContent": content, "pageContent": content,
"token_count_estimate": len(tokenize(content)), "token_count_estimate": len(tokenize(content)),
......
...@@ -23,6 +23,7 @@ def as_text(**kwargs): ...@@ -23,6 +23,7 @@ def as_text(**kwargs):
'title': f"{filename}{ext}", 'title': f"{filename}{ext}",
'docAuthor': 'Unknown', # TODO: Find a better author 'docAuthor': 'Unknown', # TODO: Find a better author
'description': 'Unknown', # TODO: Find a better description 'description': 'Unknown', # TODO: Find a better description
'docSource': 'a text file uploaded by the user.',
'chunkSource': f"{filename}{ext}", 'chunkSource': f"{filename}{ext}",
'published': file_creation_time(fullpath), 'published': file_creation_time(fullpath),
'wordCount': len(content), 'wordCount': len(content),
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment