From 54417172949cf55ea59aa03a770a8070351b57eb Mon Sep 17 00:00:00 2001
From: Timothy Carambat <rambat1010@gmail.com>
Date: Wed, 1 Nov 2023 16:44:02 -0700
Subject: [PATCH] normalize parser struct for all file types (#321)

---
 collector/scripts/watch/convert/as_docx.py | 2 +-
 collector/scripts/watch/convert/as_mbox.py | 5 +----
 collector/scripts/watch/convert/as_text.py | 1 +
 3 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/collector/scripts/watch/convert/as_docx.py b/collector/scripts/watch/convert/as_docx.py
index 33aaaaaeb..b37786179 100644
--- a/collector/scripts/watch/convert/as_docx.py
+++ b/collector/scripts/watch/convert/as_docx.py
@@ -61,7 +61,7 @@ def as_odt(**kwargs):
     'id': guid(),
     'url': "file://"+os.path.abspath(f"{parent_dir}/processed/{filename}{ext}"),
     'title': f"{filename}{ext}",
-    'author': 'Unknown', # TODO: Find a better author
+    'docAuthor': 'Unknown', # TODO: Find a better author
     'description': 'Unknown', # TODO: Find a better bescription
     'docSource': 'ODT Text file uploaded by the user.',
     'chunkSource': f"{filename}{ext}",
diff --git a/collector/scripts/watch/convert/as_mbox.py b/collector/scripts/watch/convert/as_mbox.py
index f5a645eaa..2d7c08e63 100644
--- a/collector/scripts/watch/convert/as_mbox.py
+++ b/collector/scripts/watch/convert/as_mbox.py
@@ -110,11 +110,8 @@ def as_mbox(**kwargs):
             "docAuthor": message["From"],
             "description": f"email from {message['From']} to {message['To']}",
             "docSource": "mbox file uploaded by the user.",
+            "chunkSource": subject,
             "published": file_creation_time(fullpath),
-            "sender": message["From"],
-            "recipient": message["To"],
-            "subject": subject,
-            "date_sent": date_sent,
             "wordCount": len(content),
             "pageContent": content,
             "token_count_estimate": len(tokenize(content)),
diff --git a/collector/scripts/watch/convert/as_text.py b/collector/scripts/watch/convert/as_text.py
index e6ad85140..1b897874b 100644
--- a/collector/scripts/watch/convert/as_text.py
+++ b/collector/scripts/watch/convert/as_text.py
@@ -23,6 +23,7 @@ def as_text(**kwargs):
     'title': f"{filename}{ext}",
     'docAuthor': 'Unknown', # TODO: Find a better author
     'description': 'Unknown', # TODO: Find a better description
+    'docSource': 'a text file uploaded by the user.',
     'chunkSource': f"{filename}{ext}",
     'published': file_creation_time(fullpath),
     'wordCount': len(content),
-- 
GitLab