From 23f1491dfb9c60bd2f2384aa88942e2f152acd45 Mon Sep 17 00:00:00 2001
From: "Huu Le (Lee)" <39040748+leehuwuj@users.noreply.github.com>
Date: Thu, 29 Feb 2024 23:12:21 +0700
Subject: [PATCH] fix null value in default file document metadata (#11501)

---
 .../llama_index/core/readers/file/base.py     | 40 ++++++++++++-------
 1 file changed, 26 insertions(+), 14 deletions(-)

diff --git a/llama-index-core/llama_index/core/readers/file/base.py b/llama-index-core/llama_index/core/readers/file/base.py
index 585b13f72..efd9ff34b 100644
--- a/llama-index-core/llama_index/core/readers/file/base.py
+++ b/llama-index-core/llama_index/core/readers/file/base.py
@@ -56,6 +56,21 @@ def _try_loading_included_file_formats() -> Dict[str, Type[BaseReader]]:
     return default_file_reader_cls
 
 
+def _format_file_timestamp(timestamp: float) -> Optional[str]:
+    """Format file timestamp to a %Y-%m-%d string.
+
+    Args:
+        timestamp (float): timestamp in float
+
+    Returns:
+        str: formatted timestamp
+    """
+    try:
+        return datetime.fromtimestamp(timestamp).strftime("%Y-%m-%d")
+    except Exception:
+        return None
+
+
 def default_file_metadata_func(
     file_path: str, fs: Optional[fsspec.AbstractFileSystem] = None
 ) -> Dict:
@@ -66,20 +81,10 @@ def default_file_metadata_func(
     """
     fs = fs or get_default_fs()
     stat_result = fs.stat(file_path)
-    creation_date = stat_result.get("created")
-    last_modified_date = stat_result.get("mtime")
-    last_accessed_date = stat_result.get("atime")
-    try:
-        creation_date = datetime.fromtimestamp(creation_date).strftime("%Y-%m-%d")
-        last_modified_date = datetime.fromtimestamp(last_modified_date).strftime(
-            "%Y-%m-%d"
-        )
-        last_accessed_date = datetime.fromtimestamp(last_accessed_date).strftime(
-            "%Y-%m-%d"
-        )
-    except Exception:
-        pass
-    return {
+    creation_date = _format_file_timestamp(stat_result.get("created"))
+    last_modified_date = _format_file_timestamp(stat_result.get("mtime"))
+    last_accessed_date = _format_file_timestamp(stat_result.get("atime"))
+    default_meta = {
         "file_path": file_path,
         "file_name": stat_result["name"],
         "file_type": mimetypes.guess_type(file_path)[0],
@@ -89,6 +94,13 @@ def default_file_metadata_func(
         "last_accessed_date": last_accessed_date,
     }
 
+    # Return not null value
+    return {
+        meta_key: meta_value
+        for meta_key, meta_value in default_meta.items()
+        if meta_value is not None
+    }
+
 
 class _DefaultFileMetadataFunc:
     """
-- 
GitLab