From 23f1491dfb9c60bd2f2384aa88942e2f152acd45 Mon Sep 17 00:00:00 2001 From: "Huu Le (Lee)" <39040748+leehuwuj@users.noreply.github.com> Date: Thu, 29 Feb 2024 23:12:21 +0700 Subject: [PATCH] fix null value in default file document metadata (#11501) --- .../llama_index/core/readers/file/base.py | 40 ++++++++++++------- 1 file changed, 26 insertions(+), 14 deletions(-) diff --git a/llama-index-core/llama_index/core/readers/file/base.py b/llama-index-core/llama_index/core/readers/file/base.py index 585b13f72..efd9ff34b 100644 --- a/llama-index-core/llama_index/core/readers/file/base.py +++ b/llama-index-core/llama_index/core/readers/file/base.py @@ -56,6 +56,21 @@ def _try_loading_included_file_formats() -> Dict[str, Type[BaseReader]]: return default_file_reader_cls +def _format_file_timestamp(timestamp: float) -> Optional[str]: + """Format file timestamp to a %Y-%m-%d string. + + Args: + timestamp (float): timestamp in float + + Returns: + str: formatted timestamp + """ + try: + return datetime.fromtimestamp(timestamp).strftime("%Y-%m-%d") + except Exception: + return None + + def default_file_metadata_func( file_path: str, fs: Optional[fsspec.AbstractFileSystem] = None ) -> Dict: @@ -66,20 +81,10 @@ def default_file_metadata_func( """ fs = fs or get_default_fs() stat_result = fs.stat(file_path) - creation_date = stat_result.get("created") - last_modified_date = stat_result.get("mtime") - last_accessed_date = stat_result.get("atime") - try: - creation_date = datetime.fromtimestamp(creation_date).strftime("%Y-%m-%d") - last_modified_date = datetime.fromtimestamp(last_modified_date).strftime( - "%Y-%m-%d" - ) - last_accessed_date = datetime.fromtimestamp(last_accessed_date).strftime( - "%Y-%m-%d" - ) - except Exception: - pass - return { + creation_date = _format_file_timestamp(stat_result.get("created")) + last_modified_date = _format_file_timestamp(stat_result.get("mtime")) + last_accessed_date = _format_file_timestamp(stat_result.get("atime")) + default_meta = { "file_path": file_path, "file_name": stat_result["name"], "file_type": mimetypes.guess_type(file_path)[0], @@ -89,6 +94,13 @@ def default_file_metadata_func( "last_accessed_date": last_accessed_date, } + # Return not null value + return { + meta_key: meta_value + for meta_key, meta_value in default_meta.items() + if meta_value is not None + } + class _DefaultFileMetadataFunc: """ -- GitLab