From 69427f9fddbee738907df188c5798e70b13be184 Mon Sep 17 00:00:00 2001 From: Javier Torres <javierandrestorresreyes@gmail.com> Date: Mon, 8 Apr 2024 15:27:28 -0500 Subject: [PATCH] Add in-memory loading for non-default filesystems in PDFReader (#12659) * Add in-memory loading for non-default filesystems in PDFReader * bump version --- .../llama_index/readers/file/docs/base.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/llama-index-integrations/readers/llama-index-readers-file/llama_index/readers/file/docs/base.py b/llama-index-integrations/readers/llama-index-readers-file/llama_index/readers/file/docs/base.py index 441ee5e054..71b03c1469 100644 --- a/llama-index-integrations/readers/llama-index-readers-file/llama_index/readers/file/docs/base.py +++ b/llama-index-integrations/readers/llama-index-readers-file/llama_index/readers/file/docs/base.py @@ -10,9 +10,10 @@ from pathlib import Path from typing import Any, Dict, List, Optional from fsspec import AbstractFileSystem import logging +import io from llama_index.core.readers.base import BaseReader -from llama_index.core.readers.file.base import get_default_fs +from llama_index.core.readers.file.base import get_default_fs, is_default_fs from llama_index.core.schema import Document logger = logging.getLogger(__name__) @@ -45,8 +46,12 @@ class PDFReader(BaseReader): ) fs = fs or get_default_fs() with fs.open(file, "rb") as fp: + # Load the file in memory if the filesystem is not the default one to avoid + # issues with pypdf + stream = fp if is_default_fs(fs) else io.BytesIO(fp.read()) + # Create a PDF object - pdf = pypdf.PdfReader(fp) + pdf = pypdf.PdfReader(stream) # Get the number of pages in the PDF document num_pages = len(pdf.pages) -- GitLab