diff --git a/llama-index-integrations/readers/llama-index-readers-file/llama_index/readers/file/docs/base.py b/llama-index-integrations/readers/llama-index-readers-file/llama_index/readers/file/docs/base.py index 441ee5e054a8c1741dcae9627b7a6c1cec373092..71b03c1469b25ab8911ec365d426cc1c9b9a2792 100644 --- a/llama-index-integrations/readers/llama-index-readers-file/llama_index/readers/file/docs/base.py +++ b/llama-index-integrations/readers/llama-index-readers-file/llama_index/readers/file/docs/base.py @@ -10,9 +10,10 @@ from pathlib import Path from typing import Any, Dict, List, Optional from fsspec import AbstractFileSystem import logging +import io from llama_index.core.readers.base import BaseReader -from llama_index.core.readers.file.base import get_default_fs +from llama_index.core.readers.file.base import get_default_fs, is_default_fs from llama_index.core.schema import Document logger = logging.getLogger(__name__) @@ -45,8 +46,12 @@ class PDFReader(BaseReader): ) fs = fs or get_default_fs() with fs.open(file, "rb") as fp: + # Load the file in memory if the filesystem is not the default one to avoid + # issues with pypdf + stream = fp if is_default_fs(fs) else io.BytesIO(fp.read()) + # Create a PDF object - pdf = pypdf.PdfReader(fp) + pdf = pypdf.PdfReader(stream) # Get the number of pages in the PDF document num_pages = len(pdf.pages)