Skip to content
Snippets Groups Projects
Unverified Commit 69427f9f authored by Javier Torres's avatar Javier Torres Committed by GitHub
Browse files

Add in-memory loading for non-default filesystems in PDFReader (#12659)

* Add in-memory loading for non-default filesystems in PDFReader

* bump version
parent 447805b7
No related branches found
No related tags found
No related merge requests found
......@@ -10,9 +10,10 @@ from pathlib import Path
from typing import Any, Dict, List, Optional
from fsspec import AbstractFileSystem
import logging
import io
from llama_index.core.readers.base import BaseReader
from llama_index.core.readers.file.base import get_default_fs
from llama_index.core.readers.file.base import get_default_fs, is_default_fs
from llama_index.core.schema import Document
logger = logging.getLogger(__name__)
......@@ -45,8 +46,12 @@ class PDFReader(BaseReader):
)
fs = fs or get_default_fs()
with fs.open(file, "rb") as fp:
# Load the file in memory if the filesystem is not the default one to avoid
# issues with pypdf
stream = fp if is_default_fs(fs) else io.BytesIO(fp.read())
# Create a PDF object
pdf = pypdf.PdfReader(fp)
pdf = pypdf.PdfReader(stream)
# Get the number of pages in the PDF document
num_pages = len(pdf.pages)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment