Skip to content
Snippets Groups Projects
Unverified Commit 69427f9f authored by Javier Torres's avatar Javier Torres Committed by GitHub
Browse files

Add in-memory loading for non-default filesystems in PDFReader (#12659)

* Add in-memory loading for non-default filesystems in PDFReader

* bump version
parent 447805b7
Branches
Tags
No related merge requests found
...@@ -10,9 +10,10 @@ from pathlib import Path ...@@ -10,9 +10,10 @@ from pathlib import Path
from typing import Any, Dict, List, Optional from typing import Any, Dict, List, Optional
from fsspec import AbstractFileSystem from fsspec import AbstractFileSystem
import logging import logging
import io
from llama_index.core.readers.base import BaseReader from llama_index.core.readers.base import BaseReader
from llama_index.core.readers.file.base import get_default_fs from llama_index.core.readers.file.base import get_default_fs, is_default_fs
from llama_index.core.schema import Document from llama_index.core.schema import Document
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
...@@ -45,8 +46,12 @@ class PDFReader(BaseReader): ...@@ -45,8 +46,12 @@ class PDFReader(BaseReader):
) )
fs = fs or get_default_fs() fs = fs or get_default_fs()
with fs.open(file, "rb") as fp: with fs.open(file, "rb") as fp:
# Load the file in memory if the filesystem is not the default one to avoid
# issues with pypdf
stream = fp if is_default_fs(fs) else io.BytesIO(fp.read())
# Create a PDF object # Create a PDF object
pdf = pypdf.PdfReader(fp) pdf = pypdf.PdfReader(stream)
# Get the number of pages in the PDF document # Get the number of pages in the PDF document
num_pages = len(pdf.pages) num_pages = len(pdf.pages)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment