From 46f1e13b63d6735a6fb86fabbf8d6104552b2f49 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicol=C3=B2=20Boschi?= <boschi1997@gmail.com>
Date: Wed, 13 Mar 2024 22:45:45 +0100
Subject: [PATCH] fix: pymupdf must be optional because is AGPLv3 licensed
 (#11896)

---
 docs/examples/agent/openai_retrieval_benchmark.ipynb   |  8 +++-----
 .../finetuning/gradient/gradient_structured.ipynb      |  2 +-
 .../finetuning/openai_fine_tuning_functions.ipynb      |  5 ++---
 docs/examples/low_level/evaluation.ipynb               |  5 ++---
 docs/examples/low_level/fusion_retriever.ipynb         |  2 +-
 docs/examples/low_level/oss_ingestion_retrieval.ipynb  |  2 +-
 docs/examples/low_level/response_synthesis.ipynb       |  7 ++-----
 docs/examples/low_level/retrieval.ipynb                |  2 +-
 docs/examples/low_level/router.ipynb                   |  5 ++---
 docs/examples/low_level/vector_store.ipynb             | 10 +++-------
 docs/examples/param_optimizer/param_optimizer.ipynb    |  5 ++---
 docs/examples/prompts/emotion_prompt.ipynb             |  5 ++---
 docs/examples/prompts/prompt_optimization.ipynb        |  2 +-
 docs/examples/prompts/prompts_rag.ipynb                |  5 ++---
 .../query_engine/pdf_tables/recursive_retriever.ipynb  |  2 +-
 docs/examples/retrievers/auto_merging_retriever.ipynb  |  8 +++-----
 docs/examples/retrievers/composable_retrievers.ipynb   |  2 +-
 docs/examples/retrievers/ensemble_retrieval.ipynb      |  5 ++---
 .../llama_index/readers/file/pymu_pdf/README.md        |  2 +-
 .../readers/llama-index-readers-file/pyproject.toml    |  5 +++--
 20 files changed, 36 insertions(+), 53 deletions(-)

diff --git a/docs/examples/agent/openai_retrieval_benchmark.ipynb b/docs/examples/agent/openai_retrieval_benchmark.ipynb
index 020f7ee938..3b2e8f8ecf 100644
--- a/docs/examples/agent/openai_retrieval_benchmark.ipynb
+++ b/docs/examples/agent/openai_retrieval_benchmark.ipynb
@@ -18,7 +18,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "%pip install llama-index-readers-file\n",
+    "%pip install llama-index-readers-file pymupdf\n",
     "%pip install llama-index-agent-openai\n",
     "%pip install llama-index-llms-openai"
    ]
@@ -74,8 +74,7 @@
       "\n",
       "data/llama2.pdf     100%[===================>]  13.03M   141KB/s    in 1m 48s  \n",
       "\n",
-      "2023-11-08 21:55:42 (123 KB/s) - ‘data/llama2.pdf’ saved [13661300/13661300]\n",
-      "\n"
+      "2023-11-08 21:55:42 (123 KB/s) - ‘data/llama2.pdf’ saved [13661300/13661300]\n"
      ]
     }
    ],
@@ -198,8 +197,7 @@
       "\n",
       "data/llama2_eval_qr 100%[===================>]  59.23K  --.-KB/s    in 0.02s   \n",
       "\n",
-      "2023-11-08 22:20:12 (2.87 MB/s) - ‘data/llama2_eval_qr_dataset.json’ saved [60656/60656]\n",
-      "\n"
+      "2023-11-08 22:20:12 (2.87 MB/s) - ‘data/llama2_eval_qr_dataset.json’ saved [60656/60656]\n"
      ]
     }
    ],
diff --git a/docs/examples/finetuning/gradient/gradient_structured.ipynb b/docs/examples/finetuning/gradient/gradient_structured.ipynb
index 9478b6fa29..70b585d36e 100644
--- a/docs/examples/finetuning/gradient/gradient_structured.ipynb
+++ b/docs/examples/finetuning/gradient/gradient_structured.ipynb
@@ -31,7 +31,7 @@
    "source": [
     "%pip install llama-index-llms-gradient\n",
     "%pip install llama-index-llms-openai\n",
-    "%pip install llama-index-readers-file\n",
+    "%pip install llama-index-readers-file pymupdf\n",
     "%pip install llama-index-finetuning"
    ]
   },
diff --git a/docs/examples/finetuning/openai_fine_tuning_functions.ipynb b/docs/examples/finetuning/openai_fine_tuning_functions.ipynb
index a57978db21..e8ee6bae8e 100644
--- a/docs/examples/finetuning/openai_fine_tuning_functions.ipynb
+++ b/docs/examples/finetuning/openai_fine_tuning_functions.ipynb
@@ -30,7 +30,7 @@
     "%pip install llama-index-finetuning\n",
     "%pip install llama-index-llms-openai\n",
     "%pip install llama-index-finetuning-callbacks\n",
-    "%pip install llama-index-readers-file\n",
+    "%pip install llama-index-readers-file pymupdf\n",
     "%pip install llama-index-program-openai"
    ]
   },
@@ -407,8 +407,7 @@
       "\n",
       "data/llama2.pdf     100%[===================>]  13.03M   229KB/s    in 45s     \n",
       "\n",
-      "2023-10-04 23:47:25 (298 KB/s) - ‘data/llama2.pdf’ saved [13661300/13661300]\n",
-      "\n"
+      "2023-10-04 23:47:25 (298 KB/s) - ‘data/llama2.pdf’ saved [13661300/13661300]\n"
      ]
     }
    ],
diff --git a/docs/examples/low_level/evaluation.ipynb b/docs/examples/low_level/evaluation.ipynb
index fd17ad92e4..123c6730d5 100644
--- a/docs/examples/low_level/evaluation.ipynb
+++ b/docs/examples/low_level/evaluation.ipynb
@@ -38,7 +38,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "%pip install llama-index-readers-file\n",
+    "%pip install llama-index-readers-file pymupdf\n",
     "%pip install llama-index-llms-openai"
    ]
   },
@@ -62,8 +62,7 @@
       "\n",
       "data/llama2.pdf     100%[===================>]  13.03M  1.56MB/s    in 9.3s    \n",
       "\n",
-      "2023-09-19 00:05:25 (1.40 MB/s) - ‘data/llama2.pdf’ saved [13661300/13661300]\n",
-      "\n"
+      "2023-09-19 00:05:25 (1.40 MB/s) - ‘data/llama2.pdf’ saved [13661300/13661300]\n"
      ]
     }
    ],
diff --git a/docs/examples/low_level/fusion_retriever.ipynb b/docs/examples/low_level/fusion_retriever.ipynb
index f26448b459..0f5d1bf0b2 100644
--- a/docs/examples/low_level/fusion_retriever.ipynb
+++ b/docs/examples/low_level/fusion_retriever.ipynb
@@ -40,7 +40,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "%pip install llama-index-readers-file\n",
+    "%pip install llama-index-readers-file pymupdf\n",
     "%pip install llama-index-llms-openai"
    ]
   },
diff --git a/docs/examples/low_level/oss_ingestion_retrieval.ipynb b/docs/examples/low_level/oss_ingestion_retrieval.ipynb
index 1aeaf0f270..5c09fc9a48 100644
--- a/docs/examples/low_level/oss_ingestion_retrieval.ipynb
+++ b/docs/examples/low_level/oss_ingestion_retrieval.ipynb
@@ -53,7 +53,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "%pip install llama-index-readers-file\n",
+    "%pip install llama-index-readers-file pymupdf\n",
     "%pip install llama-index-vector-stores-postgres\n",
     "%pip install llama-index-embeddings-huggingface\n",
     "%pip install llama-index-llms-llama-cpp"
diff --git a/docs/examples/low_level/response_synthesis.ipynb b/docs/examples/low_level/response_synthesis.ipynb
index b0088704c8..c472cc7b71 100644
--- a/docs/examples/low_level/response_synthesis.ipynb
+++ b/docs/examples/low_level/response_synthesis.ipynb
@@ -51,7 +51,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "%pip install llama-index-readers-file\n",
+    "%pip install llama-index-readers-file pymupdf\n",
     "%pip install llama-index-vector-stores-pinecone\n",
     "%pip install llama-index-llms-openai"
    ]
@@ -386,7 +386,6 @@
       "*****Response******:\n",
       "\n",
       "RLHF used both model-based and human-based evaluation to select the best-performing models among several ablations. Model-based evaluation was used to measure the robustness of the reward model by collecting a test set of prompts for both helpfulness and safety, and asking three annotators to judge the quality of the answers based on a 7-point Likert scale. Human evaluation was used to validate major model versions. Additionally, a more general reward was trained to ensure the measure wouldn't diverge from the human preferences. Results showed that the reward models were well calibrated with the human preference annotations.\n",
-      "\n",
       "\n"
      ]
     }
@@ -485,9 +484,7 @@
       "---------------------\n",
       "Given the context information and not prior knowledge, answer the query.\n",
       "Query: Can you tell me about results from RLHF using both model-based and human-based evaluation?\n",
-      "Answer: \n",
-      "\n",
-      "\n"
+      "Answer: \n"
      ]
     }
    ],
diff --git a/docs/examples/low_level/retrieval.ipynb b/docs/examples/low_level/retrieval.ipynb
index 4f25f8968a..51d5f72842 100644
--- a/docs/examples/low_level/retrieval.ipynb
+++ b/docs/examples/low_level/retrieval.ipynb
@@ -51,7 +51,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "%pip install llama-index-readers-file\n",
+    "%pip install llama-index-readers-file pymupdf\n",
     "%pip install llama-index-vector-stores-pinecone\n",
     "%pip install llama-index-embeddings-openai"
    ]
diff --git a/docs/examples/low_level/router.ipynb b/docs/examples/low_level/router.ipynb
index 761bf1fd99..25428541c0 100644
--- a/docs/examples/low_level/router.ipynb
+++ b/docs/examples/low_level/router.ipynb
@@ -49,7 +49,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "%pip install llama-index-readers-file\n",
+    "%pip install llama-index-readers-file pymupdf\n",
     "%pip install llama-index-program-openai\n",
     "%pip install llama-index-llms-openai"
    ]
@@ -692,8 +692,7 @@
       "\n",
       "data/llama2.pdf     100%[===================>]  13.03M  1.50MB/s    in 9.5s    \n",
       "\n",
-      "2023-09-17 23:37:22 (1.37 MB/s) - ‘data/llama2.pdf’ saved [13661300/13661300]\n",
-      "\n"
+      "2023-09-17 23:37:22 (1.37 MB/s) - ‘data/llama2.pdf’ saved [13661300/13661300]\n"
      ]
     }
    ],
diff --git a/docs/examples/low_level/vector_store.ipynb b/docs/examples/low_level/vector_store.ipynb
index dd9093bf83..fd2a913c8b 100644
--- a/docs/examples/low_level/vector_store.ipynb
+++ b/docs/examples/low_level/vector_store.ipynb
@@ -50,7 +50,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "%pip install llama-index-readers-file\n",
+    "%pip install llama-index-readers-file pymupdf\n",
     "%pip install llama-index-embeddings-openai"
    ]
   },
@@ -690,9 +690,7 @@
       "In this section, we describe our approach to safety fine-tuning, including safety categories, annotation\n",
       "guidelines, and the techniques we use to mitigate safety risks. We employ a process similar to the general\n",
       "fine-tuning methods as described in Section 3, with some notable differences related to safety concerns.\n",
-      "----------------\n",
-      "\n",
-      "\n"
+      "----------------\n"
      ]
     }
    ],
@@ -774,9 +772,7 @@
       "Better Long-Tail Safety Robustness without Hurting Helpfulness\n",
       "Safety is inherently a long-tail problem,\n",
       "where the challenge comes from a small number of very specific cases.\n",
-      "----------------\n",
-      "\n",
-      "\n"
+      "----------------\n"
      ]
     }
    ],
diff --git a/docs/examples/param_optimizer/param_optimizer.ipynb b/docs/examples/param_optimizer/param_optimizer.ipynb
index c7e28398c0..34eb9d74f0 100644
--- a/docs/examples/param_optimizer/param_optimizer.ipynb
+++ b/docs/examples/param_optimizer/param_optimizer.ipynb
@@ -33,7 +33,7 @@
    "source": [
     "%pip install llama-index-llms-openai\n",
     "%pip install llama-index-embeddings-openai\n",
-    "%pip install llama-index-readers-file\n",
+    "%pip install llama-index-readers-file pymupdf\n",
     "%pip install llama-index-experimental-param-tuner"
    ]
   },
@@ -66,8 +66,7 @@
       "\n",
       "data/llama2.pdf     100%[===================>]  13.03M   533KB/s    in 36s     \n",
       "\n",
-      "2023-11-04 00:17:10 (376 KB/s) - ‘data/llama2.pdf’ saved [13661300/13661300]\n",
-      "\n"
+      "2023-11-04 00:17:10 (376 KB/s) - ‘data/llama2.pdf’ saved [13661300/13661300]\n"
      ]
     }
    ],
diff --git a/docs/examples/prompts/emotion_prompt.ipynb b/docs/examples/prompts/emotion_prompt.ipynb
index 7fb2420c53..da2bfac9a6 100644
--- a/docs/examples/prompts/emotion_prompt.ipynb
+++ b/docs/examples/prompts/emotion_prompt.ipynb
@@ -23,7 +23,7 @@
    "outputs": [],
    "source": [
     "%pip install llama-index-llms-openai\n",
-    "%pip install llama-index-readers-file"
+    "%pip install llama-index-readers-file pymupdf"
    ]
   },
   {
@@ -192,8 +192,7 @@
       "\n",
       "data/llama2_eval_qr 100%[===================>]  59.23K  --.-KB/s    in 0.04s   \n",
       "\n",
-      "2023-11-04 00:34:10 (1.48 MB/s) - ‘data/llama2_eval_qr_dataset.json’ saved [60656/60656]\n",
-      "\n"
+      "2023-11-04 00:34:10 (1.48 MB/s) - ‘data/llama2_eval_qr_dataset.json’ saved [60656/60656]\n"
      ]
     }
    ],
diff --git a/docs/examples/prompts/prompt_optimization.ipynb b/docs/examples/prompts/prompt_optimization.ipynb
index 3cf58e9116..ce2fb70d99 100644
--- a/docs/examples/prompts/prompt_optimization.ipynb
+++ b/docs/examples/prompts/prompt_optimization.ipynb
@@ -21,7 +21,7 @@
    "outputs": [],
    "source": [
     "%pip install llama-index-llms-openai\n",
-    "%pip install llama-index-readers-file"
+    "%pip install llama-index-readers-file pymupdf"
    ]
   },
   {
diff --git a/docs/examples/prompts/prompts_rag.ipynb b/docs/examples/prompts/prompts_rag.ipynb
index cb6eacaad4..a9c0dac8c9 100644
--- a/docs/examples/prompts/prompts_rag.ipynb
+++ b/docs/examples/prompts/prompts_rag.ipynb
@@ -30,7 +30,7 @@
    "outputs": [],
    "source": [
     "%pip install llama-index-llms-openai\n",
-    "%pip install llama-index-readers-file"
+    "%pip install llama-index-readers-file pymupdf"
    ]
   },
   {
@@ -130,8 +130,7 @@
       "\n",
       "data/llama2.pdf     100%[===================>]  13.03M  1.50MB/s    in 10s     \n",
       "\n",
-      "2023-10-28 23:19:49 (1.31 MB/s) - ‘data/llama2.pdf’ saved [13661300/13661300]\n",
-      "\n"
+      "2023-10-28 23:19:49 (1.31 MB/s) - ‘data/llama2.pdf’ saved [13661300/13661300]\n"
      ]
     }
    ],
diff --git a/docs/examples/query_engine/pdf_tables/recursive_retriever.ipynb b/docs/examples/query_engine/pdf_tables/recursive_retriever.ipynb
index 0930193d12..05c490d2f7 100644
--- a/docs/examples/query_engine/pdf_tables/recursive_retriever.ipynb
+++ b/docs/examples/query_engine/pdf_tables/recursive_retriever.ipynb
@@ -31,7 +31,7 @@
    "outputs": [],
    "source": [
     "%pip install llama-index-embeddings-openai\n",
-    "%pip install llama-index-readers-file\n",
+    "%pip install llama-index-readers-file pymupdf\n",
     "%pip install llama-index-llms-openai"
    ]
   },
diff --git a/docs/examples/retrievers/auto_merging_retriever.ipynb b/docs/examples/retrievers/auto_merging_retriever.ipynb
index 42e2314230..6082fbc148 100644
--- a/docs/examples/retrievers/auto_merging_retriever.ipynb
+++ b/docs/examples/retrievers/auto_merging_retriever.ipynb
@@ -28,7 +28,7 @@
    "outputs": [],
    "source": [
     "%pip install llama-index-llms-openai\n",
-    "%pip install llama-index-readers-file"
+    "%pip install llama-index-readers-file pymupdf"
    ]
   },
   {
@@ -353,8 +353,7 @@
      "text": [
       "> Merging 4 nodes into parent node.\n",
       "> Parent node id: caf5f81c-842f-46a4-b679-6be584bd6aff.\n",
-      "> Parent node text: We conduct RLHF by first collecting human preference data for safety similar to Section 3.2.2: an...\n",
-      "\n"
+      "> Parent node text: We conduct RLHF by first collecting human preference data for safety similar to Section 3.2.2: an...\n"
      ]
     }
    ],
@@ -684,8 +683,7 @@
      "text": [
       "> Merging 4 nodes into parent node.\n",
       "> Parent node id: 3671b20d-ea5e-4afc-983e-02be6ee8302d.\n",
-      "> Parent node text: We conduct RLHF by first collecting human preference data for safety similar to Section 3.2.2: an...\n",
-      "\n"
+      "> Parent node text: We conduct RLHF by first collecting human preference data for safety similar to Section 3.2.2: an...\n"
      ]
     }
    ],
diff --git a/docs/examples/retrievers/composable_retrievers.ipynb b/docs/examples/retrievers/composable_retrievers.ipynb
index 78d8515001..234dd0015e 100644
--- a/docs/examples/retrievers/composable_retrievers.ipynb
+++ b/docs/examples/retrievers/composable_retrievers.ipynb
@@ -38,7 +38,7 @@
     "%pip install llama-index-retrievers-bm25\n",
     "%pip install llama-index-storage-docstore-redis\n",
     "%pip install llama-index-storage-docstore-dynamodb\n",
-    "%pip install llama-index-readers-file"
+    "%pip install llama-index-readers-file pymupdf"
    ]
   },
   {
diff --git a/docs/examples/retrievers/ensemble_retrieval.ipynb b/docs/examples/retrievers/ensemble_retrieval.ipynb
index 9bb2ff4fef..70e9d7230f 100644
--- a/docs/examples/retrievers/ensemble_retrieval.ipynb
+++ b/docs/examples/retrievers/ensemble_retrieval.ipynb
@@ -38,7 +38,7 @@
    "source": [
     "%pip install llama-index-llms-openai\n",
     "%pip install llama-index-postprocessor-cohere-rerank\n",
-    "%pip install llama-index-readers-file"
+    "%pip install llama-index-readers-file pymupdf"
    ]
   },
   {
@@ -159,8 +159,7 @@
       "\n",
       "data/llama2.pdf     100%[===================>]  13.03M   521KB/s    in 42s     \n",
       "\n",
-      "2023-09-28 12:57:20 (320 KB/s) - ‘data/llama2.pdf’ saved [13661300/13661300]\n",
-      "\n"
+      "2023-09-28 12:57:20 (320 KB/s) - ‘data/llama2.pdf’ saved [13661300/13661300]\n"
      ]
     }
    ],
diff --git a/llama-index-integrations/readers/llama-index-readers-file/llama_index/readers/file/pymu_pdf/README.md b/llama-index-integrations/readers/llama-index-readers-file/llama_index/readers/file/pymu_pdf/README.md
index cfde1701d6..82a20b3205 100644
--- a/llama-index-integrations/readers/llama-index-readers-file/llama_index/readers/file/pymu_pdf/README.md
+++ b/llama-index-integrations/readers/llama-index-readers-file/llama_index/readers/file/pymu_pdf/README.md
@@ -1,7 +1,7 @@
 # PyMuPDF Loader
 
 ```bash
-pip install llama-index-readers-file
+pip install llama-index-readers-file pymupdf
 ```
 
 This loader extracts text from a local PDF file using the `PyMuPDF` Python library. If `metadata` is passed as True while calling `load` function; extracted documents will include basic metadata such as page numbers, file path and total number of pages in pdf.
diff --git a/llama-index-integrations/readers/llama-index-readers-file/pyproject.toml b/llama-index-integrations/readers/llama-index-readers-file/pyproject.toml
index 7ce4da7890..71561c017a 100644
--- a/llama-index-integrations/readers/llama-index-readers-file/pyproject.toml
+++ b/llama-index-integrations/readers/llama-index-readers-file/pyproject.toml
@@ -50,12 +50,13 @@ license = "MIT"
 maintainers = ["FarisHijazi", "Haowjy", "ephe-meral", "hursh-desai", "iamarunbrahma", "jon-chuang", "mmaatouk", "ravi03071991", "sangwongenip", "thejessezhang"]
 name = "llama-index-readers-file"
 readme = "README.md"
-version = "0.1.10"
+version = "0.1.11"
 
 [tool.poetry.dependencies]
 python = ">=3.8.1,<4.0"
 llama-index-core = "^0.10.1"
-pymupdf = "^1.23.21"
+# pymupdf is AGPLv3-licensed, so it's optional
+pymupdf = {optional = true, version = "^1.23.21"}
 bs4 = "^0.0.2"
 beautifulsoup4 = "^4.12.3"
 pypdf = "^4.0.1"
-- 
GitLab