From 684cdbca00f1650ba8a5f80f41ffebcf3e7b535f Mon Sep 17 00:00:00 2001 From: Logan <logan.markewich@live.com> Date: Mon, 3 Apr 2023 21:38:03 -0600 Subject: [PATCH] Add Notebooks to Docs/Sphinx (#1039) --- .github/workflows/dev_docs.yml | 2 + .gitignore | 3 +- docs/Makefile | 1 + docs/build_notebooks.py | 27 + docs/conf.py | 8 +- docs/guides/notebooks.rst | 171 +- .../async/AsyncComposableIndicesSEC.nblink | 1 + .../async/AsyncGPTTreeIndexDemo.nblink | 1 + .../async/AsyncLLMPredictorDemo.nblink | 1 + .../notebooks/async/AsyncQueryDemo.nblink | 1 + .../notebooks/azure_demo/AzureOpenAI.nblink | 1 + .../notebooks/chatbot/Chatbot_SEC.nblink | 1 + .../ChatGPTRetrievalPluginIndexDemo.nblink | 1 + .../ChatGPTRetrievalPluginReaderDemo.nblink | 1 + .../ChatGPT_Retrieval_Plugin_Upload.nblink | 1 + .../ComposableIndices-Prior.nblink | 1 + .../ComposableIndices-Weaviate.nblink | 1 + .../ComposableIndices.nblink | 1 + .../composable_indices/QASummaryGraph.nblink | 1 + .../cost_analysis/TokenPredictor.nblink | 1 + .../data_connectors/ChromaDemo.nblink | 1 + .../data_connectors/DatabaseReaderDemo.nblink | 1 + .../data_connectors/DiscordDemo.nblink | 1 + .../data_connectors/FaissDemo.nblink | 1 + .../GithubRepositoryReaderDemo.nblink | 1 + .../data_connectors/GoogleDocsDemo.nblink | 1 + .../notebooks/data_connectors/MakeDemo.nblink | 1 + .../data_connectors/MboxReaderDemo.nblink | 1 + .../data_connectors/MongoDemo.nblink | 1 + .../data_connectors/NotionDemo.nblink | 1 + .../data_connectors/ObsidianReaderDemo.nblink | 1 + .../data_connectors/PineconeDemo.nblink | 1 + .../data_connectors/QdrantDemo.nblink | 1 + .../data_connectors/SlackDemo.nblink | 1 + .../data_connectors/TwitterDemo.nblink | 1 + .../data_connectors/WeaviateDemo.nblink | 1 + .../data_connectors/WebPageDemo.nblink | 1 + .../notebooks/docstore/DocstoreDemo.nblink | 1 + .../evaluation/GuardrailsDemo.nblink | 1 + .../LangchainOutputParserDemo.nblink | 1 + .../evaluation/TestNYC-Evaluation.nblink | 1 + .../guides/notebooks/gatsby/TestGatsby.nblink | 1 + .../knowledge_graph/KnowledgeGraphDemo.nblink | 1 + .../langchain_demo/LangchainDemo.nblink | 1 + .../notebooks/multimodal/Multimodal.nblink | 1 + .../NodePostprocessorDemo.nblink | 1 + .../notebooks/optimizer/OptimizerDemo.nblink | 1 + .../DavinciComparison.nblink | 1 + .../paul_graham_essay/GPT4Comparison.nblink | 1 + .../paul_graham_essay/InsertDemo.nblink | 1 + .../KeywordTableComparison.nblink | 1 + .../SentenceSplittingDemo.nblink | 1 + .../paul_graham_essay/TestEssay.nblink | 1 + .../playground/PlaygroundDemo.nblink | 1 + .../HyDEQueryTransformDemo.nblink | 1 + .../struct_indices/PandasIndexDemo.nblink | 1 + .../SQLIndexDemo-Context.nblink | 1 + .../SQLIndexDemo-ManyTables.nblink | 1 + .../struct_indices/SQLIndexDemo.nblink | 1 + .../test_wiki/TestNYC-Benchmark-GPT4.nblink | 1 + .../test_wiki/TestNYC-Tree-GPT4.nblink | 1 + .../guides/notebooks/test_wiki/TestNYC.nblink | 1 + .../test_wiki/TestNYC_Embeddings.nblink | 1 + .../notebooks/test_wiki/TestWikiReader.nblink | 1 + .../AsyncIndexCreationDemo.nblink | 1 + .../vector_indices/ChromaIndexDemo.nblink | 1 + .../vector_indices/FaissIndexDemo.nblink | 1 + .../vector_indices/OpensearchDemo.nblink | 1 + .../vector_indices/PineconeIndexDemo.nblink | 1 + .../vector_indices/QdrantIndexDemo.nblink | 1 + .../SimpleIndexDemo-ChatGPT.nblink | 1 + .../SimpleIndexDemo-multistep.nblink | 1 + .../SimpleIndexDemo-streaming.nblink | 1 + .../vector_indices/SimpleIndexDemo.nblink | 1 + .../vector_indices/WeaviateIndexDemo.nblink | 1 + docs/requirements.txt | 6 +- .../async/AsyncComposableIndicesSEC.ipynb | 10 +- examples/async/AsyncGPTTreeIndexDemo.ipynb | 270 +- examples/chatbot/Chatbot_SEC.ipynb | 10 +- .../ChatGPTRetrievalPluginIndexDemo.ipynb | 2 +- .../ChatGPT_Retrieval_Plugin_Upload.ipynb | 10 +- .../composable_indices/QASummaryGraph.ipynb | 10 +- examples/cost_analysis/TokenPredictor.ipynb | 706 ++-- .../data_connectors/DatabaseReaderDemo.ipynb | 409 +- .../GithubRepositoryReaderDemo.ipynb | 222 +- examples/data_connectors/MboxReaderDemo.ipynb | 208 +- .../data_connectors/ObsidianReaderDemo.ipynb | 275 +- examples/data_connectors/TwitterDemo.ipynb | 216 +- examples/data_connectors/WeaviateDemo.ipynb | 352 +- examples/evaluation/GuardrailsDemo.ipynb | 680 ++-- .../LangchainOutputParserDemo.ipynb | 662 ++-- examples/evaluation/TestNYC-Evaluation.ipynb | 10 +- examples/gatsby/TestGatsby.ipynb | 374 +- .../knowledge_graph/KnowledgeGraphDemo.ipynb | 10 +- examples/langchain_demo/LangchainDemo.ipynb | 2 +- examples/multimodal/Multimodal.ipynb | 1201 +++--- examples/optimizer/OptimizerDemo.ipynb | 398 +- .../paul_graham_essay/DavinciComparison.ipynb | 26 +- .../paul_graham_essay/GPT4Comparison.ipynb | 1294 +++---- examples/paul_graham_essay/InsertDemo.ipynb | 10 +- .../KeywordTableComparison.ipynb | 852 ++--- examples/paul_graham_essay/TestEssay.ipynb | 1322 +++---- examples/playground/PlaygroundDemo.ipynb | 830 ++-- .../HyDEQueryTransformDemo.ipynb | 2 +- examples/struct_indices/PandasIndexDemo.ipynb | 2 +- .../struct_indices/SQLIndexDemo-Context.ipynb | 636 +-- .../SQLIndexDemo-ManyTables.ipynb | 798 ++-- examples/struct_indices/SQLIndexDemo.ipynb | 4 +- .../test_wiki/TestNYC-Benchmark-GPT4.ipynb | 3404 +++++++++-------- examples/test_wiki/TestNYC-Tree-GPT4.ipynb | 10 +- examples/test_wiki/TestNYC.ipynb | 366 +- examples/test_wiki/TestNYC_Embeddings.ipynb | 882 ++--- examples/test_wiki/TestWikiReader.ipynb | 582 +-- .../AsyncIndexCreationDemo.ipynb | 426 +-- .../SimpleIndexDemo-ChatGPT.ipynb | 851 +++-- .../SimpleIndexDemo-multistep.ipynb | 4 +- .../SimpleIndexDemo-streaming.ipynb | 278 +- examples/vector_indices/SimpleIndexDemo.ipynb | 4 +- 118 files changed, 9684 insertions(+), 9223 deletions(-) create mode 100644 docs/build_notebooks.py create mode 100644 docs/guides/notebooks/async/AsyncComposableIndicesSEC.nblink create mode 100644 docs/guides/notebooks/async/AsyncGPTTreeIndexDemo.nblink create mode 100644 docs/guides/notebooks/async/AsyncLLMPredictorDemo.nblink create mode 100644 docs/guides/notebooks/async/AsyncQueryDemo.nblink create mode 100644 docs/guides/notebooks/azure_demo/AzureOpenAI.nblink create mode 100644 docs/guides/notebooks/chatbot/Chatbot_SEC.nblink create mode 100644 docs/guides/notebooks/chatgpt_plugin/ChatGPTRetrievalPluginIndexDemo.nblink create mode 100644 docs/guides/notebooks/chatgpt_plugin/ChatGPTRetrievalPluginReaderDemo.nblink create mode 100644 docs/guides/notebooks/chatgpt_plugin/ChatGPT_Retrieval_Plugin_Upload.nblink create mode 100644 docs/guides/notebooks/composable_indices/ComposableIndices-Prior.nblink create mode 100644 docs/guides/notebooks/composable_indices/ComposableIndices-Weaviate.nblink create mode 100644 docs/guides/notebooks/composable_indices/ComposableIndices.nblink create mode 100644 docs/guides/notebooks/composable_indices/QASummaryGraph.nblink create mode 100644 docs/guides/notebooks/cost_analysis/TokenPredictor.nblink create mode 100644 docs/guides/notebooks/data_connectors/ChromaDemo.nblink create mode 100644 docs/guides/notebooks/data_connectors/DatabaseReaderDemo.nblink create mode 100644 docs/guides/notebooks/data_connectors/DiscordDemo.nblink create mode 100644 docs/guides/notebooks/data_connectors/FaissDemo.nblink create mode 100644 docs/guides/notebooks/data_connectors/GithubRepositoryReaderDemo.nblink create mode 100644 docs/guides/notebooks/data_connectors/GoogleDocsDemo.nblink create mode 100644 docs/guides/notebooks/data_connectors/MakeDemo.nblink create mode 100644 docs/guides/notebooks/data_connectors/MboxReaderDemo.nblink create mode 100644 docs/guides/notebooks/data_connectors/MongoDemo.nblink create mode 100644 docs/guides/notebooks/data_connectors/NotionDemo.nblink create mode 100644 docs/guides/notebooks/data_connectors/ObsidianReaderDemo.nblink create mode 100644 docs/guides/notebooks/data_connectors/PineconeDemo.nblink create mode 100644 docs/guides/notebooks/data_connectors/QdrantDemo.nblink create mode 100644 docs/guides/notebooks/data_connectors/SlackDemo.nblink create mode 100644 docs/guides/notebooks/data_connectors/TwitterDemo.nblink create mode 100644 docs/guides/notebooks/data_connectors/WeaviateDemo.nblink create mode 100644 docs/guides/notebooks/data_connectors/WebPageDemo.nblink create mode 100644 docs/guides/notebooks/docstore/DocstoreDemo.nblink create mode 100644 docs/guides/notebooks/evaluation/GuardrailsDemo.nblink create mode 100644 docs/guides/notebooks/evaluation/LangchainOutputParserDemo.nblink create mode 100644 docs/guides/notebooks/evaluation/TestNYC-Evaluation.nblink create mode 100644 docs/guides/notebooks/gatsby/TestGatsby.nblink create mode 100644 docs/guides/notebooks/knowledge_graph/KnowledgeGraphDemo.nblink create mode 100644 docs/guides/notebooks/langchain_demo/LangchainDemo.nblink create mode 100644 docs/guides/notebooks/multimodal/Multimodal.nblink create mode 100644 docs/guides/notebooks/node_postprocessor/NodePostprocessorDemo.nblink create mode 100644 docs/guides/notebooks/optimizer/OptimizerDemo.nblink create mode 100644 docs/guides/notebooks/paul_graham_essay/DavinciComparison.nblink create mode 100644 docs/guides/notebooks/paul_graham_essay/GPT4Comparison.nblink create mode 100644 docs/guides/notebooks/paul_graham_essay/InsertDemo.nblink create mode 100644 docs/guides/notebooks/paul_graham_essay/KeywordTableComparison.nblink create mode 100644 docs/guides/notebooks/paul_graham_essay/SentenceSplittingDemo.nblink create mode 100644 docs/guides/notebooks/paul_graham_essay/TestEssay.nblink create mode 100644 docs/guides/notebooks/playground/PlaygroundDemo.nblink create mode 100644 docs/guides/notebooks/query_transformations/HyDEQueryTransformDemo.nblink create mode 100644 docs/guides/notebooks/struct_indices/PandasIndexDemo.nblink create mode 100644 docs/guides/notebooks/struct_indices/SQLIndexDemo-Context.nblink create mode 100644 docs/guides/notebooks/struct_indices/SQLIndexDemo-ManyTables.nblink create mode 100644 docs/guides/notebooks/struct_indices/SQLIndexDemo.nblink create mode 100644 docs/guides/notebooks/test_wiki/TestNYC-Benchmark-GPT4.nblink create mode 100644 docs/guides/notebooks/test_wiki/TestNYC-Tree-GPT4.nblink create mode 100644 docs/guides/notebooks/test_wiki/TestNYC.nblink create mode 100644 docs/guides/notebooks/test_wiki/TestNYC_Embeddings.nblink create mode 100644 docs/guides/notebooks/test_wiki/TestWikiReader.nblink create mode 100644 docs/guides/notebooks/vector_indices/AsyncIndexCreationDemo.nblink create mode 100644 docs/guides/notebooks/vector_indices/ChromaIndexDemo.nblink create mode 100644 docs/guides/notebooks/vector_indices/FaissIndexDemo.nblink create mode 100644 docs/guides/notebooks/vector_indices/OpensearchDemo.nblink create mode 100644 docs/guides/notebooks/vector_indices/PineconeIndexDemo.nblink create mode 100644 docs/guides/notebooks/vector_indices/QdrantIndexDemo.nblink create mode 100644 docs/guides/notebooks/vector_indices/SimpleIndexDemo-ChatGPT.nblink create mode 100644 docs/guides/notebooks/vector_indices/SimpleIndexDemo-multistep.nblink create mode 100644 docs/guides/notebooks/vector_indices/SimpleIndexDemo-streaming.nblink create mode 100644 docs/guides/notebooks/vector_indices/SimpleIndexDemo.nblink create mode 100644 docs/guides/notebooks/vector_indices/WeaviateIndexDemo.nblink diff --git a/.github/workflows/dev_docs.yml b/.github/workflows/dev_docs.yml index f0b2e907ff..e1cbb57ad8 100644 --- a/.github/workflows/dev_docs.yml +++ b/.github/workflows/dev_docs.yml @@ -10,9 +10,11 @@ jobs: steps: - uses: actions/checkout@v2 - uses: cpina/github-action-push-to-another-repository@main + - uses: awalsh128/cache-apt-pkgs-action@latest env: API_TOKEN_GITHUB: ${{ secrets.PAT}} with: + packages: pandoc source-directory: './docs' destination-github-username: 'avb-is-me' destination-repository-name: 'gpt_index' diff --git a/.gitignore b/.gitignore index 0900a937e6..c31c85352c 100644 --- a/.gitignore +++ b/.gitignore @@ -82,7 +82,6 @@ target/ # Jupyter Notebook .ipynb_checkpoints -notebooks/ # IPython profile_default/ @@ -138,4 +137,4 @@ dmypy.json # Jetbrains .idea -modules/ \ No newline at end of file +modules/ diff --git a/docs/Makefile b/docs/Makefile index d4bb2cbb9e..d7e778614b 100644 --- a/docs/Makefile +++ b/docs/Makefile @@ -17,4 +17,5 @@ help: # Catch-all target: route all unknown targets to Sphinx using the new # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). %: Makefile + python ./build_notebooks.py @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/docs/build_notebooks.py b/docs/build_notebooks.py new file mode 100644 index 0000000000..ed7fb0ca33 --- /dev/null +++ b/docs/build_notebooks.py @@ -0,0 +1,27 @@ +import json +import os + +source_dir = "../examples/" +dest_dir = "./guides/notebooks/" +relative_path = "../../../../examples" + + +for example_dir in os.listdir(source_dir): + example_dir_path = os.path.join(source_dir, example_dir) + + for nb_name in os.listdir(example_dir_path): + if not nb_name.endswith(".ipynb"): + continue + + # make dest folder in docs + os.makedirs(os.path.join(dest_dir, example_dir), exist_ok=True) + + # build link text + relative_nb_path = os.path.join(relative_path, example_dir, nb_name) + nb_link_text = json.dumps({"path": relative_nb_path}) + + # write nbsphinx-link document + nbsphinx_name = nb_name.replace(".ipynb", ".nblink") + nbsphinx_path = os.path.join(dest_dir, example_dir, nbsphinx_name) + with open(nbsphinx_path, "w") as f: + f.write(nb_link_text) diff --git a/docs/conf.py b/docs/conf.py index 4d37d641e5..f989990cf0 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -37,6 +37,8 @@ extensions = [ "sphinx_rtd_theme", "sphinx.ext.mathjax", "myst_parser", + "nbsphinx", + "nbsphinx_link", ] myst_heading_anchors = 4 @@ -45,7 +47,7 @@ myst_heading_anchors = 4 suppress_warnings = ["myst.header"] templates_path = ["_templates"] -exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"] +exclude_patterns = ["_build", "Thumbs.db", ".DS_Store", "**.ipynb_checkpoints"] # -- Options for HTML output ------------------------------------------------- @@ -53,3 +55,7 @@ exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"] html_theme = "sphinx_rtd_theme" html_static_path = ["_static"] + +# nbsphinx options +nbsphinx_execute = "never" +nbsphinx_allow_errors = True diff --git a/docs/guides/notebooks.rst b/docs/guides/notebooks.rst index 5fae146bab..5e6773fabc 100644 --- a/docs/guides/notebooks.rst +++ b/docs/guides/notebooks.rst @@ -3,4 +3,173 @@ Notebooks We offer a wide variety of example notebooks. They are referenced throughout the documentation. -Example notebooks are found `here <https://github.com/jerryjliu/gpt_index/tree/main/examples>`_. \ No newline at end of file +All examples can be accessed using the menu links or can be found `directly in the repository`_. + +.. _directly in the repository: https://github.com/jerryjliu/llama_index/tree/main/examples + +.. toctree:: + :glob: + :maxdepth: 1 + :caption: Async + + ./notebooks/async/* + + +.. toctree:: + :glob: + :maxdepth: 1 + :caption: Azure + + notebooks/azure_demo/* + + +.. toctree:: + :glob: + :maxdepth: 1 + :caption: Chatbot + + notebooks/chatbot/* + + +.. toctree:: + :glob: + :maxdepth: 1 + :caption: ChatGPT Plugin + + notebooks/chatgpt_plugin/* + + +.. toctree:: + :glob: + :maxdepth: 1 + :caption: Composable Indices + + notebooks/composable_indices/* + + +.. toctree:: + :glob: + :maxdepth: 1 + :caption: Cost Analysis + + notebooks/cost_analysis/* + + +.. toctree:: + :glob: + :maxdepth: 1 + :caption: Data Connectors + + notebooks/data_connectors/* + + +.. toctree:: + :glob: + :maxdepth: 1 + :caption: Docstore + + notebooks/docstore/* + + +.. toctree:: + :glob: + :maxdepth: 1 + :caption: Evaluation + + notebooks/evaluation/* + + +.. toctree:: + :glob: + :maxdepth: 1 + :caption: Gastby + + notebooks/gatsby/* + + +.. toctree:: + :glob: + :maxdepth: 1 + :caption: Knowledge Graph + + notebooks/knowledge_graph/* + + +.. toctree:: + :glob: + :maxdepth: 1 + :caption: Langchain Integration + + notebooks/langchain_demo/* + + +.. toctree:: + :glob: + :maxdepth: 1 + :caption: Multimodal + + notebooks/multimodal/* + + +.. toctree:: + :glob: + :maxdepth: 1 + :caption: Node Post-Processor + + notebooks/node_postprocessor/* + + +.. toctree:: + :glob: + :maxdepth: 1 + :caption: Optimizer + + notebooks/optimizer/* + + +.. toctree:: + :glob: + :maxdepth: 1 + :caption: Paul Graham Essay + + notebooks/paul_graham_essay/* + + +.. toctree:: + :glob: + :maxdepth: 1 + :caption: Playground + + notebooks/playground/* + + +.. toctree:: + :glob: + :maxdepth: 1 + :caption: Query Transformations + + notebooks/query_transformations/* + + +.. toctree:: + :glob: + :maxdepth: 1 + :caption: Structured Indices + + notebooks/struct_indices/* + + +.. toctree:: + :glob: + :maxdepth: 1 + :caption: Wikipedia + + notebooks/test_wiki/* + + +.. toctree:: + :glob: + :maxdepth: 1 + :caption: Vector Indices + + notebooks/vector_indices/* diff --git a/docs/guides/notebooks/async/AsyncComposableIndicesSEC.nblink b/docs/guides/notebooks/async/AsyncComposableIndicesSEC.nblink new file mode 100644 index 0000000000..27f243d52e --- /dev/null +++ b/docs/guides/notebooks/async/AsyncComposableIndicesSEC.nblink @@ -0,0 +1 @@ +{"path": "../../../../examples/async/AsyncComposableIndicesSEC.ipynb"} \ No newline at end of file diff --git a/docs/guides/notebooks/async/AsyncGPTTreeIndexDemo.nblink b/docs/guides/notebooks/async/AsyncGPTTreeIndexDemo.nblink new file mode 100644 index 0000000000..a553014c27 --- /dev/null +++ b/docs/guides/notebooks/async/AsyncGPTTreeIndexDemo.nblink @@ -0,0 +1 @@ +{"path": "../../../../examples/async/AsyncGPTTreeIndexDemo.ipynb"} \ No newline at end of file diff --git a/docs/guides/notebooks/async/AsyncLLMPredictorDemo.nblink b/docs/guides/notebooks/async/AsyncLLMPredictorDemo.nblink new file mode 100644 index 0000000000..a61033974b --- /dev/null +++ b/docs/guides/notebooks/async/AsyncLLMPredictorDemo.nblink @@ -0,0 +1 @@ +{"path": "../../../../examples/async/AsyncLLMPredictorDemo.ipynb"} \ No newline at end of file diff --git a/docs/guides/notebooks/async/AsyncQueryDemo.nblink b/docs/guides/notebooks/async/AsyncQueryDemo.nblink new file mode 100644 index 0000000000..e5a19ec1c4 --- /dev/null +++ b/docs/guides/notebooks/async/AsyncQueryDemo.nblink @@ -0,0 +1 @@ +{"path": "../../../../examples/async/AsyncQueryDemo.ipynb"} \ No newline at end of file diff --git a/docs/guides/notebooks/azure_demo/AzureOpenAI.nblink b/docs/guides/notebooks/azure_demo/AzureOpenAI.nblink new file mode 100644 index 0000000000..318e9bf7fe --- /dev/null +++ b/docs/guides/notebooks/azure_demo/AzureOpenAI.nblink @@ -0,0 +1 @@ +{"path": "../../../../examples/azure_demo/AzureOpenAI.ipynb"} \ No newline at end of file diff --git a/docs/guides/notebooks/chatbot/Chatbot_SEC.nblink b/docs/guides/notebooks/chatbot/Chatbot_SEC.nblink new file mode 100644 index 0000000000..5156a52750 --- /dev/null +++ b/docs/guides/notebooks/chatbot/Chatbot_SEC.nblink @@ -0,0 +1 @@ +{"path": "../../../../examples/chatbot/Chatbot_SEC.ipynb"} \ No newline at end of file diff --git a/docs/guides/notebooks/chatgpt_plugin/ChatGPTRetrievalPluginIndexDemo.nblink b/docs/guides/notebooks/chatgpt_plugin/ChatGPTRetrievalPluginIndexDemo.nblink new file mode 100644 index 0000000000..424014f345 --- /dev/null +++ b/docs/guides/notebooks/chatgpt_plugin/ChatGPTRetrievalPluginIndexDemo.nblink @@ -0,0 +1 @@ +{"path": "../../../../examples/chatgpt_plugin/ChatGPTRetrievalPluginIndexDemo.ipynb"} \ No newline at end of file diff --git a/docs/guides/notebooks/chatgpt_plugin/ChatGPTRetrievalPluginReaderDemo.nblink b/docs/guides/notebooks/chatgpt_plugin/ChatGPTRetrievalPluginReaderDemo.nblink new file mode 100644 index 0000000000..bcad633aa0 --- /dev/null +++ b/docs/guides/notebooks/chatgpt_plugin/ChatGPTRetrievalPluginReaderDemo.nblink @@ -0,0 +1 @@ +{"path": "../../../../examples/chatgpt_plugin/ChatGPTRetrievalPluginReaderDemo.ipynb"} \ No newline at end of file diff --git a/docs/guides/notebooks/chatgpt_plugin/ChatGPT_Retrieval_Plugin_Upload.nblink b/docs/guides/notebooks/chatgpt_plugin/ChatGPT_Retrieval_Plugin_Upload.nblink new file mode 100644 index 0000000000..526d471354 --- /dev/null +++ b/docs/guides/notebooks/chatgpt_plugin/ChatGPT_Retrieval_Plugin_Upload.nblink @@ -0,0 +1 @@ +{"path": "../../../../examples/chatgpt_plugin/ChatGPT_Retrieval_Plugin_Upload.ipynb"} \ No newline at end of file diff --git a/docs/guides/notebooks/composable_indices/ComposableIndices-Prior.nblink b/docs/guides/notebooks/composable_indices/ComposableIndices-Prior.nblink new file mode 100644 index 0000000000..c5dec604c4 --- /dev/null +++ b/docs/guides/notebooks/composable_indices/ComposableIndices-Prior.nblink @@ -0,0 +1 @@ +{"path": "../../../../examples/composable_indices/ComposableIndices-Prior.ipynb"} \ No newline at end of file diff --git a/docs/guides/notebooks/composable_indices/ComposableIndices-Weaviate.nblink b/docs/guides/notebooks/composable_indices/ComposableIndices-Weaviate.nblink new file mode 100644 index 0000000000..bccfa5cc2d --- /dev/null +++ b/docs/guides/notebooks/composable_indices/ComposableIndices-Weaviate.nblink @@ -0,0 +1 @@ +{"path": "../../../../examples/composable_indices/ComposableIndices-Weaviate.ipynb"} \ No newline at end of file diff --git a/docs/guides/notebooks/composable_indices/ComposableIndices.nblink b/docs/guides/notebooks/composable_indices/ComposableIndices.nblink new file mode 100644 index 0000000000..edab307b2b --- /dev/null +++ b/docs/guides/notebooks/composable_indices/ComposableIndices.nblink @@ -0,0 +1 @@ +{"path": "../../../../examples/composable_indices/ComposableIndices.ipynb"} \ No newline at end of file diff --git a/docs/guides/notebooks/composable_indices/QASummaryGraph.nblink b/docs/guides/notebooks/composable_indices/QASummaryGraph.nblink new file mode 100644 index 0000000000..1648510d8d --- /dev/null +++ b/docs/guides/notebooks/composable_indices/QASummaryGraph.nblink @@ -0,0 +1 @@ +{"path": "../../../../examples/composable_indices/QASummaryGraph.ipynb"} \ No newline at end of file diff --git a/docs/guides/notebooks/cost_analysis/TokenPredictor.nblink b/docs/guides/notebooks/cost_analysis/TokenPredictor.nblink new file mode 100644 index 0000000000..915e93b1da --- /dev/null +++ b/docs/guides/notebooks/cost_analysis/TokenPredictor.nblink @@ -0,0 +1 @@ +{"path": "../../../../examples/cost_analysis/TokenPredictor.ipynb"} \ No newline at end of file diff --git a/docs/guides/notebooks/data_connectors/ChromaDemo.nblink b/docs/guides/notebooks/data_connectors/ChromaDemo.nblink new file mode 100644 index 0000000000..6d11f643b1 --- /dev/null +++ b/docs/guides/notebooks/data_connectors/ChromaDemo.nblink @@ -0,0 +1 @@ +{"path": "../../../../examples/data_connectors/ChromaDemo.ipynb"} \ No newline at end of file diff --git a/docs/guides/notebooks/data_connectors/DatabaseReaderDemo.nblink b/docs/guides/notebooks/data_connectors/DatabaseReaderDemo.nblink new file mode 100644 index 0000000000..287ef60779 --- /dev/null +++ b/docs/guides/notebooks/data_connectors/DatabaseReaderDemo.nblink @@ -0,0 +1 @@ +{"path": "../../../../examples/data_connectors/DatabaseReaderDemo.ipynb"} \ No newline at end of file diff --git a/docs/guides/notebooks/data_connectors/DiscordDemo.nblink b/docs/guides/notebooks/data_connectors/DiscordDemo.nblink new file mode 100644 index 0000000000..1d4952cb11 --- /dev/null +++ b/docs/guides/notebooks/data_connectors/DiscordDemo.nblink @@ -0,0 +1 @@ +{"path": "../../../../examples/data_connectors/DiscordDemo.ipynb"} \ No newline at end of file diff --git a/docs/guides/notebooks/data_connectors/FaissDemo.nblink b/docs/guides/notebooks/data_connectors/FaissDemo.nblink new file mode 100644 index 0000000000..1151a573b4 --- /dev/null +++ b/docs/guides/notebooks/data_connectors/FaissDemo.nblink @@ -0,0 +1 @@ +{"path": "../../../../examples/data_connectors/FaissDemo.ipynb"} \ No newline at end of file diff --git a/docs/guides/notebooks/data_connectors/GithubRepositoryReaderDemo.nblink b/docs/guides/notebooks/data_connectors/GithubRepositoryReaderDemo.nblink new file mode 100644 index 0000000000..689936b1da --- /dev/null +++ b/docs/guides/notebooks/data_connectors/GithubRepositoryReaderDemo.nblink @@ -0,0 +1 @@ +{"path": "../../../../examples/data_connectors/GithubRepositoryReaderDemo.ipynb"} \ No newline at end of file diff --git a/docs/guides/notebooks/data_connectors/GoogleDocsDemo.nblink b/docs/guides/notebooks/data_connectors/GoogleDocsDemo.nblink new file mode 100644 index 0000000000..60f687b022 --- /dev/null +++ b/docs/guides/notebooks/data_connectors/GoogleDocsDemo.nblink @@ -0,0 +1 @@ +{"path": "../../../../examples/data_connectors/GoogleDocsDemo.ipynb"} \ No newline at end of file diff --git a/docs/guides/notebooks/data_connectors/MakeDemo.nblink b/docs/guides/notebooks/data_connectors/MakeDemo.nblink new file mode 100644 index 0000000000..edf9cd5a93 --- /dev/null +++ b/docs/guides/notebooks/data_connectors/MakeDemo.nblink @@ -0,0 +1 @@ +{"path": "../../../../examples/data_connectors/MakeDemo.ipynb"} \ No newline at end of file diff --git a/docs/guides/notebooks/data_connectors/MboxReaderDemo.nblink b/docs/guides/notebooks/data_connectors/MboxReaderDemo.nblink new file mode 100644 index 0000000000..beeb32654d --- /dev/null +++ b/docs/guides/notebooks/data_connectors/MboxReaderDemo.nblink @@ -0,0 +1 @@ +{"path": "../../../../examples/data_connectors/MboxReaderDemo.ipynb"} \ No newline at end of file diff --git a/docs/guides/notebooks/data_connectors/MongoDemo.nblink b/docs/guides/notebooks/data_connectors/MongoDemo.nblink new file mode 100644 index 0000000000..e627ac29f5 --- /dev/null +++ b/docs/guides/notebooks/data_connectors/MongoDemo.nblink @@ -0,0 +1 @@ +{"path": "../../../../examples/data_connectors/MongoDemo.ipynb"} \ No newline at end of file diff --git a/docs/guides/notebooks/data_connectors/NotionDemo.nblink b/docs/guides/notebooks/data_connectors/NotionDemo.nblink new file mode 100644 index 0000000000..401c1c4873 --- /dev/null +++ b/docs/guides/notebooks/data_connectors/NotionDemo.nblink @@ -0,0 +1 @@ +{"path": "../../../../examples/data_connectors/NotionDemo.ipynb"} \ No newline at end of file diff --git a/docs/guides/notebooks/data_connectors/ObsidianReaderDemo.nblink b/docs/guides/notebooks/data_connectors/ObsidianReaderDemo.nblink new file mode 100644 index 0000000000..6faffd22b6 --- /dev/null +++ b/docs/guides/notebooks/data_connectors/ObsidianReaderDemo.nblink @@ -0,0 +1 @@ +{"path": "../../../../examples/data_connectors/ObsidianReaderDemo.ipynb"} \ No newline at end of file diff --git a/docs/guides/notebooks/data_connectors/PineconeDemo.nblink b/docs/guides/notebooks/data_connectors/PineconeDemo.nblink new file mode 100644 index 0000000000..a5a213bb9a --- /dev/null +++ b/docs/guides/notebooks/data_connectors/PineconeDemo.nblink @@ -0,0 +1 @@ +{"path": "../../../../examples/data_connectors/PineconeDemo.ipynb"} \ No newline at end of file diff --git a/docs/guides/notebooks/data_connectors/QdrantDemo.nblink b/docs/guides/notebooks/data_connectors/QdrantDemo.nblink new file mode 100644 index 0000000000..900ea79354 --- /dev/null +++ b/docs/guides/notebooks/data_connectors/QdrantDemo.nblink @@ -0,0 +1 @@ +{"path": "../../../../examples/data_connectors/QdrantDemo.ipynb"} \ No newline at end of file diff --git a/docs/guides/notebooks/data_connectors/SlackDemo.nblink b/docs/guides/notebooks/data_connectors/SlackDemo.nblink new file mode 100644 index 0000000000..231dbf937f --- /dev/null +++ b/docs/guides/notebooks/data_connectors/SlackDemo.nblink @@ -0,0 +1 @@ +{"path": "../../../../examples/data_connectors/SlackDemo.ipynb"} \ No newline at end of file diff --git a/docs/guides/notebooks/data_connectors/TwitterDemo.nblink b/docs/guides/notebooks/data_connectors/TwitterDemo.nblink new file mode 100644 index 0000000000..0b208c3f90 --- /dev/null +++ b/docs/guides/notebooks/data_connectors/TwitterDemo.nblink @@ -0,0 +1 @@ +{"path": "../../../../examples/data_connectors/TwitterDemo.ipynb"} \ No newline at end of file diff --git a/docs/guides/notebooks/data_connectors/WeaviateDemo.nblink b/docs/guides/notebooks/data_connectors/WeaviateDemo.nblink new file mode 100644 index 0000000000..ab9e2427f6 --- /dev/null +++ b/docs/guides/notebooks/data_connectors/WeaviateDemo.nblink @@ -0,0 +1 @@ +{"path": "../../../../examples/data_connectors/WeaviateDemo.ipynb"} \ No newline at end of file diff --git a/docs/guides/notebooks/data_connectors/WebPageDemo.nblink b/docs/guides/notebooks/data_connectors/WebPageDemo.nblink new file mode 100644 index 0000000000..829d3854eb --- /dev/null +++ b/docs/guides/notebooks/data_connectors/WebPageDemo.nblink @@ -0,0 +1 @@ +{"path": "../../../../examples/data_connectors/WebPageDemo.ipynb"} \ No newline at end of file diff --git a/docs/guides/notebooks/docstore/DocstoreDemo.nblink b/docs/guides/notebooks/docstore/DocstoreDemo.nblink new file mode 100644 index 0000000000..3f379a2f68 --- /dev/null +++ b/docs/guides/notebooks/docstore/DocstoreDemo.nblink @@ -0,0 +1 @@ +{"path": "../../../../examples/docstore/DocstoreDemo.ipynb"} \ No newline at end of file diff --git a/docs/guides/notebooks/evaluation/GuardrailsDemo.nblink b/docs/guides/notebooks/evaluation/GuardrailsDemo.nblink new file mode 100644 index 0000000000..c67b10a4b4 --- /dev/null +++ b/docs/guides/notebooks/evaluation/GuardrailsDemo.nblink @@ -0,0 +1 @@ +{"path": "../../../../examples/evaluation/GuardrailsDemo.ipynb"} \ No newline at end of file diff --git a/docs/guides/notebooks/evaluation/LangchainOutputParserDemo.nblink b/docs/guides/notebooks/evaluation/LangchainOutputParserDemo.nblink new file mode 100644 index 0000000000..a7a4722097 --- /dev/null +++ b/docs/guides/notebooks/evaluation/LangchainOutputParserDemo.nblink @@ -0,0 +1 @@ +{"path": "../../../../examples/evaluation/LangchainOutputParserDemo.ipynb"} \ No newline at end of file diff --git a/docs/guides/notebooks/evaluation/TestNYC-Evaluation.nblink b/docs/guides/notebooks/evaluation/TestNYC-Evaluation.nblink new file mode 100644 index 0000000000..977b921aa1 --- /dev/null +++ b/docs/guides/notebooks/evaluation/TestNYC-Evaluation.nblink @@ -0,0 +1 @@ +{"path": "../../../../examples/evaluation/TestNYC-Evaluation.ipynb"} \ No newline at end of file diff --git a/docs/guides/notebooks/gatsby/TestGatsby.nblink b/docs/guides/notebooks/gatsby/TestGatsby.nblink new file mode 100644 index 0000000000..45dbaa9346 --- /dev/null +++ b/docs/guides/notebooks/gatsby/TestGatsby.nblink @@ -0,0 +1 @@ +{"path": "../../../../examples/gatsby/TestGatsby.ipynb"} \ No newline at end of file diff --git a/docs/guides/notebooks/knowledge_graph/KnowledgeGraphDemo.nblink b/docs/guides/notebooks/knowledge_graph/KnowledgeGraphDemo.nblink new file mode 100644 index 0000000000..44bcf85d64 --- /dev/null +++ b/docs/guides/notebooks/knowledge_graph/KnowledgeGraphDemo.nblink @@ -0,0 +1 @@ +{"path": "../../../../examples/knowledge_graph/KnowledgeGraphDemo.ipynb"} \ No newline at end of file diff --git a/docs/guides/notebooks/langchain_demo/LangchainDemo.nblink b/docs/guides/notebooks/langchain_demo/LangchainDemo.nblink new file mode 100644 index 0000000000..251948c9bf --- /dev/null +++ b/docs/guides/notebooks/langchain_demo/LangchainDemo.nblink @@ -0,0 +1 @@ +{"path": "../../../../examples/langchain_demo/LangchainDemo.ipynb"} \ No newline at end of file diff --git a/docs/guides/notebooks/multimodal/Multimodal.nblink b/docs/guides/notebooks/multimodal/Multimodal.nblink new file mode 100644 index 0000000000..db221882e8 --- /dev/null +++ b/docs/guides/notebooks/multimodal/Multimodal.nblink @@ -0,0 +1 @@ +{"path": "../../../../examples/multimodal/Multimodal.ipynb"} \ No newline at end of file diff --git a/docs/guides/notebooks/node_postprocessor/NodePostprocessorDemo.nblink b/docs/guides/notebooks/node_postprocessor/NodePostprocessorDemo.nblink new file mode 100644 index 0000000000..bb855d8169 --- /dev/null +++ b/docs/guides/notebooks/node_postprocessor/NodePostprocessorDemo.nblink @@ -0,0 +1 @@ +{"path": "../../../../examples/node_postprocessor/NodePostprocessorDemo.ipynb"} \ No newline at end of file diff --git a/docs/guides/notebooks/optimizer/OptimizerDemo.nblink b/docs/guides/notebooks/optimizer/OptimizerDemo.nblink new file mode 100644 index 0000000000..c9d893cf65 --- /dev/null +++ b/docs/guides/notebooks/optimizer/OptimizerDemo.nblink @@ -0,0 +1 @@ +{"path": "../../../../examples/optimizer/OptimizerDemo.ipynb"} \ No newline at end of file diff --git a/docs/guides/notebooks/paul_graham_essay/DavinciComparison.nblink b/docs/guides/notebooks/paul_graham_essay/DavinciComparison.nblink new file mode 100644 index 0000000000..6d585c00e7 --- /dev/null +++ b/docs/guides/notebooks/paul_graham_essay/DavinciComparison.nblink @@ -0,0 +1 @@ +{"path": "../../../../examples/paul_graham_essay/DavinciComparison.ipynb"} \ No newline at end of file diff --git a/docs/guides/notebooks/paul_graham_essay/GPT4Comparison.nblink b/docs/guides/notebooks/paul_graham_essay/GPT4Comparison.nblink new file mode 100644 index 0000000000..63768b3834 --- /dev/null +++ b/docs/guides/notebooks/paul_graham_essay/GPT4Comparison.nblink @@ -0,0 +1 @@ +{"path": "../../../../examples/paul_graham_essay/GPT4Comparison.ipynb"} \ No newline at end of file diff --git a/docs/guides/notebooks/paul_graham_essay/InsertDemo.nblink b/docs/guides/notebooks/paul_graham_essay/InsertDemo.nblink new file mode 100644 index 0000000000..94c7766261 --- /dev/null +++ b/docs/guides/notebooks/paul_graham_essay/InsertDemo.nblink @@ -0,0 +1 @@ +{"path": "../../../../examples/paul_graham_essay/InsertDemo.ipynb"} \ No newline at end of file diff --git a/docs/guides/notebooks/paul_graham_essay/KeywordTableComparison.nblink b/docs/guides/notebooks/paul_graham_essay/KeywordTableComparison.nblink new file mode 100644 index 0000000000..df536641f0 --- /dev/null +++ b/docs/guides/notebooks/paul_graham_essay/KeywordTableComparison.nblink @@ -0,0 +1 @@ +{"path": "../../../../examples/paul_graham_essay/KeywordTableComparison.ipynb"} \ No newline at end of file diff --git a/docs/guides/notebooks/paul_graham_essay/SentenceSplittingDemo.nblink b/docs/guides/notebooks/paul_graham_essay/SentenceSplittingDemo.nblink new file mode 100644 index 0000000000..ad1fb314c2 --- /dev/null +++ b/docs/guides/notebooks/paul_graham_essay/SentenceSplittingDemo.nblink @@ -0,0 +1 @@ +{"path": "../../../../examples/paul_graham_essay/SentenceSplittingDemo.ipynb"} \ No newline at end of file diff --git a/docs/guides/notebooks/paul_graham_essay/TestEssay.nblink b/docs/guides/notebooks/paul_graham_essay/TestEssay.nblink new file mode 100644 index 0000000000..e2988e067b --- /dev/null +++ b/docs/guides/notebooks/paul_graham_essay/TestEssay.nblink @@ -0,0 +1 @@ +{"path": "../../../../examples/paul_graham_essay/TestEssay.ipynb"} \ No newline at end of file diff --git a/docs/guides/notebooks/playground/PlaygroundDemo.nblink b/docs/guides/notebooks/playground/PlaygroundDemo.nblink new file mode 100644 index 0000000000..b19e6d309e --- /dev/null +++ b/docs/guides/notebooks/playground/PlaygroundDemo.nblink @@ -0,0 +1 @@ +{"path": "../../../../examples/playground/PlaygroundDemo.ipynb"} \ No newline at end of file diff --git a/docs/guides/notebooks/query_transformations/HyDEQueryTransformDemo.nblink b/docs/guides/notebooks/query_transformations/HyDEQueryTransformDemo.nblink new file mode 100644 index 0000000000..c7e84b9aa4 --- /dev/null +++ b/docs/guides/notebooks/query_transformations/HyDEQueryTransformDemo.nblink @@ -0,0 +1 @@ +{"path": "../../../../examples/query_transformations/HyDEQueryTransformDemo.ipynb"} \ No newline at end of file diff --git a/docs/guides/notebooks/struct_indices/PandasIndexDemo.nblink b/docs/guides/notebooks/struct_indices/PandasIndexDemo.nblink new file mode 100644 index 0000000000..d6ec3f659c --- /dev/null +++ b/docs/guides/notebooks/struct_indices/PandasIndexDemo.nblink @@ -0,0 +1 @@ +{"path": "../../../../examples/struct_indices/PandasIndexDemo.ipynb"} \ No newline at end of file diff --git a/docs/guides/notebooks/struct_indices/SQLIndexDemo-Context.nblink b/docs/guides/notebooks/struct_indices/SQLIndexDemo-Context.nblink new file mode 100644 index 0000000000..794811f2b9 --- /dev/null +++ b/docs/guides/notebooks/struct_indices/SQLIndexDemo-Context.nblink @@ -0,0 +1 @@ +{"path": "../../../../examples/struct_indices/SQLIndexDemo-Context.ipynb"} \ No newline at end of file diff --git a/docs/guides/notebooks/struct_indices/SQLIndexDemo-ManyTables.nblink b/docs/guides/notebooks/struct_indices/SQLIndexDemo-ManyTables.nblink new file mode 100644 index 0000000000..482a9d0f10 --- /dev/null +++ b/docs/guides/notebooks/struct_indices/SQLIndexDemo-ManyTables.nblink @@ -0,0 +1 @@ +{"path": "../../../../examples/struct_indices/SQLIndexDemo-ManyTables.ipynb"} \ No newline at end of file diff --git a/docs/guides/notebooks/struct_indices/SQLIndexDemo.nblink b/docs/guides/notebooks/struct_indices/SQLIndexDemo.nblink new file mode 100644 index 0000000000..e2651cecc7 --- /dev/null +++ b/docs/guides/notebooks/struct_indices/SQLIndexDemo.nblink @@ -0,0 +1 @@ +{"path": "../../../../examples/struct_indices/SQLIndexDemo.ipynb"} \ No newline at end of file diff --git a/docs/guides/notebooks/test_wiki/TestNYC-Benchmark-GPT4.nblink b/docs/guides/notebooks/test_wiki/TestNYC-Benchmark-GPT4.nblink new file mode 100644 index 0000000000..e5e3f3eb46 --- /dev/null +++ b/docs/guides/notebooks/test_wiki/TestNYC-Benchmark-GPT4.nblink @@ -0,0 +1 @@ +{"path": "../../../../examples/test_wiki/TestNYC-Benchmark-GPT4.ipynb"} \ No newline at end of file diff --git a/docs/guides/notebooks/test_wiki/TestNYC-Tree-GPT4.nblink b/docs/guides/notebooks/test_wiki/TestNYC-Tree-GPT4.nblink new file mode 100644 index 0000000000..bbad368cb3 --- /dev/null +++ b/docs/guides/notebooks/test_wiki/TestNYC-Tree-GPT4.nblink @@ -0,0 +1 @@ +{"path": "../../../../examples/test_wiki/TestNYC-Tree-GPT4.ipynb"} \ No newline at end of file diff --git a/docs/guides/notebooks/test_wiki/TestNYC.nblink b/docs/guides/notebooks/test_wiki/TestNYC.nblink new file mode 100644 index 0000000000..9af98cb135 --- /dev/null +++ b/docs/guides/notebooks/test_wiki/TestNYC.nblink @@ -0,0 +1 @@ +{"path": "../../../../examples/test_wiki/TestNYC.ipynb"} \ No newline at end of file diff --git a/docs/guides/notebooks/test_wiki/TestNYC_Embeddings.nblink b/docs/guides/notebooks/test_wiki/TestNYC_Embeddings.nblink new file mode 100644 index 0000000000..9a81277e90 --- /dev/null +++ b/docs/guides/notebooks/test_wiki/TestNYC_Embeddings.nblink @@ -0,0 +1 @@ +{"path": "../../../../examples/test_wiki/TestNYC_Embeddings.ipynb"} \ No newline at end of file diff --git a/docs/guides/notebooks/test_wiki/TestWikiReader.nblink b/docs/guides/notebooks/test_wiki/TestWikiReader.nblink new file mode 100644 index 0000000000..b263c6cf36 --- /dev/null +++ b/docs/guides/notebooks/test_wiki/TestWikiReader.nblink @@ -0,0 +1 @@ +{"path": "../../../../examples/test_wiki/TestWikiReader.ipynb"} \ No newline at end of file diff --git a/docs/guides/notebooks/vector_indices/AsyncIndexCreationDemo.nblink b/docs/guides/notebooks/vector_indices/AsyncIndexCreationDemo.nblink new file mode 100644 index 0000000000..3a68f8d664 --- /dev/null +++ b/docs/guides/notebooks/vector_indices/AsyncIndexCreationDemo.nblink @@ -0,0 +1 @@ +{"path": "../../../../examples/vector_indices/AsyncIndexCreationDemo.ipynb"} \ No newline at end of file diff --git a/docs/guides/notebooks/vector_indices/ChromaIndexDemo.nblink b/docs/guides/notebooks/vector_indices/ChromaIndexDemo.nblink new file mode 100644 index 0000000000..15ff10564b --- /dev/null +++ b/docs/guides/notebooks/vector_indices/ChromaIndexDemo.nblink @@ -0,0 +1 @@ +{"path": "../../../../examples/vector_indices/ChromaIndexDemo.ipynb"} \ No newline at end of file diff --git a/docs/guides/notebooks/vector_indices/FaissIndexDemo.nblink b/docs/guides/notebooks/vector_indices/FaissIndexDemo.nblink new file mode 100644 index 0000000000..4b3d64aa54 --- /dev/null +++ b/docs/guides/notebooks/vector_indices/FaissIndexDemo.nblink @@ -0,0 +1 @@ +{"path": "../../../../examples/vector_indices/FaissIndexDemo.ipynb"} \ No newline at end of file diff --git a/docs/guides/notebooks/vector_indices/OpensearchDemo.nblink b/docs/guides/notebooks/vector_indices/OpensearchDemo.nblink new file mode 100644 index 0000000000..088df7a9c4 --- /dev/null +++ b/docs/guides/notebooks/vector_indices/OpensearchDemo.nblink @@ -0,0 +1 @@ +{"path": "../../../../examples/vector_indices/OpensearchDemo.ipynb"} \ No newline at end of file diff --git a/docs/guides/notebooks/vector_indices/PineconeIndexDemo.nblink b/docs/guides/notebooks/vector_indices/PineconeIndexDemo.nblink new file mode 100644 index 0000000000..3a97fa698c --- /dev/null +++ b/docs/guides/notebooks/vector_indices/PineconeIndexDemo.nblink @@ -0,0 +1 @@ +{"path": "../../../../examples/vector_indices/PineconeIndexDemo.ipynb"} \ No newline at end of file diff --git a/docs/guides/notebooks/vector_indices/QdrantIndexDemo.nblink b/docs/guides/notebooks/vector_indices/QdrantIndexDemo.nblink new file mode 100644 index 0000000000..672467f404 --- /dev/null +++ b/docs/guides/notebooks/vector_indices/QdrantIndexDemo.nblink @@ -0,0 +1 @@ +{"path": "../../../../examples/vector_indices/QdrantIndexDemo.ipynb"} \ No newline at end of file diff --git a/docs/guides/notebooks/vector_indices/SimpleIndexDemo-ChatGPT.nblink b/docs/guides/notebooks/vector_indices/SimpleIndexDemo-ChatGPT.nblink new file mode 100644 index 0000000000..a8dfbb2753 --- /dev/null +++ b/docs/guides/notebooks/vector_indices/SimpleIndexDemo-ChatGPT.nblink @@ -0,0 +1 @@ +{"path": "../../../../examples/vector_indices/SimpleIndexDemo-ChatGPT.ipynb"} \ No newline at end of file diff --git a/docs/guides/notebooks/vector_indices/SimpleIndexDemo-multistep.nblink b/docs/guides/notebooks/vector_indices/SimpleIndexDemo-multistep.nblink new file mode 100644 index 0000000000..a854f9e016 --- /dev/null +++ b/docs/guides/notebooks/vector_indices/SimpleIndexDemo-multistep.nblink @@ -0,0 +1 @@ +{"path": "../../../../examples/vector_indices/SimpleIndexDemo-multistep.ipynb"} \ No newline at end of file diff --git a/docs/guides/notebooks/vector_indices/SimpleIndexDemo-streaming.nblink b/docs/guides/notebooks/vector_indices/SimpleIndexDemo-streaming.nblink new file mode 100644 index 0000000000..765f138608 --- /dev/null +++ b/docs/guides/notebooks/vector_indices/SimpleIndexDemo-streaming.nblink @@ -0,0 +1 @@ +{"path": "../../../../examples/vector_indices/SimpleIndexDemo-streaming.ipynb"} \ No newline at end of file diff --git a/docs/guides/notebooks/vector_indices/SimpleIndexDemo.nblink b/docs/guides/notebooks/vector_indices/SimpleIndexDemo.nblink new file mode 100644 index 0000000000..8940a12d2d --- /dev/null +++ b/docs/guides/notebooks/vector_indices/SimpleIndexDemo.nblink @@ -0,0 +1 @@ +{"path": "../../../../examples/vector_indices/SimpleIndexDemo.ipynb"} \ No newline at end of file diff --git a/docs/guides/notebooks/vector_indices/WeaviateIndexDemo.nblink b/docs/guides/notebooks/vector_indices/WeaviateIndexDemo.nblink new file mode 100644 index 0000000000..dc2ac1abb3 --- /dev/null +++ b/docs/guides/notebooks/vector_indices/WeaviateIndexDemo.nblink @@ -0,0 +1 @@ +{"path": "../../../../examples/vector_indices/WeaviateIndexDemo.ipynb"} \ No newline at end of file diff --git a/docs/requirements.txt b/docs/requirements.txt index 36e59f2fc6..b98b8d7ef8 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -2,4 +2,8 @@ sphinx>=4.3.0 sphinx_rtd_theme>=0.5.1 docutils<0.17 -myst-parser \ No newline at end of file +myst-parser +nbsphinx +nbsphinx-link +pandoc +ipython diff --git a/examples/async/AsyncComposableIndicesSEC.ipynb b/examples/async/AsyncComposableIndicesSEC.ipynb index 94bd2be0eb..3a34b977a1 100644 --- a/examples/async/AsyncComposableIndicesSEC.ipynb +++ b/examples/async/AsyncComposableIndicesSEC.ipynb @@ -1,5 +1,13 @@ { "cells": [ + { + "cell_type": "markdown", + "id": "259b03ca", + "metadata": {}, + "source": [ + "# Async Composable Indices SEC Demo" + ] + }, { "cell_type": "code", "execution_count": 2, @@ -460,7 +468,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.10" + "version": "3.11.0" }, "widgets": { "application/vnd.jupyter.widget-state+json": { diff --git a/examples/async/AsyncGPTTreeIndexDemo.ipynb b/examples/async/AsyncGPTTreeIndexDemo.ipynb index 2c15133bfc..1fd2db130b 100644 --- a/examples/async/AsyncGPTTreeIndexDemo.ipynb +++ b/examples/async/AsyncGPTTreeIndexDemo.ipynb @@ -1,136 +1,136 @@ { - "cells": [ - { - "cell_type": "markdown", - "id": "96b2b1e4", - "metadata": {}, - "source": [ - "# Async GPTTreeIndex Demo" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9331cfeb", - "metadata": {}, - "outputs": [], - "source": [ - "# NOTE: This is ONLY necessary in jupyter notebook.\n", - "# Details: Jupyter runs an event-loop behind the scenes. \n", - "# This results in nested event-loops when we start an event-loop to make async queries.\n", - "# This is normally not allowed, we use nest_asyncio to allow it for convenience. \n", - "import nest_asyncio\n", - "nest_asyncio.apply()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8a1d2821", - "metadata": {}, - "outputs": [], - "source": [ - "import time\n", - "from llama_index import GPTTreeIndex, SimpleDirectoryReader" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6948df36", - "metadata": {}, - "outputs": [], - "source": [ - "# load documents\n", - "documents = SimpleDirectoryReader('../paul_graham_essay/data').load_data()" - ] - }, - { - "cell_type": "markdown", - "id": "2d9115d1", - "metadata": {}, - "source": [ - "#### By default, GPTTreeIndex makes blocking LLM calls" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d9ef0fef", - "metadata": {}, - "outputs": [], - "source": [ - "start_time = time.perf_counter()\n", - "index = GPTTreeIndex.from_documents(documents)\n", - "elapsed_time = time.perf_counter() - start_time" - ] - }, - { - "cell_type": "markdown", - "id": "9392d573", - "metadata": {}, - "source": [ - "It takes ~47s to finish building GPTTreeIndex from 5 text chunks." - ] - }, - { - "cell_type": "markdown", - "id": "474f82d1", - "metadata": {}, - "source": [ - "#### Pass in `use_async=True` to enable asynchronous LLM calls" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "78a02987", - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "start_time = time.perf_counter()\n", - "index = GPTTreeIndex.from_documents(documents, use_async=True)\n", - "elapsed_time = time.perf_counter() - start_time" - ] - }, - { - "cell_type": "markdown", - "id": "23469128", - "metadata": {}, - "source": [ - "It takes ~12s to finish building the GPTTreeIndex from 5 text chunks." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "59c1c27e", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.9" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} \ No newline at end of file + "cells": [ + { + "cell_type": "markdown", + "id": "96b2b1e4", + "metadata": {}, + "source": [ + "# Async GPTTreeIndex Demo" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9331cfeb", + "metadata": {}, + "outputs": [], + "source": [ + "# NOTE: This is ONLY necessary in jupyter notebook.\n", + "# Details: Jupyter runs an event-loop behind the scenes. \n", + "# This results in nested event-loops when we start an event-loop to make async queries.\n", + "# This is normally not allowed, we use nest_asyncio to allow it for convenience. \n", + "import nest_asyncio\n", + "nest_asyncio.apply()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8a1d2821", + "metadata": {}, + "outputs": [], + "source": [ + "import time\n", + "from llama_index import GPTTreeIndex, SimpleDirectoryReader" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6948df36", + "metadata": {}, + "outputs": [], + "source": [ + "# load documents\n", + "documents = SimpleDirectoryReader('../paul_graham_essay/data').load_data()" + ] + }, + { + "cell_type": "markdown", + "id": "2d9115d1", + "metadata": {}, + "source": [ + "#### By default, GPTTreeIndex makes blocking LLM calls" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d9ef0fef", + "metadata": {}, + "outputs": [], + "source": [ + "start_time = time.perf_counter()\n", + "index = GPTTreeIndex.from_documents(documents)\n", + "elapsed_time = time.perf_counter() - start_time" + ] + }, + { + "cell_type": "markdown", + "id": "9392d573", + "metadata": {}, + "source": [ + "It takes ~47s to finish building GPTTreeIndex from 5 text chunks." + ] + }, + { + "cell_type": "markdown", + "id": "474f82d1", + "metadata": {}, + "source": [ + "#### Pass in `use_async=True` to enable asynchronous LLM calls" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "78a02987", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "start_time = time.perf_counter()\n", + "index = GPTTreeIndex.from_documents(documents, use_async=True)\n", + "elapsed_time = time.perf_counter() - start_time" + ] + }, + { + "cell_type": "markdown", + "id": "23469128", + "metadata": {}, + "source": [ + "It takes ~12s to finish building the GPTTreeIndex from 5 text chunks." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "59c1c27e", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.0" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/examples/chatbot/Chatbot_SEC.ipynb b/examples/chatbot/Chatbot_SEC.ipynb index 140d0bdf4e..fdc5854c2c 100644 --- a/examples/chatbot/Chatbot_SEC.ipynb +++ b/examples/chatbot/Chatbot_SEC.ipynb @@ -1,5 +1,13 @@ { "cells": [ + { + "cell_type": "markdown", + "id": "c2c4b179", + "metadata": {}, + "source": [ + "# Chatbot SEC Tutorial" + ] + }, { "cell_type": "code", "execution_count": 9, @@ -828,7 +836,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.10" + "version": "3.11.0" }, "widgets": { "application/vnd.jupyter.widget-state+json": { diff --git a/examples/chatgpt_plugin/ChatGPTRetrievalPluginIndexDemo.ipynb b/examples/chatgpt_plugin/ChatGPTRetrievalPluginIndexDemo.ipynb index 8931588b2e..49448c76d4 100644 --- a/examples/chatgpt_plugin/ChatGPTRetrievalPluginIndexDemo.ipynb +++ b/examples/chatgpt_plugin/ChatGPTRetrievalPluginIndexDemo.ipynb @@ -228,7 +228,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.10" + "version": "3.11.0" } }, "nbformat": 4, diff --git a/examples/chatgpt_plugin/ChatGPT_Retrieval_Plugin_Upload.ipynb b/examples/chatgpt_plugin/ChatGPT_Retrieval_Plugin_Upload.ipynb index b2da4bb54d..35546b102f 100644 --- a/examples/chatgpt_plugin/ChatGPT_Retrieval_Plugin_Upload.ipynb +++ b/examples/chatgpt_plugin/ChatGPT_Retrieval_Plugin_Upload.ipynb @@ -1,5 +1,13 @@ { "cells": [ + { + "cell_type": "markdown", + "id": "470309bd", + "metadata": {}, + "source": [ + "# ChatGPT Retrieval Plugin Upload Demo" + ] + }, { "cell_type": "markdown", "id": "cfb64210-9c6b-47d7-81f4-67dbdab68e4c", @@ -259,7 +267,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.10" + "version": "3.11.0" }, "vscode": { "interpreter": { diff --git a/examples/composable_indices/QASummaryGraph.ipynb b/examples/composable_indices/QASummaryGraph.ipynb index d9a093a3f3..3c2a5e7347 100644 --- a/examples/composable_indices/QASummaryGraph.ipynb +++ b/examples/composable_indices/QASummaryGraph.ipynb @@ -1,5 +1,13 @@ { "cells": [ + { + "cell_type": "markdown", + "id": "ef5c50f0", + "metadata": {}, + "source": [ + "# QA Summary Graph Example" + ] + }, { "cell_type": "code", "execution_count": 1, @@ -431,7 +439,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.10" + "version": "3.11.0" } }, "nbformat": 4, diff --git a/examples/cost_analysis/TokenPredictor.ipynb b/examples/cost_analysis/TokenPredictor.ipynb index e0a6352c80..db64f1c3c9 100644 --- a/examples/cost_analysis/TokenPredictor.ipynb +++ b/examples/cost_analysis/TokenPredictor.ipynb @@ -1,355 +1,355 @@ { - "cells": [ - { - "cell_type": "markdown", - "id": "df19606e-d67e-44d2-bed0-4b804e6fc6c3", - "metadata": {}, - "source": [ - "# Using Token Predictors\n", - "\n", - "Using our token predictors, we can predict the token usage of an operation before actually performing it.\n", - "\n", - "We first show how to predict LLM token usage with the MockLLMPredictor class, see below.\n", - "We then show how to also predict embedding token usage." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f1a9eb90-335c-4214-8bb6-fd1edbe3ccbd", - "metadata": {}, - "outputs": [], - "source": [ - "# My OpenAI Key\n", - "import os\n", - "os.environ['OPENAI_API_KEY'] = \"INSERT OPENAI KEY\"" - ] - }, - { - "cell_type": "markdown", - "id": "8a707fa6-d79e-4343-92fd-d0fadb25c466", - "metadata": {}, - "source": [ - "## Using MockLLMPredictor" - ] - }, - { - "cell_type": "markdown", - "id": "be3f7baa-1c0a-430b-981b-83ddca9e71f2", - "metadata": { - "tags": [] - }, - "source": [ - "#### Predicting Usage of GPT Tree Index\n", - "\n", - "Here we predict usage of GPTTreeIndex during index construction and querying, without making any LLM calls.\n", - "\n", - "NOTE: Predicting query usage before tree is built is only possible with GPTTreeIndex due to the nature of tree traversal. Results will be more accurate if GPTTreeIndex is actually built beforehand." - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "c0ef16d1-45ef-43ec-9aad-4e44e9bb8578", - "metadata": {}, - "outputs": [], - "source": [ - "from llama_index import GPTTreeIndex, MockLLMPredictor, SimpleDirectoryReader, ServiceContext" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "b2ecdadc-1403-4bd4-a876-f80e4da911ef", - "metadata": {}, - "outputs": [], - "source": [ - "documents = SimpleDirectoryReader('../paul_graham_essay/data').load_data()" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "11056808-fd7f-4bc6-9348-0605fb4ee668", - "metadata": {}, - "outputs": [], - "source": [ - "llm_predictor = MockLLMPredictor(max_tokens=256)\n", - "service_context = ServiceContext.from_defaults(llm_predictor=llm_predictor)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9ea4ba66-9a09-4478-b0a8-dee8645fa4e3", - "metadata": {}, - "outputs": [], - "source": [ - "index = GPTTreeIndex.from_documents(documents, service_context=service_context)" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "id": "345433c2-5553-4645-a513-0186b771a21f", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "19495\n" - ] - } - ], - "source": [ - "print(llm_predictor.last_token_usage)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f43733ae-af35-46e6-99d9-8ba507acbb0d", - "metadata": {}, - "outputs": [], - "source": [ - "# default query\n", - "response = index.query(\"What did the author do growing up?\", service_context=service_context)" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "id": "4ba19751-da2d-46af-9f8f-4f42871e65a0", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "5493\n" - ] - } - ], - "source": [ - "print(llm_predictor.last_token_usage)" - ] - }, - { - "cell_type": "markdown", - "id": "4324d85b-ae80-48ab-baf0-7dc160dfae46", - "metadata": {}, - "source": [ - "#### Predicting Usage of GPT Keyword Table Index Query\n", - "\n", - "Here we build a real keyword table index over the data, but then predict query usage." - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "id": "10447805-38db-41b9-a2c6-b0c95437b276", - "metadata": {}, - "outputs": [], - "source": [ - "from llama_index import GPTKeywordTableIndex, MockLLMPredictor, SimpleDirectoryReader" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "id": "8ca76e72-5f43-47c1-a9a4-c5c5db4f0f21", - "metadata": {}, - "outputs": [], - "source": [ - "documents = SimpleDirectoryReader('../paul_graham_essay/data').load_data()\n", - "index = GPTKeywordTableIndex.load_from_disk('../paul_graham_essay/index_table.json')" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "id": "61f48870-65d2-4b23-b57e-79082ecb4ab2", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "start token ct: 0\n", - "> Starting query: What did the author do after his time at Y Combinator?\n", - "query keywords: ['author', 'did', 'y', 'combinator', 'after', 'his', 'the', 'what', 'time', 'at', 'do']\n", - "Extracted keywords: ['combinator']\n", - "> Querying with idx: 3483810247393006047: of 2016 we moved to England. We wanted our kids...\n", - "> Querying with idx: 7597483754542696814: people edit code on our server through the brow...\n", - "> Querying with idx: 7572417251450701751: invited about 20 of the 225 groups to interview...\n", - "end token ct: 11313\n", - "> [query] Total token usage: 11313 tokens\n", - "11313\n" - ] - } - ], - "source": [ - "llm_predictor = MockLLMPredictor(max_tokens=256)\n", - "service_context = ServiceContext.from_defaults(llm_predictor=llm_predictor)\n", - "response = index.query(\"What did the author do after his time at Y Combinator?\", service_context=service_context)\n", - "print(llm_predictor.last_token_usage)" - ] - }, - { - "cell_type": "markdown", - "id": "0fee4405-05e0-46c2-87bb-64ec63a4c6c1", - "metadata": {}, - "source": [ - "#### Predicting Usage of GPT List Index Query\n", - "\n", - "Here we build a real list index over the data, but then predict query usage." - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "id": "267f2213-67d1-4241-b73f-f1790661d06b", - "metadata": {}, - "outputs": [], - "source": [ - "from llama_index import GPTListIndex, MockLLMPredictor, SimpleDirectoryReader" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "id": "d553a8b1-7045-4756-9729-df84bd305279", - "metadata": {}, - "outputs": [], - "source": [ - "documents = SimpleDirectoryReader('../paul_graham_essay/data').load_data()\n", - "index = GPTListIndex.load_from_disk('../paul_graham_essay/index_list.json')" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "id": "69c99c68-6a23-48ed-aa41-e7af50fef2f3", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "start token ct: 0\n", - "> Starting query: What did the author do after his time at Y Combinator?\n", - "end token ct: 23941\n", - "> [query] Total token usage: 23941 tokens\n" - ] - } - ], - "source": [ - "llm_predictor = MockLLMPredictor(max_tokens=256)\n", - "service_context = ServiceContext.from_defaults(llm_predictor=llm_predictor)\n", - "response = index.query(\"What did the author do after his time at Y Combinator?\", service_context=service_context)" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "id": "e8422c5c-af68-4138-a8dd-f6e8d7208c4c", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "23941\n" - ] - } - ], - "source": [ - "print(llm_predictor.last_token_usage)" - ] - }, - { - "cell_type": "markdown", - "id": "1e19cf61-6d6a-4dfa-af78-1ce184f41c6c", - "metadata": {}, - "source": [ - "## Using MockEmbedding" - ] - }, - { - "cell_type": "markdown", - "id": "106d86bf-7725-40bc-84ba-4f273493d3f6", - "metadata": {}, - "source": [ - "#### Predicting Usage of GPT Simple Vector Index" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "9baf0fe7-2c11-4233-a930-4e593433ba84", - "metadata": {}, - "outputs": [], - "source": [ - "from llama_index import GPTSimpleVectorIndex, MockLLMPredictor, MockEmbedding, SimpleDirectoryReader" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "97023361-fa47-4008-b8d7-e66d60c5b263", - "metadata": {}, - "outputs": [], - "source": [ - "documents = SimpleDirectoryReader('../paul_graham_essay/data').load_data()\n", - "index = GPTSimpleVectorIndex.load_from_disk('../paul_graham_essay/index_simple_vec.json')" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "63ebe021-2b9c-4024-95f8-56cd9e7e7c47", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> [query] Total LLM token usage: 4374 tokens\n", - "> [query] Total embedding token usage: 14 tokens\n" - ] - } - ], - "source": [ - "llm_predictor = MockLLMPredictor(max_tokens=256)\n", - "embed_model = MockEmbedding(embed_dim=1536)\n", - "service_context = ServiceContext.from_defaults(llm_predictor=llm_predictor, embed_model=embed_model)\n", - "response = index.query(\n", - " \"What did the author do after his time at Y Combinator?\",\n", - " service_context=service_context,\n", - ")" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "gpt_retrieve_venv", - "language": "python", - "name": "gpt_retrieve_venv" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.16" - } - }, - "nbformat": 4, - "nbformat_minor": 5 + "cells": [ + { + "cell_type": "markdown", + "id": "df19606e-d67e-44d2-bed0-4b804e6fc6c3", + "metadata": {}, + "source": [ + "# Using Token Predictors\n", + "\n", + "Using our token predictors, we can predict the token usage of an operation before actually performing it.\n", + "\n", + "We first show how to predict LLM token usage with the MockLLMPredictor class, see below.\n", + "We then show how to also predict embedding token usage." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f1a9eb90-335c-4214-8bb6-fd1edbe3ccbd", + "metadata": {}, + "outputs": [], + "source": [ + "# My OpenAI Key\n", + "import os\n", + "os.environ['OPENAI_API_KEY'] = \"INSERT OPENAI KEY\"" + ] + }, + { + "cell_type": "markdown", + "id": "8a707fa6-d79e-4343-92fd-d0fadb25c466", + "metadata": {}, + "source": [ + "## Using MockLLMPredictor" + ] + }, + { + "cell_type": "markdown", + "id": "be3f7baa-1c0a-430b-981b-83ddca9e71f2", + "metadata": { + "tags": [] + }, + "source": [ + "#### Predicting Usage of GPT Tree Index\n", + "\n", + "Here we predict usage of GPTTreeIndex during index construction and querying, without making any LLM calls.\n", + "\n", + "NOTE: Predicting query usage before tree is built is only possible with GPTTreeIndex due to the nature of tree traversal. Results will be more accurate if GPTTreeIndex is actually built beforehand." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "c0ef16d1-45ef-43ec-9aad-4e44e9bb8578", + "metadata": {}, + "outputs": [], + "source": [ + "from llama_index import GPTTreeIndex, MockLLMPredictor, SimpleDirectoryReader, ServiceContext" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "b2ecdadc-1403-4bd4-a876-f80e4da911ef", + "metadata": {}, + "outputs": [], + "source": [ + "documents = SimpleDirectoryReader('../paul_graham_essay/data').load_data()" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "11056808-fd7f-4bc6-9348-0605fb4ee668", + "metadata": {}, + "outputs": [], + "source": [ + "llm_predictor = MockLLMPredictor(max_tokens=256)\n", + "service_context = ServiceContext.from_defaults(llm_predictor=llm_predictor)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9ea4ba66-9a09-4478-b0a8-dee8645fa4e3", + "metadata": {}, + "outputs": [], + "source": [ + "index = GPTTreeIndex.from_documents(documents, service_context=service_context)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "345433c2-5553-4645-a513-0186b771a21f", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "19495\n" + ] + } + ], + "source": [ + "print(llm_predictor.last_token_usage)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f43733ae-af35-46e6-99d9-8ba507acbb0d", + "metadata": {}, + "outputs": [], + "source": [ + "# default query\n", + "response = index.query(\"What did the author do growing up?\", service_context=service_context)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "4ba19751-da2d-46af-9f8f-4f42871e65a0", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "5493\n" + ] + } + ], + "source": [ + "print(llm_predictor.last_token_usage)" + ] + }, + { + "cell_type": "markdown", + "id": "4324d85b-ae80-48ab-baf0-7dc160dfae46", + "metadata": {}, + "source": [ + "#### Predicting Usage of GPT Keyword Table Index Query\n", + "\n", + "Here we build a real keyword table index over the data, but then predict query usage." + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "10447805-38db-41b9-a2c6-b0c95437b276", + "metadata": {}, + "outputs": [], + "source": [ + "from llama_index import GPTKeywordTableIndex, MockLLMPredictor, SimpleDirectoryReader" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "8ca76e72-5f43-47c1-a9a4-c5c5db4f0f21", + "metadata": {}, + "outputs": [], + "source": [ + "documents = SimpleDirectoryReader('../paul_graham_essay/data').load_data()\n", + "index = GPTKeywordTableIndex.load_from_disk('../paul_graham_essay/index_table.json')" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "61f48870-65d2-4b23-b57e-79082ecb4ab2", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "start token ct: 0\n", + "> Starting query: What did the author do after his time at Y Combinator?\n", + "query keywords: ['author', 'did', 'y', 'combinator', 'after', 'his', 'the', 'what', 'time', 'at', 'do']\n", + "Extracted keywords: ['combinator']\n", + "> Querying with idx: 3483810247393006047: of 2016 we moved to England. We wanted our kids...\n", + "> Querying with idx: 7597483754542696814: people edit code on our server through the brow...\n", + "> Querying with idx: 7572417251450701751: invited about 20 of the 225 groups to interview...\n", + "end token ct: 11313\n", + "> [query] Total token usage: 11313 tokens\n", + "11313\n" + ] + } + ], + "source": [ + "llm_predictor = MockLLMPredictor(max_tokens=256)\n", + "service_context = ServiceContext.from_defaults(llm_predictor=llm_predictor)\n", + "response = index.query(\"What did the author do after his time at Y Combinator?\", service_context=service_context)\n", + "print(llm_predictor.last_token_usage)" + ] + }, + { + "cell_type": "markdown", + "id": "0fee4405-05e0-46c2-87bb-64ec63a4c6c1", + "metadata": {}, + "source": [ + "#### Predicting Usage of GPT List Index Query\n", + "\n", + "Here we build a real list index over the data, but then predict query usage." + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "267f2213-67d1-4241-b73f-f1790661d06b", + "metadata": {}, + "outputs": [], + "source": [ + "from llama_index import GPTListIndex, MockLLMPredictor, SimpleDirectoryReader" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "d553a8b1-7045-4756-9729-df84bd305279", + "metadata": {}, + "outputs": [], + "source": [ + "documents = SimpleDirectoryReader('../paul_graham_essay/data').load_data()\n", + "index = GPTListIndex.load_from_disk('../paul_graham_essay/index_list.json')" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "69c99c68-6a23-48ed-aa41-e7af50fef2f3", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "start token ct: 0\n", + "> Starting query: What did the author do after his time at Y Combinator?\n", + "end token ct: 23941\n", + "> [query] Total token usage: 23941 tokens\n" + ] + } + ], + "source": [ + "llm_predictor = MockLLMPredictor(max_tokens=256)\n", + "service_context = ServiceContext.from_defaults(llm_predictor=llm_predictor)\n", + "response = index.query(\"What did the author do after his time at Y Combinator?\", service_context=service_context)" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "e8422c5c-af68-4138-a8dd-f6e8d7208c4c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "23941\n" + ] + } + ], + "source": [ + "print(llm_predictor.last_token_usage)" + ] + }, + { + "cell_type": "markdown", + "id": "1e19cf61-6d6a-4dfa-af78-1ce184f41c6c", + "metadata": {}, + "source": [ + "## Using MockEmbedding" + ] + }, + { + "cell_type": "markdown", + "id": "106d86bf-7725-40bc-84ba-4f273493d3f6", + "metadata": {}, + "source": [ + "#### Predicting Usage of GPT Simple Vector Index" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "9baf0fe7-2c11-4233-a930-4e593433ba84", + "metadata": {}, + "outputs": [], + "source": [ + "from llama_index import GPTSimpleVectorIndex, MockLLMPredictor, MockEmbedding, SimpleDirectoryReader" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "97023361-fa47-4008-b8d7-e66d60c5b263", + "metadata": {}, + "outputs": [], + "source": [ + "documents = SimpleDirectoryReader('../paul_graham_essay/data').load_data()\n", + "index = GPTSimpleVectorIndex.load_from_disk('../paul_graham_essay/index_simple_vec.json')" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "63ebe021-2b9c-4024-95f8-56cd9e7e7c47", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> [query] Total LLM token usage: 4374 tokens\n", + "> [query] Total embedding token usage: 14 tokens\n" + ] + } + ], + "source": [ + "llm_predictor = MockLLMPredictor(max_tokens=256)\n", + "embed_model = MockEmbedding(embed_dim=1536)\n", + "service_context = ServiceContext.from_defaults(llm_predictor=llm_predictor, embed_model=embed_model)\n", + "response = index.query(\n", + " \"What did the author do after his time at Y Combinator?\",\n", + " service_context=service_context,\n", + ")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.16" + } + }, + "nbformat": 4, + "nbformat_minor": 5 } diff --git a/examples/data_connectors/DatabaseReaderDemo.ipynb b/examples/data_connectors/DatabaseReaderDemo.ipynb index 5cb887f0f5..c1816801e6 100644 --- a/examples/data_connectors/DatabaseReaderDemo.ipynb +++ b/examples/data_connectors/DatabaseReaderDemo.ipynb @@ -1,202 +1,209 @@ { - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import logging\n", - "import sys\n", - "\n", - "logging.basicConfig(stream=sys.stdout, level=logging.INFO)\n", - "logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from __future__ import absolute_import\n", - "\n", - "# My OpenAI Key\n", - "import os\n", - "os.environ['OPENAI_API_KEY'] = \"\"\n", - "\n", - "from llama_index.readers.database import DatabaseReader\n", - "from llama_index import GPTSimpleVectorIndex" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Initialize DatabaseReader object with the following parameters:\n", - "\n", - "db = DatabaseReader(\n", - " scheme = \"postgresql\", # Database Scheme\n", - " host = \"localhost\", # Database Host\n", - " port = \"5432\", # Database Port\n", - " user = \"postgres\", # Database User\n", - " password = \"FakeExamplePassword\", # Database Password\n", - " dbname = \"postgres\", # Database Name\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "### DatabaseReader class ###\n", - "# db is an instance of DatabaseReader:\n", - "print(type(db))\n", - "# DatabaseReader available method:\n", - "print(type(db.load_data))\n", - "\n", - "### SQLDatabase class ###\n", - "# db.sql is an instance of SQLDatabase:\n", - "print(type(db.sql_database))\n", - "# SQLDatabase available methods:\n", - "print(type(db.sql_database.from_uri))\n", - "print(type(db.sql_database.get_single_table_info))\n", - "print(type(db.sql_database.get_table_columns))\n", - "print(type(db.sql_database.get_table_info))\n", - "print(type(db.sql_database.get_table_names))\n", - "print(type(db.sql_database.insert_into_table))\n", - "print(type(db.sql_database.run))\n", - "print(type(db.sql_database.run_sql))\n", - "# SQLDatabase available properties:\n", - "print(type(db.sql_database.dialect))\n", - "print(type(db.sql_database.engine))\n", - "print(type(db.sql_database.table_info))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "### Testing DatabaseReader\n", - "### from SQLDatabase, SQLAlchemy engine and Database URI:\n", - "\n", - "# From SQLDatabase instance:\n", - "print(type(db.sql_database))\n", - "db_from_sql_database = DatabaseReader(sql_database = db.sql_database)\n", - "print(type(db_from_sql_database))\n", - "\n", - "# From SQLAlchemy engine:\n", - "print(type(db.sql_database.engine))\n", - "db_from_engine = DatabaseReader(engine = db.sql_database.engine)\n", - "print(type(db_from_engine))\n", - "\n", - "# From Database URI:\n", - "print(type(db.uri))\n", - "db_from_uri = DatabaseReader(uri = db.uri)\n", - "print(type(db_from_uri))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# The below SQL Query example returns a list values of each row\n", - "# with concatenated text from the name and age columns\n", - "# from the users table where the age is greater than or equal to 18\n", - "\n", - "query = f\"\"\"\n", - " SELECT\n", - " CONCAT(name, ' is ', age, ' years old.') AS text\n", - " FROM public.users\n", - " WHERE age >= 18\n", - " \"\"\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Please refer to llama_index.langchain_helpers.sql_wrapper\n", - "# SQLDatabase.run_sql method\n", - "texts = db.sql_database.run_sql(command = query)\n", - "\n", - "# Display type(texts) and texts\n", - "# type(texts) must return <class 'list'>\n", - "print(type(texts))\n", - "\n", - "# Documents must return a list of Tuple objects\n", - "print(texts)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Please refer to llama_index.readers.database.DatabaseReader.load_data\n", - "# DatabaseReader.load_data method\n", - "documents = db.load_data(query = query)\n", - "\n", - "# Display type(documents) and documents\n", - "# type(documents) must return <class 'list'>\n", - "print(type(documents))\n", - "\n", - "# Documents must return a list of Document objects\n", - "print(documents)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "try:\n", - " # Try to load existing Index from disk\n", - " index = GPTSimpleVectorIndex.load_from_disk('index.json')\n", - "except:\n", - " index = GPTSimpleVectorIndex.from_documents(documents)\n", - "\n", - " # Save newly created Index to disk\n", - " index.save_to_disk('index.json')" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.1" - }, - "vscode": { - "interpreter": { - "hash": "bd5508c2ffc7f17f7d31cf4086cc872f89e96996a08987e995649e5fbe85a3a4" - } - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} \ No newline at end of file + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Database Reader Demo" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import logging\n", + "import sys\n", + "\n", + "logging.basicConfig(stream=sys.stdout, level=logging.INFO)\n", + "logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from __future__ import absolute_import\n", + "\n", + "# My OpenAI Key\n", + "import os\n", + "os.environ['OPENAI_API_KEY'] = \"\"\n", + "\n", + "from llama_index.readers.database import DatabaseReader\n", + "from llama_index import GPTSimpleVectorIndex" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Initialize DatabaseReader object with the following parameters:\n", + "\n", + "db = DatabaseReader(\n", + " scheme = \"postgresql\", # Database Scheme\n", + " host = \"localhost\", # Database Host\n", + " port = \"5432\", # Database Port\n", + " user = \"postgres\", # Database User\n", + " password = \"FakeExamplePassword\", # Database Password\n", + " dbname = \"postgres\", # Database Name\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "### DatabaseReader class ###\n", + "# db is an instance of DatabaseReader:\n", + "print(type(db))\n", + "# DatabaseReader available method:\n", + "print(type(db.load_data))\n", + "\n", + "### SQLDatabase class ###\n", + "# db.sql is an instance of SQLDatabase:\n", + "print(type(db.sql_database))\n", + "# SQLDatabase available methods:\n", + "print(type(db.sql_database.from_uri))\n", + "print(type(db.sql_database.get_single_table_info))\n", + "print(type(db.sql_database.get_table_columns))\n", + "print(type(db.sql_database.get_table_info))\n", + "print(type(db.sql_database.get_table_names))\n", + "print(type(db.sql_database.insert_into_table))\n", + "print(type(db.sql_database.run))\n", + "print(type(db.sql_database.run_sql))\n", + "# SQLDatabase available properties:\n", + "print(type(db.sql_database.dialect))\n", + "print(type(db.sql_database.engine))\n", + "print(type(db.sql_database.table_info))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "### Testing DatabaseReader\n", + "### from SQLDatabase, SQLAlchemy engine and Database URI:\n", + "\n", + "# From SQLDatabase instance:\n", + "print(type(db.sql_database))\n", + "db_from_sql_database = DatabaseReader(sql_database = db.sql_database)\n", + "print(type(db_from_sql_database))\n", + "\n", + "# From SQLAlchemy engine:\n", + "print(type(db.sql_database.engine))\n", + "db_from_engine = DatabaseReader(engine = db.sql_database.engine)\n", + "print(type(db_from_engine))\n", + "\n", + "# From Database URI:\n", + "print(type(db.uri))\n", + "db_from_uri = DatabaseReader(uri = db.uri)\n", + "print(type(db_from_uri))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# The below SQL Query example returns a list values of each row\n", + "# with concatenated text from the name and age columns\n", + "# from the users table where the age is greater than or equal to 18\n", + "\n", + "query = f\"\"\"\n", + " SELECT\n", + " CONCAT(name, ' is ', age, ' years old.') AS text\n", + " FROM public.users\n", + " WHERE age >= 18\n", + " \"\"\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Please refer to llama_index.langchain_helpers.sql_wrapper\n", + "# SQLDatabase.run_sql method\n", + "texts = db.sql_database.run_sql(command = query)\n", + "\n", + "# Display type(texts) and texts\n", + "# type(texts) must return <class 'list'>\n", + "print(type(texts))\n", + "\n", + "# Documents must return a list of Tuple objects\n", + "print(texts)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Please refer to llama_index.readers.database.DatabaseReader.load_data\n", + "# DatabaseReader.load_data method\n", + "documents = db.load_data(query = query)\n", + "\n", + "# Display type(documents) and documents\n", + "# type(documents) must return <class 'list'>\n", + "print(type(documents))\n", + "\n", + "# Documents must return a list of Document objects\n", + "print(documents)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "try:\n", + " # Try to load existing Index from disk\n", + " index = GPTSimpleVectorIndex.load_from_disk('index.json')\n", + "except:\n", + " index = GPTSimpleVectorIndex.from_documents(documents)\n", + "\n", + " # Save newly created Index to disk\n", + " index.save_to_disk('index.json')" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.0" + }, + "vscode": { + "interpreter": { + "hash": "bd5508c2ffc7f17f7d31cf4086cc872f89e96996a08987e995649e5fbe85a3a4" + } + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/examples/data_connectors/GithubRepositoryReaderDemo.ipynb b/examples/data_connectors/GithubRepositoryReaderDemo.ipynb index 389becf839..0424499940 100644 --- a/examples/data_connectors/GithubRepositoryReaderDemo.ipynb +++ b/examples/data_connectors/GithubRepositoryReaderDemo.ipynb @@ -1,110 +1,116 @@ { - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# This is due to the fact that we use asyncio.loop_until_complete in\n", - "# the DiscordReader. Since the Jupyter kernel itself runs on\n", - "# an event loop, we need to add some help with nesting\n", - "!pip install nest_asyncio httpx\n", - "import nest_asyncio\n", - "nest_asyncio.apply()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%env OPENAI_API_KEY=sk-xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\n", - "from llama_index import GPTSimpleVectorIndex, GithubRepositoryReader\n", - "from IPython.display import Markdown, display\n", - "import os" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%env GITHUB_TOKEN=github_pat_xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\n", - "github_token = os.environ.get(\"GITHUB_TOKEN\")\n", - "owner = \"jerryjliu\"\n", - "repo = \"llama_index\"\n", - "branch = \"main\"\n", - "\n", - "documents = GithubRepositoryReader(\n", - " github_token=github_token,\n", - " owner=owner,\n", - " repo=repo,\n", - " use_parser=False,\n", - " verbose=False,\n", - ").load_data(branch=branch)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "index = GPTSimpleVectorIndex.from_documents(documents)\n", - "index.save_to_disk(\"github_index.json\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# import time\n", - "# for document in documents:\n", - "# print(document.extra_info)\n", - "# time.sleep(.25) \n", - "response = index.query(\"What is the difference between GPTSimpleVectorIndex and GPTListIndex?\", verbose=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "display(Markdown(f\"<b>{response}</b>\"))" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "gpt_index-github-reader", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.0" - }, - "orig_nbformat": 4, - "vscode": { - "interpreter": { - "hash": "5bc2ab08ee48b6366504a28e3231c27a37c154a347ee8ac6184b716eff7bdbcd" - } - } - }, - "nbformat": 4, - "nbformat_minor": 2 + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Github Repository Reader Demo" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# This is due to the fact that we use asyncio.loop_until_complete in\n", + "# the DiscordReader. Since the Jupyter kernel itself runs on\n", + "# an event loop, we need to add some help with nesting\n", + "!pip install nest_asyncio\n", + "import nest_asyncio\n", + "nest_asyncio.apply()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%env OPENAI_API_KEY=sk-xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\n", + "from llama_index import GPTSimpleVectorIndex, GithubRepositoryReader\n", + "from IPython.display import Markdown, display\n", + "import os" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%env GITHUB_TOKEN=github_pat_xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\n", + "github_token = os.environ.get(\"GITHUB_TOKEN\")\n", + "owner = \"jerryjliu\"\n", + "repo = \"gpt_index\"\n", + "branch = \"main\"\n", + "\n", + "documents = GithubRepositoryReader(\n", + " github_token=github_token,\n", + " owner=owner,\n", + " repo=repo,\n", + " use_parser=False,\n", + " verbose=False,\n", + ").load_data(branch=branch)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "index = GPTSimpleVectorIndex.from_documents(documents)\n", + "index.save_to_disk(\"github_index.json\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# import time\n", + "# for document in documents:\n", + "# print(document.extra_info)\n", + "# time.sleep(.25) \n", + "response = index.query(\"What is the difference between GPTSimpleVectorIndex and GPTListIndex?\", verbose=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "display(Markdown(f\"<b>{response}</b>\"))" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.0" + }, + "vscode": { + "interpreter": { + "hash": "5bc2ab08ee48b6366504a28e3231c27a37c154a347ee8ac6184b716eff7bdbcd" + } + } + }, + "nbformat": 4, + "nbformat_minor": 2 } diff --git a/examples/data_connectors/MboxReaderDemo.ipynb b/examples/data_connectors/MboxReaderDemo.ipynb index 5285057c08..33ebc68f54 100644 --- a/examples/data_connectors/MboxReaderDemo.ipynb +++ b/examples/data_connectors/MboxReaderDemo.ipynb @@ -1,102 +1,108 @@ { - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%env OPENAI_API_KEY=sk-************" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "from llama_index import MboxReader, GPTSimpleVectorIndex" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "documents = MboxReader().load_data('mbox_data_dir', max_count=1000) # Returns list of documents " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "index = GPTSimpleVectorIndex.from_documents(documents) # Initialize index with documents" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> [query] Total LLM token usage: 100 tokens\n", - "> [query] Total embedding token usage: 10 tokens\n" - ] - } - ], - "source": [ - "res = index.query('When did i have that call with the London office?')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> There is a call scheduled with the London office at 12am GMT on the 10th of February." - ] - } - ], - "source": [ - "res.response" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": ".venv", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.8 (main, Oct 13 2022, 09:48:40) [Clang 14.0.0 (clang-1400.0.29.102)]" - }, - "orig_nbformat": 4, - "vscode": { - "interpreter": { - "hash": "7dd9b00487715d9ffc85f7f860a0013e7a0542b27fc53d2b1d33405d7679eac1" - } - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} \ No newline at end of file + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Mbox Reader Demo" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%env OPENAI_API_KEY=sk-************" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "from llama_index import MboxReader, GPTSimpleVectorIndex" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "documents = MboxReader().load_data('mbox_data_dir', max_count=1000) # Returns list of documents " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "index = GPTSimpleVectorIndex.from_documents(documents) # Initialize index with documents" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> [query] Total LLM token usage: 100 tokens\n", + "> [query] Total embedding token usage: 10 tokens\n" + ] + } + ], + "source": [ + "res = index.query('When did i have that call with the London office?')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> There is a call scheduled with the London office at 12am GMT on the 10th of February." + ] + } + ], + "source": [ + "res.response" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.0" + }, + "vscode": { + "interpreter": { + "hash": "7dd9b00487715d9ffc85f7f860a0013e7a0542b27fc53d2b1d33405d7679eac1" + } + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/examples/data_connectors/ObsidianReaderDemo.ipynb b/examples/data_connectors/ObsidianReaderDemo.ipynb index 951a918f75..449427eb41 100644 --- a/examples/data_connectors/ObsidianReaderDemo.ipynb +++ b/examples/data_connectors/ObsidianReaderDemo.ipynb @@ -1,135 +1,142 @@ { - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%env OPENAI_API_KEY=sk-************" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import logging\n", - "import sys\n", - "\n", - "logging.basicConfig(stream=sys.stdout, level=logging.INFO)\n", - "logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "from llama_index import ObsidianReader, GPTSimpleVectorIndex" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "documents = ObsidianReader('/Users/hursh/vault').load_data() # Returns list of documents " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "index = GPTSimpleVectorIndex.from_documents(documents) # Initialize index with documents" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "# index.save_to_disk('index.json')\n", - "index = GPTSimpleVectorIndex.load_from_disk('index.json')" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> [query] Total LLM token usage: 920 tokens\n", - "> [query] Total embedding token usage: 7 tokens\n" - ] - } - ], - "source": [ - "# set Logging to DEBUG for more detailed outputs\n", - "res = index.query('What is the meaning of life?')" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'\\nThe meaning of life is subjective and can vary from person to person. It is ultimately up to each individual to decide what they believe is the purpose and value of life. Some may find meaning in their faith, while others may find it in their relationships, work, or hobbies. Ultimately, it is up to each individual to decide what brings them joy and fulfillment and to pursue that path.'" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "res.response" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.1" - }, - "vscode": { - "interpreter": { - "hash": "31f2aee4e71d21fbe5cf8b01ff0e069b9275f58929596ceb00d14d90e3e16cd6" - } - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} \ No newline at end of file + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Obsidian Reader Demo" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%env OPENAI_API_KEY=sk-************" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import logging\n", + "import sys\n", + "\n", + "logging.basicConfig(stream=sys.stdout, level=logging.INFO)\n", + "logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "from llama_index import ObsidianReader, GPTSimpleVectorIndex" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "documents = ObsidianReader('/Users/hursh/vault').load_data() # Returns list of documents " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "index = GPTSimpleVectorIndex.from_documents(documents) # Initialize index with documents" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "# index.save_to_disk('index.json')\n", + "index = GPTSimpleVectorIndex.load_from_disk('index.json')" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> [query] Total LLM token usage: 920 tokens\n", + "> [query] Total embedding token usage: 7 tokens\n" + ] + } + ], + "source": [ + "# set Logging to DEBUG for more detailed outputs\n", + "res = index.query('What is the meaning of life?')" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'\\nThe meaning of life is subjective and can vary from person to person. It is ultimately up to each individual to decide what they believe is the purpose and value of life. Some may find meaning in their faith, while others may find it in their relationships, work, or hobbies. Ultimately, it is up to each individual to decide what brings them joy and fulfillment and to pursue that path.'" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "res.response" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.0" + }, + "vscode": { + "interpreter": { + "hash": "31f2aee4e71d21fbe5cf8b01ff0e069b9275f58929596ceb00d14d90e3e16cd6" + } + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/examples/data_connectors/TwitterDemo.ipynb b/examples/data_connectors/TwitterDemo.ipynb index 06a9d41384..2828e6ee3f 100644 --- a/examples/data_connectors/TwitterDemo.ipynb +++ b/examples/data_connectors/TwitterDemo.ipynb @@ -1,105 +1,113 @@ { - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "id": "367a6eae", - "metadata": {}, - "outputs": [], - "source": [ - "import logging\n", - "import sys\n", - "\n", - "logging.basicConfig(stream=sys.stdout, level=logging.INFO)\n", - "logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "21d03e9b-8a47-45b2-ab27-295b7397ecad", - "metadata": {}, - "outputs": [], - "source": [ - "from llama_index import GPTSimpleVectorIndex, TwitterTweetReader\n", - "from IPython.display import Markdown, display\n", - "import os" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ef5d2334-9661-4648-a823-a335ea277826", - "metadata": {}, - "outputs": [], - "source": [ - "# create an app in https://developer.twitter.com/en/apps\n", - "BEARER_TOKEN = \"<bearer_token>\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1d6a1153-1383-4aaf-b39d-72c1fc9cc428", - "metadata": {}, - "outputs": [], - "source": [ - "# create reader, specify twitter handles\n", - "reader = TwitterTweetReader(BEARER_TOKEN)\n", - "documents = reader.load_data([\"@twitter_handle1\"])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ca319024-88e7-424f-b1d8-4daa06c6bc6a", - "metadata": {}, - "outputs": [], - "source": [ - "index = GPTSimpleVectorIndex.from_documents(documents)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "78680a17-9088-419e-97cf-ac3d5783a709", - "metadata": {}, - "outputs": [], - "source": [ - "# set Logging to DEBUG for more detailed outputs\n", - "response = index.query(\"<query_text>\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2f0f92a7-cdd9-478f-9765-0a122d6e8508", - "metadata": {}, - "outputs": [], - "source": [ - "display(Markdown(f\"<b>{response}</b>\"))" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.1" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} \ No newline at end of file + "cells": [ + { + "cell_type": "markdown", + "id": "f5f652cc", + "metadata": {}, + "source": [ + "# Twitter Reader Demo" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "367a6eae", + "metadata": {}, + "outputs": [], + "source": [ + "import logging\n", + "import sys\n", + "\n", + "logging.basicConfig(stream=sys.stdout, level=logging.INFO)\n", + "logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "21d03e9b-8a47-45b2-ab27-295b7397ecad", + "metadata": {}, + "outputs": [], + "source": [ + "from llama_index import GPTSimpleVectorIndex, TwitterTweetReader\n", + "from IPython.display import Markdown, display\n", + "import os" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ef5d2334-9661-4648-a823-a335ea277826", + "metadata": {}, + "outputs": [], + "source": [ + "# create an app in https://developer.twitter.com/en/apps\n", + "BEARER_TOKEN = \"<bearer_token>\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1d6a1153-1383-4aaf-b39d-72c1fc9cc428", + "metadata": {}, + "outputs": [], + "source": [ + "# create reader, specify twitter handles\n", + "reader = TwitterTweetReader(BEARER_TOKEN)\n", + "documents = reader.load_data([\"@twitter_handle1\"])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ca319024-88e7-424f-b1d8-4daa06c6bc6a", + "metadata": {}, + "outputs": [], + "source": [ + "index = GPTSimpleVectorIndex.from_documents(documents)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "78680a17-9088-419e-97cf-ac3d5783a709", + "metadata": {}, + "outputs": [], + "source": [ + "# set Logging to DEBUG for more detailed outputs\n", + "response = index.query(\"<query_text>\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2f0f92a7-cdd9-478f-9765-0a122d6e8508", + "metadata": {}, + "outputs": [], + "source": [ + "display(Markdown(f\"<b>{response}</b>\"))" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.0" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/examples/data_connectors/WeaviateDemo.ipynb b/examples/data_connectors/WeaviateDemo.ipynb index b36751549b..1c163471d6 100644 --- a/examples/data_connectors/WeaviateDemo.ipynb +++ b/examples/data_connectors/WeaviateDemo.ipynb @@ -1,177 +1,177 @@ { - "cells": [ - { - "cell_type": "markdown", - "id": "36e7bb96-0c27-47e9-a525-c11f40be3b86", - "metadata": {}, - "source": [ - "# Weaviate" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "38ca1434", - "metadata": {}, - "outputs": [], - "source": [ - "import logging\n", - "import sys\n", - "\n", - "logging.basicConfig(stream=sys.stdout, level=logging.INFO)\n", - "logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "d99bc57b-85df-46ac-8262-2409344af428", - "metadata": {}, - "outputs": [], - "source": [ - "import weaviate\n", - "from llama_index.readers.weaviate import WeaviateReader" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "fec36c7a-3766-4167-890e-b93adb831a64", - "metadata": {}, - "outputs": [], - "source": [ - "# See https://weaviate.io/developers/weaviate/current/client-libraries/python.html\n", - "# for more details on authentication\n", - "resource_owner_config = weaviate.AuthClientPassword(\n", - " username = \"<username>\", \n", - " password = \"<password>\", \n", - ")\n", - "\n", - "# initialize reader\n", - "reader = WeaviateReader(\"https://<cluster-id>.semi.network/\", auth_client_secret=resource_owner_config)" - ] - }, - { - "cell_type": "markdown", - "id": "ce9f299c-4f0a-4bca-bc90-79848f02b381", - "metadata": {}, - "source": [ - "You have two options for the Weaviate reader: 1) directly specify the class_name and properties, or 2) input the raw graphql_query. Examples are shown below." - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "b92d69a1-d39f-45cf-a136-cb9c2f2f5cdf", - "metadata": {}, - "outputs": [], - "source": [ - "# 1) load data using class_name and properties\n", - "# docs = reader.load_data(\n", - "# class_name=\"Author\", properties=[\"name\", \"description\"], separate_documents=True\n", - "# )\n", - "\n", - "documents = reader.load_data(\n", - " class_name=\"<class_name>\", \n", - " properties=[\"property1\", \"property2\", \"...\"], \n", - " separate_documents=True\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "722b5d47-9897-4c54-9734-259ab0c1634c", - "metadata": {}, - "outputs": [], - "source": [ - "# 2) example GraphQL query\n", - "# query = \"\"\"\n", - "# {\n", - "# Get {\n", - "# Author {\n", - "# name\n", - "# description\n", - "# }\n", - "# }\n", - "# }\n", - "# \"\"\"\n", - "# docs = reader.load_data(graphql_query=query, separate_documents=True)\n", - "\n", - "query = \"\"\"\n", - "{\n", - " Get {\n", - " <class_name> {\n", - " <property1>\n", - " <property2>\n", - " ...\n", - " }\n", - " }\n", - "}\n", - "\"\"\"\n", - "\n", - "documents = reader.load_data(graphql_query=query, separate_documents=True)" - ] - }, - { - "cell_type": "markdown", - "id": "169b4273-eb20-4d06-9ffe-71320f4570f6", - "metadata": {}, - "source": [ - "### Create index" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "92599a0a-93ba-4c93-80f1-9acae0663c34", - "metadata": {}, - "outputs": [], - "source": [ - "index = GPTListIndex.from_documents(documents)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "52d93c3f-a08d-4637-98bc-0c3cc693c563", - "metadata": {}, - "outputs": [], - "source": [ - "# set Logging to DEBUG for more detailed outputs\n", - "response = index.query(\"<query_text>\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "771b42be-4108-43a0-a1b4-b259a7819936", - "metadata": {}, - "outputs": [], - "source": [ - "display(Markdown(f\"<b>{response}</b>\"))" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.1" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} \ No newline at end of file + "cells": [ + { + "cell_type": "markdown", + "id": "36e7bb96-0c27-47e9-a525-c11f40be3b86", + "metadata": {}, + "source": [ + "# Weaviate Demo" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "38ca1434", + "metadata": {}, + "outputs": [], + "source": [ + "import logging\n", + "import sys\n", + "\n", + "logging.basicConfig(stream=sys.stdout, level=logging.INFO)\n", + "logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "d99bc57b-85df-46ac-8262-2409344af428", + "metadata": {}, + "outputs": [], + "source": [ + "import weaviate\n", + "from llama_index.readers.weaviate import WeaviateReader" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fec36c7a-3766-4167-890e-b93adb831a64", + "metadata": {}, + "outputs": [], + "source": [ + "# See https://weaviate.io/developers/weaviate/current/client-libraries/python.html\n", + "# for more details on authentication\n", + "resource_owner_config = weaviate.AuthClientPassword(\n", + " username = \"<username>\", \n", + " password = \"<password>\", \n", + ")\n", + "\n", + "# initialize reader\n", + "reader = WeaviateReader(\"https://<cluster-id>.semi.network/\", auth_client_secret=resource_owner_config)" + ] + }, + { + "cell_type": "markdown", + "id": "ce9f299c-4f0a-4bca-bc90-79848f02b381", + "metadata": {}, + "source": [ + "You have two options for the Weaviate reader: 1) directly specify the class_name and properties, or 2) input the raw graphql_query. Examples are shown below." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "b92d69a1-d39f-45cf-a136-cb9c2f2f5cdf", + "metadata": {}, + "outputs": [], + "source": [ + "# 1) load data using class_name and properties\n", + "# docs = reader.load_data(\n", + "# class_name=\"Author\", properties=[\"name\", \"description\"], separate_documents=True\n", + "# )\n", + "\n", + "documents = reader.load_data(\n", + " class_name=\"<class_name>\", \n", + " properties=[\"property1\", \"property2\", \"...\"], \n", + " separate_documents=True\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "722b5d47-9897-4c54-9734-259ab0c1634c", + "metadata": {}, + "outputs": [], + "source": [ + "# 2) example GraphQL query\n", + "# query = \"\"\"\n", + "# {\n", + "# Get {\n", + "# Author {\n", + "# name\n", + "# description\n", + "# }\n", + "# }\n", + "# }\n", + "# \"\"\"\n", + "# docs = reader.load_data(graphql_query=query, separate_documents=True)\n", + "\n", + "query = \"\"\"\n", + "{\n", + " Get {\n", + " <class_name> {\n", + " <property1>\n", + " <property2>\n", + " ...\n", + " }\n", + " }\n", + "}\n", + "\"\"\"\n", + "\n", + "documents = reader.load_data(graphql_query=query, separate_documents=True)" + ] + }, + { + "cell_type": "markdown", + "id": "169b4273-eb20-4d06-9ffe-71320f4570f6", + "metadata": {}, + "source": [ + "### Create index" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "92599a0a-93ba-4c93-80f1-9acae0663c34", + "metadata": {}, + "outputs": [], + "source": [ + "index = GPTListIndex.from_documents(documents)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "52d93c3f-a08d-4637-98bc-0c3cc693c563", + "metadata": {}, + "outputs": [], + "source": [ + "# set Logging to DEBUG for more detailed outputs\n", + "response = index.query(\"<query_text>\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "771b42be-4108-43a0-a1b4-b259a7819936", + "metadata": {}, + "outputs": [], + "source": [ + "display(Markdown(f\"<b>{response}</b>\"))" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.0" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/examples/evaluation/GuardrailsDemo.ipynb b/examples/evaluation/GuardrailsDemo.ipynb index 78b177a61d..b99a045174 100644 --- a/examples/evaluation/GuardrailsDemo.ipynb +++ b/examples/evaluation/GuardrailsDemo.ipynb @@ -1,341 +1,341 @@ { - "cells": [ - { - "cell_type": "markdown", - "id": "9c48213d-6e6a-4c10-838a-2a7c710c3a05", - "metadata": {}, - "source": [ - "# Simple Index Demo" - ] - }, - { - "cell_type": "markdown", - "id": "50d3b817-b70e-4667-be4f-d3a0fe4bd119", - "metadata": {}, - "source": [ - "#### Load documents, build the GPTSimpleVectorIndex" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "690a6918-7c75-4f95-9ccc-d2c4a1fe00d7", - "metadata": {}, - "outputs": [], - "source": [ - "import logging\n", - "import sys\n", - "\n", - "logging.basicConfig(stream=sys.stdout, level=logging.INFO)\n", - "logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))\n", - "\n", - "from gpt_index import GPTSimpleVectorIndex, SimpleDirectoryReader\n", - "from IPython.display import Markdown, display" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "03d1691e-544b-454f-825b-5ee12f7faa8a", - "metadata": {}, - "outputs": [], - "source": [ - "# load documents\n", - "documents = SimpleDirectoryReader('../paul_graham_essay/data').load_data()" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "ad144ee7-96da-4dd6-be00-fd6cf0c78e58", - "metadata": { - "scrolled": true - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "INFO:gpt_index.token_counter.token_counter:> [build_index_from_documents] Total LLM token usage: 0 tokens\n", - "> [build_index_from_documents] Total LLM token usage: 0 tokens\n", - "INFO:gpt_index.token_counter.token_counter:> [build_index_from_documents] Total embedding token usage: 18579 tokens\n", - "> [build_index_from_documents] Total embedding token usage: 18579 tokens\n" - ] - } - ], - "source": [ - "index = GPTSimpleVectorIndex.from_documents(documents, chunk_size_limit=512)" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "2bbccf1d-ac39-427c-b3a3-f8e9d1d12348", - "metadata": {}, - "outputs": [], - "source": [ - "# save index to disk\n", - "index.save_to_disk('index_simple.json')" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "197ca78e-1310-474d-91e3-877c3636b901", - "metadata": {}, - "outputs": [], - "source": [ - "# load index from disk\n", - "index = GPTSimpleVectorIndex.load_from_disk('index_simple.json')" - ] - }, - { - "cell_type": "markdown", - "id": "8b7d7c61-b5d7-4b8f-b90b-3ebee1103f27", - "metadata": {}, - "source": [ - "#### Define Query + Guardrails Spec" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "6fb88295-0840-4e2d-b79b-def0b0a63a7f", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "from gpt_index.output_parsers import GuardrailsOutputParser\n", - "from gpt_index.llm_predictor import StructuredLLMPredictor" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "057139d2-09e8-4b8d-83a1-a2356a1475a8", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "llm_predictor = StructuredLLMPredictor()" - ] - }, - { - "cell_type": "markdown", - "id": "bc25edf7-9343-4e82-a3f1-eec4281a9371", - "metadata": {}, - "source": [ - "**Define custom QA and Refine Prompts**" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "2833d086-d240-4798-b3c5-a83ac4593b0e", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "from gpt_index.prompts.prompts import QuestionAnswerPrompt, RefinePrompt\n", - "from gpt_index.prompts.default_prompts import DEFAULT_TEXT_QA_PROMPT_TMPL, DEFAULT_REFINE_PROMPT_TMPL" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "id": "a4b9201d-fe16-4cc0-8135-a08d9928625d", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "# NOTE: we don't need to define the query_str in the rail spec, we can define during query-time.\n", - "rail_spec = (\"\"\"\n", - "<rail version=\"0.1\">\n", - "\n", - "<output>\n", - " <list name=\"points\" description=\"Bullet points regarding events in the author's life.\">\n", - " <object>\n", - " <string name=\"explanation\" format=\"one-line\" on-fail-one-line=\"noop\" />\n", - " <string name=\"explanation2\" format=\"one-line\" on-fail-one-line=\"noop\" />\n", - " <string name=\"explanation3\" format=\"one-line\" on-fail-one-line=\"noop\" />\n", - " </object>\n", - " </list>\n", - "</output>\n", - "\n", - "<prompt>\n", - "\n", - "Query string here.\n", - "\n", - "@xml_prefix_prompt\n", - "\n", - "{output_schema}\n", - "\n", - "@json_suffix_prompt_v2_wo_none\n", - "</prompt>\n", - "</rail>\n", - "\"\"\")" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "id": "f7af4ebf-1dff-48ec-9fb7-8926af45b6a0", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "output_parser = GuardrailsOutputParser.from_rail_string(rail_spec, llm=llm_predictor.llm)" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "id": "a9b440d4-6fb4-46e6-973f-44207b432d3f", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "# NOTE: we use the same output parser for both prompts, though you can choose to use different parsers\n", - "# NOTE: here we add formatting instructions to the prompts.\n", - "\n", - "fmt_qa_tmpl = output_parser.format(DEFAULT_TEXT_QA_PROMPT_TMPL)\n", - "fmt_refine_tmpl = output_parser.format(DEFAULT_REFINE_PROMPT_TMPL)\n", - "\n", - "qa_prompt = QuestionAnswerPrompt(fmt_qa_tmpl, output_parser=output_parser)\n", - "refine_prompt = RefinePrompt(fmt_refine_tmpl, output_parser=output_parser)" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "id": "1ba18a80-35f4-4fd4-9b13-9f13f84db4fe", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Context information is below. \n", - "---------------------\n", - "{context_str}\n", - "---------------------\n", - "Given the context information and not prior knowledge, answer the question: {query_str}\n", - "\n", - "\n", - "\n", - "Given below is XML that describes the information to extract from this document and the tags to extract it into.\n", - "\n", - "\n", - "<output>\n", - " <list name=\"points\" description=\"Bullet points regarding events in the author's life.\">\n", - " <object>\n", - " <string name=\"explanation\" format=\"one-line\"/>\n", - " <string name=\"explanation2\" format=\"one-line\"/>\n", - " <string name=\"explanation3\" format=\"one-line\"/>\n", - " </object>\n", - " </list>\n", - "</output>\n", - "\n", - "\n", - "\n", - "\n", - "ONLY return a valid JSON object (no other text is necessary). The JSON MUST conform to the XML format, including any types and format requests e.g. requests for lists, objects and specific types. Be correct and concise.\n", - "\n", - "JSON Output:\n", - "\n", - "\n" - ] - } - ], - "source": [ - "# take a look at the new QA template! \n", - "print(fmt_qa_tmpl)" - ] - }, - { - "cell_type": "markdown", - "id": "b6caf93b-6345-4c65-a346-a95b0f1746c4", - "metadata": {}, - "source": [ - "#### Query Index" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "id": "fb9cdf43-0f31-4c36-869b-df9fa50aebdb", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "INFO:gpt_index.token_counter.token_counter:> [query] Total LLM token usage: 754 tokens\n", - "> [query] Total LLM token usage: 754 tokens\n", - "INFO:gpt_index.token_counter.token_counter:> [query] Total embedding token usage: 11 tokens\n", - "> [query] Total embedding token usage: 11 tokens\n" - ] - } - ], - "source": [ - "response = index.query(\n", - " \"What are the three items the author did growing up?\", \n", - " text_qa_template=qa_prompt, \n", - " refine_template=refine_prompt, \n", - " llm_predictor=llm_predictor\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "id": "bc7760b6-5be3-4303-b97e-3f5edacf674b", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'points': [{'explanation': 'Writing short stories', 'explanation2': 'Programming on an IBM 1401', 'explanation3': 'Using microcomputers'}]}\n" - ] - } - ], - "source": [ - "print(response)" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "llama_index_shreyar", - "language": "python", - "name": "llama_index_shreyar" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.10" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} \ No newline at end of file + "cells": [ + { + "cell_type": "markdown", + "id": "9c48213d-6e6a-4c10-838a-2a7c710c3a05", + "metadata": {}, + "source": [ + "# Simple Index Demo Using Gaurdrails" + ] + }, + { + "cell_type": "markdown", + "id": "50d3b817-b70e-4667-be4f-d3a0fe4bd119", + "metadata": {}, + "source": [ + "#### Load documents, build the GPTSimpleVectorIndex" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "690a6918-7c75-4f95-9ccc-d2c4a1fe00d7", + "metadata": {}, + "outputs": [], + "source": [ + "import logging\n", + "import sys\n", + "\n", + "logging.basicConfig(stream=sys.stdout, level=logging.INFO)\n", + "logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))\n", + "\n", + "from gpt_index import GPTSimpleVectorIndex, SimpleDirectoryReader\n", + "from IPython.display import Markdown, display" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "03d1691e-544b-454f-825b-5ee12f7faa8a", + "metadata": {}, + "outputs": [], + "source": [ + "# load documents\n", + "documents = SimpleDirectoryReader('../paul_graham_essay/data').load_data()" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "ad144ee7-96da-4dd6-be00-fd6cf0c78e58", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "INFO:gpt_index.token_counter.token_counter:> [build_index_from_documents] Total LLM token usage: 0 tokens\n", + "> [build_index_from_documents] Total LLM token usage: 0 tokens\n", + "INFO:gpt_index.token_counter.token_counter:> [build_index_from_documents] Total embedding token usage: 18579 tokens\n", + "> [build_index_from_documents] Total embedding token usage: 18579 tokens\n" + ] + } + ], + "source": [ + "index = GPTSimpleVectorIndex.from_documents(documents, chunk_size_limit=512)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "2bbccf1d-ac39-427c-b3a3-f8e9d1d12348", + "metadata": {}, + "outputs": [], + "source": [ + "# save index to disk\n", + "index.save_to_disk('index_simple.json')" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "197ca78e-1310-474d-91e3-877c3636b901", + "metadata": {}, + "outputs": [], + "source": [ + "# load index from disk\n", + "index = GPTSimpleVectorIndex.load_from_disk('index_simple.json')" + ] + }, + { + "cell_type": "markdown", + "id": "8b7d7c61-b5d7-4b8f-b90b-3ebee1103f27", + "metadata": {}, + "source": [ + "#### Define Query + Guardrails Spec" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "6fb88295-0840-4e2d-b79b-def0b0a63a7f", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from gpt_index.output_parsers import GuardrailsOutputParser\n", + "from gpt_index.llm_predictor import StructuredLLMPredictor" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "057139d2-09e8-4b8d-83a1-a2356a1475a8", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "llm_predictor = StructuredLLMPredictor()" + ] + }, + { + "cell_type": "markdown", + "id": "bc25edf7-9343-4e82-a3f1-eec4281a9371", + "metadata": {}, + "source": [ + "**Define custom QA and Refine Prompts**" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "2833d086-d240-4798-b3c5-a83ac4593b0e", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from gpt_index.prompts.prompts import QuestionAnswerPrompt, RefinePrompt\n", + "from gpt_index.prompts.default_prompts import DEFAULT_TEXT_QA_PROMPT_TMPL, DEFAULT_REFINE_PROMPT_TMPL" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "a4b9201d-fe16-4cc0-8135-a08d9928625d", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# NOTE: we don't need to define the query_str in the rail spec, we can define during query-time.\n", + "rail_spec = (\"\"\"\n", + "<rail version=\"0.1\">\n", + "\n", + "<output>\n", + " <list name=\"points\" description=\"Bullet points regarding events in the author's life.\">\n", + " <object>\n", + " <string name=\"explanation\" format=\"one-line\" on-fail-one-line=\"noop\" />\n", + " <string name=\"explanation2\" format=\"one-line\" on-fail-one-line=\"noop\" />\n", + " <string name=\"explanation3\" format=\"one-line\" on-fail-one-line=\"noop\" />\n", + " </object>\n", + " </list>\n", + "</output>\n", + "\n", + "<prompt>\n", + "\n", + "Query string here.\n", + "\n", + "@xml_prefix_prompt\n", + "\n", + "{output_schema}\n", + "\n", + "@json_suffix_prompt_v2_wo_none\n", + "</prompt>\n", + "</rail>\n", + "\"\"\")" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "f7af4ebf-1dff-48ec-9fb7-8926af45b6a0", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "output_parser = GuardrailsOutputParser.from_rail_string(rail_spec, llm=llm_predictor.llm)" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "a9b440d4-6fb4-46e6-973f-44207b432d3f", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# NOTE: we use the same output parser for both prompts, though you can choose to use different parsers\n", + "# NOTE: here we add formatting instructions to the prompts.\n", + "\n", + "fmt_qa_tmpl = output_parser.format(DEFAULT_TEXT_QA_PROMPT_TMPL)\n", + "fmt_refine_tmpl = output_parser.format(DEFAULT_REFINE_PROMPT_TMPL)\n", + "\n", + "qa_prompt = QuestionAnswerPrompt(fmt_qa_tmpl, output_parser=output_parser)\n", + "refine_prompt = RefinePrompt(fmt_refine_tmpl, output_parser=output_parser)" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "1ba18a80-35f4-4fd4-9b13-9f13f84db4fe", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Context information is below. \n", + "---------------------\n", + "{context_str}\n", + "---------------------\n", + "Given the context information and not prior knowledge, answer the question: {query_str}\n", + "\n", + "\n", + "\n", + "Given below is XML that describes the information to extract from this document and the tags to extract it into.\n", + "\n", + "\n", + "<output>\n", + " <list name=\"points\" description=\"Bullet points regarding events in the author's life.\">\n", + " <object>\n", + " <string name=\"explanation\" format=\"one-line\"/>\n", + " <string name=\"explanation2\" format=\"one-line\"/>\n", + " <string name=\"explanation3\" format=\"one-line\"/>\n", + " </object>\n", + " </list>\n", + "</output>\n", + "\n", + "\n", + "\n", + "\n", + "ONLY return a valid JSON object (no other text is necessary). The JSON MUST conform to the XML format, including any types and format requests e.g. requests for lists, objects and specific types. Be correct and concise.\n", + "\n", + "JSON Output:\n", + "\n", + "\n" + ] + } + ], + "source": [ + "# take a look at the new QA template! \n", + "print(fmt_qa_tmpl)" + ] + }, + { + "cell_type": "markdown", + "id": "b6caf93b-6345-4c65-a346-a95b0f1746c4", + "metadata": {}, + "source": [ + "#### Query Index" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "fb9cdf43-0f31-4c36-869b-df9fa50aebdb", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "INFO:gpt_index.token_counter.token_counter:> [query] Total LLM token usage: 754 tokens\n", + "> [query] Total LLM token usage: 754 tokens\n", + "INFO:gpt_index.token_counter.token_counter:> [query] Total embedding token usage: 11 tokens\n", + "> [query] Total embedding token usage: 11 tokens\n" + ] + } + ], + "source": [ + "response = index.query(\n", + " \"What are the three items the author did growing up?\", \n", + " text_qa_template=qa_prompt, \n", + " refine_template=refine_prompt, \n", + " llm_predictor=llm_predictor\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "bc7760b6-5be3-4303-b97e-3f5edacf674b", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'points': [{'explanation': 'Writing short stories', 'explanation2': 'Programming on an IBM 1401', 'explanation3': 'Using microcomputers'}]}\n" + ] + } + ], + "source": [ + "print(response)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.0" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/examples/evaluation/LangchainOutputParserDemo.ipynb b/examples/evaluation/LangchainOutputParserDemo.ipynb index 91f6f5a2ff..541de8a864 100644 --- a/examples/evaluation/LangchainOutputParserDemo.ipynb +++ b/examples/evaluation/LangchainOutputParserDemo.ipynb @@ -1,333 +1,333 @@ { - "cells": [ - { - "cell_type": "markdown", - "id": "9c48213d-6e6a-4c10-838a-2a7c710c3a05", - "metadata": {}, - "source": [ - "# Simple Index Demo" - ] - }, - { - "cell_type": "markdown", - "id": "50d3b817-b70e-4667-be4f-d3a0fe4bd119", - "metadata": {}, - "source": [ - "#### Load documents, build the GPTSimpleVectorIndex" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "id": "690a6918-7c75-4f95-9ccc-d2c4a1fe00d7", - "metadata": {}, - "outputs": [], - "source": [ - "import logging\n", - "import sys\n", - "\n", - "logging.basicConfig(stream=sys.stdout, level=logging.INFO)\n", - "logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))\n", - "\n", - "from gpt_index import GPTSimpleVectorIndex, SimpleDirectoryReader\n", - "from IPython.display import Markdown, display" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "03d1691e-544b-454f-825b-5ee12f7faa8a", - "metadata": {}, - "outputs": [], - "source": [ - "# load documents\n", - "documents = SimpleDirectoryReader('../paul_graham_essay/data').load_data()" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "ad144ee7-96da-4dd6-be00-fd6cf0c78e58", - "metadata": { - "scrolled": true - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "INFO:gpt_index.token_counter.token_counter:> [build_index_from_documents] Total LLM token usage: 0 tokens\n", - "> [build_index_from_documents] Total LLM token usage: 0 tokens\n", - "INFO:gpt_index.token_counter.token_counter:> [build_index_from_documents] Total embedding token usage: 18579 tokens\n", - "> [build_index_from_documents] Total embedding token usage: 18579 tokens\n" - ] - } - ], - "source": [ - "index = GPTSimpleVectorIndex.from_documents(documents, chunk_size_limit=512)" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "2bbccf1d-ac39-427c-b3a3-f8e9d1d12348", - "metadata": {}, - "outputs": [], - "source": [ - "# save index to disk\n", - "index.save_to_disk('index_simple.json')" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "id": "197ca78e-1310-474d-91e3-877c3636b901", - "metadata": {}, - "outputs": [], - "source": [ - "# load index from disk\n", - "index = GPTSimpleVectorIndex.load_from_disk('index_simple.json')" - ] - }, - { - "cell_type": "markdown", - "id": "8b7d7c61-b5d7-4b8f-b90b-3ebee1103f27", - "metadata": {}, - "source": [ - "#### Define Query + Langchain Output Parser" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "6fb88295-0840-4e2d-b79b-def0b0a63a7f", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "from gpt_index.output_parsers import LangchainOutputParser\n", - "from gpt_index.llm_predictor import StructuredLLMPredictor\n", - "from langchain.output_parsers import StructuredOutputParser, ResponseSchema" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "057139d2-09e8-4b8d-83a1-a2356a1475a8", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "llm_predictor = StructuredLLMPredictor()" - ] - }, - { - "cell_type": "markdown", - "id": "bc25edf7-9343-4e82-a3f1-eec4281a9371", - "metadata": {}, - "source": [ - "**Define custom QA and Refine Prompts**" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "2833d086-d240-4798-b3c5-a83ac4593b0e", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "from gpt_index.prompts.prompts import QuestionAnswerPrompt, RefinePrompt\n", - "from gpt_index.prompts.default_prompts import DEFAULT_TEXT_QA_PROMPT_TMPL, DEFAULT_REFINE_PROMPT_TMPL" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "a4b9201d-fe16-4cc0-8135-a08d9928625d", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "response_schemas = [\n", - " ResponseSchema(name=\"Education\", description=\"Describes the author's educational experience/background.\"),\n", - " ResponseSchema(name=\"Work\", description=\"Describes the author's work experience/background.\")\n", - "]" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "e73b87b8-90da-4ab8-9ff7-e40880277d9b", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "lc_output_parser = StructuredOutputParser.from_response_schemas(response_schemas)\n", - "output_parser = LangchainOutputParser(lc_output_parser)" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "id": "a9b440d4-6fb4-46e6-973f-44207b432d3f", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "# NOTE: we use the same output parser for both prompts, though you can choose to use different parsers\n", - "# NOTE: here we add formatting instructions to the prompts.\n", - "\n", - "fmt_qa_tmpl = output_parser.format(DEFAULT_TEXT_QA_PROMPT_TMPL)\n", - "fmt_refine_tmpl = output_parser.format(DEFAULT_REFINE_PROMPT_TMPL)\n", - "\n", - "qa_prompt = QuestionAnswerPrompt(fmt_qa_tmpl, output_parser=output_parser)\n", - "refine_prompt = RefinePrompt(fmt_refine_tmpl, output_parser=output_parser)" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "id": "1ba18a80-35f4-4fd4-9b13-9f13f84db4fe", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Context information is below. \n", - "---------------------\n", - "{context_str}\n", - "---------------------\n", - "Given the context information and not prior knowledge, answer the question: {query_str}\n", - "\n", - "\n", - "The output should be a markdown code snippet formatted in the following schema:\n", - "\n", - "```json\n", - "{{\n", - "\t\"Education\": string // Describes the author's educational experience/background.\n", - "\t\"Work\": string // Describes the author's work experience/background.\n", - "}}\n", - "```\n" - ] - } - ], - "source": [ - "# take a look at the new QA template! \n", - "print(fmt_qa_tmpl)" - ] - }, - { - "cell_type": "markdown", - "id": "b6caf93b-6345-4c65-a346-a95b0f1746c4", - "metadata": {}, - "source": [ - "#### Query Index" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "id": "fb9cdf43-0f31-4c36-869b-df9fa50aebdb", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "INFO:gpt_index.token_counter.token_counter:> [query] Total LLM token usage: 609 tokens\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> [query] Total LLM token usage: 609 tokens\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "INFO:gpt_index.token_counter.token_counter:> [query] Total embedding token usage: 11 tokens\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> [query] Total embedding token usage: 11 tokens\n" - ] - } - ], - "source": [ - "response = index.query(\n", - " \"What are a few things the author did growing up?\", \n", - " text_qa_template=qa_prompt, \n", - " refine_template=refine_prompt, \n", - " llm_predictor=llm_predictor\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "id": "bc7760b6-5be3-4303-b97e-3f5edacf674b", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'Education': 'Before college, the author wrote short stories and experimented with programming on an IBM 1401.', 'Work': 'The author worked on writing and programming outside of school.'}\n" - ] - } - ], - "source": [ - "print(response)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "002a4b5f-51ac-437a-afe7-94e2687737a9", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "llama_index", - "language": "python", - "name": "llama_index" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.10" - } + "cells": [ + { + "cell_type": "markdown", + "id": "9c48213d-6e6a-4c10-838a-2a7c710c3a05", + "metadata": {}, + "source": [ + "# Simple Index Demo with Langchain Output Parsers" + ] + }, + { + "cell_type": "markdown", + "id": "50d3b817-b70e-4667-be4f-d3a0fe4bd119", + "metadata": {}, + "source": [ + "#### Load documents, build the GPTSimpleVectorIndex" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "690a6918-7c75-4f95-9ccc-d2c4a1fe00d7", + "metadata": {}, + "outputs": [], + "source": [ + "import logging\n", + "import sys\n", + "\n", + "logging.basicConfig(stream=sys.stdout, level=logging.INFO)\n", + "logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))\n", + "\n", + "from gpt_index import GPTSimpleVectorIndex, SimpleDirectoryReader\n", + "from IPython.display import Markdown, display" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "03d1691e-544b-454f-825b-5ee12f7faa8a", + "metadata": {}, + "outputs": [], + "source": [ + "# load documents\n", + "documents = SimpleDirectoryReader('../paul_graham_essay/data').load_data()" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "ad144ee7-96da-4dd6-be00-fd6cf0c78e58", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "INFO:gpt_index.token_counter.token_counter:> [build_index_from_documents] Total LLM token usage: 0 tokens\n", + "> [build_index_from_documents] Total LLM token usage: 0 tokens\n", + "INFO:gpt_index.token_counter.token_counter:> [build_index_from_documents] Total embedding token usage: 18579 tokens\n", + "> [build_index_from_documents] Total embedding token usage: 18579 tokens\n" + ] + } + ], + "source": [ + "index = GPTSimpleVectorIndex.from_documents(documents, chunk_size_limit=512)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "2bbccf1d-ac39-427c-b3a3-f8e9d1d12348", + "metadata": {}, + "outputs": [], + "source": [ + "# save index to disk\n", + "index.save_to_disk('index_simple.json')" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "197ca78e-1310-474d-91e3-877c3636b901", + "metadata": {}, + "outputs": [], + "source": [ + "# load index from disk\n", + "index = GPTSimpleVectorIndex.load_from_disk('index_simple.json')" + ] + }, + { + "cell_type": "markdown", + "id": "8b7d7c61-b5d7-4b8f-b90b-3ebee1103f27", + "metadata": {}, + "source": [ + "#### Define Query + Langchain Output Parser" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "6fb88295-0840-4e2d-b79b-def0b0a63a7f", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from gpt_index.output_parsers import LangchainOutputParser\n", + "from gpt_index.llm_predictor import StructuredLLMPredictor\n", + "from langchain.output_parsers import StructuredOutputParser, ResponseSchema" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "057139d2-09e8-4b8d-83a1-a2356a1475a8", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "llm_predictor = StructuredLLMPredictor()" + ] + }, + { + "cell_type": "markdown", + "id": "bc25edf7-9343-4e82-a3f1-eec4281a9371", + "metadata": {}, + "source": [ + "**Define custom QA and Refine Prompts**" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "2833d086-d240-4798-b3c5-a83ac4593b0e", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from gpt_index.prompts.prompts import QuestionAnswerPrompt, RefinePrompt\n", + "from gpt_index.prompts.default_prompts import DEFAULT_TEXT_QA_PROMPT_TMPL, DEFAULT_REFINE_PROMPT_TMPL" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "a4b9201d-fe16-4cc0-8135-a08d9928625d", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "response_schemas = [\n", + " ResponseSchema(name=\"Education\", description=\"Describes the author's educational experience/background.\"),\n", + " ResponseSchema(name=\"Work\", description=\"Describes the author's work experience/background.\")\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "e73b87b8-90da-4ab8-9ff7-e40880277d9b", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "lc_output_parser = StructuredOutputParser.from_response_schemas(response_schemas)\n", + "output_parser = LangchainOutputParser(lc_output_parser)" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "a9b440d4-6fb4-46e6-973f-44207b432d3f", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# NOTE: we use the same output parser for both prompts, though you can choose to use different parsers\n", + "# NOTE: here we add formatting instructions to the prompts.\n", + "\n", + "fmt_qa_tmpl = output_parser.format(DEFAULT_TEXT_QA_PROMPT_TMPL)\n", + "fmt_refine_tmpl = output_parser.format(DEFAULT_REFINE_PROMPT_TMPL)\n", + "\n", + "qa_prompt = QuestionAnswerPrompt(fmt_qa_tmpl, output_parser=output_parser)\n", + "refine_prompt = RefinePrompt(fmt_refine_tmpl, output_parser=output_parser)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "1ba18a80-35f4-4fd4-9b13-9f13f84db4fe", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Context information is below. \n", + "---------------------\n", + "{context_str}\n", + "---------------------\n", + "Given the context information and not prior knowledge, answer the question: {query_str}\n", + "\n", + "\n", + "The output should be a markdown code snippet formatted in the following schema:\n", + "\n", + "```json\n", + "{{\n", + "\t\"Education\": string // Describes the author's educational experience/background.\n", + "\t\"Work\": string // Describes the author's work experience/background.\n", + "}}\n", + "```\n" + ] + } + ], + "source": [ + "# take a look at the new QA template! \n", + "print(fmt_qa_tmpl)" + ] + }, + { + "cell_type": "markdown", + "id": "b6caf93b-6345-4c65-a346-a95b0f1746c4", + "metadata": {}, + "source": [ + "#### Query Index" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "fb9cdf43-0f31-4c36-869b-df9fa50aebdb", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:gpt_index.token_counter.token_counter:> [query] Total LLM token usage: 609 tokens\n" + ] }, - "nbformat": 4, - "nbformat_minor": 5 -} \ No newline at end of file + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> [query] Total LLM token usage: 609 tokens\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:gpt_index.token_counter.token_counter:> [query] Total embedding token usage: 11 tokens\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> [query] Total embedding token usage: 11 tokens\n" + ] + } + ], + "source": [ + "response = index.query(\n", + " \"What are a few things the author did growing up?\", \n", + " text_qa_template=qa_prompt, \n", + " refine_template=refine_prompt, \n", + " llm_predictor=llm_predictor\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "bc7760b6-5be3-4303-b97e-3f5edacf674b", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'Education': 'Before college, the author wrote short stories and experimented with programming on an IBM 1401.', 'Work': 'The author worked on writing and programming outside of school.'}\n" + ] + } + ], + "source": [ + "print(response)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "002a4b5f-51ac-437a-afe7-94e2687737a9", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "llama_index", + "language": "python", + "name": "llama_index" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.0" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/examples/evaluation/TestNYC-Evaluation.ipynb b/examples/evaluation/TestNYC-Evaluation.ipynb index 1a15885d7f..f7c3ffa074 100644 --- a/examples/evaluation/TestNYC-Evaluation.ipynb +++ b/examples/evaluation/TestNYC-Evaluation.ipynb @@ -1,5 +1,13 @@ { "cells": [ + { + "cell_type": "markdown", + "id": "040fd65f", + "metadata": {}, + "source": [ + "# NYC Wiki Evaluation" + ] + }, { "cell_type": "code", "execution_count": 1, @@ -501,7 +509,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.10" + "version": "3.11.0" } }, "nbformat": 4, diff --git a/examples/gatsby/TestGatsby.ipynb b/examples/gatsby/TestGatsby.ipynb index b7c5f3be54..1c2b45f8cd 100644 --- a/examples/gatsby/TestGatsby.ipynb +++ b/examples/gatsby/TestGatsby.ipynb @@ -1,184 +1,192 @@ { - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "id": "ffeb4eee", - "metadata": {}, - "outputs": [], - "source": [ - "import logging\n", - "import sys\n", - "\n", - "logging.basicConfig(stream=sys.stdout, level=logging.INFO)\n", - "logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f1a9eb90-335c-4214-8bb6-fd1edbe3ccbd", - "metadata": {}, - "outputs": [], - "source": [ - "# My OpenAI Key\n", - "import os\n", - "os.environ['OPENAI_API_KEY'] = \"INSERT OPENAI KEY\"" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "8d0b2364-4806-4656-81e7-3f6e4b910b5b", - "metadata": {}, - "outputs": [], - "source": [ - "from llama_index import GPTTreeIndex, SimpleDirectoryReader\n", - "from IPython.display import Markdown, display" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "1298bbb4-c99e-431e-93ef-eb32c0a2fc2a", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> Building index from nodes: 9 chunks\n", - "0/95\n", - "10/95\n", - "20/95\n", - "30/95\n", - "40/95\n", - "50/95\n", - "60/95\n", - "70/95\n", - "80/95\n", - "90/95\n", - "> [build_index_from_documents] Total token usage: 34226 tokens\n" - ] - } - ], - "source": [ - "documents = SimpleDirectoryReader('data').load_data()\n", - "index = GPTTreeIndex.from_documents(documents)" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "0b4fe9b6-5762-4e86-b51e-aac45d3ecdb1", - "metadata": {}, - "outputs": [], - "source": [ - "index.save_to_disk('index_gatsby.json')" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "5eec265d-211b-4f26-b05b-5b4e7072bc6e", - "metadata": {}, - "outputs": [], - "source": [ - "# try loading\n", - "new_index = GPTTreeIndex.load_from_disk('index_gatsby.json')" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "68c9ebfe-b1b6-4f4e-9278-174346de8c90", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> Starting query: What did the narrator do after getting back to Chicago?\n", - ">[Level 0] Selected node: [8]/[8]\n", - ">[Level 1] Selected node: [8]/[8]\n", - "> [query] Total token usage: 6058 tokens\n" - ] - } - ], - "source": [ - "# set Logging to DEBUG for more detailed outputs\n", - "\n", - "response = new_index.query(\"What did the narrator do after getting back to Chicago?\")" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "91581e60-6051-40ae-bba6-8fa08ffbb728", - "metadata": {}, - "outputs": [ - { - "data": { - "text/markdown": [ - "<b>The narrator returned to his home in Chicago and began calling people to inform them of Gatsby's funeral. He was worried that the funeral would draw a sightseeing crowd and wanted to keep it private. He was relieved when Klipspringer called and promised to tell anyone who might be interested about the funeral. He then asked Klipspringer to commit to attending the funeral, but Klipspringer hesitated.</b>" - ], - "text/plain": [ - "<IPython.core.display.Markdown object>" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "display(Markdown(f\"<b>{response}</b>\"))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ca10a9c1-9dff-476d-b218-3208a1b8e7f6", - "metadata": {}, - "outputs": [], - "source": [ - "# GPT is confused by the text evidence\n", - "response = new_index.query(\"What did Gatsby do before he met Daisy?\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4fc3f18a-0ef9-453c-acf8-7aedd784cdcf", - "metadata": {}, - "outputs": [], - "source": [ - "display(Markdown(f\"<b>{response}</b>\"))" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.1" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} \ No newline at end of file + "cells": [ + { + "cell_type": "markdown", + "id": "abf838d6", + "metadata": {}, + "source": [ + "# Gatsby Test" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ffeb4eee", + "metadata": {}, + "outputs": [], + "source": [ + "import logging\n", + "import sys\n", + "\n", + "logging.basicConfig(stream=sys.stdout, level=logging.INFO)\n", + "logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f1a9eb90-335c-4214-8bb6-fd1edbe3ccbd", + "metadata": {}, + "outputs": [], + "source": [ + "# My OpenAI Key\n", + "import os\n", + "os.environ['OPENAI_API_KEY'] = \"INSERT OPENAI KEY\"" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "8d0b2364-4806-4656-81e7-3f6e4b910b5b", + "metadata": {}, + "outputs": [], + "source": [ + "from llama_index import GPTTreeIndex, SimpleDirectoryReader\n", + "from IPython.display import Markdown, display" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "1298bbb4-c99e-431e-93ef-eb32c0a2fc2a", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> Building index from nodes: 9 chunks\n", + "0/95\n", + "10/95\n", + "20/95\n", + "30/95\n", + "40/95\n", + "50/95\n", + "60/95\n", + "70/95\n", + "80/95\n", + "90/95\n", + "> [build_index_from_documents] Total token usage: 34226 tokens\n" + ] + } + ], + "source": [ + "documents = SimpleDirectoryReader('data').load_data()\n", + "index = GPTTreeIndex.from_documents(documents)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "0b4fe9b6-5762-4e86-b51e-aac45d3ecdb1", + "metadata": {}, + "outputs": [], + "source": [ + "index.save_to_disk('index_gatsby.json')" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "5eec265d-211b-4f26-b05b-5b4e7072bc6e", + "metadata": {}, + "outputs": [], + "source": [ + "# try loading\n", + "new_index = GPTTreeIndex.load_from_disk('index_gatsby.json')" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "68c9ebfe-b1b6-4f4e-9278-174346de8c90", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> Starting query: What did the narrator do after getting back to Chicago?\n", + ">[Level 0] Selected node: [8]/[8]\n", + ">[Level 1] Selected node: [8]/[8]\n", + "> [query] Total token usage: 6058 tokens\n" + ] + } + ], + "source": [ + "# set Logging to DEBUG for more detailed outputs\n", + "\n", + "response = new_index.query(\"What did the narrator do after getting back to Chicago?\")" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "91581e60-6051-40ae-bba6-8fa08ffbb728", + "metadata": {}, + "outputs": [ + { + "data": { + "text/markdown": [ + "<b>The narrator returned to his home in Chicago and began calling people to inform them of Gatsby's funeral. He was worried that the funeral would draw a sightseeing crowd and wanted to keep it private. He was relieved when Klipspringer called and promised to tell anyone who might be interested about the funeral. He then asked Klipspringer to commit to attending the funeral, but Klipspringer hesitated.</b>" + ], + "text/plain": [ + "<IPython.core.display.Markdown object>" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "display(Markdown(f\"<b>{response}</b>\"))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ca10a9c1-9dff-476d-b218-3208a1b8e7f6", + "metadata": {}, + "outputs": [], + "source": [ + "# GPT is confused by the text evidence\n", + "response = new_index.query(\"What did Gatsby do before he met Daisy?\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4fc3f18a-0ef9-453c-acf8-7aedd784cdcf", + "metadata": {}, + "outputs": [], + "source": [ + "display(Markdown(f\"<b>{response}</b>\"))" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.0" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/examples/knowledge_graph/KnowledgeGraphDemo.ipynb b/examples/knowledge_graph/KnowledgeGraphDemo.ipynb index 70ade12dd8..1a7a2c2033 100644 --- a/examples/knowledge_graph/KnowledgeGraphDemo.ipynb +++ b/examples/knowledge_graph/KnowledgeGraphDemo.ipynb @@ -1,5 +1,13 @@ { "cells": [ + { + "cell_type": "markdown", + "id": "45043ecd", + "metadata": {}, + "source": [ + "# Knowledge Graph Demo" + ] + }, { "cell_type": "code", "execution_count": 1, @@ -546,7 +554,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.10" + "version": "3.11.0" } }, "nbformat": 4, diff --git a/examples/langchain_demo/LangchainDemo.ipynb b/examples/langchain_demo/LangchainDemo.ipynb index 2e74d40700..781d922448 100644 --- a/examples/langchain_demo/LangchainDemo.ipynb +++ b/examples/langchain_demo/LangchainDemo.ipynb @@ -5,7 +5,7 @@ "id": "3bf01a75-a01b-472e-bc0c-9fe97658eb46", "metadata": {}, "source": [ - "## GPT Index <> Langchain Integrations\n", + "# GPT Index <> Langchain Integrations\n", "\n", "This demo notebook shows how you can provide integrations between GPT Index and Langchain. It provides the following examples:\n", "- Using GPT Index as a callable tool with a Langchain agent\n", diff --git a/examples/multimodal/Multimodal.ipynb b/examples/multimodal/Multimodal.ipynb index 8a42426e2c..8d65da0bc3 100644 --- a/examples/multimodal/Multimodal.ipynb +++ b/examples/multimodal/Multimodal.ipynb @@ -1,594 +1,611 @@ { - "cells": [ - { - "cell_type": "code", - "execution_count": 13, - "id": "d4073749", - "metadata": {}, - "outputs": [], - "source": [ - "from gpt_index import SimpleDirectoryReader, GPTSimpleVectorIndex\n", - "from gpt_index.readers.file.base import (\n", - " DEFAULT_FILE_EXTRACTOR, \n", - " ImageParser,\n", - ")\n", - "from gpt_index.response.notebook_utils import (\n", - " display_response, \n", - " display_image,\n", - ")\n", - "from gpt_index.indices.query.query_transform.base import (\n", - " ImageOutputQueryTransform,\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "id": "a4b74e87", - "metadata": {}, - "outputs": [], - "source": [ - "# NOTE: By default, image parser converts image into text and discard the original image. \n", - "# Here, we explicitly keep both the original image and parsed text in an image document\n", - "image_parser = ImageParser(keep_image=True, parse_text=True)\n", - "file_extractor = DEFAULT_FILE_EXTRACTOR\n", - "file_extractor.update(\n", - "{\n", - " \".jpg\": image_parser,\n", - " \".png\": image_parser,\n", - " \".jpeg\": image_parser,\n", - "})\n", - "\n", - "# NOTE: we add filename as metadata for all documents\n", - "filename_fn = lambda filename: {'file_name': filename}" - ] - }, - { - "cell_type": "markdown", - "id": "ca801c8c", - "metadata": {}, - "source": [ - "# Q&A over Receipt Images" - ] - }, - { - "cell_type": "markdown", - "id": "80cce8e4", - "metadata": {}, - "source": [ - "We first ingest our receipt images with the *custom* `image parser` and `metadata function` defined above. \n", - "This gives us `image documents` instead of only text documents." - ] - }, - { - "cell_type": "code", - "execution_count": 29, - "id": "dbc28f2d", - "metadata": {}, - "outputs": [], - "source": [ - "receipt_reader = SimpleDirectoryReader(\n", - " input_dir='data/receipts', \n", - " file_extractor=file_extractor, \n", - " file_metadata=filename_fn,\n", - ")\n", - "receipt_documents = receipt_reader.load_data()" - ] - }, - { - "cell_type": "markdown", - "id": "12fd6f45", - "metadata": {}, - "source": [ - "We build a simple vector index as usual, but unlike before, our index holds images in addition to text." - ] - }, - { - "cell_type": "code", - "execution_count": 30, - "id": "629cab63", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "INFO:root:> [build_index_from_documents] Total LLM token usage: 0 tokens\n", - "INFO:root:> [build_index_from_documents] Total embedding token usage: 2180 tokens\n" - ] - } - ], - "source": [ - "receipts_index = GPTSimpleVectorIndex.from_documents(receipt_documents)" - ] - }, - { - "cell_type": "markdown", - "id": "8fef454f", - "metadata": {}, - "source": [ - "We can now ask a question that prompts for response with both text and image. \n", - "We use a custom query transform `ImageOutputQueryTransform` to add instruction on how to display the image nicely in the notebook." - ] - }, - { - "cell_type": "code", - "execution_count": 33, - "id": "7c078dc0", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "INFO:root:> [query] Total LLM token usage: 1005 tokens\n", - "INFO:root:> [query] Total embedding token usage: 30 tokens\n" - ] - } - ], - "source": [ - "receipts_response = receipts_index.query(\n", - " 'When was the last time I went to McDonald\\'s and how much did I spend. \\\n", - " Also show me the receipt from my visit.',\n", - " query_transform=ImageOutputQueryTransform(width=400)\n", - ")" - ] - }, - { - "cell_type": "markdown", - "id": "05c180ae", - "metadata": {}, - "source": [ - "We now have rich multimodal response with inline text and image! \n", - "\n", - "The source nodes section gives additional details on retrieved data used for synthesizing the final response. \n", - "In this case, we can verify that the receipt for McDonald's is correctly retrieved. " - ] - }, - { - "cell_type": "code", - "execution_count": 34, - "id": "810ad2e9", - "metadata": {}, - "outputs": [ - { - "data": { - "text/markdown": [ - "**`Final Response:`** The last time you went to McDonald's was on 03/10/2018 at 07:39:12 PM and you spent $26.15. Here is the receipt from your visit: <img src=\"data/receipts/1100-receipt.jpg\" width=\"400\" />" - ], - "text/plain": [ - "<IPython.core.display.Markdown object>" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/markdown": [ - "---" - ], - "text/plain": [ - "<IPython.core.display.Markdown object>" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/markdown": [ - "**`Source Node 1/1`**" - ], - "text/plain": [ - "<IPython.core.display.Markdown object>" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/markdown": [ - "**Document ID:** 3949d7d1-96bc-4d46-beb6-79d2acbb1fd6<br>**Similarity:** 0.7981321083637717<br>**Text:** file_name: data/receipts/1100-receipt.jpg\n", - "\n", - "<s_menu><s_nm> Story</s_nm><s_num> 16725 Stony Platin ...<br>**Image:**" - ], - "text/plain": [ - "<IPython.core.display.Markdown object>" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "<PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=384x512>" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "display_response(receipts_response)" - ] - }, - { - "cell_type": "markdown", - "id": "fa834925", - "metadata": {}, - "source": [ - "# Q & A over LlamaIndex Documentation" - ] - }, - { - "cell_type": "markdown", - "id": "f1a82a54", - "metadata": {}, - "source": [ - "We now demo the same for Q&A over LlamaIndex documentations. \n", - "This demo higlights the ability to synthesize multimodal output with a mixture of text and image documents" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "id": "d5f04295", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration.\n" - ] - } - ], - "source": [ - "llama_reader = SimpleDirectoryReader(\n", - " input_dir='data/llama',\n", - " file_extractor=file_extractor, \n", - " file_metadata=filename_fn,\n", - ")\n", - "llama_documents = llama_reader.load_data(concatenate=True)" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "id": "46db4191", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "INFO:root:> [build_index_from_documents] Total LLM token usage: 0 tokens\n", - "INFO:root:> [build_index_from_documents] Total embedding token usage: 965 tokens\n" - ] - } - ], - "source": [ - "llama_index = GPTSimpleVectorIndex.from_documents(llama_documents)" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "id": "4a4cc090", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "INFO:root:> [query] Total LLM token usage: 1592 tokens\n", - "INFO:root:> [query] Total embedding token usage: 13 tokens\n" - ] - } - ], - "source": [ - "llama_response = llama_index.query(\n", - " 'Show an image to illustrate how tree index works and explain briefly.', \n", - " query_transform=ImageOutputQueryTransform(width=400),\n", - " similarity_top_k=2\n", - ")" - ] - }, - { - "cell_type": "markdown", - "id": "559624a6", - "metadata": {}, - "source": [ - "By inspecting the 2 source nodes, we see relevant text and image describing the tree index are retrieved for synthesizing the final multimodal response." - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "id": "5c5721d6", - "metadata": {}, - "outputs": [ - { - "data": { - "text/markdown": [ - "**`Final Response:`** Tree index is a data structure that organizes data in a hierarchical structure. It is often used to store and retrieve data quickly. The image below illustrates how tree index works. \n", - "\n", - "<img src=\"data/llama/tree_index.png\" width=\"400\" />\n", - "\n", - "At the top of the tree is the root node, which contains the main data. From the root node, the data is divided into smaller nodes, which are called child nodes. Each child node can have its own child nodes, and so on. To retrieve data, the tree index is traversed from the root node to the desired node. This allows for quick retrieval of data.\n", - "\n", - "In addition, LlamaIndex offers different methods of synthesizing a response from the tree index. The way to toggle this can be found in our Usage Pattern Guide. For example, the \"Create and Refine\" mode is an iterative way of generating a response. We first use the context in the first node, along with the query, to generate an initial answer. We then pass this answer, the query, and the context of the second node as input into a \"refine prompt\" to generate a refined answer. We refine through N-1 nodes, where" - ], - "text/plain": [ - "<IPython.core.display.Markdown object>" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/markdown": [ - "---" - ], - "text/plain": [ - "<IPython.core.display.Markdown object>" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/markdown": [ - "**`Source Node 1/2`**" - ], - "text/plain": [ - "<IPython.core.display.Markdown object>" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/markdown": [ - "**Document ID:** 589f9b15-f3b4-4bfc-b50e-28fb1a2d0173<br>**Similarity:** 0.8151716742235475<br>**Text:** file_name: data/llama/tree_index.png\n", - "\n", - "<s_menu><s_nm> Root Node</s_nm><s_unitprice> Parent</s_nm><...<br>**Image:**" - ], - "text/plain": [ - "<IPython.core.display.Markdown object>" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "<PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=512x431>" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/markdown": [ - "---" - ], - "text/plain": [ - "<IPython.core.display.Markdown object>" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/markdown": [ - "**`Source Node 2/2`**" - ], - "text/plain": [ - "<IPython.core.display.Markdown object>" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/markdown": [ - "**Document ID:** d60e5289-64c9-4446-91be-afca6c2be723<br>**Similarity:** 0.8133374944584655<br>**Text:** How Each Index Works\n", - "\n", - "This guide describes how each index works with diagrams. We also visually h...<br>" - ], - "text/plain": [ - "<IPython.core.display.Markdown object>" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "display_response(llama_response)" - ] - }, - { - "cell_type": "markdown", - "id": "dbd7376e", - "metadata": {}, - "source": [ - "We show another example asking about vector store index instead." - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "id": "92569825", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "INFO:root:> [query] Total LLM token usage: 1567 tokens\n", - "INFO:root:> [query] Total embedding token usage: 14 tokens\n" - ] - } - ], - "source": [ - "llama_response = llama_index.query(\n", - " 'Show an image to illustrate how vector store index works and explain briefly.', \n", - " query_transform=ImageOutputQueryTransform(width=400),\n", - " similarity_top_k=2\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "id": "7cfdd68d", - "metadata": {}, - "outputs": [ - { - "data": { - "text/markdown": [ - "**`Final Response:`** Vector store index is a data structure used to store and retrieve data efficiently. It is a type of hash table that uses a hash function to map keys to their associated values. The image below illustrates how vector store index works. \n", - "\n", - "<img src=\"data/llama/vector_store_index.png\" width=\"400\" />\n", - "\n", - "In the image, the keys are represented by the numbers on the left side of the table, and the values are represented by the numbers on the right side. The hash function is used to map the keys to their associated values. The hash function takes the key as input and produces an index, which is used to locate the value in the table.\n", - "\n", - "Vector store index is one of the four index types used by LlamaIndex, a search engine for natural language processing. The other three index types are list index, tree index, and keyword table index. Each index type has its own way of storing and retrieving data. \n", - "\n", - "List index stores Nodes as a sequential chain. During query time, if no other query parameters are specified, LlamaIndex simply loads all Nodes in the list into our Reponse Synthesis module. The list index also offers numerous ways of querying a list" - ], - "text/plain": [ - "<IPython.core.display.Markdown object>" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/markdown": [ - "---" - ], - "text/plain": [ - "<IPython.core.display.Markdown object>" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/markdown": [ - "**`Source Node 1/2`**" - ], - "text/plain": [ - "<IPython.core.display.Markdown object>" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/markdown": [ - "**Document ID:** 80d8d2e9-412f-4b22-bc60-55dd131b73a9<br>**Similarity:** 0.8164705784668993<br>**Text:** file_name: data/llama/vector_store_index.png\n", - "\n", - "<s_menu><s_nm> Nodel</s_nm><s_unitprice> Node2</s_u...<br>**Image:**" - ], - "text/plain": [ - "<IPython.core.display.Markdown object>" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "<PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=512x313>" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/markdown": [ - "---" - ], - "text/plain": [ - "<IPython.core.display.Markdown object>" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/markdown": [ - "**`Source Node 2/2`**" - ], - "text/plain": [ - "<IPython.core.display.Markdown object>" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/markdown": [ - "**Document ID:** d60e5289-64c9-4446-91be-afca6c2be723<br>**Similarity:** 0.7878578850577496<br>**Text:** How Each Index Works\n", - "\n", - "This guide describes how each index works with diagrams. We also visually h...<br>" - ], - "text/plain": [ - "<IPython.core.display.Markdown object>" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "display_response(llama_response)" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.9" - } + "cells": [ + { + "cell_type": "markdown", + "id": "493dd86c", + "metadata": {}, + "source": [ + "# Multimodal Demo" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "d4073749", + "metadata": {}, + "outputs": [], + "source": [ + "from llama_index import SimpleDirectoryReader, GPTSimpleVectorIndex\n", + "from llama_index.readers.file.base import (\n", + " DEFAULT_FILE_EXTRACTOR, \n", + " ImageParser,\n", + ")\n", + "from gpt_index.response.notebook_utils import (\n", + " display_response, \n", + " display_image,\n", + ")\n", + "from gpt_index.indices.query.query_transform.base import (\n", + " ImageOutputQueryTransform,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "a4b74e87", + "metadata": {}, + "outputs": [], + "source": [ + "# NOTE: By default, image parser converts image into text and discard the original image. \n", + "# Here, we explicitly keep both the original image and parsed text in an image document\n", + "image_parser = ImageParser(keep_image=True, parse_text=True)\n", + "file_extractor = DEFAULT_FILE_EXTRACTOR\n", + "file_extractor.update(\n", + "{\n", + " \".jpg\": image_parser,\n", + " \".png\": image_parser,\n", + " \".jpeg\": image_parser,\n", + "})\n", + "\n", + "# NOTE: we add filename as metadata for all documents\n", + "filename_fn = lambda filename: {'file_name': filename}" + ] + }, + { + "cell_type": "markdown", + "id": "ca801c8c", + "metadata": {}, + "source": [ + "## Q&A over Receipt Images" + ] + }, + { + "cell_type": "markdown", + "id": "80cce8e4", + "metadata": {}, + "source": [ + "We first ingest our receipt images with the *custom* `image parser` and `metadata function` defined above. \n", + "This gives us `image documents` instead of only text documents." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "dbc28f2d", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration.\n" + ] + } + ], + "source": [ + "receipt_reader = SimpleDirectoryReader(\n", + " input_dir='data/receipts', \n", + " file_extractor=file_extractor, \n", + " file_metadata=filename_fn,\n", + ")\n", + "receipt_documents = receipt_reader.load_data()" + ] + }, + { + "cell_type": "markdown", + "id": "12fd6f45", + "metadata": {}, + "source": [ + "We build a simple vector index as usual, but unlike before, our index holds images in addition to text." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "629cab63", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total LLM token usage: 0 tokens\n", + "INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total embedding token usage: 2180 tokens\n" + ] + } + ], + "source": [ + "receipts_index = GPTSimpleVectorIndex.from_documents(receipt_documents)" + ] + }, + { + "cell_type": "markdown", + "id": "8fef454f", + "metadata": {}, + "source": [ + "We can now ask a question that prompts for response with both text and image. \n", + "We use a custom query transform `ImageOutputQueryTransform` to add instruction on how to display the image nicely in the notebook." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "7c078dc0", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:llama_index.token_counter.token_counter:> [query] Total LLM token usage: 1004 tokens\n", + "INFO:llama_index.token_counter.token_counter:> [query] Total embedding token usage: 30 tokens\n" + ] + } + ], + "source": [ + "receipts_response = receipts_index.query(\n", + " 'When was the last time I went to McDonald\\'s and how much did I spend. \\\n", + " Also show me the receipt from my visit.',\n", + " query_transform=ImageOutputQueryTransform(width=400)\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "05c180ae", + "metadata": {}, + "source": [ + "We now have rich multimodal response with inline text and image! \n", + "\n", + "The source nodes section gives additional details on retrieved data used for synthesizing the final response. \n", + "In this case, we can verify that the receipt for McDonald's is correctly retrieved. " + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "810ad2e9", + "metadata": {}, + "outputs": [ + { + "data": { + "text/markdown": [ + "**`Final Response:`** The last time you went to McDonald's was on 03/10/2018 at 07:39:12 PM and you spent $26.15. Here is the receipt from your visit: <img src=\"data/receipts/1100-receipt.jpg\" width=\"400\" />" + ], + "text/plain": [ + "<IPython.core.display.Markdown object>" + ] + }, + "metadata": {}, + "output_type": "display_data" }, - "nbformat": 4, - "nbformat_minor": 5 -} \ No newline at end of file + { + "data": { + "text/markdown": [ + "--" + ], + "text/plain": [ + "<IPython.core.display.Markdown object>" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "**`Source Node 1/1`**" + ], + "text/plain": [ + "<IPython.core.display.Markdown object>" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/loganm/miniconda3/envs/llama_index/lib/python3.11/site-packages/llama_index/data_structs/node_v2.py:144: UserWarning: .source_text is deprecated, use .node.get_text() instead\n", + " warnings.warn(\".source_text is deprecated, use .node.get_text() instead\")\n", + "/home/loganm/miniconda3/envs/llama_index/lib/python3.11/site-packages/llama_index/data_structs/node_v2.py:139: UserWarning: .doc_id is deprecated, use .node.ref_doc_id instead\n", + " warnings.warn(\".doc_id is deprecated, use .node.ref_doc_id instead\")\n", + "/home/loganm/miniconda3/envs/llama_index/lib/python3.11/site-packages/llama_index/data_structs/node_v2.py:159: UserWarning: .similarity is deprecated, use .score instead instead\n", + " warnings.warn(\".similarity is deprecated, use .score instead instead\")\n", + "/home/loganm/miniconda3/envs/llama_index/lib/python3.11/site-packages/llama_index/data_structs/node_v2.py:164: UserWarning: .image is deprecated, check if Node is an ImageNode and use .node.image instead\n", + " warnings.warn(\n" + ] + }, + { + "data": { + "text/markdown": [ + "**Document ID:** 2421376f-875d-4d51-a828-ce66354b5145<br>**Similarity:** 0.7982013908158432<br>**Text:** file_name: data/receipts/1100-receipt.jpg\n", + "\n", + "<s_menu><s_nm> Story</s_nm><s_num> 16725 Stony Platin ...<br>**Image:**" + ], + "text/plain": [ + "<IPython.core.display.Markdown object>" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "<PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=384x512>" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "display_response(receipts_response)" + ] + }, + { + "cell_type": "markdown", + "id": "fa834925", + "metadata": {}, + "source": [ + "## Q & A over LlamaIndex Documentation" + ] + }, + { + "cell_type": "markdown", + "id": "f1a82a54", + "metadata": {}, + "source": [ + "We now demo the same for Q&A over LlamaIndex documentations. \n", + "This demo higlights the ability to synthesize multimodal output with a mixture of text and image documents" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "d5f04295", + "metadata": {}, + "outputs": [], + "source": [ + "llama_reader = SimpleDirectoryReader(\n", + " input_dir='data/llama',\n", + " file_extractor=file_extractor, \n", + " file_metadata=filename_fn,\n", + ")\n", + "llama_documents = llama_reader.load_data(concatenate=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "46db4191", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total LLM token usage: 0 tokens\n", + "INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total embedding token usage: 965 tokens\n" + ] + } + ], + "source": [ + "llama_index = GPTSimpleVectorIndex.from_documents(llama_documents)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "4a4cc090", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:llama_index.token_counter.token_counter:> [query] Total LLM token usage: 1475 tokens\n", + "INFO:llama_index.token_counter.token_counter:> [query] Total embedding token usage: 13 tokens\n" + ] + } + ], + "source": [ + "llama_response = llama_index.query(\n", + " 'Show an image to illustrate how tree index works and explain briefly.', \n", + " query_transform=ImageOutputQueryTransform(width=400),\n", + " similarity_top_k=2\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "559624a6", + "metadata": {}, + "source": [ + "By inspecting the 2 source nodes, we see relevant text and image describing the tree index are retrieved for synthesizing the final multimodal response." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "5c5721d6", + "metadata": {}, + "outputs": [ + { + "data": { + "text/markdown": [ + "**`Final Response:`** This image illustrates how a tree index works. A tree index is a type of data structure that stores data in a hierarchical structure. It is composed of nodes, which can have multiple children and a single parent. Each node contains data, such as a key, value, or other information. The tree index is used to quickly search for data by traversing the tree from the root node to the desired node. During query time, we traverse from root nodes down to leaf nodes. By default, (`child_branch_factor=1`), a query chooses one child node given a parent node. If `child_branch_factor=2`, a query chooses two child nodes per parent. LlamaIndex also offers different methods of synthesizing a response, such as Create and Refine and Tree Summarize. Create and Refine is an iterative way of generating a response, while Tree Summarize builds a tree index over the set of candidate nodes with a summary prompt seeded with the query." + ], + "text/plain": [ + "<IPython.core.display.Markdown object>" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "--" + ], + "text/plain": [ + "<IPython.core.display.Markdown object>" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "**`Source Node 1/2`**" + ], + "text/plain": [ + "<IPython.core.display.Markdown object>" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "**Document ID:** 8d50b8e8-4505-4b75-979c-2f068f3afaba<br>**Similarity:** 0.8147614150754062<br>**Text:** file_name: data/llama/tree_index.png\n", + "\n", + "<s_menu><s_nm> Root Node</s_nm><s_unitprice> Parent</s_nm><...<br>**Image:**" + ], + "text/plain": [ + "<IPython.core.display.Markdown object>" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "<PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=512x431>" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "--" + ], + "text/plain": [ + "<IPython.core.display.Markdown object>" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "**`Source Node 2/2`**" + ], + "text/plain": [ + "<IPython.core.display.Markdown object>" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "**Document ID:** 673418ff-7052-4921-9c9f-9d3fe8f6af49<br>**Similarity:** 0.8138599661902355<br>**Text:** How Each Index Works\n", + "\n", + "This guide describes how each index works with diagrams. We also visually h...<br>" + ], + "text/plain": [ + "<IPython.core.display.Markdown object>" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "display_response(llama_response)" + ] + }, + { + "cell_type": "markdown", + "id": "dbd7376e", + "metadata": {}, + "source": [ + "We show another example asking about vector store index instead." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "92569825", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:llama_index.token_counter.token_counter:> [query] Total LLM token usage: 1404 tokens\n", + "INFO:llama_index.token_counter.token_counter:> [query] Total embedding token usage: 14 tokens\n" + ] + } + ], + "source": [ + "llama_response = llama_index.query(\n", + " 'Show an image to illustrate how vector store index works and explain briefly.', \n", + " query_transform=ImageOutputQueryTransform(width=400),\n", + " similarity_top_k=2\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "7cfdd68d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/markdown": [ + "**`Final Response:`** <img src=\"data/llama/vector_store_index.png\" width=\"400\" />\n", + "Vector store index is a way of storing data in a vector format. It is used to store data in a way that is easy to access and manipulate. The data is stored in a vector format, which is a collection of numbers that represent the data. This makes it easier to access and manipulate the data, as well as to store it in a more efficient way. Vector store index stores each Node and a corresponding embedding in a Vector Store. During query time, we extract relevant keywords from the query, and match those with pre-extracted Node keywords to fetch the corresponding Nodes. The extracted Nodes are passed to our Response Synthesis module, which can be configured to use different methods of synthesizing a response, such as Create and Refine or Tree Summarize." + ], + "text/plain": [ + "<IPython.core.display.Markdown object>" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "--" + ], + "text/plain": [ + "<IPython.core.display.Markdown object>" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "**`Source Node 1/2`**" + ], + "text/plain": [ + "<IPython.core.display.Markdown object>" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "**Document ID:** 2c05e859-9d49-4a87-8cb7-4c755aa72dd4<br>**Similarity:** 0.816241967681269<br>**Text:** file_name: data/llama/vector_store_index.png\n", + "\n", + "<s_menu><s_nm> Nodel</s_nm><s_unitprice> Node2</s_u...<br>**Image:**" + ], + "text/plain": [ + "<IPython.core.display.Markdown object>" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "<PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=512x313>" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "--" + ], + "text/plain": [ + "<IPython.core.display.Markdown object>" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "**`Source Node 2/2`**" + ], + "text/plain": [ + "<IPython.core.display.Markdown object>" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "**Document ID:** 673418ff-7052-4921-9c9f-9d3fe8f6af49<br>**Similarity:** 0.7878806797032482<br>**Text:** How Each Index Works\n", + "\n", + "This guide describes how each index works with diagrams. We also visually h...<br>" + ], + "text/plain": [ + "<IPython.core.display.Markdown object>" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "display_response(llama_response)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f459baab", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "llama_index", + "language": "python", + "name": "llama_index" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.0" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/examples/optimizer/OptimizerDemo.ipynb b/examples/optimizer/OptimizerDemo.ipynb index 7ff686f826..84e6be9edf 100644 --- a/examples/optimizer/OptimizerDemo.ipynb +++ b/examples/optimizer/OptimizerDemo.ipynb @@ -1,198 +1,204 @@ { - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "id": "839c4a87", - "metadata": {}, - "outputs": [], - "source": [ - "# My OpenAI Key\n", - "import os\n", - "os.environ['OPENAI_API_KEY'] = \"INSERT OPENAI KEY\"" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "40cf0773", - "metadata": {}, - "source": [ - "### Setup" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "fa34cd83", - "metadata": {}, - "outputs": [], - "source": [ - "from llama_index import download_loader\n", - "\n", - "WikipediaReader = download_loader(\"WikipediaReader\")\n", - "\n", - "loader = WikipediaReader()\n", - "documents = loader.load_data(pages=['Berlin'])" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "id": "f59e6c18", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "<class 'llama_index.readers.schema.base.Document'>\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "INFO:root:> [build_index_from_documents] Total LLM token usage: 0 tokens\n", - "INFO:root:> [build_index_from_documents] Total embedding token usage: 18390 tokens\n" - ] - } - ], - "source": [ - "from llama_index import GPTSimpleVectorIndex\n", - "index = GPTSimpleVectorIndex.from_documents(documents)\n", - "# save index to file\n", - "index.save_to_disk(\"simple_vector_index.json\")" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "827ada33", - "metadata": {}, - "source": [ - "Compare query with and without optimization for LLM token usage, Embedding Model usage on query, Embedding model usage for optimizer, and total time." - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "id": "a04e4535", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Without optimization\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "INFO:root:> [query] Total LLM token usage: 3545 tokens\n", - "INFO:root:> [query] Total embedding token usage: 7 tokens\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Total time elapsed: 2.8928110599517822\n", - "Answer: \n", - "The population of Berlin in 1949 was approximately 2.2 million inhabitants. After the fall of the Berlin Wall in 1989, the population of Berlin increased to approximately 3.7 million inhabitants.\n", - "\n", - "With optimization\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "INFO:root:> [optimize] Total embedding token usage: 7 tokens\n", - "INFO:root:> [query] Total LLM token usage: 1779 tokens\n", - "INFO:root:> [query] Total embedding token usage: 7 tokens\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Total time elapsed: 2.346346139907837\n", - "Answer: \n", - "The population of Berlin is around 4.5 million.\n", - "Alternate optimization cutoff\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "INFO:root:> [optimize] Total embedding token usage: 7 tokens\n", - "INFO:root:> [query] Total LLM token usage: 3215 tokens\n", - "INFO:root:> [query] Total embedding token usage: 7 tokens\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Total time elapsed: 2.101111888885498\n", - "Answer: \n", - "The population of Berlin is around 4.5 million.\n" - ] - } - ], - "source": [ - "import time\n", - "from gpt_index import GPTSimpleVectorIndex\n", - "from gpt_index.optimization.optimizer import SentenceEmbeddingOptimizer\n", - "# load from disk\n", - "index = GPTSimpleVectorIndex.load_from_disk('simple_vector_index.json')\n", - "\n", - "print(\"Without optimization\")\n", - "start_time = time.time()\n", - "res = index.query(\"What is the population of Berlin?\")\n", - "end_time = time.time()\n", - "print(\"Total time elapsed: {}\".format(end_time - start_time))\n", - "print(\"Answer: {}\".format(res))\n", - "\n", - "print(\"With optimization\")\n", - "start_time = time.time()\n", - "res = index.query(\"What is the population of Berlin?\", optimizer=SentenceEmbeddingOptimizer(percentile_cutoff=0.5))\n", - "end_time = time.time()\n", - "print(\"Total time elapsed: {}\".format(end_time - start_time))\n", - "print(\"Answer: {}\".format(res))\n", - "\n", - "print(\"Alternate optimization cutoff\")\n", - "start_time = time.time()\n", - "res = index.query(\"What is the population of Berlin?\", optimizer=SentenceEmbeddingOptimizer(threshold_cutoff=0.7))\n", - "end_time = time.time()\n", - "print(\"Total time elapsed: {}\".format(end_time - start_time))\n", - "print(\"Answer: {}\".format(res))" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.0" - } + "cells": [ + { + "cell_type": "markdown", + "id": "df32a9f0", + "metadata": {}, + "source": [ + "# Optimizer Demo" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "839c4a87", + "metadata": {}, + "outputs": [], + "source": [ + "# My OpenAI Key\n", + "import os\n", + "os.environ['OPENAI_API_KEY'] = \"INSERT OPENAI KEY\"" + ] + }, + { + "cell_type": "markdown", + "id": "40cf0773", + "metadata": {}, + "source": [ + "### Setup" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "fa34cd83", + "metadata": {}, + "outputs": [], + "source": [ + "from llama_index import download_loader\n", + "\n", + "WikipediaReader = download_loader(\"WikipediaReader\")\n", + "\n", + "loader = WikipediaReader()\n", + "documents = loader.load_data(pages=['Berlin'])" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "f59e6c18", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "<class 'llama_index.readers.schema.base.Document'>\n" + ] }, - "nbformat": 4, - "nbformat_minor": 5 -} \ No newline at end of file + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:root:> [build_index_from_documents] Total LLM token usage: 0 tokens\n", + "INFO:root:> [build_index_from_documents] Total embedding token usage: 18390 tokens\n" + ] + } + ], + "source": [ + "from llama_index import GPTSimpleVectorIndex\n", + "index = GPTSimpleVectorIndex.from_documents(documents)\n", + "# save index to file\n", + "index.save_to_disk(\"simple_vector_index.json\")" + ] + }, + { + "cell_type": "markdown", + "id": "827ada33", + "metadata": {}, + "source": [ + "Compare query with and without optimization for LLM token usage, Embedding Model usage on query, Embedding model usage for optimizer, and total time." + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "a04e4535", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Without optimization\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:root:> [query] Total LLM token usage: 3545 tokens\n", + "INFO:root:> [query] Total embedding token usage: 7 tokens\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Total time elapsed: 2.8928110599517822\n", + "Answer: \n", + "The population of Berlin in 1949 was approximately 2.2 million inhabitants. After the fall of the Berlin Wall in 1989, the population of Berlin increased to approximately 3.7 million inhabitants.\n", + "\n", + "With optimization\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:root:> [optimize] Total embedding token usage: 7 tokens\n", + "INFO:root:> [query] Total LLM token usage: 1779 tokens\n", + "INFO:root:> [query] Total embedding token usage: 7 tokens\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Total time elapsed: 2.346346139907837\n", + "Answer: \n", + "The population of Berlin is around 4.5 million.\n", + "Alternate optimization cutoff\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:root:> [optimize] Total embedding token usage: 7 tokens\n", + "INFO:root:> [query] Total LLM token usage: 3215 tokens\n", + "INFO:root:> [query] Total embedding token usage: 7 tokens\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Total time elapsed: 2.101111888885498\n", + "Answer: \n", + "The population of Berlin is around 4.5 million.\n" + ] + } + ], + "source": [ + "import time\n", + "from gpt_index import GPTSimpleVectorIndex\n", + "from gpt_index.optimization.optimizer import SentenceEmbeddingOptimizer\n", + "# load from disk\n", + "index = GPTSimpleVectorIndex.load_from_disk('simple_vector_index.json')\n", + "\n", + "print(\"Without optimization\")\n", + "start_time = time.time()\n", + "res = index.query(\"What is the population of Berlin?\")\n", + "end_time = time.time()\n", + "print(\"Total time elapsed: {}\".format(end_time - start_time))\n", + "print(\"Answer: {}\".format(res))\n", + "\n", + "print(\"With optimization\")\n", + "start_time = time.time()\n", + "res = index.query(\"What is the population of Berlin?\", optimizer=SentenceEmbeddingOptimizer(percentile_cutoff=0.5))\n", + "end_time = time.time()\n", + "print(\"Total time elapsed: {}\".format(end_time - start_time))\n", + "print(\"Answer: {}\".format(res))\n", + "\n", + "print(\"Alternate optimization cutoff\")\n", + "start_time = time.time()\n", + "res = index.query(\"What is the population of Berlin?\", optimizer=SentenceEmbeddingOptimizer(threshold_cutoff=0.7))\n", + "end_time = time.time()\n", + "print(\"Total time elapsed: {}\".format(end_time - start_time))\n", + "print(\"Answer: {}\".format(res))" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.0" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/examples/paul_graham_essay/DavinciComparison.ipynb b/examples/paul_graham_essay/DavinciComparison.ipynb index 978ad7d6a6..40670a9fb4 100644 --- a/examples/paul_graham_essay/DavinciComparison.ipynb +++ b/examples/paul_graham_essay/DavinciComparison.ipynb @@ -1,17 +1,5 @@ { "cells": [ - { - "cell_type": "code", - "execution_count": null, - "id": "df7cb4c1-a1b8-4e80-ad88-8878008bde89", - "metadata": {}, - "outputs": [], - "source": [ - "# My OpenAI Key\n", - "import os\n", - "os.environ['OPENAI_API_KEY'] = \"INSERT OPENAI KEY\"" - ] - }, { "cell_type": "markdown", "id": "7096589b-daaf-440a-b89d-b4956f2db4b2", @@ -24,6 +12,18 @@ "Does text-davinci-003 do better?" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "df7cb4c1-a1b8-4e80-ad88-8878008bde89", + "metadata": {}, + "outputs": [], + "source": [ + "# My OpenAI Key\n", + "import os\n", + "os.environ['OPENAI_API_KEY'] = \"INSERT OPENAI KEY\"" + ] + }, { "cell_type": "markdown", "id": "d8cfbe6f-4c50-4c4f-90f9-03bb91201ef5", @@ -209,7 +209,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.10" + "version": "3.11.0" } }, "nbformat": 4, diff --git a/examples/paul_graham_essay/GPT4Comparison.ipynb b/examples/paul_graham_essay/GPT4Comparison.ipynb index e555cc38b5..ce45dbb13b 100644 --- a/examples/paul_graham_essay/GPT4Comparison.ipynb +++ b/examples/paul_graham_essay/GPT4Comparison.ipynb @@ -1,646 +1,654 @@ { - "cells": [ - { - "cell_type": "code", - "execution_count": 62, - "id": "4921c412", - "metadata": {}, - "outputs": [], - "source": [ - "from gpt_index import GPTListIndex, SimpleDirectoryReader, LLMPredictor, PromptHelper, ServiceContext\n", - "from gpt_index.response.notebook_utils import display_response\n", - "from langchain import OpenAI\n", - "from langchain.chat_models import ChatOpenAI\n", - "from IPython.display import Markdown, display" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "261d923e", - "metadata": {}, - "outputs": [], - "source": [ - "documents = SimpleDirectoryReader('data').load_data()" - ] - }, - { - "cell_type": "markdown", - "id": "f23b5169", - "metadata": {}, - "source": [ - "# davinci-003" - ] - }, - { - "cell_type": "code", - "execution_count": 34, - "id": "0c635cdb", - "metadata": {}, - "outputs": [], - "source": [ - "llm_predictor = LLMPredictor(llm=OpenAI(temperature=0, model_name=\"text-davinci-003\"))\n", - "service_context = ServiceContext.from_defaults(llm_predictor=llm_predictor)" - ] - }, - { - "cell_type": "code", - "execution_count": 35, - "id": "b8ad1a2a", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "INFO:gpt_index.token_counter.token_counter:> [build_index_from_documents] Total LLM token usage: 0 tokens\n", - "INFO:gpt_index.token_counter.token_counter:> [build_index_from_documents] Total embedding token usage: 0 tokens\n" - ] - } - ], - "source": [ - "davinci_index = GPTListIndex.from_documents(documents, service_context=service_context)" - ] - }, - { - "cell_type": "code", - "execution_count": 45, - "id": "c9925597", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'Document is split into 6 nodes.'" - ] - }, - "execution_count": 45, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "f'Document is split into {len(davinci_index._index_struct.nodes)} nodes.'" - ] - }, - { - "cell_type": "code", - "execution_count": 68, - "id": "fa1d7242", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "INFO:gpt_index.indices.common.tree.base:> Building index from nodes: 5 chunks\n", - "INFO:gpt_index.token_counter.token_counter:> [query] Total LLM token usage: 19882 tokens\n", - "INFO:gpt_index.token_counter.token_counter:> [query] Total embedding token usage: 0 tokens\n" - ] - } - ], - "source": [ - "response = davinci_index.query(\n", - " \"What happened on one night in October 2003?\", \n", - " response_mode=\"tree_summarize\"\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 69, - "id": "d758bdb7", - "metadata": { - "scrolled": true - }, - "outputs": [ - { - "data": { - "text/markdown": [ - "**`Final Response:`** It is not possible to answer this question with the given context information." - ], - "text/plain": [ - "<IPython.core.display.Markdown object>" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/markdown": [ - "---" - ], - "text/plain": [ - "<IPython.core.display.Markdown object>" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/markdown": [ - "**`Source Node 1/6`**" - ], - "text/plain": [ - "<IPython.core.display.Markdown object>" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/markdown": [ - "**Document ID:** 2ea119c7-fc3d-4090-a47a-8dd2a0d37416<br>**Similarity:** None<br>**Text:** What I Worked On\n", - "\n", - "February 2021\n", - "\n", - "Before college the two main things I worked on, outside of schoo...<br>" - ], - "text/plain": [ - "<IPython.core.display.Markdown object>" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/markdown": [ - "---" - ], - "text/plain": [ - "<IPython.core.display.Markdown object>" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/markdown": [ - "**`Source Node 2/6`**" - ], - "text/plain": [ - "<IPython.core.display.Markdown object>" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/markdown": [ - "**Document ID:** 2ea119c7-fc3d-4090-a47a-8dd2a0d37416<br>**Similarity:** None<br>**Text:** whereby the students wouldn't require the faculty to teach anything, and in return the faculty wo...<br>" - ], - "text/plain": [ - "<IPython.core.display.Markdown object>" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/markdown": [ - "---" - ], - "text/plain": [ - "<IPython.core.display.Markdown object>" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/markdown": [ - "**`Source Node 3/6`**" - ], - "text/plain": [ - "<IPython.core.display.Markdown object>" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/markdown": [ - "**Document ID:** 2ea119c7-fc3d-4090-a47a-8dd2a0d37416<br>**Similarity:** None<br>**Text:** fact that our software worked via the web, and we got $10,000 in seed funding from Idelle's husba...<br>" - ], - "text/plain": [ - "<IPython.core.display.Markdown object>" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/markdown": [ - "---" - ], - "text/plain": [ - "<IPython.core.display.Markdown object>" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/markdown": [ - "**`Source Node 4/6`**" - ], - "text/plain": [ - "<IPython.core.display.Markdown object>" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/markdown": [ - "**Document ID:** 2ea119c7-fc3d-4090-a47a-8dd2a0d37416<br>**Similarity:** None<br>**Text:** project was the new Lisp, whose parentheses I now wouldn't even have to hide. A lot of Lisp hacke...<br>" - ], - "text/plain": [ - "<IPython.core.display.Markdown object>" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/markdown": [ - "---" - ], - "text/plain": [ - "<IPython.core.display.Markdown object>" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/markdown": [ - "**`Source Node 5/6`**" - ], - "text/plain": [ - "<IPython.core.display.Markdown object>" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/markdown": [ - "**Document ID:** 2ea119c7-fc3d-4090-a47a-8dd2a0d37416<br>**Similarity:** None<br>**Text:** chance it had to do with HN, and a 40% chance it had do with everything else combined. [17]\n", - "\n", - "As w...<br>" - ], - "text/plain": [ - "<IPython.core.display.Markdown object>" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/markdown": [ - "---" - ], - "text/plain": [ - "<IPython.core.display.Markdown object>" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/markdown": [ - "**`Source Node 6/6`**" - ], - "text/plain": [ - "<IPython.core.display.Markdown object>" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/markdown": [ - "**Document ID:** 2ea119c7-fc3d-4090-a47a-8dd2a0d37416<br>**Similarity:** None<br>**Text:** and some people dislike being told such things.\n", - "\n", - "[11] People put plenty of stuff on the internet ...<br>" - ], - "text/plain": [ - "<IPython.core.display.Markdown object>" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "display_response(response)" - ] - }, - { - "cell_type": "markdown", - "id": "3f843a73", - "metadata": {}, - "source": [ - "# gpt-4" - ] - }, - { - "cell_type": "code", - "execution_count": 63, - "id": "0849d860", - "metadata": {}, - "outputs": [], - "source": [ - "llm_predictor = LLMPredictor(llm=ChatOpenAI(temperature=0, model_name=\"gpt-4\"))\n", - "service_context = ServiceContext.from_defaults(llm_predictor=llm_predictor)" - ] - }, - { - "cell_type": "code", - "execution_count": 64, - "id": "bb9eff4a", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "INFO:gpt_index.token_counter.token_counter:> [build_index_from_documents] Total LLM token usage: 0 tokens\n", - "INFO:gpt_index.token_counter.token_counter:> [build_index_from_documents] Total embedding token usage: 0 tokens\n" - ] - } - ], - "source": [ - "gpt4_index = GPTListIndex.from_documents(documents, service_context=service_context)" - ] - }, - { - "cell_type": "code", - "execution_count": 65, - "id": "cb56a205", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'Document is split into 3 nodes.'" - ] - }, - "execution_count": 65, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "f'Document is split into {len(gpt4_index._index_struct.nodes)} nodes.'" - ] - }, - { - "cell_type": "code", - "execution_count": 70, - "id": "44dda700", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "INFO:gpt_index.indices.common.tree.base:> Building index from nodes: 2 chunks\n", - "INFO:gpt_index.token_counter.token_counter:> [query] Total LLM token usage: 18006 tokens\n", - "INFO:gpt_index.token_counter.token_counter:> [query] Total embedding token usage: 0 tokens\n" - ] - } - ], - "source": [ - "response = gpt4_index.query(\n", - " \"What happened on one night in October 2003?\", \n", - " response_mode=\"tree_summarize\"\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 71, - "id": "42bd0984", - "metadata": { - "scrolled": true - }, - "outputs": [ - { - "data": { - "text/markdown": [ - "**`Final Response:`** On one night in October 2003, there was a big party at Paul Graham's house, organized by his friend Maria Daniels. At this party, Paul met Jessica Livingston, who would later become his partner in starting Y Combinator. Additionally, Paul Graham had a conversation with his friend Robert Morris about starting a new kind of venture firm that would fund startups in batches, which eventually led to the creation of Y Combinator." - ], - "text/plain": [ - "<IPython.core.display.Markdown object>" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/markdown": [ - "---" - ], - "text/plain": [ - "<IPython.core.display.Markdown object>" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/markdown": [ - "**`Source Node 1/3`**" - ], - "text/plain": [ - "<IPython.core.display.Markdown object>" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/markdown": [ - "**Document ID:** 2ea119c7-fc3d-4090-a47a-8dd2a0d37416<br>**Similarity:** 0.740238551627948<br>**Text:** What I Worked On\n", - "\n", - "February 2021\n", - "\n", - "Before college the two main things I worked on, outside of schoo...<br>" - ], - "text/plain": [ - "<IPython.core.display.Markdown object>" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/markdown": [ - "---" - ], - "text/plain": [ - "<IPython.core.display.Markdown object>" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/markdown": [ - "**`Source Node 2/3`**" - ], - "text/plain": [ - "<IPython.core.display.Markdown object>" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/markdown": [ - "**Document ID:** 2ea119c7-fc3d-4090-a47a-8dd2a0d37416<br>**Similarity:** None<br>**Text:** really good. He recommended Trevor Blackwell, which surprised me at first, because at that point ...<br>" - ], - "text/plain": [ - "<IPython.core.display.Markdown object>" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/markdown": [ - "---" - ], - "text/plain": [ - "<IPython.core.display.Markdown object>" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/markdown": [ - "**`Source Node 3/3`**" - ], - "text/plain": [ - "<IPython.core.display.Markdown object>" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/markdown": [ - "**Document ID:** 2ea119c7-fc3d-4090-a47a-8dd2a0d37416<br>**Similarity:** None<br>**Text:** make nuclear reactors. But I kept at it, and in October 2013 he finally agreed. We decided he'd t...<br>" - ], - "text/plain": [ - "<IPython.core.display.Markdown object>" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "display_response(response)" - ] - }, - { - "cell_type": "markdown", - "id": "fd981e5e", - "metadata": {}, - "source": [ - "# gpt-4-32k" - ] - }, - { - "cell_type": "markdown", - "id": "9d9f20a9", - "metadata": {}, - "source": [ - "NOTE: not available yet" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "71137f57", - "metadata": {}, - "outputs": [], - "source": [ - "llm_predictor = LLMPredictor(llm=ChatOpenAI(temperature=0, model_name=\"gpt-4-32k\"))\n", - "service_context = ServiceContext.from_defaults(llm_predictor=llm_predictor)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "bb619782", - "metadata": {}, - "outputs": [], - "source": [ - "gpt4_32k_index = GPTSimpleVectorIndex.from_documents(documents, service_context=service_context)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7d417f6a", - "metadata": {}, - "outputs": [], - "source": [ - "len(gpt4_32k_index._index_struct.nodes_dict)" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.9" - } + "cells": [ + { + "cell_type": "markdown", + "id": "65850ded", + "metadata": {}, + "source": [ + "# GPT-4 Comparison" + ] + }, + { + "cell_type": "code", + "execution_count": 62, + "id": "4921c412", + "metadata": {}, + "outputs": [], + "source": [ + "from gpt_index import GPTListIndex, SimpleDirectoryReader, LLMPredictor, PromptHelper, ServiceContext\n", + "from gpt_index.response.notebook_utils import display_response\n", + "from langchain import OpenAI\n", + "from langchain.chat_models import ChatOpenAI\n", + "from IPython.display import Markdown, display" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "261d923e", + "metadata": {}, + "outputs": [], + "source": [ + "documents = SimpleDirectoryReader('data').load_data()" + ] + }, + { + "cell_type": "markdown", + "id": "f23b5169", + "metadata": {}, + "source": [ + "## davinci-003" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "id": "0c635cdb", + "metadata": {}, + "outputs": [], + "source": [ + "llm_predictor = LLMPredictor(llm=OpenAI(temperature=0, model_name=\"text-davinci-003\"))\n", + "service_context = ServiceContext.from_defaults(llm_predictor=llm_predictor)" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "id": "b8ad1a2a", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:gpt_index.token_counter.token_counter:> [build_index_from_documents] Total LLM token usage: 0 tokens\n", + "INFO:gpt_index.token_counter.token_counter:> [build_index_from_documents] Total embedding token usage: 0 tokens\n" + ] + } + ], + "source": [ + "davinci_index = GPTListIndex.from_documents(documents, service_context=service_context)" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "id": "c9925597", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'Document is split into 6 nodes.'" + ] + }, + "execution_count": 45, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "f'Document is split into {len(davinci_index._index_struct.nodes)} nodes.'" + ] + }, + { + "cell_type": "code", + "execution_count": 68, + "id": "fa1d7242", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:gpt_index.indices.common.tree.base:> Building index from nodes: 5 chunks\n", + "INFO:gpt_index.token_counter.token_counter:> [query] Total LLM token usage: 19882 tokens\n", + "INFO:gpt_index.token_counter.token_counter:> [query] Total embedding token usage: 0 tokens\n" + ] + } + ], + "source": [ + "response = davinci_index.query(\n", + " \"What happened on one night in October 2003?\", \n", + " response_mode=\"tree_summarize\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 69, + "id": "d758bdb7", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/markdown": [ + "**`Final Response:`** It is not possible to answer this question with the given context information." + ], + "text/plain": [ + "<IPython.core.display.Markdown object>" + ] + }, + "metadata": {}, + "output_type": "display_data" }, - "nbformat": 4, - "nbformat_minor": 5 + { + "data": { + "text/markdown": [ + "--" + ], + "text/plain": [ + "<IPython.core.display.Markdown object>" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "**`Source Node 1/6`**" + ], + "text/plain": [ + "<IPython.core.display.Markdown object>" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "**Document ID:** 2ea119c7-fc3d-4090-a47a-8dd2a0d37416<br>**Similarity:** None<br>**Text:** What I Worked On\n", + "\n", + "February 2021\n", + "\n", + "Before college the two main things I worked on, outside of schoo...<br>" + ], + "text/plain": [ + "<IPython.core.display.Markdown object>" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "--" + ], + "text/plain": [ + "<IPython.core.display.Markdown object>" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "**`Source Node 2/6`**" + ], + "text/plain": [ + "<IPython.core.display.Markdown object>" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "**Document ID:** 2ea119c7-fc3d-4090-a47a-8dd2a0d37416<br>**Similarity:** None<br>**Text:** whereby the students wouldn't require the faculty to teach anything, and in return the faculty wo...<br>" + ], + "text/plain": [ + "<IPython.core.display.Markdown object>" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "--" + ], + "text/plain": [ + "<IPython.core.display.Markdown object>" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "**`Source Node 3/6`**" + ], + "text/plain": [ + "<IPython.core.display.Markdown object>" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "**Document ID:** 2ea119c7-fc3d-4090-a47a-8dd2a0d37416<br>**Similarity:** None<br>**Text:** fact that our software worked via the web, and we got $10,000 in seed funding from Idelle's husba...<br>" + ], + "text/plain": [ + "<IPython.core.display.Markdown object>" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "--" + ], + "text/plain": [ + "<IPython.core.display.Markdown object>" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "**`Source Node 4/6`**" + ], + "text/plain": [ + "<IPython.core.display.Markdown object>" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "**Document ID:** 2ea119c7-fc3d-4090-a47a-8dd2a0d37416<br>**Similarity:** None<br>**Text:** project was the new Lisp, whose parentheses I now wouldn't even have to hide. A lot of Lisp hacke...<br>" + ], + "text/plain": [ + "<IPython.core.display.Markdown object>" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "--" + ], + "text/plain": [ + "<IPython.core.display.Markdown object>" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "**`Source Node 5/6`**" + ], + "text/plain": [ + "<IPython.core.display.Markdown object>" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "**Document ID:** 2ea119c7-fc3d-4090-a47a-8dd2a0d37416<br>**Similarity:** None<br>**Text:** chance it had to do with HN, and a 40% chance it had do with everything else combined. [17]\n", + "\n", + "As w...<br>" + ], + "text/plain": [ + "<IPython.core.display.Markdown object>" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "--" + ], + "text/plain": [ + "<IPython.core.display.Markdown object>" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "**`Source Node 6/6`**" + ], + "text/plain": [ + "<IPython.core.display.Markdown object>" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "**Document ID:** 2ea119c7-fc3d-4090-a47a-8dd2a0d37416<br>**Similarity:** None<br>**Text:** and some people dislike being told such things.\n", + "\n", + "[11] People put plenty of stuff on the internet ...<br>" + ], + "text/plain": [ + "<IPython.core.display.Markdown object>" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "display_response(response)" + ] + }, + { + "cell_type": "markdown", + "id": "3f843a73", + "metadata": {}, + "source": [ + "## gpt-4" + ] + }, + { + "cell_type": "code", + "execution_count": 63, + "id": "0849d860", + "metadata": {}, + "outputs": [], + "source": [ + "llm_predictor = LLMPredictor(llm=ChatOpenAI(temperature=0, model_name=\"gpt-4\"))\n", + "service_context = ServiceContext.from_defaults(llm_predictor=llm_predictor)" + ] + }, + { + "cell_type": "code", + "execution_count": 64, + "id": "bb9eff4a", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:gpt_index.token_counter.token_counter:> [build_index_from_documents] Total LLM token usage: 0 tokens\n", + "INFO:gpt_index.token_counter.token_counter:> [build_index_from_documents] Total embedding token usage: 0 tokens\n" + ] + } + ], + "source": [ + "gpt4_index = GPTListIndex.from_documents(documents, service_context=service_context)" + ] + }, + { + "cell_type": "code", + "execution_count": 65, + "id": "cb56a205", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'Document is split into 3 nodes.'" + ] + }, + "execution_count": 65, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "f'Document is split into {len(gpt4_index._index_struct.nodes)} nodes.'" + ] + }, + { + "cell_type": "code", + "execution_count": 70, + "id": "44dda700", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:gpt_index.indices.common.tree.base:> Building index from nodes: 2 chunks\n", + "INFO:gpt_index.token_counter.token_counter:> [query] Total LLM token usage: 18006 tokens\n", + "INFO:gpt_index.token_counter.token_counter:> [query] Total embedding token usage: 0 tokens\n" + ] + } + ], + "source": [ + "response = gpt4_index.query(\n", + " \"What happened on one night in October 2003?\", \n", + " response_mode=\"tree_summarize\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 71, + "id": "42bd0984", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/markdown": [ + "**`Final Response:`** On one night in October 2003, there was a big party at Paul Graham's house, organized by his friend Maria Daniels. At this party, Paul met Jessica Livingston, who would later become his partner in starting Y Combinator. Additionally, Paul Graham had a conversation with his friend Robert Morris about starting a new kind of venture firm that would fund startups in batches, which eventually led to the creation of Y Combinator." + ], + "text/plain": [ + "<IPython.core.display.Markdown object>" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "--" + ], + "text/plain": [ + "<IPython.core.display.Markdown object>" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "**`Source Node 1/3`**" + ], + "text/plain": [ + "<IPython.core.display.Markdown object>" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "**Document ID:** 2ea119c7-fc3d-4090-a47a-8dd2a0d37416<br>**Similarity:** 0.740238551627948<br>**Text:** What I Worked On\n", + "\n", + "February 2021\n", + "\n", + "Before college the two main things I worked on, outside of schoo...<br>" + ], + "text/plain": [ + "<IPython.core.display.Markdown object>" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "--" + ], + "text/plain": [ + "<IPython.core.display.Markdown object>" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "**`Source Node 2/3`**" + ], + "text/plain": [ + "<IPython.core.display.Markdown object>" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "**Document ID:** 2ea119c7-fc3d-4090-a47a-8dd2a0d37416<br>**Similarity:** None<br>**Text:** really good. He recommended Trevor Blackwell, which surprised me at first, because at that point ...<br>" + ], + "text/plain": [ + "<IPython.core.display.Markdown object>" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "--" + ], + "text/plain": [ + "<IPython.core.display.Markdown object>" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "**`Source Node 3/3`**" + ], + "text/plain": [ + "<IPython.core.display.Markdown object>" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "**Document ID:** 2ea119c7-fc3d-4090-a47a-8dd2a0d37416<br>**Similarity:** None<br>**Text:** make nuclear reactors. But I kept at it, and in October 2013 he finally agreed. We decided he'd t...<br>" + ], + "text/plain": [ + "<IPython.core.display.Markdown object>" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "display_response(response)" + ] + }, + { + "cell_type": "markdown", + "id": "fd981e5e", + "metadata": {}, + "source": [ + "## gpt-4-32k" + ] + }, + { + "cell_type": "markdown", + "id": "9d9f20a9", + "metadata": {}, + "source": [ + "NOTE: not available yet" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "71137f57", + "metadata": {}, + "outputs": [], + "source": [ + "llm_predictor = LLMPredictor(llm=ChatOpenAI(temperature=0, model_name=\"gpt-4-32k\"))\n", + "service_context = ServiceContext.from_defaults(llm_predictor=llm_predictor)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bb619782", + "metadata": {}, + "outputs": [], + "source": [ + "gpt4_32k_index = GPTSimpleVectorIndex.from_documents(documents, service_context=service_context)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7d417f6a", + "metadata": {}, + "outputs": [], + "source": [ + "len(gpt4_32k_index._index_struct.nodes_dict)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.0" + } + }, + "nbformat": 4, + "nbformat_minor": 5 } diff --git a/examples/paul_graham_essay/InsertDemo.ipynb b/examples/paul_graham_essay/InsertDemo.ipynb index 3e6e23cd6c..afdd4334f4 100644 --- a/examples/paul_graham_essay/InsertDemo.ipynb +++ b/examples/paul_graham_essay/InsertDemo.ipynb @@ -1,5 +1,13 @@ { "cells": [ + { + "cell_type": "markdown", + "id": "48d79af8", + "metadata": {}, + "source": [ + "# Insert Demo" + ] + }, { "cell_type": "markdown", "id": "46e5110c-ed35-463e-a9f6-cff9cda6221b", @@ -321,7 +329,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.10" + "version": "3.11.0" } }, "nbformat": 4, diff --git a/examples/paul_graham_essay/KeywordTableComparison.ipynb b/examples/paul_graham_essay/KeywordTableComparison.ipynb index 23c964f600..d2923c566c 100644 --- a/examples/paul_graham_essay/KeywordTableComparison.ipynb +++ b/examples/paul_graham_essay/KeywordTableComparison.ipynb @@ -1,428 +1,428 @@ { - "cells": [ - { - "cell_type": "markdown", - "id": "a6457769-dfaf-4241-ab32-dcf901dde902", - "metadata": { - "tags": [] - }, - "source": [ - "## GPT Keyword Table Index Comparisons\n", - "\n", - "Comparing GPTSimpleKeywordTableIndex, GPTRAKEKeywordTableIndex, GPTKeywordTableIndex.\n", - "\n", - "- GPTSimpleKeywordTableIndex - uses simple regex to extract keywords.\n", - "- GPTRAKEKeywordTableIndex - uses RAKE to extract keywords.\n", - "- GPTKeywordTableIndex - uses GPT to extract keywords." - ] - }, - { - "cell_type": "markdown", - "id": "075080e5-c255-4a5c-9330-9da11532e1c8", - "metadata": { - "tags": [] - }, - "source": [ - "#### GPTSimpleKeywordTableIndex" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "b367b7ef-6a7d-4aee-b174-dba6ec4d2e21", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "[nltk_data] Downloading package stopwords to /home/jerry/nltk_data...\n", - "[nltk_data] Package stopwords is already up-to-date!\n" - ] - } - ], - "source": [ - "from llama_index import GPTSimpleKeywordTableIndex, SimpleDirectoryReader\n", - "from IPython.display import Markdown, display" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1f8248fa-e0bd-494a-ad68-8192ccc87696", - "metadata": {}, - "outputs": [], - "source": [ - "# build keyword index\n", - "documents = SimpleDirectoryReader('data').load_data()\n", - "index = GPTSimpleKeywordTableIndex(documents)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "53833655-0296-4bcb-b501-259b043d68b3", - "metadata": {}, - "outputs": [], - "source": [ - "response = index.query(\"What did the author do after his time at YC?\")" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "62bcca18-b644-4393-ad29-6c5f0424fb22", - "metadata": {}, - "outputs": [ - { - "data": { - "text/markdown": [ - "<b>\n", - "\n", - "The author went on to write essays and work on other projects, including a new version of the Arc programming language and Hacker News. He also started painting, but stopped after a few months. In 2015, he started working on a new Lisp programming language, which he finished in 2019. The author then moved to England in 2016 with his family and continued writing essays. In 2019, he finished Bel and wrote a bunch of essays on various topics.\n", - "\n", - "The author also worked on building online stores in 1995 after finishing ANSI Common Lisp. He ran the software on servers and let users control it by clicking on links, which was a new concept at the time. In 1996, he co-founded Viaweb with Robert Morris, which was later acquired by Yahoo in 1998. After leaving Yahoo, the author moved back to New York and started painting again. In 2000, he had the idea for a web application that would let people edit code on a server and host the resulting applications, which later became known as \"Reddit\".</b>" - ], - "text/plain": [ - "<IPython.core.display.Markdown object>" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "display(Markdown(f\"<b>{response}</b>\"))" - ] - }, - { - "cell_type": "markdown", - "id": "d24f9a20-48a6-4131-91b9-b01448c6ecb5", - "metadata": {}, - "source": [ - "#### GPTRAKEKeywordTableIndex" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "c4d3f293-e608-4b90-86aa-9bce666dbcd5", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "[nltk_data] Downloading package stopwords to /home/jerry/nltk_data...\n", - "[nltk_data] Package stopwords is already up-to-date!\n" - ] - } - ], - "source": [ - "from llama_index import GPTRAKEKeywordTableIndex, SimpleDirectoryReader\n", - "from IPython.display import Markdown, display" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "66b1da3b-8231-4da9-8026-4f95481c79df", - "metadata": { - "scrolled": true, - "tags": [] - }, - "outputs": [], - "source": [ - "# build keyword index\n", - "documents = SimpleDirectoryReader('data').load_data()\n", - "index = GPTRAKEKeywordTableIndex(documents)" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "id": "f13e5543-c6cb-4651-986c-ecde0f4bf789", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> Starting query: What did the author do after his time at YC?\n", - "Extracted keywords: []\n" - ] - } - ], - "source": [ - "response = index.query(\"What did the author do after his time at YC?\")" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "id": "5ae01ac3-55fa-43a3-9b24-f733072d5f8d", - "metadata": {}, - "outputs": [ - { - "data": { - "text/markdown": [ - "<b>Empty response</b>" - ], - "text/plain": [ - "<IPython.core.display.Markdown object>" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "display(Markdown(f\"<b>{response}</b>\"))" - ] - }, - { - "cell_type": "markdown", - "id": "59cee6cf-92df-40d8-8dad-a40b792de96f", - "metadata": {}, - "source": [ - "#### GPTKeywordTableIndex" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "78d59ef6-70b0-47bb-818d-7237a3b7de75", - "metadata": {}, - "outputs": [], - "source": [ - "from llama_index import GPTKeywordTableIndex, SimpleDirectoryReader\n", - "from IPython.display import Markdown, display" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5a3f1c67-6d73-4f37-afcf-9e637002fcff", - "metadata": {}, - "outputs": [], - "source": [ - "# build keyword index\n", - "documents = SimpleDirectoryReader('data').load_data()\n", - "index = GPTKeywordTableIndex.from_documents(documents)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "69d4f686-6825-49cf-a113-d2fdd484de77", - "metadata": {}, - "outputs": [], - "source": [ - "response = index.query(\"What did the author do after his time at Y Combinator?\")" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "id": "a483514d-4ab5-489d-8b99-7250df491ce3", - "metadata": {}, - "outputs": [ - { - "data": { - "text/markdown": [ - "<b>\n", - "\n", - "After a few years, the author decided to step away from Y Combinator to focus on other projects, such as painting and writing essays. In 2013, he handed over control of Y Combinator to Sam Altman. The author's mother passed away in 2014, and after taking some time to grieve, he returned to writing essays and working on Lisp. He continued working on Lisp until 2019, when he finally completed the project.\n", - "\n", - "In 2015, the author decided to move to England with his family. They originally intended to only stay for a year, but ended up liking it so much that they remained there. The author wrote Bel while living in England. In 2019, he finally finished the project. After completing Bel, the author wrote a number of essays on various topics. He continued writing essays through 2020, but also started thinking about other things he could work on.</b>" - ], - "text/plain": [ - "<IPython.core.display.Markdown object>" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "display(Markdown(f\"<b>{response}</b>\"))" - ] - }, - { - "cell_type": "markdown", - "id": "112e21ee-587c-4d8b-871e-cb99b94e3778", - "metadata": {}, - "source": [ - "## GPT Keyword Table Query Comparisons\n", - "Compare mode={\"default\", \"simple\", \"rake\"}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3029961a-ec22-42a1-90d6-f5892eb81e34", - "metadata": {}, - "outputs": [], - "source": [ - "# build table with default GPTKeywordTableIndex\n", - "from llama_index import GPTKeywordTableIndex, SimpleDirectoryReader\n", - "from IPython.display import Markdown, display\n", - "\n", - "documents = SimpleDirectoryReader('data').load_data()\n", - "index = GPTKeywordTableIndex.from_documents(documents)" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "d75b31da-4788-4295-8642-07ac5c4f11a5", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> Starting query: What did the author do after his time at Y Combinator?\n", - "Extracted keywords: ['y combinator', 'combinator']\n", - "> Querying with idx: 235042210695008001: of excluding them, because there were so many s...\n", - "> Querying with idx: 7029274505691774319: it was like living in another country, and sinc...\n", - "> Querying with idx: 1773317813360405038: browser, and then host the resulting applicatio...\n", - "> Querying with idx: 3866067077574405334: person, and from those we picked 8 to fund. The...\n" - ] - }, - { - "data": { - "text/markdown": [ - "<b>\n", - "\n", - "The author went on to write a book about his experiences at Y Combinator, and then moved to England. He started writing essays again and also began working on a new Lisp programming language. He also wrote an essay about how he chooses what to work on.</b>" - ], - "text/plain": [ - "<IPython.core.display.Markdown object>" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "# default\n", - "response = index.query(\"What did the author do after his time at Y Combinator?\", mode=\"default\")\n", - "display(Markdown(f\"<b>{response}</b>\"))" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "07b713f4-adfc-46f7-a795-5b333e33d49d", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> Starting query: What did the author do after his time at Y Combinator?\n", - "Extracted keywords: ['combinator']\n", - "> Querying with idx: 235042210695008001: of excluding them, because there were so many s...\n", - "> Querying with idx: 7029274505691774319: it was like living in another country, and sinc...\n", - "> Querying with idx: 1773317813360405038: browser, and then host the resulting applicatio...\n", - "> Querying with idx: 3866067077574405334: person, and from those we picked 8 to fund. The...\n" - ] - }, - { - "data": { - "text/markdown": [ - "<b>\n", - "\n", - "The author went on to write a book about his experiences at Y Combinator, and then moved to England. He started writing essays again and also began working on a new Lisp programming language. He also wrote an essay about how he chooses what to work on.</b>" - ], - "text/plain": [ - "<IPython.core.display.Markdown object>" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "# simple\n", - "response = index.query(\"What did the author do after his time at Y Combinator?\", mode=\"simple\")\n", - "display(Markdown(f\"<b>{response}</b>\"))" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "d2e19ad9-3190-45e5-a28d-235c28296d70", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> Starting query: What did the author do after his time at Y Combinator?\n", - "Extracted keywords: ['combinator']\n", - "> Querying with idx: 235042210695008001: of excluding them, because there were so many s...\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "[nltk_data] Downloading package punkt to /home/jerry/nltk_data...\n", - "[nltk_data] Package punkt is already up-to-date!\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> Querying with idx: 7029274505691774319: it was like living in another country, and sinc...\n", - "> Querying with idx: 1773317813360405038: browser, and then host the resulting applicatio...\n", - "> Querying with idx: 3866067077574405334: person, and from those we picked 8 to fund. The...\n" - ] - }, - { - "data": { - "text/markdown": [ - "<b>\n", - "\n", - "The author went on to write a book about his experiences at Y Combinator, and then moved to England. He started writing essays again and also began working on a new Lisp programming language. He also wrote an essay about how he chooses what to work on.</b>" - ], - "text/plain": [ - "<IPython.core.display.Markdown object>" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "# rake\n", - "response = index.query(\"What did the author do after his time at Y Combinator?\", mode=\"rake\")\n", - "display(Markdown(f\"<b>{response}</b>\"))" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "myvenv", - "language": "python", - "name": "myvenv" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.0" - } + "cells": [ + { + "cell_type": "markdown", + "id": "a6457769-dfaf-4241-ab32-dcf901dde902", + "metadata": { + "tags": [] + }, + "source": [ + "# GPT Keyword Table Index Comparisons\n", + "\n", + "Comparing GPTSimpleKeywordTableIndex, GPTRAKEKeywordTableIndex, GPTKeywordTableIndex.\n", + "\n", + "- GPTSimpleKeywordTableIndex - uses simple regex to extract keywords.\n", + "- GPTRAKEKeywordTableIndex - uses RAKE to extract keywords.\n", + "- GPTKeywordTableIndex - uses GPT to extract keywords." + ] + }, + { + "cell_type": "markdown", + "id": "075080e5-c255-4a5c-9330-9da11532e1c8", + "metadata": { + "tags": [] + }, + "source": [ + "#### GPTSimpleKeywordTableIndex" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "b367b7ef-6a7d-4aee-b174-dba6ec4d2e21", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[nltk_data] Downloading package stopwords to /home/jerry/nltk_data...\n", + "[nltk_data] Package stopwords is already up-to-date!\n" + ] + } + ], + "source": [ + "from llama_index import GPTSimpleKeywordTableIndex, SimpleDirectoryReader\n", + "from IPython.display import Markdown, display" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1f8248fa-e0bd-494a-ad68-8192ccc87696", + "metadata": {}, + "outputs": [], + "source": [ + "# build keyword index\n", + "documents = SimpleDirectoryReader('data').load_data()\n", + "index = GPTSimpleKeywordTableIndex(documents)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "53833655-0296-4bcb-b501-259b043d68b3", + "metadata": {}, + "outputs": [], + "source": [ + "response = index.query(\"What did the author do after his time at YC?\")" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "62bcca18-b644-4393-ad29-6c5f0424fb22", + "metadata": {}, + "outputs": [ + { + "data": { + "text/markdown": [ + "<b>\n", + "\n", + "The author went on to write essays and work on other projects, including a new version of the Arc programming language and Hacker News. He also started painting, but stopped after a few months. In 2015, he started working on a new Lisp programming language, which he finished in 2019. The author then moved to England in 2016 with his family and continued writing essays. In 2019, he finished Bel and wrote a bunch of essays on various topics.\n", + "\n", + "The author also worked on building online stores in 1995 after finishing ANSI Common Lisp. He ran the software on servers and let users control it by clicking on links, which was a new concept at the time. In 1996, he co-founded Viaweb with Robert Morris, which was later acquired by Yahoo in 1998. After leaving Yahoo, the author moved back to New York and started painting again. In 2000, he had the idea for a web application that would let people edit code on a server and host the resulting applications, which later became known as \"Reddit\".</b>" + ], + "text/plain": [ + "<IPython.core.display.Markdown object>" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "display(Markdown(f\"<b>{response}</b>\"))" + ] + }, + { + "cell_type": "markdown", + "id": "d24f9a20-48a6-4131-91b9-b01448c6ecb5", + "metadata": {}, + "source": [ + "#### GPTRAKEKeywordTableIndex" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "c4d3f293-e608-4b90-86aa-9bce666dbcd5", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[nltk_data] Downloading package stopwords to /home/jerry/nltk_data...\n", + "[nltk_data] Package stopwords is already up-to-date!\n" + ] + } + ], + "source": [ + "from llama_index import GPTRAKEKeywordTableIndex, SimpleDirectoryReader\n", + "from IPython.display import Markdown, display" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "66b1da3b-8231-4da9-8026-4f95481c79df", + "metadata": { + "scrolled": true, + "tags": [] + }, + "outputs": [], + "source": [ + "# build keyword index\n", + "documents = SimpleDirectoryReader('data').load_data()\n", + "index = GPTRAKEKeywordTableIndex(documents)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "f13e5543-c6cb-4651-986c-ecde0f4bf789", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> Starting query: What did the author do after his time at YC?\n", + "Extracted keywords: []\n" + ] + } + ], + "source": [ + "response = index.query(\"What did the author do after his time at YC?\")" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "5ae01ac3-55fa-43a3-9b24-f733072d5f8d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/markdown": [ + "<b>Empty response</b>" + ], + "text/plain": [ + "<IPython.core.display.Markdown object>" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "display(Markdown(f\"<b>{response}</b>\"))" + ] + }, + { + "cell_type": "markdown", + "id": "59cee6cf-92df-40d8-8dad-a40b792de96f", + "metadata": {}, + "source": [ + "#### GPTKeywordTableIndex" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "78d59ef6-70b0-47bb-818d-7237a3b7de75", + "metadata": {}, + "outputs": [], + "source": [ + "from llama_index import GPTKeywordTableIndex, SimpleDirectoryReader\n", + "from IPython.display import Markdown, display" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5a3f1c67-6d73-4f37-afcf-9e637002fcff", + "metadata": {}, + "outputs": [], + "source": [ + "# build keyword index\n", + "documents = SimpleDirectoryReader('data').load_data()\n", + "index = GPTKeywordTableIndex.from_documents(documents)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "69d4f686-6825-49cf-a113-d2fdd484de77", + "metadata": {}, + "outputs": [], + "source": [ + "response = index.query(\"What did the author do after his time at Y Combinator?\")" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "a483514d-4ab5-489d-8b99-7250df491ce3", + "metadata": {}, + "outputs": [ + { + "data": { + "text/markdown": [ + "<b>\n", + "\n", + "After a few years, the author decided to step away from Y Combinator to focus on other projects, such as painting and writing essays. In 2013, he handed over control of Y Combinator to Sam Altman. The author's mother passed away in 2014, and after taking some time to grieve, he returned to writing essays and working on Lisp. He continued working on Lisp until 2019, when he finally completed the project.\n", + "\n", + "In 2015, the author decided to move to England with his family. They originally intended to only stay for a year, but ended up liking it so much that they remained there. The author wrote Bel while living in England. In 2019, he finally finished the project. After completing Bel, the author wrote a number of essays on various topics. He continued writing essays through 2020, but also started thinking about other things he could work on.</b>" + ], + "text/plain": [ + "<IPython.core.display.Markdown object>" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "display(Markdown(f\"<b>{response}</b>\"))" + ] + }, + { + "cell_type": "markdown", + "id": "112e21ee-587c-4d8b-871e-cb99b94e3778", + "metadata": {}, + "source": [ + "## GPT Keyword Table Query Comparisons\n", + "Compare mode={\"default\", \"simple\", \"rake\"}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3029961a-ec22-42a1-90d6-f5892eb81e34", + "metadata": {}, + "outputs": [], + "source": [ + "# build table with default GPTKeywordTableIndex\n", + "from llama_index import GPTKeywordTableIndex, SimpleDirectoryReader\n", + "from IPython.display import Markdown, display\n", + "\n", + "documents = SimpleDirectoryReader('data').load_data()\n", + "index = GPTKeywordTableIndex.from_documents(documents)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "d75b31da-4788-4295-8642-07ac5c4f11a5", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> Starting query: What did the author do after his time at Y Combinator?\n", + "Extracted keywords: ['y combinator', 'combinator']\n", + "> Querying with idx: 235042210695008001: of excluding them, because there were so many s...\n", + "> Querying with idx: 7029274505691774319: it was like living in another country, and sinc...\n", + "> Querying with idx: 1773317813360405038: browser, and then host the resulting applicatio...\n", + "> Querying with idx: 3866067077574405334: person, and from those we picked 8 to fund. The...\n" + ] }, - "nbformat": 4, - "nbformat_minor": 5 -} \ No newline at end of file + { + "data": { + "text/markdown": [ + "<b>\n", + "\n", + "The author went on to write a book about his experiences at Y Combinator, and then moved to England. He started writing essays again and also began working on a new Lisp programming language. He also wrote an essay about how he chooses what to work on.</b>" + ], + "text/plain": [ + "<IPython.core.display.Markdown object>" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# default\n", + "response = index.query(\"What did the author do after his time at Y Combinator?\", mode=\"default\")\n", + "display(Markdown(f\"<b>{response}</b>\"))" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "07b713f4-adfc-46f7-a795-5b333e33d49d", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> Starting query: What did the author do after his time at Y Combinator?\n", + "Extracted keywords: ['combinator']\n", + "> Querying with idx: 235042210695008001: of excluding them, because there were so many s...\n", + "> Querying with idx: 7029274505691774319: it was like living in another country, and sinc...\n", + "> Querying with idx: 1773317813360405038: browser, and then host the resulting applicatio...\n", + "> Querying with idx: 3866067077574405334: person, and from those we picked 8 to fund. The...\n" + ] + }, + { + "data": { + "text/markdown": [ + "<b>\n", + "\n", + "The author went on to write a book about his experiences at Y Combinator, and then moved to England. He started writing essays again and also began working on a new Lisp programming language. He also wrote an essay about how he chooses what to work on.</b>" + ], + "text/plain": [ + "<IPython.core.display.Markdown object>" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# simple\n", + "response = index.query(\"What did the author do after his time at Y Combinator?\", mode=\"simple\")\n", + "display(Markdown(f\"<b>{response}</b>\"))" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "d2e19ad9-3190-45e5-a28d-235c28296d70", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> Starting query: What did the author do after his time at Y Combinator?\n", + "Extracted keywords: ['combinator']\n", + "> Querying with idx: 235042210695008001: of excluding them, because there were so many s...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[nltk_data] Downloading package punkt to /home/jerry/nltk_data...\n", + "[nltk_data] Package punkt is already up-to-date!\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> Querying with idx: 7029274505691774319: it was like living in another country, and sinc...\n", + "> Querying with idx: 1773317813360405038: browser, and then host the resulting applicatio...\n", + "> Querying with idx: 3866067077574405334: person, and from those we picked 8 to fund. The...\n" + ] + }, + { + "data": { + "text/markdown": [ + "<b>\n", + "\n", + "The author went on to write a book about his experiences at Y Combinator, and then moved to England. He started writing essays again and also began working on a new Lisp programming language. He also wrote an essay about how he chooses what to work on.</b>" + ], + "text/plain": [ + "<IPython.core.display.Markdown object>" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# rake\n", + "response = index.query(\"What did the author do after his time at Y Combinator?\", mode=\"rake\")\n", + "display(Markdown(f\"<b>{response}</b>\"))" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.0" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/examples/paul_graham_essay/TestEssay.ipynb b/examples/paul_graham_essay/TestEssay.ipynb index 3e1661492c..064cddeb66 100644 --- a/examples/paul_graham_essay/TestEssay.ipynb +++ b/examples/paul_graham_essay/TestEssay.ipynb @@ -1,660 +1,668 @@ { - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "id": "f1a9eb90-335c-4214-8bb6-fd1edbe3ccbd", - "metadata": {}, - "outputs": [], - "source": [ - "# My OpenAI Key\n", - "import os\n", - "os.environ['OPENAI_API_KEY'] = \"INSERT OPENAI KEY\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6a712b56", - "metadata": {}, - "outputs": [], - "source": [ - "import logging\n", - "import sys\n", - "\n", - "logging.basicConfig(stream=sys.stdout, level=logging.INFO)\n", - "logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))" - ] - }, - { - "cell_type": "markdown", - "id": "be3f7baa-1c0a-430b-981b-83ddca9e71f2", - "metadata": { - "tags": [] - }, - "source": [ - "## Using GPT Tree Index" - ] - }, - { - "cell_type": "markdown", - "id": "0881f151-279e-4910-95c7-f49d3d6a4c69", - "metadata": {}, - "source": [ - "#### [Demo] Default leaf traversal " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8d0b2364-4806-4656-81e7-3f6e4b910b5b", - "metadata": {}, - "outputs": [], - "source": [ - "from llama_index import GPTTreeIndex, SimpleDirectoryReader\n", - "from IPython.display import Markdown, display" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "1c297fd3-3424-41d8-9d0d-25fe6310ab62", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "documents = SimpleDirectoryReader('data').load_data()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "370fd08f-56ff-4c24-b0c4-c93116a6d482", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "index = GPTTreeIndex.from_documents(documents)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0b4fe9b6-5762-4e86-b51e-aac45d3ecdb1", - "metadata": {}, - "outputs": [], - "source": [ - "index.save_to_disk('index.json')" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "5eec265d-211b-4f26-b05b-5b4e7072bc6e", - "metadata": {}, - "outputs": [], - "source": [ - "# try loading\n", - "new_index = GPTTreeIndex.load_from_disk('index.json')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "bd14686d-1c53-4637-9340-3745f2121ae2", - "metadata": {}, - "outputs": [], - "source": [ - "# set Logging to DEBUG for more detailed outputs\n", - "response = new_index.query(\"What did the author do growing up?\")" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "b4c87d14-d2d8-4d80-89f6-1e5972973528", - "metadata": {}, - "outputs": [ - { - "data": { - "text/markdown": [ - "<b>The author wrote short stories and tried to program on an IBM 1401.</b>" - ], - "text/plain": [ - "<IPython.core.display.Markdown object>" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "display(Markdown(f\"<b>{response}</b>\"))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "68c9ebfe-b1b6-4f4e-9278-174346de8c90", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "# set Logging to DEBUG for more detailed outputs\n", - "response = new_index.query(\"What did the author do after his time at Y Combinator?\")" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "a5ab5943-7c84-4c2b-ac99-ec4b5fc67e64", - "metadata": {}, - "outputs": [ - { - "data": { - "text/markdown": [ - "<b>The author went on to start his own company.</b>" - ], - "text/plain": [ - "<IPython.core.display.Markdown object>" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "display(Markdown(f\"<b>{response}</b>\"))" - ] - }, - { - "cell_type": "markdown", - "id": "85c62ec3-c3cf-467e-ab0f-88ffb9f990be", - "metadata": {}, - "source": [ - "#### [Demo] Leaf traversal with child_branch_factor=2" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "46714db4-9592-4c55-9ca7-916758f2ce68", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "# try using branching factor 2\n", - "response = new_index.query(\"What did the author do growing up?\", child_branch_factor=2)" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "1ea7f891-b7e1-497a-a965-14201b220404", - "metadata": {}, - "outputs": [ - { - "data": { - "text/markdown": [ - "<b>The author grew up writing simple programs on a TRS-80 computer, as well as trying to program on an IBM 1401. In the early 1990s, the author was a student at the Rhode Island School of Design (RISD) and then the Accademia di Belle Arti in Florence, Italy. They eventually dropped out of RISD and moved to New York City, where they got a job at Interleaf, a software company. While working there, they learned about a new markup language called HTML, which would later become a big part of their life. He also wrote a book on Lisp programming.</b>" - ], - "text/plain": [ - "<IPython.core.display.Markdown object>" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "display(Markdown(f\"<b>{response}</b>\"))" - ] - }, - { - "cell_type": "markdown", - "id": "3c572726-bb95-49c3-a762-d966de59ee5f", - "metadata": {}, - "source": [ - "#### [Demo] Build Tree Index during Query-Time" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "255fb052-1ff6-4f27-881f-28d4790e9520", - "metadata": {}, - "outputs": [], - "source": [ - "documents = SimpleDirectoryReader('data').load_data()" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "85371256-292c-473e-9485-7de5c1997a59", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> [build_index_from_documents] Total token usage: 0 tokens\n" - ] - } - ], - "source": [ - "index_light = GPTTreeIndex.from_documents(documents, build_tree=False)" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "77b0acb3-5593-4f00-8eef-315a031fedc2", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> Starting query: What did the author do after his time at Y Combinator?\n", - "> Building index from nodes: 5 chunks\n", - "0/57\n", - "10/57\n", - "20/57\n", - "30/57\n", - "40/57\n", - "50/57\n", - "> [query] Total token usage: 18200 tokens\n" - ] - }, - { - "data": { - "text/plain": [ - "'\\nThe author went back to painting.'" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "index_light.query(\"What did the author do after his time at Y Combinator?\", mode=\"summarize\")" - ] - }, - { - "cell_type": "markdown", - "id": "f9773497-9aa6-4a16-884a-cd882e63d012", - "metadata": {}, - "source": [ - "#### [Demo] Build Tree Index with a custom Summary Prompt, directly retrieve answer from root node" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "id": "8ab6d3ad-95e1-477a-a0dc-2ce4763ff2c4", - "metadata": {}, - "outputs": [], - "source": [ - "from llama_index import SummaryPrompt" - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "id": "5a91a445-6ab2-457c-850e-79c5386129db", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> Building index from nodes: 5 chunks\n", - "0/57\n", - "10/57\n", - "20/57\n", - "30/57\n", - "40/57\n", - "50/57\n", - "> [build_index_from_documents] Total token usage: 18031 tokens\n" - ] - } - ], - "source": [ - "documents = SimpleDirectoryReader('data').load_data()\n", - "\n", - "query_str = \"What did the author do growing up?\"\n", - "SUMMARY_PROMPT_TMPL = (\n", - " \"Context information is below. \\n\"\n", - " \"---------------------\\n\"\n", - " \"{context_str}\"\n", - " \"\\n---------------------\\n\"\n", - " \"Given the context information and not prior knowledge, \"\n", - " f\"answer the question: {query_str}\\n\"\n", - ")\n", - "SUMMARY_PROMPT = SummaryPrompt(SUMMARY_PROMPT_TMPL)\n", - "index_with_query = GPTTreeIndex.from_documents(documents, summary_template=SUMMARY_PROMPT)" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "985dad0c-1ede-4576-a4c9-c077b815edd8", - "metadata": {}, - "outputs": [], - "source": [ - "index_with_query.save_to_disk(\"index_with_query.json\")" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "de04fce5-88f9-41b7-87d9-dcde8f84a872", - "metadata": {}, - "outputs": [], - "source": [ - "index_with_query = GPTTreeIndex.load_from_disk(\"index_with_query.json\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9223ffa8-d49d-4de3-821a-701b2a0352d4", - "metadata": {}, - "outputs": [], - "source": [ - "# directly retrieve response from root nodes instead of traversing tree\n", - "response = index_with_query.query(query_str, mode=\"retrieve\")" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "id": "fdca6970-2f3f-4741-ae98-555db8d3d9a0", - "metadata": {}, - "outputs": [ - { - "data": { - "text/markdown": [ - "<b>\n", - "The author was homeschooled and then attended a prestigious art school. The author grew up writing essays and thinking about other things he could work on.</b>" - ], - "text/plain": [ - "<IPython.core.display.Markdown object>" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "display(Markdown(f\"<b>{response}</b>\"))" - ] - }, - { - "cell_type": "markdown", - "id": "a6457769-dfaf-4241-ab32-dcf901dde902", - "metadata": {}, - "source": [ - "## Using GPT Keyword Table Index" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "78d59ef6-70b0-47bb-818d-7237a3b7de75", - "metadata": {}, - "outputs": [], - "source": [ - "from llama_index import GPTKeywordTableIndex, SimpleDirectoryReader\n", - "from IPython.display import Markdown, display" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "5a3f1c67-6d73-4f37-afcf-9e637002fcff", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> Processing chunk 0 of 6: \t\t\n", - "\n", - "What I Worked On\n", - "\n", - "February 2021\n", - "\n", - "Before col...\n", - "> Keywords: ['painting', 'computers', 'programming', 'lisp', 'ai', 'college', 'graduate school', 'graduate', 'school', 'writing']\n", - "> Processing chunk 1 of 6: of excluding them, because there were so many s...\n", - "> Keywords: ['school', 'students', 'painting', 'florence', 'risd', 'accademia', 'still lives', 'still', 'lives', 'color', 'new york', 'new', 'york', 'yorkville', 'idelle weber', 'idelle', 'weber', 'harvard', 'world wide web', 'world', 'wide', 'web', 'y combinator', 'combinator', 'software', 'lisp']\n", - "> Processing chunk 2 of 6: an alarming prospect, because neither of us kne...\n", - "> Keywords: ['windows', 'unix', 'lisp', 'web app', 'web', 'app', 'browser', 'store builder', 'store', 'builder', 'ecommerce', 'startup', 'painting']\n", - "> Processing chunk 3 of 6: browser, and then host the resulting applicatio...\n", - "> Keywords: ['y combinator', 'combinator', 'investment', 'summer founders program', 'summer', 'founders', 'program', 'microsoft', 'goldman sachs', 'goldman', 'sachs']\n", - "> Processing chunk 4 of 6: person, and from those we picked 8 to fund. The...\n", - "> Keywords: ['y combinator', 'combinator', 'yc', 'lisp', 'bel', 'essays', 'writing', 'software', 'programming', 'arc']\n", - "> Processing chunk 5 of 6: it was like living in another country, and sinc...\n", - "> Keywords: ['software', 'technology', 'y combinator', 'combinator', 'essays', 'online publishing', 'online', 'publishing', 'venture capital', 'venture', 'capital', 'startups', 'space aliens', 'space', 'aliens', 'lisp']\n" - ] - } - ], - "source": [ - "# build keyword index\n", - "documents = SimpleDirectoryReader('data').load_data()\n", - "index = GPTKeywordTableIndex.from_documents(documents)" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "7ec97988-0190-4df7-b19a-e3130122298f", - "metadata": {}, - "outputs": [], - "source": [ - "# save index\n", - "index.save_to_disk('index_table.json')" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "d94d0fe0-43c1-41cd-901b-0d748d30f1c7", - "metadata": {}, - "outputs": [], - "source": [ - "# reload index\n", - "index = GPTKeywordTableIndex.load_from_disk('index_table.json')" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "69d4f686-6825-49cf-a113-d2fdd484de77", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> Starting query: What did the author do after his time at Y Combinator?\n", - "Extracted keywords: ['y combinator', 'combinator']\n", - "> Querying with idx: 7143669651211954504: of excluding them, because there were so many s...\n", - "> Querying with idx: 4978118451876167434: browser, and then host the resulting applicatio...\n", - "> Querying with idx: 7378313280237489139: person, and from those we picked 8 to fund. The...\n", - "> Querying with idx: 2670584622494666310: it was like living in another country, and sinc...\n" - ] - } - ], - "source": [ - "# set Logging to DEBUG for more detailed outputs\n", - "response = index.query(\"What did the author do after his time at Y Combinator?\")" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "id": "a483514d-4ab5-489d-8b99-7250df491ce3", - "metadata": {}, - "outputs": [ - { - "data": { - "text/markdown": [ - "<b>\n", - "\n", - "After a few years, the author decided to step away from Y Combinator to focus on other projects, such as painting and writing essays. In 2013, he handed over control of Y Combinator to Sam Altman. The author's mother passed away in 2014, and after taking some time to grieve, he returned to writing essays and working on Lisp. He continued working on Lisp until 2019, when he finally completed the project.\n", - "\n", - "In 2015, the author decided to move to England with his family. They originally intended to only stay for a year, but ended up liking it so much that they remained there. The author wrote Bel while living in England. In 2019, he finally finished the project. After completing Bel, the author wrote a number of essays on various topics. He continued writing essays through 2020, but also started thinking about other things he could work on.</b>" - ], - "text/plain": [ - "<IPython.core.display.Markdown object>" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "display(Markdown(f\"<b>{response}</b>\"))" - ] - }, - { - "cell_type": "markdown", - "id": "aae1bea9-b534-430a-a52b-1f4414957ac9", - "metadata": {}, - "source": [ - "## Using GPT List Index" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1aa8c8c1-7fce-4737-9141-d14fd37a779c", - "metadata": {}, - "outputs": [], - "source": [ - "from llama_index import GPTListIndex, SimpleDirectoryReader\n", - "from IPython.display import Markdown, display" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "191caa65-a77f-4d8c-b095-4aed61300ea5", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> Adding chunk: \t\t\n", - "\n", - "What I Worked On\n", - "\n", - "February 2021\n", - "\n", - "Before col...\n", - "> Adding chunk: only up to age 25 and already there are such co...\n", - "> Adding chunk: clear that it was even possible. To find out, w...\n", - "> Adding chunk: a name for the kind of company Viaweb was, an \"...\n", - "> Adding chunk: get their initial set of customers almost entir...\n", - "> Adding chunk: had smart people and built impressive technolog...\n", - "> [build_index_from_documents] Total token usage: 0 tokens\n" - ] - } - ], - "source": [ - "# build linked list index\n", - "documents = SimpleDirectoryReader('data').load_data()\n", - "index = GPTListIndex.from_documents(documents)\n", - "# save index\n", - "index.save_to_disk('index_list.json')" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "id": "af2d049d-518d-4ec4-b84f-1fab8aece04f", - "metadata": {}, - "outputs": [], - "source": [ - "# load index from disk\n", - "index = GPTListIndex.load_from_disk('index_list.json')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1b3d4bd8-7540-4c6f-8616-ab2d8c6ae2b2", - "metadata": {}, - "outputs": [], - "source": [ - "# set Logging to DEBUG for more detailed outputs\n", - "response = index.query(\"What did the author do after his time at Y Combinator?\")" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "id": "5101b979-175f-490e-9b32-27689fe4b789", - "metadata": {}, - "outputs": [ - { - "data": { - "text/markdown": [ - "<b>\n", - "\n", - "After his time at Y Combinator, the author moved back to Providence to continue at RISD. However, he found that art school was not what he expected it to be and dropped out. He then moved to New York City and started writing a book on Lisp. When that didn't work out, he started a company to put art galleries online. However, that also failed. He then had the idea to start a company to build online stores, which became a success.\n", - "\n", - "The author then left his position at Yahoo to pursue painting full-time. However, he found it difficult to get back into the painting mindset and eventually returned to New York City. It was there that he had the idea to create a web application that would allow users to create and host their own web applications.</b>" - ], - "text/plain": [ - "<IPython.core.display.Markdown object>" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "display(Markdown(f\"<b>{response}</b>\"))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "65cfce56-853e-431b-888e-946771c3b07e", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.16" - } + "cells": [ + { + "cell_type": "markdown", + "id": "9f9e2065", + "metadata": {}, + "source": [ + "# Paul Graham Essay Test" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f1a9eb90-335c-4214-8bb6-fd1edbe3ccbd", + "metadata": {}, + "outputs": [], + "source": [ + "# My OpenAI Key\n", + "import os\n", + "os.environ['OPENAI_API_KEY'] = \"INSERT OPENAI KEY\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6a712b56", + "metadata": {}, + "outputs": [], + "source": [ + "import logging\n", + "import sys\n", + "\n", + "logging.basicConfig(stream=sys.stdout, level=logging.INFO)\n", + "logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))" + ] + }, + { + "cell_type": "markdown", + "id": "be3f7baa-1c0a-430b-981b-83ddca9e71f2", + "metadata": { + "tags": [] + }, + "source": [ + "## Using GPT Tree Index" + ] + }, + { + "cell_type": "markdown", + "id": "0881f151-279e-4910-95c7-f49d3d6a4c69", + "metadata": {}, + "source": [ + "#### [Demo] Default leaf traversal " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8d0b2364-4806-4656-81e7-3f6e4b910b5b", + "metadata": {}, + "outputs": [], + "source": [ + "from llama_index import GPTTreeIndex, SimpleDirectoryReader\n", + "from IPython.display import Markdown, display" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "1c297fd3-3424-41d8-9d0d-25fe6310ab62", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "documents = SimpleDirectoryReader('data').load_data()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "370fd08f-56ff-4c24-b0c4-c93116a6d482", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "index = GPTTreeIndex.from_documents(documents)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0b4fe9b6-5762-4e86-b51e-aac45d3ecdb1", + "metadata": {}, + "outputs": [], + "source": [ + "index.save_to_disk('index.json')" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "5eec265d-211b-4f26-b05b-5b4e7072bc6e", + "metadata": {}, + "outputs": [], + "source": [ + "# try loading\n", + "new_index = GPTTreeIndex.load_from_disk('index.json')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bd14686d-1c53-4637-9340-3745f2121ae2", + "metadata": {}, + "outputs": [], + "source": [ + "# set Logging to DEBUG for more detailed outputs\n", + "response = new_index.query(\"What did the author do growing up?\")" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "b4c87d14-d2d8-4d80-89f6-1e5972973528", + "metadata": {}, + "outputs": [ + { + "data": { + "text/markdown": [ + "<b>The author wrote short stories and tried to program on an IBM 1401.</b>" + ], + "text/plain": [ + "<IPython.core.display.Markdown object>" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "display(Markdown(f\"<b>{response}</b>\"))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "68c9ebfe-b1b6-4f4e-9278-174346de8c90", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# set Logging to DEBUG for more detailed outputs\n", + "response = new_index.query(\"What did the author do after his time at Y Combinator?\")" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "a5ab5943-7c84-4c2b-ac99-ec4b5fc67e64", + "metadata": {}, + "outputs": [ + { + "data": { + "text/markdown": [ + "<b>The author went on to start his own company.</b>" + ], + "text/plain": [ + "<IPython.core.display.Markdown object>" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "display(Markdown(f\"<b>{response}</b>\"))" + ] + }, + { + "cell_type": "markdown", + "id": "85c62ec3-c3cf-467e-ab0f-88ffb9f990be", + "metadata": {}, + "source": [ + "#### [Demo] Leaf traversal with child_branch_factor=2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "46714db4-9592-4c55-9ca7-916758f2ce68", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# try using branching factor 2\n", + "response = new_index.query(\"What did the author do growing up?\", child_branch_factor=2)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "1ea7f891-b7e1-497a-a965-14201b220404", + "metadata": {}, + "outputs": [ + { + "data": { + "text/markdown": [ + "<b>The author grew up writing simple programs on a TRS-80 computer, as well as trying to program on an IBM 1401. In the early 1990s, the author was a student at the Rhode Island School of Design (RISD) and then the Accademia di Belle Arti in Florence, Italy. They eventually dropped out of RISD and moved to New York City, where they got a job at Interleaf, a software company. While working there, they learned about a new markup language called HTML, which would later become a big part of their life. He also wrote a book on Lisp programming.</b>" + ], + "text/plain": [ + "<IPython.core.display.Markdown object>" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "display(Markdown(f\"<b>{response}</b>\"))" + ] + }, + { + "cell_type": "markdown", + "id": "3c572726-bb95-49c3-a762-d966de59ee5f", + "metadata": {}, + "source": [ + "#### [Demo] Build Tree Index during Query-Time" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "255fb052-1ff6-4f27-881f-28d4790e9520", + "metadata": {}, + "outputs": [], + "source": [ + "documents = SimpleDirectoryReader('data').load_data()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "85371256-292c-473e-9485-7de5c1997a59", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> [build_index_from_documents] Total token usage: 0 tokens\n" + ] + } + ], + "source": [ + "index_light = GPTTreeIndex.from_documents(documents, build_tree=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "77b0acb3-5593-4f00-8eef-315a031fedc2", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> Starting query: What did the author do after his time at Y Combinator?\n", + "> Building index from nodes: 5 chunks\n", + "0/57\n", + "10/57\n", + "20/57\n", + "30/57\n", + "40/57\n", + "50/57\n", + "> [query] Total token usage: 18200 tokens\n" + ] }, - "nbformat": 4, - "nbformat_minor": 5 + { + "data": { + "text/plain": [ + "'\\nThe author went back to painting.'" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "index_light.query(\"What did the author do after his time at Y Combinator?\", mode=\"summarize\")" + ] + }, + { + "cell_type": "markdown", + "id": "f9773497-9aa6-4a16-884a-cd882e63d012", + "metadata": {}, + "source": [ + "#### [Demo] Build Tree Index with a custom Summary Prompt, directly retrieve answer from root node" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "8ab6d3ad-95e1-477a-a0dc-2ce4763ff2c4", + "metadata": {}, + "outputs": [], + "source": [ + "from llama_index import SummaryPrompt" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "5a91a445-6ab2-457c-850e-79c5386129db", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> Building index from nodes: 5 chunks\n", + "0/57\n", + "10/57\n", + "20/57\n", + "30/57\n", + "40/57\n", + "50/57\n", + "> [build_index_from_documents] Total token usage: 18031 tokens\n" + ] + } + ], + "source": [ + "documents = SimpleDirectoryReader('data').load_data()\n", + "\n", + "query_str = \"What did the author do growing up?\"\n", + "SUMMARY_PROMPT_TMPL = (\n", + " \"Context information is below. \\n\"\n", + " \"---------------------\\n\"\n", + " \"{context_str}\"\n", + " \"\\n---------------------\\n\"\n", + " \"Given the context information and not prior knowledge, \"\n", + " f\"answer the question: {query_str}\\n\"\n", + ")\n", + "SUMMARY_PROMPT = SummaryPrompt(SUMMARY_PROMPT_TMPL)\n", + "index_with_query = GPTTreeIndex.from_documents(documents, summary_template=SUMMARY_PROMPT)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "985dad0c-1ede-4576-a4c9-c077b815edd8", + "metadata": {}, + "outputs": [], + "source": [ + "index_with_query.save_to_disk(\"index_with_query.json\")" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "de04fce5-88f9-41b7-87d9-dcde8f84a872", + "metadata": {}, + "outputs": [], + "source": [ + "index_with_query = GPTTreeIndex.load_from_disk(\"index_with_query.json\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9223ffa8-d49d-4de3-821a-701b2a0352d4", + "metadata": {}, + "outputs": [], + "source": [ + "# directly retrieve response from root nodes instead of traversing tree\n", + "response = index_with_query.query(query_str, mode=\"retrieve\")" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "fdca6970-2f3f-4741-ae98-555db8d3d9a0", + "metadata": {}, + "outputs": [ + { + "data": { + "text/markdown": [ + "<b>\n", + "The author was homeschooled and then attended a prestigious art school. The author grew up writing essays and thinking about other things he could work on.</b>" + ], + "text/plain": [ + "<IPython.core.display.Markdown object>" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "display(Markdown(f\"<b>{response}</b>\"))" + ] + }, + { + "cell_type": "markdown", + "id": "a6457769-dfaf-4241-ab32-dcf901dde902", + "metadata": {}, + "source": [ + "## Using GPT Keyword Table Index" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "78d59ef6-70b0-47bb-818d-7237a3b7de75", + "metadata": {}, + "outputs": [], + "source": [ + "from llama_index import GPTKeywordTableIndex, SimpleDirectoryReader\n", + "from IPython.display import Markdown, display" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "5a3f1c67-6d73-4f37-afcf-9e637002fcff", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> Processing chunk 0 of 6: \t\t\n", + "\n", + "What I Worked On\n", + "\n", + "February 2021\n", + "\n", + "Before col...\n", + "> Keywords: ['painting', 'computers', 'programming', 'lisp', 'ai', 'college', 'graduate school', 'graduate', 'school', 'writing']\n", + "> Processing chunk 1 of 6: of excluding them, because there were so many s...\n", + "> Keywords: ['school', 'students', 'painting', 'florence', 'risd', 'accademia', 'still lives', 'still', 'lives', 'color', 'new york', 'new', 'york', 'yorkville', 'idelle weber', 'idelle', 'weber', 'harvard', 'world wide web', 'world', 'wide', 'web', 'y combinator', 'combinator', 'software', 'lisp']\n", + "> Processing chunk 2 of 6: an alarming prospect, because neither of us kne...\n", + "> Keywords: ['windows', 'unix', 'lisp', 'web app', 'web', 'app', 'browser', 'store builder', 'store', 'builder', 'ecommerce', 'startup', 'painting']\n", + "> Processing chunk 3 of 6: browser, and then host the resulting applicatio...\n", + "> Keywords: ['y combinator', 'combinator', 'investment', 'summer founders program', 'summer', 'founders', 'program', 'microsoft', 'goldman sachs', 'goldman', 'sachs']\n", + "> Processing chunk 4 of 6: person, and from those we picked 8 to fund. The...\n", + "> Keywords: ['y combinator', 'combinator', 'yc', 'lisp', 'bel', 'essays', 'writing', 'software', 'programming', 'arc']\n", + "> Processing chunk 5 of 6: it was like living in another country, and sinc...\n", + "> Keywords: ['software', 'technology', 'y combinator', 'combinator', 'essays', 'online publishing', 'online', 'publishing', 'venture capital', 'venture', 'capital', 'startups', 'space aliens', 'space', 'aliens', 'lisp']\n" + ] + } + ], + "source": [ + "# build keyword index\n", + "documents = SimpleDirectoryReader('data').load_data()\n", + "index = GPTKeywordTableIndex.from_documents(documents)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "7ec97988-0190-4df7-b19a-e3130122298f", + "metadata": {}, + "outputs": [], + "source": [ + "# save index\n", + "index.save_to_disk('index_table.json')" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "d94d0fe0-43c1-41cd-901b-0d748d30f1c7", + "metadata": {}, + "outputs": [], + "source": [ + "# reload index\n", + "index = GPTKeywordTableIndex.load_from_disk('index_table.json')" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "69d4f686-6825-49cf-a113-d2fdd484de77", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> Starting query: What did the author do after his time at Y Combinator?\n", + "Extracted keywords: ['y combinator', 'combinator']\n", + "> Querying with idx: 7143669651211954504: of excluding them, because there were so many s...\n", + "> Querying with idx: 4978118451876167434: browser, and then host the resulting applicatio...\n", + "> Querying with idx: 7378313280237489139: person, and from those we picked 8 to fund. The...\n", + "> Querying with idx: 2670584622494666310: it was like living in another country, and sinc...\n" + ] + } + ], + "source": [ + "# set Logging to DEBUG for more detailed outputs\n", + "response = index.query(\"What did the author do after his time at Y Combinator?\")" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "a483514d-4ab5-489d-8b99-7250df491ce3", + "metadata": {}, + "outputs": [ + { + "data": { + "text/markdown": [ + "<b>\n", + "\n", + "After a few years, the author decided to step away from Y Combinator to focus on other projects, such as painting and writing essays. In 2013, he handed over control of Y Combinator to Sam Altman. The author's mother passed away in 2014, and after taking some time to grieve, he returned to writing essays and working on Lisp. He continued working on Lisp until 2019, when he finally completed the project.\n", + "\n", + "In 2015, the author decided to move to England with his family. They originally intended to only stay for a year, but ended up liking it so much that they remained there. The author wrote Bel while living in England. In 2019, he finally finished the project. After completing Bel, the author wrote a number of essays on various topics. He continued writing essays through 2020, but also started thinking about other things he could work on.</b>" + ], + "text/plain": [ + "<IPython.core.display.Markdown object>" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "display(Markdown(f\"<b>{response}</b>\"))" + ] + }, + { + "cell_type": "markdown", + "id": "aae1bea9-b534-430a-a52b-1f4414957ac9", + "metadata": {}, + "source": [ + "## Using GPT List Index" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1aa8c8c1-7fce-4737-9141-d14fd37a779c", + "metadata": {}, + "outputs": [], + "source": [ + "from llama_index import GPTListIndex, SimpleDirectoryReader\n", + "from IPython.display import Markdown, display" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "191caa65-a77f-4d8c-b095-4aed61300ea5", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> Adding chunk: \t\t\n", + "\n", + "What I Worked On\n", + "\n", + "February 2021\n", + "\n", + "Before col...\n", + "> Adding chunk: only up to age 25 and already there are such co...\n", + "> Adding chunk: clear that it was even possible. To find out, w...\n", + "> Adding chunk: a name for the kind of company Viaweb was, an \"...\n", + "> Adding chunk: get their initial set of customers almost entir...\n", + "> Adding chunk: had smart people and built impressive technolog...\n", + "> [build_index_from_documents] Total token usage: 0 tokens\n" + ] + } + ], + "source": [ + "# build linked list index\n", + "documents = SimpleDirectoryReader('data').load_data()\n", + "index = GPTListIndex.from_documents(documents)\n", + "# save index\n", + "index.save_to_disk('index_list.json')" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "af2d049d-518d-4ec4-b84f-1fab8aece04f", + "metadata": {}, + "outputs": [], + "source": [ + "# load index from disk\n", + "index = GPTListIndex.load_from_disk('index_list.json')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1b3d4bd8-7540-4c6f-8616-ab2d8c6ae2b2", + "metadata": {}, + "outputs": [], + "source": [ + "# set Logging to DEBUG for more detailed outputs\n", + "response = index.query(\"What did the author do after his time at Y Combinator?\")" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "5101b979-175f-490e-9b32-27689fe4b789", + "metadata": {}, + "outputs": [ + { + "data": { + "text/markdown": [ + "<b>\n", + "\n", + "After his time at Y Combinator, the author moved back to Providence to continue at RISD. However, he found that art school was not what he expected it to be and dropped out. He then moved to New York City and started writing a book on Lisp. When that didn't work out, he started a company to put art galleries online. However, that also failed. He then had the idea to start a company to build online stores, which became a success.\n", + "\n", + "The author then left his position at Yahoo to pursue painting full-time. However, he found it difficult to get back into the painting mindset and eventually returned to New York City. It was there that he had the idea to create a web application that would allow users to create and host their own web applications.</b>" + ], + "text/plain": [ + "<IPython.core.display.Markdown object>" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "display(Markdown(f\"<b>{response}</b>\"))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "65cfce56-853e-431b-888e-946771c3b07e", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.0" + } + }, + "nbformat": 4, + "nbformat_minor": 5 } diff --git a/examples/playground/PlaygroundDemo.ipynb b/examples/playground/PlaygroundDemo.ipynb index b73d81c932..52c25f1b64 100644 --- a/examples/playground/PlaygroundDemo.ipynb +++ b/examples/playground/PlaygroundDemo.ipynb @@ -1,413 +1,421 @@ { - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "id": "839c4a87", - "metadata": {}, - "outputs": [], - "source": [ - "# My OpenAI Key\n", - "import os\n", - "os.environ['OPENAI_API_KEY'] = \"INSERT OPENAI KEY\"" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "d726e871", - "metadata": {}, - "outputs": [], - "source": [ - "# Hide INFO logs regarding token usage, etc\n", - "import logging\n", - "logger = logging.getLogger()\n", - "logger.setLevel(logging.CRITICAL)" - ] - }, - { - "cell_type": "markdown", - "id": "40cf0773", - "metadata": {}, - "source": [ - "## Setup\n", - "\n", - "### Generate some example Documents" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "fa34cd83", - "metadata": {}, - "outputs": [], - "source": [ - "from llama_index import download_loader\n", - "from llama_index.indices.vector_store import GPTSimpleVectorIndex\n", - "from llama_index.indices.tree.base import GPTTreeIndex\n", - "\n", - "WikipediaReader = download_loader(\"WikipediaReader\")\n", - "\n", - "loader = WikipediaReader()\n", - "documents = loader.load_data(pages=['Berlin'])" - ] - }, - { - "cell_type": "markdown", - "id": "0c32392b", - "metadata": {}, - "source": [ - "### Create a list of any sort of indices (custom LLMs, custom embeddings, etc)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f59e6c18", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "INFO:root:> [build_index_from_documents] Total LLM token usage: 0 tokens\n", - "INFO:root:> [build_index_from_documents] Total embedding token usage: 18344 tokens\n", - "INFO:root:> Building index from nodes: 5 chunks\n" - ] - } - ], - "source": [ - "indices = [GPTSimpleVectorIndex.from_documents(documents), GPTTreeIndex.from_documents(documents)]" - ] - }, - { - "cell_type": "markdown", - "id": "827ada33", - "metadata": {}, - "source": [ - "## Using the Playground\n", - "\n", - "\n", - "### Initialize with indices" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "a04e4535", - "metadata": {}, - "outputs": [], - "source": [ - "from llama_index.playground import Playground\n", - "\n", - "playground = Playground(indices=indices)" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "5f6999fc", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "INFO:openai:error_code=None error_message='Rate limit reached for default-global-with-image-limits in organization org-ehTdCqs0FpsxuTTwsJIlNSdZ on requests per min. Limit: 60.000000 / min. Current: 110.000000 / min. Contact support@openai.com if you continue to have issues. Please add a payment method to your account to increase your rate limit. Visit https://platform.openai.com/account/billing to add a payment method.' error_param=None error_type=requests message='OpenAI API error received' stream_error=False\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[1mQuery:\u001b[0m\n", - "What is the population of Berlin?\n", - "\n", - "Trying 10 combinations...\n", - "\n", - "\n", - "\u001b[1mGPTSimpleVectorIndex\u001b[0m, mode = default\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "INFO:openai:error_code=None error_message='Rate limit reached for default-global-with-image-limits in organization org-ehTdCqs0FpsxuTTwsJIlNSdZ on requests per min. Limit: 60.000000 / min. Current: 90.000000 / min. Contact support@openai.com if you continue to have issues. Please add a payment method to your account to increase your rate limit. Visit https://platform.openai.com/account/billing to add a payment method.' error_param=None error_type=requests message='OpenAI API error received' stream_error=False\n", - "INFO:openai:error_code=None error_message='Rate limit reached for default-global-with-image-limits in organization org-ehTdCqs0FpsxuTTwsJIlNSdZ on requests per min. Limit: 60.000000 / min. Current: 90.000000 / min. Contact support@openai.com if you continue to have issues. Please add a payment method to your account to increase your rate limit. Visit https://platform.openai.com/account/billing to add a payment method.' error_param=None error_type=requests message='OpenAI API error received' stream_error=False\n", - "INFO:openai:error_code=None error_message='Rate limit reached for default-global-with-image-limits in organization org-ehTdCqs0FpsxuTTwsJIlNSdZ on requests per min. Limit: 60.000000 / min. Current: 80.000000 / min. Contact support@openai.com if you continue to have issues. Please add a payment method to your account to increase your rate limit. Visit https://platform.openai.com/account/billing to add a payment method.' error_param=None error_type=requests message='OpenAI API error received' stream_error=False\n", - "INFO:root:> [query] Total LLM token usage: 3545 tokens\n", - "INFO:root:> [query] Total embedding token usage: 7 tokens\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[36;1m\u001b[1;3m\n", - "The population of Berlin in 1949 was approximately 2.2 million inhabitants. After the fall of the Berlin Wall in 1989, the population of Berlin increased to approximately 3.7 million inhabitants.\u001b[0m\n", - "\n", - "\u001b[1mGPTSimpleVectorIndex\u001b[0m, mode = embedding\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "INFO:root:> [query] Total LLM token usage: 3545 tokens\n", - "INFO:root:> [query] Total embedding token usage: 7 tokens\n", - "INFO:root:> Starting query: What is the population of Berlin?\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[36;1m\u001b[1;3m\n", - "The population of Berlin in 1949 was approximately 2.2 million inhabitants. After the fall of the Berlin Wall in 1989, the population of Berlin increased to approximately 3.7 million inhabitants.\u001b[0m\n", - "\n", - "\u001b[1mGPTTreeIndex\u001b[0m, mode = default\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "INFO:root:>[Level 0] Selected node: [1]/[1]\n", - "INFO:root:>[Level 1] Selected node: [3]/[3]\n", - "INFO:root:> [query] Total LLM token usage: 5168 tokens\n", - "INFO:root:> [query] Total embedding token usage: 0 tokens\n", - "INFO:root:> Starting query: What is the population of Berlin?\n", - "INFO:root:> Building index from nodes: 6 chunks\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[33;1m\u001b[1;3mThe population of Berlin is approximately 3.7 million people.\u001b[0m\n", - "\n", - "\u001b[1mGPTTreeIndex\u001b[0m, mode = summarize\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "INFO:root:> [query] Total LLM token usage: 21617 tokens\n", - "INFO:root:> [query] Total embedding token usage: 0 tokens\n", - "INFO:root:> Starting query: What is the population of Berlin?\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[33;1m\u001b[1;3m\n", - "The population of Berlin is approximately 3.7 million people.\u001b[0m\n", - "\n", - "\u001b[1mGPTTreeIndex\u001b[0m, mode = embedding\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "INFO:root:> [query] Total LLM token usage: 368 tokens\n", - "INFO:root:> [query] Total embedding token usage: 4598 tokens\n", - "INFO:root:> Starting query: What is the population of Berlin?\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[33;1m\u001b[1;3mApproximately 3.7 million people.\u001b[0m\n", - "\n", - "\u001b[1mGPTTreeIndex\u001b[0m, mode = retrieve\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "INFO:root:> [query] Total LLM token usage: 1439 tokens\n", - "INFO:root:> [query] Total embedding token usage: 0 tokens\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[33;1m\u001b[1;3m\n", - "The population of Berlin is 3.75 million registered inhabitants.\u001b[0m\n", - "\n", - "\n", - "Ran 6 combinations in total.\n" - ] - }, - { - "data": { - "text/html": [ - "<div>\n", - "<style scoped>\n", - " .dataframe tbody tr th:only-of-type {\n", - " vertical-align: middle;\n", - " }\n", - "\n", - " .dataframe tbody tr th {\n", - " vertical-align: top;\n", - " }\n", - "\n", - " .dataframe thead th {\n", - " text-align: right;\n", - " }\n", - "</style>\n", - "<table border=\"1\" class=\"dataframe\">\n", - " <thead>\n", - " <tr style=\"text-align: right;\">\n", - " <th></th>\n", - " <th>Index</th>\n", - " <th>Mode</th>\n", - " <th>Output</th>\n", - " <th>Duration</th>\n", - " <th>LLM Tokens</th>\n", - " <th>Embedding Tokens</th>\n", - " </tr>\n", - " </thead>\n", - " <tbody>\n", - " <tr>\n", - " <th>0</th>\n", - " <td>GPTSimpleVectorIndex</td>\n", - " <td>default</td>\n", - " <td>\\nThe population of Berlin in 1949 was approxi...</td>\n", - " <td>52.319133</td>\n", - " <td>3545</td>\n", - " <td>7</td>\n", - " </tr>\n", - " <tr>\n", - " <th>1</th>\n", - " <td>GPTSimpleVectorIndex</td>\n", - " <td>embedding</td>\n", - " <td>\\nThe population of Berlin in 1949 was approxi...</td>\n", - " <td>8.192025</td>\n", - " <td>3545</td>\n", - " <td>7</td>\n", - " </tr>\n", - " <tr>\n", - " <th>2</th>\n", - " <td>GPTTreeIndex</td>\n", - " <td>default</td>\n", - " <td>The population of Berlin is approximately 3.7 ...</td>\n", - " <td>12.542335</td>\n", - " <td>5168</td>\n", - " <td>0</td>\n", - " </tr>\n", - " <tr>\n", - " <th>3</th>\n", - " <td>GPTTreeIndex</td>\n", - " <td>summarize</td>\n", - " <td>\\nThe population of Berlin is approximately 3....</td>\n", - " <td>18.665586</td>\n", - " <td>21617</td>\n", - " <td>0</td>\n", - " </tr>\n", - " <tr>\n", - " <th>4</th>\n", - " <td>GPTTreeIndex</td>\n", - " <td>embedding</td>\n", - " <td>Approximately 3.7 million people.</td>\n", - " <td>3.573458</td>\n", - " <td>368</td>\n", - " <td>4598</td>\n", - " </tr>\n", - " <tr>\n", - " <th>5</th>\n", - " <td>GPTTreeIndex</td>\n", - " <td>retrieve</td>\n", - " <td>\\nThe population of Berlin is 3.75 million reg...</td>\n", - " <td>2.269598</td>\n", - " <td>1439</td>\n", - " <td>0</td>\n", - " </tr>\n", - " </tbody>\n", - "</table>\n", - "</div>" - ], - "text/plain": [ - " Index Mode \\\n", - "0 GPTSimpleVectorIndex default \n", - "1 GPTSimpleVectorIndex embedding \n", - "2 GPTTreeIndex default \n", - "3 GPTTreeIndex summarize \n", - "4 GPTTreeIndex embedding \n", - "5 GPTTreeIndex retrieve \n", - "\n", - " Output Duration LLM Tokens \\\n", - "0 \\nThe population of Berlin in 1949 was approxi... 52.319133 3545 \n", - "1 \\nThe population of Berlin in 1949 was approxi... 8.192025 3545 \n", - "2 The population of Berlin is approximately 3.7 ... 12.542335 5168 \n", - "3 \\nThe population of Berlin is approximately 3.... 18.665586 21617 \n", - "4 Approximately 3.7 million people. 3.573458 368 \n", - "5 \\nThe population of Berlin is 3.75 million reg... 2.269598 1439 \n", - "\n", - " Embedding Tokens \n", - "0 7 \n", - "1 7 \n", - "2 0 \n", - "3 0 \n", - "4 4598 \n", - "5 0 " - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "playground.compare(\"What is the population of Berlin?\")" - ] - }, - { - "cell_type": "markdown", - "id": "8829a829", - "metadata": {}, - "source": [ - "### Initialize with Documents" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "dfbc8ade", - "metadata": {}, - "outputs": [], - "source": [ - "# Uses documents in a preset list of indices\n", - "playground = Playground.from_docs(documents=documents)" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.6" - } + "cells": [ + { + "cell_type": "markdown", + "id": "c283d4df", + "metadata": {}, + "source": [ + "# Playground Demo" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "839c4a87", + "metadata": {}, + "outputs": [], + "source": [ + "# My OpenAI Key\n", + "import os\n", + "os.environ['OPENAI_API_KEY'] = \"INSERT OPENAI KEY\"" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "d726e871", + "metadata": {}, + "outputs": [], + "source": [ + "# Hide INFO logs regarding token usage, etc\n", + "import logging\n", + "logger = logging.getLogger()\n", + "logger.setLevel(logging.CRITICAL)" + ] + }, + { + "cell_type": "markdown", + "id": "40cf0773", + "metadata": {}, + "source": [ + "## Setup\n", + "\n", + "### Generate some example Documents" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fa34cd83", + "metadata": {}, + "outputs": [], + "source": [ + "from llama_index import download_loader\n", + "from llama_index.indices.vector_store import GPTSimpleVectorIndex\n", + "from llama_index.indices.tree.base import GPTTreeIndex\n", + "\n", + "WikipediaReader = download_loader(\"WikipediaReader\")\n", + "\n", + "loader = WikipediaReader()\n", + "documents = loader.load_data(pages=['Berlin'])" + ] + }, + { + "cell_type": "markdown", + "id": "0c32392b", + "metadata": {}, + "source": [ + "### Create a list of any sort of indices (custom LLMs, custom embeddings, etc)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f59e6c18", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:root:> [build_index_from_documents] Total LLM token usage: 0 tokens\n", + "INFO:root:> [build_index_from_documents] Total embedding token usage: 18344 tokens\n", + "INFO:root:> Building index from nodes: 5 chunks\n" + ] + } + ], + "source": [ + "indices = [GPTSimpleVectorIndex.from_documents(documents), GPTTreeIndex.from_documents(documents)]" + ] + }, + { + "cell_type": "markdown", + "id": "827ada33", + "metadata": {}, + "source": [ + "## Using the Playground\n", + "\n", + "\n", + "### Initialize with indices" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "a04e4535", + "metadata": {}, + "outputs": [], + "source": [ + "from llama_index.playground import Playground\n", + "\n", + "playground = Playground(indices=indices)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "5f6999fc", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:openai:error_code=None error_message='Rate limit reached for default-global-with-image-limits in organization org-ehTdCqs0FpsxuTTwsJIlNSdZ on requests per min. Limit: 60.000000 / min. Current: 110.000000 / min. Contact support@openai.com if you continue to have issues. Please add a payment method to your account to increase your rate limit. Visit https://platform.openai.com/account/billing to add a payment method.' error_param=None error_type=requests message='OpenAI API error received' stream_error=False\n" + ] }, - "nbformat": 4, - "nbformat_minor": 5 -} \ No newline at end of file + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[1mQuery:\u001b[0m\n", + "What is the population of Berlin?\n", + "\n", + "Trying 10 combinations...\n", + "\n", + "\n", + "\u001b[1mGPTSimpleVectorIndex\u001b[0m, mode = default\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:openai:error_code=None error_message='Rate limit reached for default-global-with-image-limits in organization org-ehTdCqs0FpsxuTTwsJIlNSdZ on requests per min. Limit: 60.000000 / min. Current: 90.000000 / min. Contact support@openai.com if you continue to have issues. Please add a payment method to your account to increase your rate limit. Visit https://platform.openai.com/account/billing to add a payment method.' error_param=None error_type=requests message='OpenAI API error received' stream_error=False\n", + "INFO:openai:error_code=None error_message='Rate limit reached for default-global-with-image-limits in organization org-ehTdCqs0FpsxuTTwsJIlNSdZ on requests per min. Limit: 60.000000 / min. Current: 90.000000 / min. Contact support@openai.com if you continue to have issues. Please add a payment method to your account to increase your rate limit. Visit https://platform.openai.com/account/billing to add a payment method.' error_param=None error_type=requests message='OpenAI API error received' stream_error=False\n", + "INFO:openai:error_code=None error_message='Rate limit reached for default-global-with-image-limits in organization org-ehTdCqs0FpsxuTTwsJIlNSdZ on requests per min. Limit: 60.000000 / min. Current: 80.000000 / min. Contact support@openai.com if you continue to have issues. Please add a payment method to your account to increase your rate limit. Visit https://platform.openai.com/account/billing to add a payment method.' error_param=None error_type=requests message='OpenAI API error received' stream_error=False\n", + "INFO:root:> [query] Total LLM token usage: 3545 tokens\n", + "INFO:root:> [query] Total embedding token usage: 7 tokens\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[36;1m\u001b[1;3m\n", + "The population of Berlin in 1949 was approximately 2.2 million inhabitants. After the fall of the Berlin Wall in 1989, the population of Berlin increased to approximately 3.7 million inhabitants.\u001b[0m\n", + "\n", + "\u001b[1mGPTSimpleVectorIndex\u001b[0m, mode = embedding\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:root:> [query] Total LLM token usage: 3545 tokens\n", + "INFO:root:> [query] Total embedding token usage: 7 tokens\n", + "INFO:root:> Starting query: What is the population of Berlin?\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[36;1m\u001b[1;3m\n", + "The population of Berlin in 1949 was approximately 2.2 million inhabitants. After the fall of the Berlin Wall in 1989, the population of Berlin increased to approximately 3.7 million inhabitants.\u001b[0m\n", + "\n", + "\u001b[1mGPTTreeIndex\u001b[0m, mode = default\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:root:>[Level 0] Selected node: [1]/[1]\n", + "INFO:root:>[Level 1] Selected node: [3]/[3]\n", + "INFO:root:> [query] Total LLM token usage: 5168 tokens\n", + "INFO:root:> [query] Total embedding token usage: 0 tokens\n", + "INFO:root:> Starting query: What is the population of Berlin?\n", + "INFO:root:> Building index from nodes: 6 chunks\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[33;1m\u001b[1;3mThe population of Berlin is approximately 3.7 million people.\u001b[0m\n", + "\n", + "\u001b[1mGPTTreeIndex\u001b[0m, mode = summarize\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:root:> [query] Total LLM token usage: 21617 tokens\n", + "INFO:root:> [query] Total embedding token usage: 0 tokens\n", + "INFO:root:> Starting query: What is the population of Berlin?\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[33;1m\u001b[1;3m\n", + "The population of Berlin is approximately 3.7 million people.\u001b[0m\n", + "\n", + "\u001b[1mGPTTreeIndex\u001b[0m, mode = embedding\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:root:> [query] Total LLM token usage: 368 tokens\n", + "INFO:root:> [query] Total embedding token usage: 4598 tokens\n", + "INFO:root:> Starting query: What is the population of Berlin?\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[33;1m\u001b[1;3mApproximately 3.7 million people.\u001b[0m\n", + "\n", + "\u001b[1mGPTTreeIndex\u001b[0m, mode = retrieve\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:root:> [query] Total LLM token usage: 1439 tokens\n", + "INFO:root:> [query] Total embedding token usage: 0 tokens\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[33;1m\u001b[1;3m\n", + "The population of Berlin is 3.75 million registered inhabitants.\u001b[0m\n", + "\n", + "\n", + "Ran 6 combinations in total.\n" + ] + }, + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>Index</th>\n", + " <th>Mode</th>\n", + " <th>Output</th>\n", + " <th>Duration</th>\n", + " <th>LLM Tokens</th>\n", + " <th>Embedding Tokens</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>GPTSimpleVectorIndex</td>\n", + " <td>default</td>\n", + " <td>\\nThe population of Berlin in 1949 was approxi...</td>\n", + " <td>52.319133</td>\n", + " <td>3545</td>\n", + " <td>7</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>GPTSimpleVectorIndex</td>\n", + " <td>embedding</td>\n", + " <td>\\nThe population of Berlin in 1949 was approxi...</td>\n", + " <td>8.192025</td>\n", + " <td>3545</td>\n", + " <td>7</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>GPTTreeIndex</td>\n", + " <td>default</td>\n", + " <td>The population of Berlin is approximately 3.7 ...</td>\n", + " <td>12.542335</td>\n", + " <td>5168</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>GPTTreeIndex</td>\n", + " <td>summarize</td>\n", + " <td>\\nThe population of Berlin is approximately 3....</td>\n", + " <td>18.665586</td>\n", + " <td>21617</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>GPTTreeIndex</td>\n", + " <td>embedding</td>\n", + " <td>Approximately 3.7 million people.</td>\n", + " <td>3.573458</td>\n", + " <td>368</td>\n", + " <td>4598</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5</th>\n", + " <td>GPTTreeIndex</td>\n", + " <td>retrieve</td>\n", + " <td>\\nThe population of Berlin is 3.75 million reg...</td>\n", + " <td>2.269598</td>\n", + " <td>1439</td>\n", + " <td>0</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " Index Mode \\\n", + "0 GPTSimpleVectorIndex default \n", + "1 GPTSimpleVectorIndex embedding \n", + "2 GPTTreeIndex default \n", + "3 GPTTreeIndex summarize \n", + "4 GPTTreeIndex embedding \n", + "5 GPTTreeIndex retrieve \n", + "\n", + " Output Duration LLM Tokens \\\n", + "0 \\nThe population of Berlin in 1949 was approxi... 52.319133 3545 \n", + "1 \\nThe population of Berlin in 1949 was approxi... 8.192025 3545 \n", + "2 The population of Berlin is approximately 3.7 ... 12.542335 5168 \n", + "3 \\nThe population of Berlin is approximately 3.... 18.665586 21617 \n", + "4 Approximately 3.7 million people. 3.573458 368 \n", + "5 \\nThe population of Berlin is 3.75 million reg... 2.269598 1439 \n", + "\n", + " Embedding Tokens \n", + "0 7 \n", + "1 7 \n", + "2 0 \n", + "3 0 \n", + "4 4598 \n", + "5 0 " + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "playground.compare(\"What is the population of Berlin?\")" + ] + }, + { + "cell_type": "markdown", + "id": "8829a829", + "metadata": {}, + "source": [ + "### Initialize with Documents" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dfbc8ade", + "metadata": {}, + "outputs": [], + "source": [ + "# Uses documents in a preset list of indices\n", + "playground = Playground.from_docs(documents=documents)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.0" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/examples/query_transformations/HyDEQueryTransformDemo.ipynb b/examples/query_transformations/HyDEQueryTransformDemo.ipynb index 945d504591..a069e823db 100644 --- a/examples/query_transformations/HyDEQueryTransformDemo.ipynb +++ b/examples/query_transformations/HyDEQueryTransformDemo.ipynb @@ -389,7 +389,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.10" + "version": "3.11.0" } }, "nbformat": 4, diff --git a/examples/struct_indices/PandasIndexDemo.ipynb b/examples/struct_indices/PandasIndexDemo.ipynb index 9395b2403b..32465aec10 100644 --- a/examples/struct_indices/PandasIndexDemo.ipynb +++ b/examples/struct_indices/PandasIndexDemo.ipynb @@ -335,7 +335,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.10" + "version": "3.11.0" } }, "nbformat": 4, diff --git a/examples/struct_indices/SQLIndexDemo-Context.ipynb b/examples/struct_indices/SQLIndexDemo-Context.ipynb index 66498629f0..8eb00a25d5 100644 --- a/examples/struct_indices/SQLIndexDemo-Context.ipynb +++ b/examples/struct_indices/SQLIndexDemo-Context.ipynb @@ -1,319 +1,319 @@ { - "cells": [ - { - "cell_type": "markdown", - "id": "e45f9b60-cd6b-4c15-958f-1feca5438128", - "metadata": {}, - "source": [ - "# SQL Index Demo\n", - "\n", - "Demo where table contains context." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "fbd7317b", - "metadata": {}, - "outputs": [], - "source": [ - "import logging\n", - "import sys\n", - "\n", - "logging.basicConfig(stream=sys.stdout, level=logging.INFO)\n", - "logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "107396a9-4aa7-49b3-9f0f-a755726c19ba", - "metadata": {}, - "outputs": [], - "source": [ - "from llama_index import GPTSQLStructStoreIndex, SQLDatabase, SimpleDirectoryReader, WikipediaReader, Document\n", - "from llama_index.indices.struct_store import SQLContextContainerBuilder\n", - "from IPython.display import Markdown, display" - ] - }, - { - "cell_type": "markdown", - "id": "77ac8d94-cd61-4869-a32b-0b2e7d18b83f", - "metadata": {}, - "source": [ - "### Load Wikipedia Data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "93301fcf-a52b-430c-98a3-5360e6c8fc4a", - "metadata": {}, - "outputs": [], - "source": [ - "# install wikipedia python package\n", - "!pip install wikipedia" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ba3f7e5e-cdc4-4529-bba9-db45d8457dba", - "metadata": {}, - "outputs": [], - "source": [ - "wiki_docs = WikipediaReader().load_data(pages=['Toronto', 'Berlin', 'Tokyo'])" - ] - }, - { - "cell_type": "markdown", - "id": "461438c8-302d-45c5-8e69-16ad604686d1", - "metadata": {}, - "source": [ - "### Create Database Schema" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a370b266-66f5-4624-bbf9-2ad57f0511f8", - "metadata": {}, - "outputs": [], - "source": [ - "from sqlalchemy import create_engine, MetaData, Table, Column, String, Integer, select, column" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ea24f794-f10b-42e6-922d-9258b7167405", - "metadata": {}, - "outputs": [], - "source": [ - "engine = create_engine(\"sqlite:///:memory:\")\n", - "metadata_obj = MetaData(bind=engine)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b4154b29-7e23-4c26-a507-370a66186ae7", - "metadata": {}, - "outputs": [], - "source": [ - "# create city SQL table\n", - "table_name = \"city_stats\"\n", - "city_stats_table = Table(\n", - " table_name,\n", - " metadata_obj,\n", - " Column(\"city_name\", String(16), primary_key=True),\n", - " Column(\"population\", Integer),\n", - " Column(\"country\", String(16), nullable=False),\n", - ")\n", - "metadata_obj.create_all()" - ] - }, - { - "cell_type": "markdown", - "id": "1c09089a-6bcd-48db-8120-a84c8da3f82e", - "metadata": { - "tags": [] - }, - "source": [ - "### Build Index with Context" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "611319e5-d3c2-4286-a84f-ed2459896c58", - "metadata": {}, - "outputs": [], - "source": [ - "from llama_index import GPTSQLStructStoreIndex, SQLDatabase\n", - "from llama_index.indices.struct_store import SQLContextContainerBuilder" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3fc2dfab-90ea-4f01-9e28-d21fdc5f0758", - "metadata": {}, - "outputs": [], - "source": [ - "sql_database = SQLDatabase(engine, include_tables=[\"city_stats\"])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "89f6f1d1-a022-43d7-b135-a79ec9407956", - "metadata": {}, - "outputs": [], - "source": [ - "sql_database.table_info" - ] - }, - { - "cell_type": "markdown", - "id": "331ff0ce-9131-4680-a5f2-3f41c73e018e", - "metadata": {}, - "source": [ - "We either set the context manually, or have GPT extract the context for us" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a743f365-21c6-4eae-a2f4-fc72d4199daa", - "metadata": {}, - "outputs": [], - "source": [ - "# manually set context text\n", - "city_stats_text = (\n", - " \"This table gives information regarding the population and country of a given city.\\n\"\n", - " \"The user will query with codewords, where 'foo' corresponds to population and 'bar'\"\n", - " \"corresponds to city.\"\n", - ")\n", - "table_context_dict={\"city_stats\": city_stats_text}\n", - "context_builder = SQLContextContainerBuilder(sql_database, context_dict=table_context_dict)\n", - "context_container = context_builder.build_context_container()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b15ffc74-4a44-40b4-87b1-44f952ebfd58", - "metadata": {}, - "outputs": [], - "source": [ - "index = GPTSQLStructStoreIndex.from_documents(\n", - " wiki_docs, \n", - " sql_database=sql_database, \n", - " table_name=\"city_stats\",\n", - " sql_context_container=context_container\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5cdcc666-4b51-4ed5-92ca-e98e8a01fdd0", - "metadata": {}, - "outputs": [], - "source": [ - "# extract context from a raw Document using GPT\n", - "city_stats_text = (\n", - " \"This table gives information regarding the population and country of a given city.\\n\"\n", - ")\n", - "context_documents_dict = {\"city_stats\": [Document(city_stats_text)]}\n", - "context_builder = SQLContextContainerBuilder.from_documents(\n", - " context_documents_dict, \n", - " sql_database\n", - ")\n", - "context_container = context_builder.build_context_container()\n", - "\n", - "index = GPTSQLStructStoreIndex.from_documents(\n", - " wiki_docs, \n", - " sql_database=sql_database, \n", - " table_name=\"city_stats\",\n", - " sql_context_container=context_container,\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b315b8ff-7dd7-4e7d-ac47-8c5a0c3e7ae9", - "metadata": {}, - "outputs": [], - "source": [ - "# view current table\n", - "stmt = select(\n", - " [column(\"city_name\"), column(\"population\"), column(\"country\")]\n", - ").select_from(city_stats_table)\n", - "\n", - "with engine.connect() as connection:\n", - " results = connection.execute(stmt).fetchall()\n", - " print(results)\n" - ] - }, - { - "cell_type": "markdown", - "id": "051a171f-8c97-40ed-ae17-4e3fa3785487", - "metadata": {}, - "source": [ - "### Query Index" - ] - }, - { - "cell_type": "markdown", - "id": "91139712-f232-47e1-9683-cbbd49cd331b", - "metadata": {}, - "source": [ - "Here we show a natural language query, which is translated to a SQL query under the hood." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a76d1816-4f70-4914-80af-7b968c614592", - "metadata": {}, - "outputs": [], - "source": [ - "# set Logging to DEBUG for more detailed outputs\n", - "response = index.query(\"Which city has the highest population?\", mode=\"default\")" - ] - }, - { - "cell_type": "markdown", - "id": "5dc2f7bf-6f6c-42ba-8f42-47afea6606ad", - "metadata": {}, - "source": [ - "We can also use codewords during the NL query! " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d71045c0-7a96-4e86-b38c-c378b7759aa4", - "metadata": {}, - "outputs": [], - "source": [ - "# set Logging to DEBUG for more detailed outputs\n", - "response = index.query(\"Which bar has the highest foo?\", mode=\"default\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e713d73e-73ed-4748-8673-f476899fac8e", - "metadata": {}, - "outputs": [], - "source": [ - "display(Markdown(f\"<b>{response}</b>\"))" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.16" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} \ No newline at end of file + "cells": [ + { + "cell_type": "markdown", + "id": "e45f9b60-cd6b-4c15-958f-1feca5438128", + "metadata": {}, + "source": [ + "# SQL Index Demo - Table Context\n", + "\n", + "Demo where table contains context." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "fbd7317b", + "metadata": {}, + "outputs": [], + "source": [ + "import logging\n", + "import sys\n", + "\n", + "logging.basicConfig(stream=sys.stdout, level=logging.INFO)\n", + "logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "107396a9-4aa7-49b3-9f0f-a755726c19ba", + "metadata": {}, + "outputs": [], + "source": [ + "from llama_index import GPTSQLStructStoreIndex, SQLDatabase, SimpleDirectoryReader, WikipediaReader, Document\n", + "from llama_index.indices.struct_store import SQLContextContainerBuilder\n", + "from IPython.display import Markdown, display" + ] + }, + { + "cell_type": "markdown", + "id": "77ac8d94-cd61-4869-a32b-0b2e7d18b83f", + "metadata": {}, + "source": [ + "### Load Wikipedia Data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "93301fcf-a52b-430c-98a3-5360e6c8fc4a", + "metadata": {}, + "outputs": [], + "source": [ + "# install wikipedia python package\n", + "!pip install wikipedia" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ba3f7e5e-cdc4-4529-bba9-db45d8457dba", + "metadata": {}, + "outputs": [], + "source": [ + "wiki_docs = WikipediaReader().load_data(pages=['Toronto', 'Berlin', 'Tokyo'])" + ] + }, + { + "cell_type": "markdown", + "id": "461438c8-302d-45c5-8e69-16ad604686d1", + "metadata": {}, + "source": [ + "### Create Database Schema" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a370b266-66f5-4624-bbf9-2ad57f0511f8", + "metadata": {}, + "outputs": [], + "source": [ + "from sqlalchemy import create_engine, MetaData, Table, Column, String, Integer, select, column" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ea24f794-f10b-42e6-922d-9258b7167405", + "metadata": {}, + "outputs": [], + "source": [ + "engine = create_engine(\"sqlite:///:memory:\")\n", + "metadata_obj = MetaData(bind=engine)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b4154b29-7e23-4c26-a507-370a66186ae7", + "metadata": {}, + "outputs": [], + "source": [ + "# create city SQL table\n", + "table_name = \"city_stats\"\n", + "city_stats_table = Table(\n", + " table_name,\n", + " metadata_obj,\n", + " Column(\"city_name\", String(16), primary_key=True),\n", + " Column(\"population\", Integer),\n", + " Column(\"country\", String(16), nullable=False),\n", + ")\n", + "metadata_obj.create_all()" + ] + }, + { + "cell_type": "markdown", + "id": "1c09089a-6bcd-48db-8120-a84c8da3f82e", + "metadata": { + "tags": [] + }, + "source": [ + "### Build Index with Context" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "611319e5-d3c2-4286-a84f-ed2459896c58", + "metadata": {}, + "outputs": [], + "source": [ + "from llama_index import GPTSQLStructStoreIndex, SQLDatabase\n", + "from llama_index.indices.struct_store import SQLContextContainerBuilder" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3fc2dfab-90ea-4f01-9e28-d21fdc5f0758", + "metadata": {}, + "outputs": [], + "source": [ + "sql_database = SQLDatabase(engine, include_tables=[\"city_stats\"])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "89f6f1d1-a022-43d7-b135-a79ec9407956", + "metadata": {}, + "outputs": [], + "source": [ + "sql_database.table_info" + ] + }, + { + "cell_type": "markdown", + "id": "331ff0ce-9131-4680-a5f2-3f41c73e018e", + "metadata": {}, + "source": [ + "We either set the context manually, or have GPT extract the context for us" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a743f365-21c6-4eae-a2f4-fc72d4199daa", + "metadata": {}, + "outputs": [], + "source": [ + "# manually set context text\n", + "city_stats_text = (\n", + " \"This table gives information regarding the population and country of a given city.\\n\"\n", + " \"The user will query with codewords, where 'foo' corresponds to population and 'bar'\"\n", + " \"corresponds to city.\"\n", + ")\n", + "table_context_dict={\"city_stats\": city_stats_text}\n", + "context_builder = SQLContextContainerBuilder(sql_database, context_dict=table_context_dict)\n", + "context_container = context_builder.build_context_container()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b15ffc74-4a44-40b4-87b1-44f952ebfd58", + "metadata": {}, + "outputs": [], + "source": [ + "index = GPTSQLStructStoreIndex.from_documents(\n", + " wiki_docs, \n", + " sql_database=sql_database, \n", + " table_name=\"city_stats\",\n", + " sql_context_container=context_container\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5cdcc666-4b51-4ed5-92ca-e98e8a01fdd0", + "metadata": {}, + "outputs": [], + "source": [ + "# extract context from a raw Document using GPT\n", + "city_stats_text = (\n", + " \"This table gives information regarding the population and country of a given city.\\n\"\n", + ")\n", + "context_documents_dict = {\"city_stats\": [Document(city_stats_text)]}\n", + "context_builder = SQLContextContainerBuilder.from_documents(\n", + " context_documents_dict, \n", + " sql_database\n", + ")\n", + "context_container = context_builder.build_context_container()\n", + "\n", + "index = GPTSQLStructStoreIndex.from_documents(\n", + " wiki_docs, \n", + " sql_database=sql_database, \n", + " table_name=\"city_stats\",\n", + " sql_context_container=context_container,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b315b8ff-7dd7-4e7d-ac47-8c5a0c3e7ae9", + "metadata": {}, + "outputs": [], + "source": [ + "# view current table\n", + "stmt = select(\n", + " [column(\"city_name\"), column(\"population\"), column(\"country\")]\n", + ").select_from(city_stats_table)\n", + "\n", + "with engine.connect() as connection:\n", + " results = connection.execute(stmt).fetchall()\n", + " print(results)\n" + ] + }, + { + "cell_type": "markdown", + "id": "051a171f-8c97-40ed-ae17-4e3fa3785487", + "metadata": {}, + "source": [ + "### Query Index" + ] + }, + { + "cell_type": "markdown", + "id": "91139712-f232-47e1-9683-cbbd49cd331b", + "metadata": {}, + "source": [ + "Here we show a natural language query, which is translated to a SQL query under the hood." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a76d1816-4f70-4914-80af-7b968c614592", + "metadata": {}, + "outputs": [], + "source": [ + "# set Logging to DEBUG for more detailed outputs\n", + "response = index.query(\"Which city has the highest population?\", mode=\"default\")" + ] + }, + { + "cell_type": "markdown", + "id": "5dc2f7bf-6f6c-42ba-8f42-47afea6606ad", + "metadata": {}, + "source": [ + "We can also use codewords during the NL query! " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d71045c0-7a96-4e86-b38c-c378b7759aa4", + "metadata": {}, + "outputs": [], + "source": [ + "# set Logging to DEBUG for more detailed outputs\n", + "response = index.query(\"Which bar has the highest foo?\", mode=\"default\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e713d73e-73ed-4748-8673-f476899fac8e", + "metadata": {}, + "outputs": [], + "source": [ + "display(Markdown(f\"<b>{response}</b>\"))" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.0" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/examples/struct_indices/SQLIndexDemo-ManyTables.ipynb b/examples/struct_indices/SQLIndexDemo-ManyTables.ipynb index ba2dd0ce17..c2a0c5067a 100644 --- a/examples/struct_indices/SQLIndexDemo-ManyTables.ipynb +++ b/examples/struct_indices/SQLIndexDemo-ManyTables.ipynb @@ -1,400 +1,400 @@ { - "cells": [ - { - "cell_type": "markdown", - "id": "e45f9b60-cd6b-4c15-958f-1feca5438128", - "metadata": {}, - "source": [ - "# SQL Index Demo\n", - "\n", - "Demo where table contains context." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "fbd7317b", - "metadata": {}, - "outputs": [], - "source": [ - "import logging\n", - "import sys\n", - "\n", - "logging.basicConfig(stream=sys.stdout, level=logging.INFO)\n", - "logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "107396a9-4aa7-49b3-9f0f-a755726c19ba", - "metadata": {}, - "outputs": [], - "source": [ - "from gpt_index import GPTSQLStructStoreIndex, SQLDatabase, SimpleDirectoryReader, WikipediaReader, Document\n", - "from gpt_index.indices.struct_store import SQLContextContainerBuilder\n", - "from IPython.display import Markdown, display" - ] - }, - { - "cell_type": "markdown", - "id": "461438c8-302d-45c5-8e69-16ad604686d1", - "metadata": {}, - "source": [ - "### Create Database Schema + Test Data\n", - "\n", - "Here we introduce a toy scenario where there are 100 tables (too big to fit into the prompt)" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "a370b266-66f5-4624-bbf9-2ad57f0511f8", - "metadata": {}, - "outputs": [], - "source": [ - "from sqlalchemy import create_engine, MetaData, Table, Column, String, Integer, select, column" - ] - }, - { - "cell_type": "code", - "execution_count": 30, - "id": "ea24f794-f10b-42e6-922d-9258b7167405", - "metadata": {}, - "outputs": [], - "source": [ - "engine = create_engine(\"sqlite:///:memory:\")\n", - "metadata_obj = MetaData(bind=engine)" - ] - }, - { - "cell_type": "code", - "execution_count": 31, - "id": "b4154b29-7e23-4c26-a507-370a66186ae7", - "metadata": {}, - "outputs": [], - "source": [ - "# create city SQL table\n", - "table_name = \"city_stats\"\n", - "city_stats_table = Table(\n", - " table_name,\n", - " metadata_obj,\n", - " Column(\"city_name\", String(16), primary_key=True),\n", - " Column(\"population\", Integer),\n", - " Column(\"country\", String(16), nullable=False),\n", - ")\n", - "all_table_names = [\"city_stats\"]\n", - "# create a ton of dummy tables\n", - "n = 100\n", - "for i in range(n):\n", - " tmp_table_name = f\"tmp_table_{i}\"\n", - " tmp_table = Table(\n", - " tmp_table_name,\n", - " metadata_obj,\n", - " Column(f\"tmp_field_{i}_1\", String(16), primary_key=True),\n", - " Column(f\"tmp_field_{i}_2\", Integer),\n", - " Column(f\"tmp_field_{i}_3\", String(16), nullable=False),\n", - " )\n", - " all_table_names.append(f\"tmp_table_{i}\")\n", - "\n", - "metadata_obj.create_all()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4768bcb4-c40e-4d5d-8d70-7cb3228b50ab", - "metadata": {}, - "outputs": [], - "source": [ - "# print tables\n", - "metadata_obj.tables.keys()" - ] - }, - { - "cell_type": "markdown", - "id": "4c0eb518-5da3-4215-8280-0776d07806a0", - "metadata": {}, - "source": [ - "We introduce some test data into the `city_stats` table" - ] - }, - { - "cell_type": "code", - "execution_count": 33, - "id": "d15192b6-99f9-4f72-b637-82e885ea057f", - "metadata": {}, - "outputs": [], - "source": [ - "from sqlalchemy import insert\n", - "rows = [\n", - " {\"city_name\": \"Toronto\", \"population\": 2930000, \"country\": \"Canada\"},\n", - " {\"city_name\": \"Tokyo\", \"population\": 13960000, \"country\": \"Japan\"},\n", - " {\"city_name\": \"Chicago\", \"population\": 2679000, \"country\": \"United States\"},\n", - " {\"city_name\": \"Seoul\", \"population\": 9776000, \"country\": \"South Korea\"},\n", - "]\n", - "for row in rows:\n", - " stmt = insert(city_stats_table).values(**row)\n", - " with engine.connect() as connection:\n", - " cursor = connection.execute(stmt)" - ] - }, - { - "cell_type": "code", - "execution_count": 34, - "id": "bfc2e4a4-e11d-4d8f-bf1f-7f777a1dc6e2", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[('Toronto', 2930000, 'Canada'), ('Tokyo', 13960000, 'Japan'), ('Chicago', 2679000, 'United States'), ('Seoul', 9776000, 'South Korea')]\n" - ] - } - ], - "source": [ - "with engine.connect() as connection:\n", - " cursor = connection.exec_driver_sql(\"SELECT * FROM city_stats\")\n", - " print(cursor.fetchall())" - ] - }, - { - "cell_type": "markdown", - "id": "1c09089a-6bcd-48db-8120-a84c8da3f82e", - "metadata": { - "tags": [] - }, - "source": [ - "### Using GPT Index to Store Table Schema Context" - ] - }, - { - "cell_type": "code", - "execution_count": 35, - "id": "611319e5-d3c2-4286-a84f-ed2459896c58", - "metadata": {}, - "outputs": [], - "source": [ - "from gpt_index import GPTSQLStructStoreIndex, SQLDatabase, GPTSimpleVectorIndex\n", - "from gpt_index.indices.struct_store import SQLContextContainerBuilder" - ] - }, - { - "cell_type": "code", - "execution_count": 36, - "id": "3fc2dfab-90ea-4f01-9e28-d21fdc5f0758", - "metadata": {}, - "outputs": [], - "source": [ - "sql_database = SQLDatabase(engine)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "89f6f1d1-a022-43d7-b135-a79ec9407956", - "metadata": {}, - "outputs": [], - "source": [ - "sql_database.table_info" - ] - }, - { - "cell_type": "markdown", - "id": "331ff0ce-9131-4680-a5f2-3f41c73e018e", - "metadata": {}, - "source": [ - "We dump the table schema information into a vector index. The vector index is stored within the context builder for future use." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1e86d548-a3f4-436a-a754-5247871ebe55", - "metadata": { - "scrolled": true, - "tags": [] - }, - "outputs": [], - "source": [ - "# build a vector index from the table schema information\n", - "context_builder = SQLContextContainerBuilder(sql_database)\n", - "table_schema_index = context_builder.derive_index_from_context(\n", - " GPTSimpleVectorIndex,\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a743f365-21c6-4eae-a2f4-fc72d4199daa", - "metadata": {}, - "outputs": [], - "source": [ - "# NOTE: not ingesting any unstructured documents atm\n", - "index = GPTSQLStructStoreIndex.from_documents(\n", - " [],\n", - " sql_database=sql_database, \n", - " table_name=\"city_stats\",\n", - ")" - ] - }, - { - "cell_type": "markdown", - "id": "051a171f-8c97-40ed-ae17-4e3fa3785487", - "metadata": {}, - "source": [ - "### Query Index" - ] - }, - { - "cell_type": "markdown", - "id": "91139712-f232-47e1-9683-cbbd49cd331b", - "metadata": {}, - "source": [ - "Here we show a natural language query. \n", - "1. We first query for the right table schema. Note that we build a context container during query-time.\n", - "2. Given this context container, we execute the NL query against the db." - ] - }, - { - "cell_type": "code", - "execution_count": 44, - "id": "eabededd-3c17-45b7-aabc-06a2457bc3cb", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "INFO:root:> [query] Total LLM token usage: 135 tokens\n", - "> [query] Total LLM token usage: 135 tokens\n", - "INFO:root:> [query] Total embedding token usage: 23 tokens\n", - "> [query] Total embedding token usage: 23 tokens\n", - "\n", - "Table 'city_stats':\n", - "city_name (VARCHAR(16)), population (INTEGER), country (VARCHAR(16))\n" - ] - } - ], - "source": [ - "query_str = \"Which city has the highest population?\"\n", - "context_builder.query_index_for_context(table_schema_index, query_str, store_context_str=True)\n", - "context_container = context_builder.build_context_container()" - ] - }, - { - "cell_type": "code", - "execution_count": 45, - "id": "a80ee856-6ac3-4b37-b390-be583024bed4", - "metadata": {}, - "outputs": [ - { - "data": { - "text/markdown": [ - "<b>\n", - "Table 'city_stats':\n", - "city_name (VARCHAR(16)), population (INTEGER), country (VARCHAR(16))</b>" - ], - "text/plain": [ - "<IPython.core.display.Markdown object>" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "display(Markdown(f\"<b>{context_container.context_str}</b>\"))" - ] - }, - { - "cell_type": "code", - "execution_count": 40, - "id": "64671ddc-9768-40c2-8898-ab7c0cf10917", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "INFO:root:> Table desc str: \n", - "Table 'city_stats':\n", - "city_name (VARCHAR(16)), population (INTEGER), country (VARCHAR(16))\n", - "> Table desc str: \n", - "Table 'city_stats':\n", - "city_name (VARCHAR(16)), population (INTEGER), country (VARCHAR(16))\n", - "INFO:root:> [query] Total LLM token usage: 134 tokens\n", - "> [query] Total LLM token usage: 134 tokens\n", - "INFO:root:> [query] Total embedding token usage: 0 tokens\n", - "> [query] Total embedding token usage: 0 tokens\n" - ] - } - ], - "source": [ - "response = index.query(query_str, sql_context_container=context_container)" - ] - }, - { - "cell_type": "markdown", - "id": "5dc2f7bf-6f6c-42ba-8f42-47afea6606ad", - "metadata": {}, - "source": [ - "We can also use codewords during the NL query! " - ] - }, - { - "cell_type": "code", - "execution_count": 41, - "id": "25c11645-56bd-433a-85f4-420413f8970d", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "\"[('Tokyo',)]\"" - ] - }, - "execution_count": 41, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "str(response)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3f72abc6-54d7-4f85-abf8-32978d94f558", - "metadata": {}, - "outputs": [], - "source": [ - "response.extra_info" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "gpt_retrieve_venv", - "language": "python", - "name": "gpt_retrieve_venv" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.16" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} \ No newline at end of file + "cells": [ + { + "cell_type": "markdown", + "id": "e45f9b60-cd6b-4c15-958f-1feca5438128", + "metadata": {}, + "source": [ + "# SQL Index Demo - Many Tables\n", + "\n", + "Demo where table contains context." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "fbd7317b", + "metadata": {}, + "outputs": [], + "source": [ + "import logging\n", + "import sys\n", + "\n", + "logging.basicConfig(stream=sys.stdout, level=logging.INFO)\n", + "logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "107396a9-4aa7-49b3-9f0f-a755726c19ba", + "metadata": {}, + "outputs": [], + "source": [ + "from gpt_index import GPTSQLStructStoreIndex, SQLDatabase, SimpleDirectoryReader, WikipediaReader, Document\n", + "from gpt_index.indices.struct_store import SQLContextContainerBuilder\n", + "from IPython.display import Markdown, display" + ] + }, + { + "cell_type": "markdown", + "id": "461438c8-302d-45c5-8e69-16ad604686d1", + "metadata": {}, + "source": [ + "### Create Database Schema + Test Data\n", + "\n", + "Here we introduce a toy scenario where there are 100 tables (too big to fit into the prompt)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "a370b266-66f5-4624-bbf9-2ad57f0511f8", + "metadata": {}, + "outputs": [], + "source": [ + "from sqlalchemy import create_engine, MetaData, Table, Column, String, Integer, select, column" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "id": "ea24f794-f10b-42e6-922d-9258b7167405", + "metadata": {}, + "outputs": [], + "source": [ + "engine = create_engine(\"sqlite:///:memory:\")\n", + "metadata_obj = MetaData(bind=engine)" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "id": "b4154b29-7e23-4c26-a507-370a66186ae7", + "metadata": {}, + "outputs": [], + "source": [ + "# create city SQL table\n", + "table_name = \"city_stats\"\n", + "city_stats_table = Table(\n", + " table_name,\n", + " metadata_obj,\n", + " Column(\"city_name\", String(16), primary_key=True),\n", + " Column(\"population\", Integer),\n", + " Column(\"country\", String(16), nullable=False),\n", + ")\n", + "all_table_names = [\"city_stats\"]\n", + "# create a ton of dummy tables\n", + "n = 100\n", + "for i in range(n):\n", + " tmp_table_name = f\"tmp_table_{i}\"\n", + " tmp_table = Table(\n", + " tmp_table_name,\n", + " metadata_obj,\n", + " Column(f\"tmp_field_{i}_1\", String(16), primary_key=True),\n", + " Column(f\"tmp_field_{i}_2\", Integer),\n", + " Column(f\"tmp_field_{i}_3\", String(16), nullable=False),\n", + " )\n", + " all_table_names.append(f\"tmp_table_{i}\")\n", + "\n", + "metadata_obj.create_all()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4768bcb4-c40e-4d5d-8d70-7cb3228b50ab", + "metadata": {}, + "outputs": [], + "source": [ + "# print tables\n", + "metadata_obj.tables.keys()" + ] + }, + { + "cell_type": "markdown", + "id": "4c0eb518-5da3-4215-8280-0776d07806a0", + "metadata": {}, + "source": [ + "We introduce some test data into the `city_stats` table" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "id": "d15192b6-99f9-4f72-b637-82e885ea057f", + "metadata": {}, + "outputs": [], + "source": [ + "from sqlalchemy import insert\n", + "rows = [\n", + " {\"city_name\": \"Toronto\", \"population\": 2930000, \"country\": \"Canada\"},\n", + " {\"city_name\": \"Tokyo\", \"population\": 13960000, \"country\": \"Japan\"},\n", + " {\"city_name\": \"Chicago\", \"population\": 2679000, \"country\": \"United States\"},\n", + " {\"city_name\": \"Seoul\", \"population\": 9776000, \"country\": \"South Korea\"},\n", + "]\n", + "for row in rows:\n", + " stmt = insert(city_stats_table).values(**row)\n", + " with engine.connect() as connection:\n", + " cursor = connection.execute(stmt)" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "id": "bfc2e4a4-e11d-4d8f-bf1f-7f777a1dc6e2", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[('Toronto', 2930000, 'Canada'), ('Tokyo', 13960000, 'Japan'), ('Chicago', 2679000, 'United States'), ('Seoul', 9776000, 'South Korea')]\n" + ] + } + ], + "source": [ + "with engine.connect() as connection:\n", + " cursor = connection.exec_driver_sql(\"SELECT * FROM city_stats\")\n", + " print(cursor.fetchall())" + ] + }, + { + "cell_type": "markdown", + "id": "1c09089a-6bcd-48db-8120-a84c8da3f82e", + "metadata": { + "tags": [] + }, + "source": [ + "### Using GPT Index to Store Table Schema Context" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "id": "611319e5-d3c2-4286-a84f-ed2459896c58", + "metadata": {}, + "outputs": [], + "source": [ + "from gpt_index import GPTSQLStructStoreIndex, SQLDatabase, GPTSimpleVectorIndex\n", + "from gpt_index.indices.struct_store import SQLContextContainerBuilder" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "id": "3fc2dfab-90ea-4f01-9e28-d21fdc5f0758", + "metadata": {}, + "outputs": [], + "source": [ + "sql_database = SQLDatabase(engine)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "89f6f1d1-a022-43d7-b135-a79ec9407956", + "metadata": {}, + "outputs": [], + "source": [ + "sql_database.table_info" + ] + }, + { + "cell_type": "markdown", + "id": "331ff0ce-9131-4680-a5f2-3f41c73e018e", + "metadata": {}, + "source": [ + "We dump the table schema information into a vector index. The vector index is stored within the context builder for future use." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1e86d548-a3f4-436a-a754-5247871ebe55", + "metadata": { + "scrolled": true, + "tags": [] + }, + "outputs": [], + "source": [ + "# build a vector index from the table schema information\n", + "context_builder = SQLContextContainerBuilder(sql_database)\n", + "table_schema_index = context_builder.derive_index_from_context(\n", + " GPTSimpleVectorIndex,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a743f365-21c6-4eae-a2f4-fc72d4199daa", + "metadata": {}, + "outputs": [], + "source": [ + "# NOTE: not ingesting any unstructured documents atm\n", + "index = GPTSQLStructStoreIndex.from_documents(\n", + " [],\n", + " sql_database=sql_database, \n", + " table_name=\"city_stats\",\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "051a171f-8c97-40ed-ae17-4e3fa3785487", + "metadata": {}, + "source": [ + "### Query Index" + ] + }, + { + "cell_type": "markdown", + "id": "91139712-f232-47e1-9683-cbbd49cd331b", + "metadata": {}, + "source": [ + "Here we show a natural language query. \n", + "1. We first query for the right table schema. Note that we build a context container during query-time.\n", + "2. Given this context container, we execute the NL query against the db." + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "id": "eabededd-3c17-45b7-aabc-06a2457bc3cb", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "INFO:root:> [query] Total LLM token usage: 135 tokens\n", + "> [query] Total LLM token usage: 135 tokens\n", + "INFO:root:> [query] Total embedding token usage: 23 tokens\n", + "> [query] Total embedding token usage: 23 tokens\n", + "\n", + "Table 'city_stats':\n", + "city_name (VARCHAR(16)), population (INTEGER), country (VARCHAR(16))\n" + ] + } + ], + "source": [ + "query_str = \"Which city has the highest population?\"\n", + "context_builder.query_index_for_context(table_schema_index, query_str, store_context_str=True)\n", + "context_container = context_builder.build_context_container()" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "id": "a80ee856-6ac3-4b37-b390-be583024bed4", + "metadata": {}, + "outputs": [ + { + "data": { + "text/markdown": [ + "<b>\n", + "Table 'city_stats':\n", + "city_name (VARCHAR(16)), population (INTEGER), country (VARCHAR(16))</b>" + ], + "text/plain": [ + "<IPython.core.display.Markdown object>" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "display(Markdown(f\"<b>{context_container.context_str}</b>\"))" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "id": "64671ddc-9768-40c2-8898-ab7c0cf10917", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "INFO:root:> Table desc str: \n", + "Table 'city_stats':\n", + "city_name (VARCHAR(16)), population (INTEGER), country (VARCHAR(16))\n", + "> Table desc str: \n", + "Table 'city_stats':\n", + "city_name (VARCHAR(16)), population (INTEGER), country (VARCHAR(16))\n", + "INFO:root:> [query] Total LLM token usage: 134 tokens\n", + "> [query] Total LLM token usage: 134 tokens\n", + "INFO:root:> [query] Total embedding token usage: 0 tokens\n", + "> [query] Total embedding token usage: 0 tokens\n" + ] + } + ], + "source": [ + "response = index.query(query_str, sql_context_container=context_container)" + ] + }, + { + "cell_type": "markdown", + "id": "5dc2f7bf-6f6c-42ba-8f42-47afea6606ad", + "metadata": {}, + "source": [ + "We can also use codewords during the NL query! " + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "id": "25c11645-56bd-433a-85f4-420413f8970d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "\"[('Tokyo',)]\"" + ] + }, + "execution_count": 41, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "str(response)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3f72abc6-54d7-4f85-abf8-32978d94f558", + "metadata": {}, + "outputs": [], + "source": [ + "response.extra_info" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.16" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/examples/struct_indices/SQLIndexDemo.ipynb b/examples/struct_indices/SQLIndexDemo.ipynb index 29b0e5eaf0..4deb993677 100644 --- a/examples/struct_indices/SQLIndexDemo.ipynb +++ b/examples/struct_indices/SQLIndexDemo.ipynb @@ -5,7 +5,7 @@ "id": "e45f9b60-cd6b-4c15-958f-1feca5438128", "metadata": {}, "source": [ - "# SQL Index Demo" + "# SQL Index Demo - Basic" ] }, { @@ -452,7 +452,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.10" + "version": "3.11.0" } }, "nbformat": 4, diff --git a/examples/test_wiki/TestNYC-Benchmark-GPT4.ipynb b/examples/test_wiki/TestNYC-Benchmark-GPT4.ipynb index 70eb75082d..bd9b68b8a6 100644 --- a/examples/test_wiki/TestNYC-Benchmark-GPT4.ipynb +++ b/examples/test_wiki/TestNYC-Benchmark-GPT4.ipynb @@ -1,1700 +1,1708 @@ { - "cells": [ - { - "cell_type": "code", - "execution_count": 17, - "id": "9080b39e", - "metadata": {}, - "outputs": [], - "source": [ - "import logging, sys\n", - "logging.basicConfig(stream=sys.stdout, level=logging.INFO)\n", - "logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))\n", - "\n", - "# Uncomment if you want to temporarily disable logger\n", - "logging.disable(sys.maxsize)" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "7de92ce3", - "metadata": {}, - "outputs": [], - "source": [ - "# NOTE: only necessary for querying with `use_async=True` in notebook\n", - "import nest_asyncio\n", - "nest_asyncio.apply()" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "f1a9eb90-335c-4214-8bb6-fd1edbe3ccbd", - "metadata": {}, - "outputs": [], - "source": [ - "# My OpenAI Key\n", - "import os\n", - "os.environ['OPENAI_API_KEY'] = \"\"" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "8d0b2364-4806-4656-81e7-3f6e4b910b5b", - "metadata": {}, - "outputs": [], - "source": [ - "from gpt_index import GPTTreeIndex, SimpleDirectoryReader, LLMPredictor, GPTSimpleVectorIndex, GPTListIndex, Prompt, ServiceContext\n", - "from gpt_index.indices.base import BaseGPTIndex\n", - "from gpt_index.langchain_helpers.text_splitter import TokenTextSplitter\n", - "from langchain.chat_models import ChatOpenAI\n", - "from langchain.llms import OpenAI\n", - "from gpt_index.response.schema import Response\n", - "import pandas as pd\n", - "from typing import Tuple" - ] - }, - { - "cell_type": "markdown", - "id": "707662e5", - "metadata": {}, - "source": [ - "# Setup data" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "b4b4387b-413e-4016-ba1e-88b3d9410a38", - "metadata": {}, - "outputs": [], - "source": [ - "# fetch \"New York City\" page from Wikipedia\n", - "from pathlib import Path\n", - "\n", - "import requests\n", - "response = requests.get(\n", - " 'https://en.wikipedia.org/w/api.php',\n", - " params={\n", - " 'action': 'query',\n", - " 'format': 'json',\n", - " 'titles': 'New York City',\n", - " 'prop': 'extracts',\n", - " # 'exintro': True,\n", - " 'explaintext': True,\n", - " }\n", - ").json()\n", - "page = next(iter(response['query']['pages'].values()))\n", - "nyc_text = page['extract']\n", - "\n", - "data_path = Path('data')\n", - "if not data_path.exists():\n", - " Path.mkdir(data_path)\n", - "\n", - "with open('data/nyc_text.txt', 'w') as fp:\n", - " fp.write(nyc_text)" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "523fbebe-6e79-4d7b-b400-188b711a0e8f", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "DEBUG:gpt_index.readers.file.base:> [SimpleDirectoryReader] Total files added: 1\n", - "> [SimpleDirectoryReader] Total files added: 1\n" - ] - } - ], - "source": [ - "documents = SimpleDirectoryReader('data').load_data()" - ] - }, - { - "cell_type": "markdown", - "id": "f4a269bd", - "metadata": {}, - "source": [ - "# Setup benchmark" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "62f01ddf", - "metadata": {}, - "outputs": [], - "source": [ - "from dataclasses import dataclass\n", - "from typing import List" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "4ff13cd4", - "metadata": {}, - "outputs": [], - "source": [ - "@dataclass\n", - "class TestCase:\n", - " query: str \n", - " must_contain: List[str]" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "9c653b72", - "metadata": {}, - "outputs": [], - "source": [ - "@dataclass\n", - "class TestOutcome:\n", - " test: TestCase\n", - " response: Response\n", - " \n", - " @property\n", - " def is_correct_response(self) -> bool:\n", - " is_correct = True\n", - " for answer in self.test.must_contain:\n", - " if answer not in self.response.response:\n", - " is_correct = False\n", - " return is_correct\n", - " \n", - " @property\n", - " def is_correct_source(self) -> bool:\n", - " is_correct = True\n", - " for answer in self.test.must_contain:\n", - " if all(answer not in node.source_text for node in self.response.source_nodes):\n", - " is_correct = False\n", - " return is_correct" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "b9cd18ae", - "metadata": {}, - "outputs": [], - "source": [ - "class Benchmark:\n", - " def __init__(self, tests: List[TestCase]) -> None:\n", - " self._tests = tests\n", - " \n", - " def test(self, index: BaseGPTIndex, llm_predictor: LLMPredictor, **kwargs) -> List[TestOutcome]:\n", - " outcomes: List[TestOutcome] = []\n", - " service_context = ServiceContext.from_defaults(llm_predictor=llm_predictor)\n", - " for test in self._tests:\n", - " response = index.query(\n", - " test.query,\n", - " service_context=service_context,\n", - " **kwargs\n", - " )\n", - " outcome = TestOutcome(test=test, response=response)\n", - " outcomes.append(outcome)\n", - " return outcomes" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "8edad985", - "metadata": {}, - "outputs": [], - "source": [ - "def analyze_outcome(outcomes: List[TestOutcome]) -> None:\n", - " rows = []\n", - " for outcome in outcomes:\n", - " row = [outcome.test.query, outcome.is_correct_response, outcome.is_correct_source]\n", - " rows.append(row)\n", - " df = pd.DataFrame(rows, columns=['Test Query', 'Correct Response', 'Correct Source'])\n", - " return df" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "id": "4bc38077", - "metadata": {}, - "outputs": [], - "source": [ - "test_battle = TestCase(\n", - " query=\"What battles took place in New York City in the American Revolution?\",\n", - " must_contain=[\"Battle of Long Island\"]\n", - ")\n", - "\n", - "test_mayor = TestCase(\n", - " query='Who was elected as the mayor after the Great Depression?',\n", - " must_contain=[\"Fiorello La Guardia\"]\n", - ")\n", - "\n", - "test_tourists = TestCase(\n", - " query='How many tourists visited New York City in 2019?',\n", - " must_contain=['66.6 million']\n", - ")\n", - "test_airport = TestCase(\n", - " query='What are the airports in New York City?',\n", - " must_contain=['LaGuardia Airport']\n", - ")\n", - "test_visit = TestCase(\n", - " query='When was the first documented visit into New York Harbor?',\n", - " must_contain=['1524']\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "id": "f159dadb", - "metadata": {}, - "outputs": [], - "source": [ - "bm = Benchmark([\n", - " test_battle,\n", - " test_mayor,\n", - " test_tourists,\n", - " test_airport,\n", - " test_visit,\n", - "])" - ] - }, - { - "cell_type": "markdown", - "id": "65ddbd56", - "metadata": {}, - "source": [ - "# LLM based evaluation" - ] - }, - { - "cell_type": "code", - "execution_count": 592, - "id": "ed175de5", - "metadata": {}, - "outputs": [], - "source": [ - "from gpt_index.prompts.prompt_type import PromptType\n", - "\n", - "EVAL_PROMPT_TMPL = (\n", - " \"Given the question below. \\n\"\n", - " \"---------------------\\n\"\n", - " \"{query_str}\"\n", - " \"\\n---------------------\\n\"\n", - " \"Decide if the following retreived context is relevant. \\n\"\n", - " \"\\n---------------------\\n\"\n", - " \"{context_str}\"\n", - " \"\\n---------------------\\n\"\n", - " \"Then decide if the answer is correct. \\n\"\n", - " \"\\n---------------------\\n\"\n", - " \"{answer_str}\"\n", - " \"\\n---------------------\\n\"\n", - " \"Answer in the following format:\\n\"\n", - " \"'Context is relevant: <True>\\nAnswer is correct: <True>' \"\n", - " \"and explain why.\"\n", - ")\n", - "\n", - "class EvalPrompt(Prompt):\n", - " prompt_type: PromptType = PromptType.CUSTOM\n", - " input_variables: List[str] = [\"query_str\", 'context_str', 'answer_str']\n", - "\n", - "DEFAULT_EVAL_PROMPT = EvalPrompt(EVAL_PROMPT_TMPL)" - ] - }, - { - "cell_type": "code", - "execution_count": 593, - "id": "93c498b6", - "metadata": {}, - "outputs": [], - "source": [ - "import re\n", - "def extract_eval_result(result_str: str):\n", - " boolean_pattern = r\"(True|False)\"\n", - " matches = re.findall(boolean_pattern, result_str)\n", - " return [match == \"True\" for match in matches] " - ] - }, - { - "cell_type": "code", - "execution_count": 594, - "id": "4c8109c3", - "metadata": {}, - "outputs": [], - "source": [ - "def analyze_outcome_llm_single(outcome: TestOutcome, llm_predictor: LLMPredictor) -> Tuple[bool, bool]:\n", - " try:\n", - " source_text = outcome.response.source_nodes[0].source_text\n", - " except:\n", - " source_text = \"Failed to retrieve any context\"\n", - " result_str, _ = llm_predictor.predict(\n", - " DEFAULT_EVAL_PROMPT,\n", - " query_str=outcome.test.query,\n", - " context_str=source_text,\n", - " answer_str=outcome.response.response\n", - " )\n", - " is_context_relevant, is_answer_correct = extract_eval_result(result_str)\n", - " return is_answer_correct, is_context_relevant, result_str\n", - "\n", - "def analyze_outcome_llm(outcomes: List[TestOutcome], llm_predictor: LLMPredictor) -> None:\n", - " rows = []\n", - " for outcome in outcomes:\n", - " is_correct_response, is_correct_source, result_str = analyze_outcome_llm_single(outcome, llm_predictor)\n", - " row = [outcome.test.query, is_correct_response, is_correct_source, result_str]\n", - " rows.append(row)\n", - " df = pd.DataFrame(rows, columns=['Test Query', 'Correct Response (LLM)', 'Correct Source (LLM)', 'Eval (LLM)'])\n", - " return df" - ] - }, - { - "cell_type": "markdown", - "id": "5a9f43a6", - "metadata": {}, - "source": [ - "# Build Indices" - ] - }, - { - "cell_type": "code", - "execution_count": 643, - "id": "790bad05", - "metadata": {}, - "outputs": [], - "source": [ - "vector_index = GPTSimpleVectorIndex.from_documents(\n", - " documents, \n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 473, - "id": "64c970e0", - "metadata": {}, - "outputs": [], - "source": [ - "list_index = GPTListIndex.from_documents(\n", - " documents, \n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 468, - "id": "bacc4f1c", - "metadata": {}, - "outputs": [], - "source": [ - "tree_index = GPTTreeIndex.from_documents(documents)" - ] - }, - { - "cell_type": "code", - "execution_count": 632, - "id": "a600d4de", - "metadata": {}, - "outputs": [], - "source": [ - "# Save indices\n", - "vector_index.save_to_disk('vector_index.json')\n", - "tree_index.save_to_disk('tree_index.json')\n", - "list_index.save_to_disk('list_index.json')" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "id": "5eec265d-211b-4f26-b05b-5b4e7072bc6e", - "metadata": {}, - "outputs": [], - "source": [ - "# Load indices\n", - "tree_index = GPTTreeIndex.load_from_disk('tree_index.json')\n", - "list_index = GPTListIndex.load_from_disk('list_index.json')\n", - "vector_index = GPTSimpleVectorIndex.load_from_disk('vector_index.json')" - ] - }, - { - "cell_type": "markdown", - "id": "5b2e7fdd", - "metadata": {}, - "source": [ - "# Create LLMPredictors" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "id": "4766ac56-ac8d-4f33-b994-6901964241ea", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "# gpt-4\n", - "llm_predictor_gpt4 = LLMPredictor(\n", - " llm=ChatOpenAI(temperature=0, model_name=\"gpt-4\")\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 169, - "id": "c8692cf6", - "metadata": {}, - "outputs": [], - "source": [ - "# gpt-3 (text-davinci-003)\n", - "llm_predictor_gpt3 = LLMPredictor(llm=OpenAI(temperature=0, model_name=\"text-davinci-003\"))" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "id": "fb74ec62", - "metadata": {}, - "outputs": [], - "source": [ - "# chatgpt (gpt-3.5-turbo)\n", - "llm_predictor_chatgpt = LLMPredictor(llm=ChatOpenAI(temperature=0, model_name=\"gpt-3.5-turbo\"))" - ] - }, - { - "cell_type": "markdown", - "id": "1354f668", - "metadata": {}, - "source": [ - "# Benchmarking " - ] - }, - { - "cell_type": "markdown", - "id": "01124a3f", - "metadata": {}, - "source": [ - "### Tree Index + GPT4" - ] - }, - { - "cell_type": "code", - "execution_count": 583, - "id": "6f418554", - "metadata": {}, - "outputs": [], - "source": [ - "outcomes_tree_gpt4 = bm.test(tree_index, llm_predictor_gpt4)" - ] - }, - { - "cell_type": "code", - "execution_count": 584, - "id": "de98ceba", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "<div>\n", - "<style scoped>\n", - " .dataframe tbody tr th:only-of-type {\n", - " vertical-align: middle;\n", - " }\n", - "\n", - " .dataframe tbody tr th {\n", - " vertical-align: top;\n", - " }\n", - "\n", - " .dataframe thead th {\n", - " text-align: right;\n", - " }\n", - "</style>\n", - "<table border=\"1\" class=\"dataframe\">\n", - " <thead>\n", - " <tr style=\"text-align: right;\">\n", - " <th></th>\n", - " <th>Test Query</th>\n", - " <th>Correct Response</th>\n", - " <th>Correct Source</th>\n", - " </tr>\n", - " </thead>\n", - " <tbody>\n", - " <tr>\n", - " <th>0</th>\n", - " <td>What battles took place in New York City in th...</td>\n", - " <td>True</td>\n", - " <td>True</td>\n", - " </tr>\n", - " <tr>\n", - " <th>1</th>\n", - " <td>Who was elected as the mayor after the Great D...</td>\n", - " <td>False</td>\n", - " <td>False</td>\n", - " </tr>\n", - " <tr>\n", - " <th>2</th>\n", - " <td>How many tourists visited New York City in 2019?</td>\n", - " <td>False</td>\n", - " <td>False</td>\n", - " </tr>\n", - " <tr>\n", - " <th>3</th>\n", - " <td>What are the airports in New York City?</td>\n", - " <td>False</td>\n", - " <td>False</td>\n", - " </tr>\n", - " <tr>\n", - " <th>4</th>\n", - " <td>When was the first documented visit into New Y...</td>\n", - " <td>False</td>\n", - " <td>False</td>\n", - " </tr>\n", - " </tbody>\n", - "</table>\n", - "</div>" - ], - "text/plain": [ - " Test Query Correct Response \\\n", - "0 What battles took place in New York City in th... True \n", - "1 Who was elected as the mayor after the Great D... False \n", - "2 How many tourists visited New York City in 2019? False \n", - "3 What are the airports in New York City? False \n", - "4 When was the first documented visit into New Y... False \n", - "\n", - " Correct Source \n", - "0 True \n", - "1 False \n", - "2 False \n", - "3 False \n", - "4 False " - ] - }, - "execution_count": 584, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "analyze_outcome(outcomes_tree_gpt4)" - ] - }, - { - "cell_type": "markdown", - "id": "f5ef33a0", - "metadata": {}, - "source": [ - "### Tree Index + GPT3" - ] - }, - { - "cell_type": "code", - "execution_count": 549, - "id": "ba871d2a", - "metadata": {}, - "outputs": [], - "source": [ - "outcomes_tree_gpt3 = bm.test(tree_index, llm_predictor_gpt3)" - ] - }, - { - "cell_type": "code", - "execution_count": 550, - "id": "7d4c6930", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "<div>\n", - "<style scoped>\n", - " .dataframe tbody tr th:only-of-type {\n", - " vertical-align: middle;\n", - " }\n", - "\n", - " .dataframe tbody tr th {\n", - " vertical-align: top;\n", - " }\n", - "\n", - " .dataframe thead th {\n", - " text-align: right;\n", - " }\n", - "</style>\n", - "<table border=\"1\" class=\"dataframe\">\n", - " <thead>\n", - " <tr style=\"text-align: right;\">\n", - " <th></th>\n", - " <th>Test Query</th>\n", - " <th>Correct Response</th>\n", - " <th>Correct Source</th>\n", - " </tr>\n", - " </thead>\n", - " <tbody>\n", - " <tr>\n", - " <th>0</th>\n", - " <td>What battles took place in New York City in th...</td>\n", - " <td>True</td>\n", - " <td>False</td>\n", - " </tr>\n", - " <tr>\n", - " <th>1</th>\n", - " <td>Who was elected as the mayor after the Great D...</td>\n", - " <td>False</td>\n", - " <td>False</td>\n", - " </tr>\n", - " <tr>\n", - " <th>2</th>\n", - " <td>How many tourists visited New York City in 2019?</td>\n", - " <td>False</td>\n", - " <td>False</td>\n", - " </tr>\n", - " <tr>\n", - " <th>3</th>\n", - " <td>What are the airports in New York City?</td>\n", - " <td>True</td>\n", - " <td>False</td>\n", - " </tr>\n", - " <tr>\n", - " <th>4</th>\n", - " <td>When was the first documented visit into New Y...</td>\n", - " <td>True</td>\n", - " <td>False</td>\n", - " </tr>\n", - " </tbody>\n", - "</table>\n", - "</div>" - ], - "text/plain": [ - " Test Query Correct Response \\\n", - "0 What battles took place in New York City in th... True \n", - "1 Who was elected as the mayor after the Great D... False \n", - "2 How many tourists visited New York City in 2019? False \n", - "3 What are the airports in New York City? True \n", - "4 When was the first documented visit into New Y... True \n", - "\n", - " Correct Source \n", - "0 False \n", - "1 False \n", - "2 False \n", - "3 False \n", - "4 False " - ] - }, - "execution_count": 550, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "analyze_outcome(outcomes_tree_gpt3)" - ] - }, - { - "cell_type": "markdown", - "id": "30a9ba34", - "metadata": {}, - "source": [ - "### List Index + GPT4" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "id": "bc0f05d1", - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "outcomes_list_gpt4 = bm.test(list_index, llm_predictor_gpt4, response_mode=\"tree_summarize\", use_async=True)" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "id": "2d2e879d", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "<div>\n", - "<style scoped>\n", - " .dataframe tbody tr th:only-of-type {\n", - " vertical-align: middle;\n", - " }\n", - "\n", - " .dataframe tbody tr th {\n", - " vertical-align: top;\n", - " }\n", - "\n", - " .dataframe thead th {\n", - " text-align: right;\n", - " }\n", - "</style>\n", - "<table border=\"1\" class=\"dataframe\">\n", - " <thead>\n", - " <tr style=\"text-align: right;\">\n", - " <th></th>\n", - " <th>Test Query</th>\n", - " <th>Correct Response</th>\n", - " <th>Correct Source</th>\n", - " </tr>\n", - " </thead>\n", - " <tbody>\n", - " <tr>\n", - " <th>0</th>\n", - " <td>What battles took place in New York City in th...</td>\n", - " <td>False</td>\n", - " <td>True</td>\n", - " </tr>\n", - " <tr>\n", - " <th>1</th>\n", - " <td>Who was elected as the mayor after the Great D...</td>\n", - " <td>False</td>\n", - " <td>True</td>\n", - " </tr>\n", - " <tr>\n", - " <th>2</th>\n", - " <td>How many tourists visited New York City in 2019?</td>\n", - " <td>True</td>\n", - " <td>True</td>\n", - " </tr>\n", - " <tr>\n", - " <th>3</th>\n", - " <td>What are the airports in New York City?</td>\n", - " <td>True</td>\n", - " <td>True</td>\n", - " </tr>\n", - " <tr>\n", - " <th>4</th>\n", - " <td>When was the first documented visit into New Y...</td>\n", - " <td>True</td>\n", - " <td>True</td>\n", - " </tr>\n", - " </tbody>\n", - "</table>\n", - "</div>" - ], - "text/plain": [ - " Test Query Correct Response \\\n", - "0 What battles took place in New York City in th... False \n", - "1 Who was elected as the mayor after the Great D... False \n", - "2 How many tourists visited New York City in 2019? True \n", - "3 What are the airports in New York City? True \n", - "4 When was the first documented visit into New Y... True \n", - "\n", - " Correct Source \n", - "0 True \n", - "1 True \n", - "2 True \n", - "3 True \n", - "4 True " - ] - }, - "execution_count": 19, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "analyze_outcome(outcomes_list_gpt4)" - ] - }, - { - "cell_type": "markdown", - "id": "8cba793c", - "metadata": {}, - "source": [ - "### List Index + GPT3" - ] - }, - { - "cell_type": "code", - "execution_count": 501, - "id": "66cfa3fa", - "metadata": {}, - "outputs": [], - "source": [ - "outcomes_list_gpt3 = bm.test(list_index, llm_predictor_gpt3, response_mode=\"tree_summarize\", use_async=True)" - ] - }, - { - "cell_type": "code", - "execution_count": 502, - "id": "06bc98d8", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "<div>\n", - "<style scoped>\n", - " .dataframe tbody tr th:only-of-type {\n", - " vertical-align: middle;\n", - " }\n", - "\n", - " .dataframe tbody tr th {\n", - " vertical-align: top;\n", - " }\n", - "\n", - " .dataframe thead th {\n", - " text-align: right;\n", - " }\n", - "</style>\n", - "<table border=\"1\" class=\"dataframe\">\n", - " <thead>\n", - " <tr style=\"text-align: right;\">\n", - " <th></th>\n", - " <th>Test Query</th>\n", - " <th>Correct Response</th>\n", - " <th>Correct Source</th>\n", - " </tr>\n", - " </thead>\n", - " <tbody>\n", - " <tr>\n", - " <th>0</th>\n", - " <td>What battles took place in New York City in th...</td>\n", - " <td>True</td>\n", - " <td>True</td>\n", - " </tr>\n", - " <tr>\n", - " <th>1</th>\n", - " <td>Who was elected as the mayor during the Great ...</td>\n", - " <td>True</td>\n", - " <td>True</td>\n", - " </tr>\n", - " <tr>\n", - " <th>2</th>\n", - " <td>How many tourists visited New York City in 2019?</td>\n", - " <td>False</td>\n", - " <td>True</td>\n", - " </tr>\n", - " <tr>\n", - " <th>3</th>\n", - " <td>What are the airports in New York City?</td>\n", - " <td>True</td>\n", - " <td>True</td>\n", - " </tr>\n", - " <tr>\n", - " <th>4</th>\n", - " <td>When was the first documented visit into New Y...</td>\n", - " <td>True</td>\n", - " <td>True</td>\n", - " </tr>\n", - " </tbody>\n", - "</table>\n", - "</div>" - ], - "text/plain": [ - " Test Query Correct Response \\\n", - "0 What battles took place in New York City in th... True \n", - "1 Who was elected as the mayor during the Great ... True \n", - "2 How many tourists visited New York City in 2019? False \n", - "3 What are the airports in New York City? True \n", - "4 When was the first documented visit into New Y... True \n", - "\n", - " Correct Source \n", - "0 True \n", - "1 True \n", - "2 True \n", - "3 True \n", - "4 True " - ] - }, - "execution_count": 502, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "analyze_outcome(outcomes_list_gpt3)" - ] - }, - { - "cell_type": "markdown", - "id": "c4d0b3eb", - "metadata": {}, - "source": [ - "### List Index + ChatGPT" - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "id": "f146c74e", - "metadata": {}, - "outputs": [], - "source": [ - "outcomes_list_chatgpt = bm.test(list_index, llm_predictor_chatgpt, response_mode=\"tree_summarize\", use_async=True)" - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "id": "8eb9d392", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "<div>\n", - "<style scoped>\n", - " .dataframe tbody tr th:only-of-type {\n", - " vertical-align: middle;\n", - " }\n", - "\n", - " .dataframe tbody tr th {\n", - " vertical-align: top;\n", - " }\n", - "\n", - " .dataframe thead th {\n", - " text-align: right;\n", - " }\n", - "</style>\n", - "<table border=\"1\" class=\"dataframe\">\n", - " <thead>\n", - " <tr style=\"text-align: right;\">\n", - " <th></th>\n", - " <th>Test Query</th>\n", - " <th>Correct Response</th>\n", - " <th>Correct Source</th>\n", - " </tr>\n", - " </thead>\n", - " <tbody>\n", - " <tr>\n", - " <th>0</th>\n", - " <td>What battles took place in New York City in th...</td>\n", - " <td>False</td>\n", - " <td>True</td>\n", - " </tr>\n", - " <tr>\n", - " <th>1</th>\n", - " <td>Who was elected as the mayor after the Great D...</td>\n", - " <td>False</td>\n", - " <td>True</td>\n", - " </tr>\n", - " <tr>\n", - " <th>2</th>\n", - " <td>How many tourists visited New York City in 2019?</td>\n", - " <td>False</td>\n", - " <td>True</td>\n", - " </tr>\n", - " <tr>\n", - " <th>3</th>\n", - " <td>What are the airports in New York City?</td>\n", - " <td>True</td>\n", - " <td>True</td>\n", - " </tr>\n", - " <tr>\n", - " <th>4</th>\n", - " <td>When was the first documented visit into New Y...</td>\n", - " <td>True</td>\n", - " <td>True</td>\n", - " </tr>\n", - " </tbody>\n", - "</table>\n", - "</div>" - ], - "text/plain": [ - " Test Query Correct Response \\\n", - "0 What battles took place in New York City in th... False \n", - "1 Who was elected as the mayor after the Great D... False \n", - "2 How many tourists visited New York City in 2019? False \n", - "3 What are the airports in New York City? True \n", - "4 When was the first documented visit into New Y... True \n", - "\n", - " Correct Source \n", - "0 True \n", - "1 True \n", - "2 True \n", - "3 True \n", - "4 True " - ] - }, - "execution_count": 24, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "analyze_outcome(outcomes_list_chatgpt)" - ] - }, - { - "cell_type": "markdown", - "id": "38fc1438", - "metadata": {}, - "source": [ - "### Vector Store Index + GPT4 " - ] - }, - { - "cell_type": "code", - "execution_count": 487, - "id": "5349d1e7", - "metadata": {}, - "outputs": [], - "source": [ - "outcomes_vector_gpt4 = bm.test(vector_index, llm_predictor_gpt4)" - ] - }, - { - "cell_type": "code", - "execution_count": 488, - "id": "7fc53e19", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "<div>\n", - "<style scoped>\n", - " .dataframe tbody tr th:only-of-type {\n", - " vertical-align: middle;\n", - " }\n", - "\n", - " .dataframe tbody tr th {\n", - " vertical-align: top;\n", - " }\n", - "\n", - " .dataframe thead th {\n", - " text-align: right;\n", - " }\n", - "</style>\n", - "<table border=\"1\" class=\"dataframe\">\n", - " <thead>\n", - " <tr style=\"text-align: right;\">\n", - " <th></th>\n", - " <th>Test Query</th>\n", - " <th>Correct Response</th>\n", - " <th>Correct Source</th>\n", - " </tr>\n", - " </thead>\n", - " <tbody>\n", - " <tr>\n", - " <th>0</th>\n", - " <td>What battles took place in New York City in th...</td>\n", - " <td>True</td>\n", - " <td>True</td>\n", - " </tr>\n", - " <tr>\n", - " <th>1</th>\n", - " <td>Who was elected as the mayor during the Great ...</td>\n", - " <td>True</td>\n", - " <td>True</td>\n", - " </tr>\n", - " <tr>\n", - " <th>2</th>\n", - " <td>How many tourists visited New York City in 2019?</td>\n", - " <td>False</td>\n", - " <td>False</td>\n", - " </tr>\n", - " <tr>\n", - " <th>3</th>\n", - " <td>What are the airports in New York City?</td>\n", - " <td>True</td>\n", - " <td>True</td>\n", - " </tr>\n", - " <tr>\n", - " <th>4</th>\n", - " <td>When was the first documented visit into New Y...</td>\n", - " <td>True</td>\n", - " <td>True</td>\n", - " </tr>\n", - " </tbody>\n", - "</table>\n", - "</div>" - ], - "text/plain": [ - " Test Query Correct Response \\\n", - "0 What battles took place in New York City in th... True \n", - "1 Who was elected as the mayor during the Great ... True \n", - "2 How many tourists visited New York City in 2019? False \n", - "3 What are the airports in New York City? True \n", - "4 When was the first documented visit into New Y... True \n", - "\n", - " Correct Source \n", - "0 True \n", - "1 True \n", - "2 False \n", - "3 True \n", - "4 True " - ] - }, - "execution_count": 488, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "analyze_outcome(outcomes_vector_gpt4)" - ] - }, - { - "cell_type": "markdown", - "id": "70eb711f", - "metadata": {}, - "source": [ - "### Vector Store Index + GPT3" - ] - }, - { - "cell_type": "code", - "execution_count": 644, - "id": "e35ebdf9", - "metadata": {}, - "outputs": [], - "source": [ - "outcomes_vector_gpt3 = bm.test(vector_index, llm_predictor_gpt3)" - ] - }, - { - "cell_type": "code", - "execution_count": 645, - "id": "95c49697", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "<div>\n", - "<style scoped>\n", - " .dataframe tbody tr th:only-of-type {\n", - " vertical-align: middle;\n", - " }\n", - "\n", - " .dataframe tbody tr th {\n", - " vertical-align: top;\n", - " }\n", - "\n", - " .dataframe thead th {\n", - " text-align: right;\n", - " }\n", - "</style>\n", - "<table border=\"1\" class=\"dataframe\">\n", - " <thead>\n", - " <tr style=\"text-align: right;\">\n", - " <th></th>\n", - " <th>Test Query</th>\n", - " <th>Correct Response</th>\n", - " <th>Correct Source</th>\n", - " </tr>\n", - " </thead>\n", - " <tbody>\n", - " <tr>\n", - " <th>0</th>\n", - " <td>What battles took place in New York City in th...</td>\n", - " <td>True</td>\n", - " <td>True</td>\n", - " </tr>\n", - " <tr>\n", - " <th>1</th>\n", - " <td>Who was elected as the mayor after the Great D...</td>\n", - " <td>True</td>\n", - " <td>False</td>\n", - " </tr>\n", - " <tr>\n", - " <th>2</th>\n", - " <td>How many tourists visited New York City in 2019?</td>\n", - " <td>False</td>\n", - " <td>False</td>\n", - " </tr>\n", - " <tr>\n", - " <th>3</th>\n", - " <td>What are the airports in New York City?</td>\n", - " <td>True</td>\n", - " <td>False</td>\n", - " </tr>\n", - " <tr>\n", - " <th>4</th>\n", - " <td>When was the first documented visit into New Y...</td>\n", - " <td>True</td>\n", - " <td>False</td>\n", - " </tr>\n", - " </tbody>\n", - "</table>\n", - "</div>" - ], - "text/plain": [ - " Test Query Correct Response \\\n", - "0 What battles took place in New York City in th... True \n", - "1 Who was elected as the mayor after the Great D... True \n", - "2 How many tourists visited New York City in 2019? False \n", - "3 What are the airports in New York City? True \n", - "4 When was the first documented visit into New Y... True \n", - "\n", - " Correct Source \n", - "0 True \n", - "1 False \n", - "2 False \n", - "3 False \n", - "4 False " - ] - }, - "execution_count": 645, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "analyze_outcome(outcomes_vector_gpt3)" - ] - }, - { - "cell_type": "markdown", - "id": "a36ba2ee", - "metadata": {}, - "source": [ - "# LLM based Evaluation" - ] - }, - { - "cell_type": "code", - "execution_count": 646, - "id": "59ff561c", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "<div>\n", - "<style scoped>\n", - " .dataframe tbody tr th:only-of-type {\n", - " vertical-align: middle;\n", - " }\n", - "\n", - " .dataframe tbody tr th {\n", - " vertical-align: top;\n", - " }\n", - "\n", - " .dataframe thead th {\n", - " text-align: right;\n", - " }\n", - "</style>\n", - "<table border=\"1\" class=\"dataframe\">\n", - " <thead>\n", - " <tr style=\"text-align: right;\">\n", - " <th></th>\n", - " <th>Test Query</th>\n", - " <th>Correct Response</th>\n", - " <th>Correct Source</th>\n", - " </tr>\n", - " </thead>\n", - " <tbody>\n", - " <tr>\n", - " <th>0</th>\n", - " <td>What battles took place in New York City in th...</td>\n", - " <td>True</td>\n", - " <td>True</td>\n", - " </tr>\n", - " <tr>\n", - " <th>1</th>\n", - " <td>Who was elected as the mayor after the Great D...</td>\n", - " <td>True</td>\n", - " <td>False</td>\n", - " </tr>\n", - " <tr>\n", - " <th>2</th>\n", - " <td>How many tourists visited New York City in 2019?</td>\n", - " <td>False</td>\n", - " <td>False</td>\n", - " </tr>\n", - " <tr>\n", - " <th>3</th>\n", - " <td>What are the airports in New York City?</td>\n", - " <td>True</td>\n", - " <td>False</td>\n", - " </tr>\n", - " <tr>\n", - " <th>4</th>\n", - " <td>When was the first documented visit into New Y...</td>\n", - " <td>True</td>\n", - " <td>False</td>\n", - " </tr>\n", - " </tbody>\n", - "</table>\n", - "</div>" - ], - "text/plain": [ - " Test Query Correct Response \\\n", - "0 What battles took place in New York City in th... True \n", - "1 Who was elected as the mayor after the Great D... True \n", - "2 How many tourists visited New York City in 2019? False \n", - "3 What are the airports in New York City? True \n", - "4 When was the first documented visit into New Y... True \n", - "\n", - " Correct Source \n", - "0 True \n", - "1 False \n", - "2 False \n", - "3 False \n", - "4 False " - ] - }, - "execution_count": 646, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "analyze_outcome(outcomes_vector_gpt3)" - ] - }, - { - "cell_type": "code", - "execution_count": 647, - "id": "e4ffaca6", - "metadata": {}, - "outputs": [], - "source": [ - "eval_gpt4 = analyze_outcome_llm(outcomes_vector_gpt3, llm_predictor_gpt4)" - ] - }, - { - "cell_type": "code", - "execution_count": 657, - "id": "85c4e415", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "<div>\n", - "<style scoped>\n", - " .dataframe tbody tr th:only-of-type {\n", - " vertical-align: middle;\n", - " }\n", - "\n", - " .dataframe tbody tr th {\n", - " vertical-align: top;\n", - " }\n", - "\n", - " .dataframe thead th {\n", - " text-align: right;\n", - " }\n", - "</style>\n", - "<table border=\"1\" class=\"dataframe\">\n", - " <thead>\n", - " <tr style=\"text-align: right;\">\n", - " <th></th>\n", - " <th>Test Query</th>\n", - " <th>Correct Response (LLM)</th>\n", - " <th>Correct Source (LLM)</th>\n", - " <th>Eval (LLM)</th>\n", - " </tr>\n", - " </thead>\n", - " <tbody>\n", - " <tr>\n", - " <th>0</th>\n", - " <td>What battles took place in New York City in th...</td>\n", - " <td>True</td>\n", - " <td>True</td>\n", - " <td>Context is relevant: True\\nAnswer is correct: ...</td>\n", - " </tr>\n", - " <tr>\n", - " <th>1</th>\n", - " <td>Who was elected as the mayor after the Great D...</td>\n", - " <td>True</td>\n", - " <td>False</td>\n", - " <td>Context is relevant: False\\nAnswer is correct:...</td>\n", - " </tr>\n", - " <tr>\n", - " <th>2</th>\n", - " <td>How many tourists visited New York City in 2019?</td>\n", - " <td>True</td>\n", - " <td>False</td>\n", - " <td>Context is relevant: False\\nAnswer is correct:...</td>\n", - " </tr>\n", - " <tr>\n", - " <th>3</th>\n", - " <td>What are the airports in New York City?</td>\n", - " <td>True</td>\n", - " <td>False</td>\n", - " <td>Context is relevant: False\\nAnswer is correct:...</td>\n", - " </tr>\n", - " <tr>\n", - " <th>4</th>\n", - " <td>When was the first documented visit into New Y...</td>\n", - " <td>True</td>\n", - " <td>False</td>\n", - " <td>Context is relevant: False\\nAnswer is correct:...</td>\n", - " </tr>\n", - " </tbody>\n", - "</table>\n", - "</div>" - ], - "text/plain": [ - " Test Query Correct Response (LLM) \\\n", - "0 What battles took place in New York City in th... True \n", - "1 Who was elected as the mayor after the Great D... True \n", - "2 How many tourists visited New York City in 2019? True \n", - "3 What are the airports in New York City? True \n", - "4 When was the first documented visit into New Y... True \n", - "\n", - " Correct Source (LLM) Eval (LLM) \n", - "0 True Context is relevant: True\\nAnswer is correct: ... \n", - "1 False Context is relevant: False\\nAnswer is correct:... \n", - "2 False Context is relevant: False\\nAnswer is correct:... \n", - "3 False Context is relevant: False\\nAnswer is correct:... \n", - "4 False Context is relevant: False\\nAnswer is correct:... " - ] - }, - "execution_count": 657, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "eval_gpt4" - ] - }, - { - "cell_type": "code", - "execution_count": 651, - "id": "3efb66d6", - "metadata": {}, - "outputs": [], - "source": [ - "eval_chatgpt = analyze_outcome_llm(outcomes_vector_gpt3, llm_predictor_chatgpt)" - ] - }, - { - "cell_type": "code", - "execution_count": 652, - "id": "4c452767", - "metadata": { - "scrolled": true - }, - "outputs": [ - { - "data": { - "text/html": [ - "<div>\n", - "<style scoped>\n", - " .dataframe tbody tr th:only-of-type {\n", - " vertical-align: middle;\n", - " }\n", - "\n", - " .dataframe tbody tr th {\n", - " vertical-align: top;\n", - " }\n", - "\n", - " .dataframe thead th {\n", - " text-align: right;\n", - " }\n", - "</style>\n", - "<table border=\"1\" class=\"dataframe\">\n", - " <thead>\n", - " <tr style=\"text-align: right;\">\n", - " <th></th>\n", - " <th>Test Query</th>\n", - " <th>Correct Response (LLM)</th>\n", - " <th>Correct Source (LLM)</th>\n", - " <th>Eval (LLM)</th>\n", - " </tr>\n", - " </thead>\n", - " <tbody>\n", - " <tr>\n", - " <th>0</th>\n", - " <td>What battles took place in New York City in th...</td>\n", - " <td>True</td>\n", - " <td>True</td>\n", - " <td>\\n\\nContext is relevant: True\\nAnswer is corre...</td>\n", - " </tr>\n", - " <tr>\n", - " <th>1</th>\n", - " <td>Who was elected as the mayor after the Great D...</td>\n", - " <td>True</td>\n", - " <td>True</td>\n", - " <td>\\n\\nContext is relevant: True\\nAnswer is corre...</td>\n", - " </tr>\n", - " <tr>\n", - " <th>2</th>\n", - " <td>How many tourists visited New York City in 2019?</td>\n", - " <td>False</td>\n", - " <td>False</td>\n", - " <td>\\n\\nContext is relevant: False\\nAnswer is corr...</td>\n", - " </tr>\n", - " <tr>\n", - " <th>3</th>\n", - " <td>What are the airports in New York City?</td>\n", - " <td>True</td>\n", - " <td>False</td>\n", - " <td>\\n\\nContext is relevant: False\\nAnswer is corr...</td>\n", - " </tr>\n", - " <tr>\n", - " <th>4</th>\n", - " <td>When was the first documented visit into New Y...</td>\n", - " <td>False</td>\n", - " <td>True</td>\n", - " <td>\\n\\nContext is relevant: True\\nAnswer is corre...</td>\n", - " </tr>\n", - " </tbody>\n", - "</table>\n", - "</div>" - ], - "text/plain": [ - " Test Query Correct Response (LLM) \\\n", - "0 What battles took place in New York City in th... True \n", - "1 Who was elected as the mayor after the Great D... True \n", - "2 How many tourists visited New York City in 2019? False \n", - "3 What are the airports in New York City? True \n", - "4 When was the first documented visit into New Y... False \n", - "\n", - " Correct Source (LLM) Eval (LLM) \n", - "0 True \\n\\nContext is relevant: True\\nAnswer is corre... \n", - "1 True \\n\\nContext is relevant: True\\nAnswer is corre... \n", - "2 False \\n\\nContext is relevant: False\\nAnswer is corr... \n", - "3 False \\n\\nContext is relevant: False\\nAnswer is corr... \n", - "4 True \\n\\nContext is relevant: True\\nAnswer is corre... " - ] - }, - "execution_count": 652, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "eval_chatgpt" - ] - }, - { - "cell_type": "code", - "execution_count": 649, - "id": "61e8dad2", - "metadata": {}, - "outputs": [], - "source": [ - "eval_gpt3 = analyze_outcome_llm(outcomes_vector_gpt3, llm_predictor_gpt3)" - ] - }, - { - "cell_type": "code", - "execution_count": 650, - "id": "170400c3", - "metadata": { - "scrolled": true - }, - "outputs": [ - { - "data": { - "text/html": [ - "<div>\n", - "<style scoped>\n", - " .dataframe tbody tr th:only-of-type {\n", - " vertical-align: middle;\n", - " }\n", - "\n", - " .dataframe tbody tr th {\n", - " vertical-align: top;\n", - " }\n", - "\n", - " .dataframe thead th {\n", - " text-align: right;\n", - " }\n", - "</style>\n", - "<table border=\"1\" class=\"dataframe\">\n", - " <thead>\n", - " <tr style=\"text-align: right;\">\n", - " <th></th>\n", - " <th>Test Query</th>\n", - " <th>Correct Response (LLM)</th>\n", - " <th>Correct Source (LLM)</th>\n", - " <th>Eval (LLM)</th>\n", - " </tr>\n", - " </thead>\n", - " <tbody>\n", - " <tr>\n", - " <th>0</th>\n", - " <td>What battles took place in New York City in th...</td>\n", - " <td>True</td>\n", - " <td>True</td>\n", - " <td>\\n\\nContext is relevant: True\\nAnswer is corre...</td>\n", - " </tr>\n", - " <tr>\n", - " <th>1</th>\n", - " <td>Who was elected as the mayor after the Great D...</td>\n", - " <td>True</td>\n", - " <td>True</td>\n", - " <td>\\n\\nContext is relevant: True\\nAnswer is corre...</td>\n", - " </tr>\n", - " <tr>\n", - " <th>2</th>\n", - " <td>How many tourists visited New York City in 2019?</td>\n", - " <td>False</td>\n", - " <td>False</td>\n", - " <td>\\n\\nContext is relevant: False\\nAnswer is corr...</td>\n", - " </tr>\n", - " <tr>\n", - " <th>3</th>\n", - " <td>What are the airports in New York City?</td>\n", - " <td>True</td>\n", - " <td>True</td>\n", - " <td>\\n\\nContext is relevant: True\\nAnswer is corre...</td>\n", - " </tr>\n", - " <tr>\n", - " <th>4</th>\n", - " <td>When was the first documented visit into New Y...</td>\n", - " <td>True</td>\n", - " <td>True</td>\n", - " <td>\\n\\nContext is relevant: True\\nAnswer is corre...</td>\n", - " </tr>\n", - " </tbody>\n", - "</table>\n", - "</div>" - ], - "text/plain": [ - " Test Query Correct Response (LLM) \\\n", - "0 What battles took place in New York City in th... True \n", - "1 Who was elected as the mayor after the Great D... True \n", - "2 How many tourists visited New York City in 2019? False \n", - "3 What are the airports in New York City? True \n", - "4 When was the first documented visit into New Y... True \n", - "\n", - " Correct Source (LLM) Eval (LLM) \n", - "0 True \\n\\nContext is relevant: True\\nAnswer is corre... \n", - "1 True \\n\\nContext is relevant: True\\nAnswer is corre... \n", - "2 False \\n\\nContext is relevant: False\\nAnswer is corr... \n", - "3 True \\n\\nContext is relevant: True\\nAnswer is corre... \n", - "4 True \\n\\nContext is relevant: True\\nAnswer is corre... " - ] - }, - "execution_count": 650, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "eval_gpt3" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.9" - } - }, - "nbformat": 4, - "nbformat_minor": 5 + "cells": [ + { + "cell_type": "markdown", + "id": "36f92b98", + "metadata": {}, + "source": [ + "# GPT4 NYC Wiki Benchmark" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "9080b39e", + "metadata": {}, + "outputs": [], + "source": [ + "import logging, sys\n", + "logging.basicConfig(stream=sys.stdout, level=logging.INFO)\n", + "logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))\n", + "\n", + "# Uncomment if you want to temporarily disable logger\n", + "logging.disable(sys.maxsize)" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "7de92ce3", + "metadata": {}, + "outputs": [], + "source": [ + "# NOTE: only necessary for querying with `use_async=True` in notebook\n", + "import nest_asyncio\n", + "nest_asyncio.apply()" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "f1a9eb90-335c-4214-8bb6-fd1edbe3ccbd", + "metadata": {}, + "outputs": [], + "source": [ + "# My OpenAI Key\n", + "import os\n", + "os.environ['OPENAI_API_KEY'] = \"\"" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "8d0b2364-4806-4656-81e7-3f6e4b910b5b", + "metadata": {}, + "outputs": [], + "source": [ + "from gpt_index import GPTTreeIndex, SimpleDirectoryReader, LLMPredictor, GPTSimpleVectorIndex, GPTListIndex, Prompt, ServiceContext\n", + "from gpt_index.indices.base import BaseGPTIndex\n", + "from gpt_index.langchain_helpers.text_splitter import TokenTextSplitter\n", + "from langchain.chat_models import ChatOpenAI\n", + "from langchain.llms import OpenAI\n", + "from gpt_index.response.schema import Response\n", + "import pandas as pd\n", + "from typing import Tuple" + ] + }, + { + "cell_type": "markdown", + "id": "707662e5", + "metadata": {}, + "source": [ + "## Setup data" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "b4b4387b-413e-4016-ba1e-88b3d9410a38", + "metadata": {}, + "outputs": [], + "source": [ + "# fetch \"New York City\" page from Wikipedia\n", + "from pathlib import Path\n", + "\n", + "import requests\n", + "response = requests.get(\n", + " 'https://en.wikipedia.org/w/api.php',\n", + " params={\n", + " 'action': 'query',\n", + " 'format': 'json',\n", + " 'titles': 'New York City',\n", + " 'prop': 'extracts',\n", + " # 'exintro': True,\n", + " 'explaintext': True,\n", + " }\n", + ").json()\n", + "page = next(iter(response['query']['pages'].values()))\n", + "nyc_text = page['extract']\n", + "\n", + "data_path = Path('data')\n", + "if not data_path.exists():\n", + " Path.mkdir(data_path)\n", + "\n", + "with open('data/nyc_text.txt', 'w') as fp:\n", + " fp.write(nyc_text)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "523fbebe-6e79-4d7b-b400-188b711a0e8f", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "DEBUG:gpt_index.readers.file.base:> [SimpleDirectoryReader] Total files added: 1\n", + "> [SimpleDirectoryReader] Total files added: 1\n" + ] + } + ], + "source": [ + "documents = SimpleDirectoryReader('data').load_data()" + ] + }, + { + "cell_type": "markdown", + "id": "f4a269bd", + "metadata": {}, + "source": [ + "## Setup benchmark" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "62f01ddf", + "metadata": {}, + "outputs": [], + "source": [ + "from dataclasses import dataclass\n", + "from typing import List" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "4ff13cd4", + "metadata": {}, + "outputs": [], + "source": [ + "@dataclass\n", + "class TestCase:\n", + " query: str \n", + " must_contain: List[str]" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "9c653b72", + "metadata": {}, + "outputs": [], + "source": [ + "@dataclass\n", + "class TestOutcome:\n", + " test: TestCase\n", + " response: Response\n", + " \n", + " @property\n", + " def is_correct_response(self) -> bool:\n", + " is_correct = True\n", + " for answer in self.test.must_contain:\n", + " if answer not in self.response.response:\n", + " is_correct = False\n", + " return is_correct\n", + " \n", + " @property\n", + " def is_correct_source(self) -> bool:\n", + " is_correct = True\n", + " for answer in self.test.must_contain:\n", + " if all(answer not in node.source_text for node in self.response.source_nodes):\n", + " is_correct = False\n", + " return is_correct" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "b9cd18ae", + "metadata": {}, + "outputs": [], + "source": [ + "class Benchmark:\n", + " def __init__(self, tests: List[TestCase]) -> None:\n", + " self._tests = tests\n", + " \n", + " def test(self, index: BaseGPTIndex, llm_predictor: LLMPredictor, **kwargs) -> List[TestOutcome]:\n", + " outcomes: List[TestOutcome] = []\n", + " service_context = ServiceContext.from_defaults(llm_predictor=llm_predictor)\n", + " for test in self._tests:\n", + " response = index.query(\n", + " test.query,\n", + " service_context=service_context,\n", + " **kwargs\n", + " )\n", + " outcome = TestOutcome(test=test, response=response)\n", + " outcomes.append(outcome)\n", + " return outcomes" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "8edad985", + "metadata": {}, + "outputs": [], + "source": [ + "def analyze_outcome(outcomes: List[TestOutcome]) -> None:\n", + " rows = []\n", + " for outcome in outcomes:\n", + " row = [outcome.test.query, outcome.is_correct_response, outcome.is_correct_source]\n", + " rows.append(row)\n", + " df = pd.DataFrame(rows, columns=['Test Query', 'Correct Response', 'Correct Source'])\n", + " return df" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "4bc38077", + "metadata": {}, + "outputs": [], + "source": [ + "test_battle = TestCase(\n", + " query=\"What battles took place in New York City in the American Revolution?\",\n", + " must_contain=[\"Battle of Long Island\"]\n", + ")\n", + "\n", + "test_mayor = TestCase(\n", + " query='Who was elected as the mayor after the Great Depression?',\n", + " must_contain=[\"Fiorello La Guardia\"]\n", + ")\n", + "\n", + "test_tourists = TestCase(\n", + " query='How many tourists visited New York City in 2019?',\n", + " must_contain=['66.6 million']\n", + ")\n", + "test_airport = TestCase(\n", + " query='What are the airports in New York City?',\n", + " must_contain=['LaGuardia Airport']\n", + ")\n", + "test_visit = TestCase(\n", + " query='When was the first documented visit into New York Harbor?',\n", + " must_contain=['1524']\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "f159dadb", + "metadata": {}, + "outputs": [], + "source": [ + "bm = Benchmark([\n", + " test_battle,\n", + " test_mayor,\n", + " test_tourists,\n", + " test_airport,\n", + " test_visit,\n", + "])" + ] + }, + { + "cell_type": "markdown", + "id": "65ddbd56", + "metadata": {}, + "source": [ + "## LLM based evaluation" + ] + }, + { + "cell_type": "code", + "execution_count": 592, + "id": "ed175de5", + "metadata": {}, + "outputs": [], + "source": [ + "from gpt_index.prompts.prompt_type import PromptType\n", + "\n", + "EVAL_PROMPT_TMPL = (\n", + " \"Given the question below. \\n\"\n", + " \"---------------------\\n\"\n", + " \"{query_str}\"\n", + " \"\\n---------------------\\n\"\n", + " \"Decide if the following retreived context is relevant. \\n\"\n", + " \"\\n---------------------\\n\"\n", + " \"{context_str}\"\n", + " \"\\n---------------------\\n\"\n", + " \"Then decide if the answer is correct. \\n\"\n", + " \"\\n---------------------\\n\"\n", + " \"{answer_str}\"\n", + " \"\\n---------------------\\n\"\n", + " \"Answer in the following format:\\n\"\n", + " \"'Context is relevant: <True>\\nAnswer is correct: <True>' \"\n", + " \"and explain why.\"\n", + ")\n", + "\n", + "class EvalPrompt(Prompt):\n", + " prompt_type: PromptType = PromptType.CUSTOM\n", + " input_variables: List[str] = [\"query_str\", 'context_str', 'answer_str']\n", + "\n", + "DEFAULT_EVAL_PROMPT = EvalPrompt(EVAL_PROMPT_TMPL)" + ] + }, + { + "cell_type": "code", + "execution_count": 593, + "id": "93c498b6", + "metadata": {}, + "outputs": [], + "source": [ + "import re\n", + "def extract_eval_result(result_str: str):\n", + " boolean_pattern = r\"(True|False)\"\n", + " matches = re.findall(boolean_pattern, result_str)\n", + " return [match == \"True\" for match in matches] " + ] + }, + { + "cell_type": "code", + "execution_count": 594, + "id": "4c8109c3", + "metadata": {}, + "outputs": [], + "source": [ + "def analyze_outcome_llm_single(outcome: TestOutcome, llm_predictor: LLMPredictor) -> Tuple[bool, bool]:\n", + " try:\n", + " source_text = outcome.response.source_nodes[0].source_text\n", + " except:\n", + " source_text = \"Failed to retrieve any context\"\n", + " result_str, _ = llm_predictor.predict(\n", + " DEFAULT_EVAL_PROMPT,\n", + " query_str=outcome.test.query,\n", + " context_str=source_text,\n", + " answer_str=outcome.response.response\n", + " )\n", + " is_context_relevant, is_answer_correct = extract_eval_result(result_str)\n", + " return is_answer_correct, is_context_relevant, result_str\n", + "\n", + "def analyze_outcome_llm(outcomes: List[TestOutcome], llm_predictor: LLMPredictor) -> None:\n", + " rows = []\n", + " for outcome in outcomes:\n", + " is_correct_response, is_correct_source, result_str = analyze_outcome_llm_single(outcome, llm_predictor)\n", + " row = [outcome.test.query, is_correct_response, is_correct_source, result_str]\n", + " rows.append(row)\n", + " df = pd.DataFrame(rows, columns=['Test Query', 'Correct Response (LLM)', 'Correct Source (LLM)', 'Eval (LLM)'])\n", + " return df" + ] + }, + { + "cell_type": "markdown", + "id": "5a9f43a6", + "metadata": {}, + "source": [ + "## Build Indices" + ] + }, + { + "cell_type": "code", + "execution_count": 643, + "id": "790bad05", + "metadata": {}, + "outputs": [], + "source": [ + "vector_index = GPTSimpleVectorIndex.from_documents(\n", + " documents, \n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 473, + "id": "64c970e0", + "metadata": {}, + "outputs": [], + "source": [ + "list_index = GPTListIndex.from_documents(\n", + " documents, \n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 468, + "id": "bacc4f1c", + "metadata": {}, + "outputs": [], + "source": [ + "tree_index = GPTTreeIndex.from_documents(documents)" + ] + }, + { + "cell_type": "code", + "execution_count": 632, + "id": "a600d4de", + "metadata": {}, + "outputs": [], + "source": [ + "# Save indices\n", + "vector_index.save_to_disk('vector_index.json')\n", + "tree_index.save_to_disk('tree_index.json')\n", + "list_index.save_to_disk('list_index.json')" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "5eec265d-211b-4f26-b05b-5b4e7072bc6e", + "metadata": {}, + "outputs": [], + "source": [ + "# Load indices\n", + "tree_index = GPTTreeIndex.load_from_disk('tree_index.json')\n", + "list_index = GPTListIndex.load_from_disk('list_index.json')\n", + "vector_index = GPTSimpleVectorIndex.load_from_disk('vector_index.json')" + ] + }, + { + "cell_type": "markdown", + "id": "5b2e7fdd", + "metadata": {}, + "source": [ + "## Create LLMPredictors" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "4766ac56-ac8d-4f33-b994-6901964241ea", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# gpt-4\n", + "llm_predictor_gpt4 = LLMPredictor(\n", + " llm=ChatOpenAI(temperature=0, model_name=\"gpt-4\")\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 169, + "id": "c8692cf6", + "metadata": {}, + "outputs": [], + "source": [ + "# gpt-3 (text-davinci-003)\n", + "llm_predictor_gpt3 = LLMPredictor(llm=OpenAI(temperature=0, model_name=\"text-davinci-003\"))" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "fb74ec62", + "metadata": {}, + "outputs": [], + "source": [ + "# chatgpt (gpt-3.5-turbo)\n", + "llm_predictor_chatgpt = LLMPredictor(llm=ChatOpenAI(temperature=0, model_name=\"gpt-3.5-turbo\"))" + ] + }, + { + "cell_type": "markdown", + "id": "1354f668", + "metadata": {}, + "source": [ + "## Benchmarking " + ] + }, + { + "cell_type": "markdown", + "id": "01124a3f", + "metadata": {}, + "source": [ + "### Tree Index + GPT4" + ] + }, + { + "cell_type": "code", + "execution_count": 583, + "id": "6f418554", + "metadata": {}, + "outputs": [], + "source": [ + "outcomes_tree_gpt4 = bm.test(tree_index, llm_predictor_gpt4)" + ] + }, + { + "cell_type": "code", + "execution_count": 584, + "id": "de98ceba", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>Test Query</th>\n", + " <th>Correct Response</th>\n", + " <th>Correct Source</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>What battles took place in New York City in th...</td>\n", + " <td>True</td>\n", + " <td>True</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>Who was elected as the mayor after the Great D...</td>\n", + " <td>False</td>\n", + " <td>False</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>How many tourists visited New York City in 2019?</td>\n", + " <td>False</td>\n", + " <td>False</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>What are the airports in New York City?</td>\n", + " <td>False</td>\n", + " <td>False</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>When was the first documented visit into New Y...</td>\n", + " <td>False</td>\n", + " <td>False</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " Test Query Correct Response \\\n", + "0 What battles took place in New York City in th... True \n", + "1 Who was elected as the mayor after the Great D... False \n", + "2 How many tourists visited New York City in 2019? False \n", + "3 What are the airports in New York City? False \n", + "4 When was the first documented visit into New Y... False \n", + "\n", + " Correct Source \n", + "0 True \n", + "1 False \n", + "2 False \n", + "3 False \n", + "4 False " + ] + }, + "execution_count": 584, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "analyze_outcome(outcomes_tree_gpt4)" + ] + }, + { + "cell_type": "markdown", + "id": "f5ef33a0", + "metadata": {}, + "source": [ + "### Tree Index + GPT3" + ] + }, + { + "cell_type": "code", + "execution_count": 549, + "id": "ba871d2a", + "metadata": {}, + "outputs": [], + "source": [ + "outcomes_tree_gpt3 = bm.test(tree_index, llm_predictor_gpt3)" + ] + }, + { + "cell_type": "code", + "execution_count": 550, + "id": "7d4c6930", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>Test Query</th>\n", + " <th>Correct Response</th>\n", + " <th>Correct Source</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>What battles took place in New York City in th...</td>\n", + " <td>True</td>\n", + " <td>False</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>Who was elected as the mayor after the Great D...</td>\n", + " <td>False</td>\n", + " <td>False</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>How many tourists visited New York City in 2019?</td>\n", + " <td>False</td>\n", + " <td>False</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>What are the airports in New York City?</td>\n", + " <td>True</td>\n", + " <td>False</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>When was the first documented visit into New Y...</td>\n", + " <td>True</td>\n", + " <td>False</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " Test Query Correct Response \\\n", + "0 What battles took place in New York City in th... True \n", + "1 Who was elected as the mayor after the Great D... False \n", + "2 How many tourists visited New York City in 2019? False \n", + "3 What are the airports in New York City? True \n", + "4 When was the first documented visit into New Y... True \n", + "\n", + " Correct Source \n", + "0 False \n", + "1 False \n", + "2 False \n", + "3 False \n", + "4 False " + ] + }, + "execution_count": 550, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "analyze_outcome(outcomes_tree_gpt3)" + ] + }, + { + "cell_type": "markdown", + "id": "30a9ba34", + "metadata": {}, + "source": [ + "### List Index + GPT4" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "bc0f05d1", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "outcomes_list_gpt4 = bm.test(list_index, llm_predictor_gpt4, response_mode=\"tree_summarize\", use_async=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "2d2e879d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>Test Query</th>\n", + " <th>Correct Response</th>\n", + " <th>Correct Source</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>What battles took place in New York City in th...</td>\n", + " <td>False</td>\n", + " <td>True</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>Who was elected as the mayor after the Great D...</td>\n", + " <td>False</td>\n", + " <td>True</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>How many tourists visited New York City in 2019?</td>\n", + " <td>True</td>\n", + " <td>True</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>What are the airports in New York City?</td>\n", + " <td>True</td>\n", + " <td>True</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>When was the first documented visit into New Y...</td>\n", + " <td>True</td>\n", + " <td>True</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " Test Query Correct Response \\\n", + "0 What battles took place in New York City in th... False \n", + "1 Who was elected as the mayor after the Great D... False \n", + "2 How many tourists visited New York City in 2019? True \n", + "3 What are the airports in New York City? True \n", + "4 When was the first documented visit into New Y... True \n", + "\n", + " Correct Source \n", + "0 True \n", + "1 True \n", + "2 True \n", + "3 True \n", + "4 True " + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "analyze_outcome(outcomes_list_gpt4)" + ] + }, + { + "cell_type": "markdown", + "id": "8cba793c", + "metadata": {}, + "source": [ + "### List Index + GPT3" + ] + }, + { + "cell_type": "code", + "execution_count": 501, + "id": "66cfa3fa", + "metadata": {}, + "outputs": [], + "source": [ + "outcomes_list_gpt3 = bm.test(list_index, llm_predictor_gpt3, response_mode=\"tree_summarize\", use_async=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 502, + "id": "06bc98d8", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>Test Query</th>\n", + " <th>Correct Response</th>\n", + " <th>Correct Source</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>What battles took place in New York City in th...</td>\n", + " <td>True</td>\n", + " <td>True</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>Who was elected as the mayor during the Great ...</td>\n", + " <td>True</td>\n", + " <td>True</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>How many tourists visited New York City in 2019?</td>\n", + " <td>False</td>\n", + " <td>True</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>What are the airports in New York City?</td>\n", + " <td>True</td>\n", + " <td>True</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>When was the first documented visit into New Y...</td>\n", + " <td>True</td>\n", + " <td>True</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " Test Query Correct Response \\\n", + "0 What battles took place in New York City in th... True \n", + "1 Who was elected as the mayor during the Great ... True \n", + "2 How many tourists visited New York City in 2019? False \n", + "3 What are the airports in New York City? True \n", + "4 When was the first documented visit into New Y... True \n", + "\n", + " Correct Source \n", + "0 True \n", + "1 True \n", + "2 True \n", + "3 True \n", + "4 True " + ] + }, + "execution_count": 502, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "analyze_outcome(outcomes_list_gpt3)" + ] + }, + { + "cell_type": "markdown", + "id": "c4d0b3eb", + "metadata": {}, + "source": [ + "### List Index + ChatGPT" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "f146c74e", + "metadata": {}, + "outputs": [], + "source": [ + "outcomes_list_chatgpt = bm.test(list_index, llm_predictor_chatgpt, response_mode=\"tree_summarize\", use_async=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "8eb9d392", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>Test Query</th>\n", + " <th>Correct Response</th>\n", + " <th>Correct Source</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>What battles took place in New York City in th...</td>\n", + " <td>False</td>\n", + " <td>True</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>Who was elected as the mayor after the Great D...</td>\n", + " <td>False</td>\n", + " <td>True</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>How many tourists visited New York City in 2019?</td>\n", + " <td>False</td>\n", + " <td>True</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>What are the airports in New York City?</td>\n", + " <td>True</td>\n", + " <td>True</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>When was the first documented visit into New Y...</td>\n", + " <td>True</td>\n", + " <td>True</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " Test Query Correct Response \\\n", + "0 What battles took place in New York City in th... False \n", + "1 Who was elected as the mayor after the Great D... False \n", + "2 How many tourists visited New York City in 2019? False \n", + "3 What are the airports in New York City? True \n", + "4 When was the first documented visit into New Y... True \n", + "\n", + " Correct Source \n", + "0 True \n", + "1 True \n", + "2 True \n", + "3 True \n", + "4 True " + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "analyze_outcome(outcomes_list_chatgpt)" + ] + }, + { + "cell_type": "markdown", + "id": "38fc1438", + "metadata": {}, + "source": [ + "### Vector Store Index + GPT4 " + ] + }, + { + "cell_type": "code", + "execution_count": 487, + "id": "5349d1e7", + "metadata": {}, + "outputs": [], + "source": [ + "outcomes_vector_gpt4 = bm.test(vector_index, llm_predictor_gpt4)" + ] + }, + { + "cell_type": "code", + "execution_count": 488, + "id": "7fc53e19", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>Test Query</th>\n", + " <th>Correct Response</th>\n", + " <th>Correct Source</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>What battles took place in New York City in th...</td>\n", + " <td>True</td>\n", + " <td>True</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>Who was elected as the mayor during the Great ...</td>\n", + " <td>True</td>\n", + " <td>True</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>How many tourists visited New York City in 2019?</td>\n", + " <td>False</td>\n", + " <td>False</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>What are the airports in New York City?</td>\n", + " <td>True</td>\n", + " <td>True</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>When was the first documented visit into New Y...</td>\n", + " <td>True</td>\n", + " <td>True</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " Test Query Correct Response \\\n", + "0 What battles took place in New York City in th... True \n", + "1 Who was elected as the mayor during the Great ... True \n", + "2 How many tourists visited New York City in 2019? False \n", + "3 What are the airports in New York City? True \n", + "4 When was the first documented visit into New Y... True \n", + "\n", + " Correct Source \n", + "0 True \n", + "1 True \n", + "2 False \n", + "3 True \n", + "4 True " + ] + }, + "execution_count": 488, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "analyze_outcome(outcomes_vector_gpt4)" + ] + }, + { + "cell_type": "markdown", + "id": "70eb711f", + "metadata": {}, + "source": [ + "### Vector Store Index + GPT3" + ] + }, + { + "cell_type": "code", + "execution_count": 644, + "id": "e35ebdf9", + "metadata": {}, + "outputs": [], + "source": [ + "outcomes_vector_gpt3 = bm.test(vector_index, llm_predictor_gpt3)" + ] + }, + { + "cell_type": "code", + "execution_count": 645, + "id": "95c49697", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>Test Query</th>\n", + " <th>Correct Response</th>\n", + " <th>Correct Source</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>What battles took place in New York City in th...</td>\n", + " <td>True</td>\n", + " <td>True</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>Who was elected as the mayor after the Great D...</td>\n", + " <td>True</td>\n", + " <td>False</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>How many tourists visited New York City in 2019?</td>\n", + " <td>False</td>\n", + " <td>False</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>What are the airports in New York City?</td>\n", + " <td>True</td>\n", + " <td>False</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>When was the first documented visit into New Y...</td>\n", + " <td>True</td>\n", + " <td>False</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " Test Query Correct Response \\\n", + "0 What battles took place in New York City in th... True \n", + "1 Who was elected as the mayor after the Great D... True \n", + "2 How many tourists visited New York City in 2019? False \n", + "3 What are the airports in New York City? True \n", + "4 When was the first documented visit into New Y... True \n", + "\n", + " Correct Source \n", + "0 True \n", + "1 False \n", + "2 False \n", + "3 False \n", + "4 False " + ] + }, + "execution_count": 645, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "analyze_outcome(outcomes_vector_gpt3)" + ] + }, + { + "cell_type": "markdown", + "id": "a36ba2ee", + "metadata": {}, + "source": [ + "## LLM based Evaluation" + ] + }, + { + "cell_type": "code", + "execution_count": 646, + "id": "59ff561c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>Test Query</th>\n", + " <th>Correct Response</th>\n", + " <th>Correct Source</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>What battles took place in New York City in th...</td>\n", + " <td>True</td>\n", + " <td>True</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>Who was elected as the mayor after the Great D...</td>\n", + " <td>True</td>\n", + " <td>False</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>How many tourists visited New York City in 2019?</td>\n", + " <td>False</td>\n", + " <td>False</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>What are the airports in New York City?</td>\n", + " <td>True</td>\n", + " <td>False</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>When was the first documented visit into New Y...</td>\n", + " <td>True</td>\n", + " <td>False</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " Test Query Correct Response \\\n", + "0 What battles took place in New York City in th... True \n", + "1 Who was elected as the mayor after the Great D... True \n", + "2 How many tourists visited New York City in 2019? False \n", + "3 What are the airports in New York City? True \n", + "4 When was the first documented visit into New Y... True \n", + "\n", + " Correct Source \n", + "0 True \n", + "1 False \n", + "2 False \n", + "3 False \n", + "4 False " + ] + }, + "execution_count": 646, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "analyze_outcome(outcomes_vector_gpt3)" + ] + }, + { + "cell_type": "code", + "execution_count": 647, + "id": "e4ffaca6", + "metadata": {}, + "outputs": [], + "source": [ + "eval_gpt4 = analyze_outcome_llm(outcomes_vector_gpt3, llm_predictor_gpt4)" + ] + }, + { + "cell_type": "code", + "execution_count": 657, + "id": "85c4e415", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>Test Query</th>\n", + " <th>Correct Response (LLM)</th>\n", + " <th>Correct Source (LLM)</th>\n", + " <th>Eval (LLM)</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>What battles took place in New York City in th...</td>\n", + " <td>True</td>\n", + " <td>True</td>\n", + " <td>Context is relevant: True\\nAnswer is correct: ...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>Who was elected as the mayor after the Great D...</td>\n", + " <td>True</td>\n", + " <td>False</td>\n", + " <td>Context is relevant: False\\nAnswer is correct:...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>How many tourists visited New York City in 2019?</td>\n", + " <td>True</td>\n", + " <td>False</td>\n", + " <td>Context is relevant: False\\nAnswer is correct:...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>What are the airports in New York City?</td>\n", + " <td>True</td>\n", + " <td>False</td>\n", + " <td>Context is relevant: False\\nAnswer is correct:...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>When was the first documented visit into New Y...</td>\n", + " <td>True</td>\n", + " <td>False</td>\n", + " <td>Context is relevant: False\\nAnswer is correct:...</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " Test Query Correct Response (LLM) \\\n", + "0 What battles took place in New York City in th... True \n", + "1 Who was elected as the mayor after the Great D... True \n", + "2 How many tourists visited New York City in 2019? True \n", + "3 What are the airports in New York City? True \n", + "4 When was the first documented visit into New Y... True \n", + "\n", + " Correct Source (LLM) Eval (LLM) \n", + "0 True Context is relevant: True\\nAnswer is correct: ... \n", + "1 False Context is relevant: False\\nAnswer is correct:... \n", + "2 False Context is relevant: False\\nAnswer is correct:... \n", + "3 False Context is relevant: False\\nAnswer is correct:... \n", + "4 False Context is relevant: False\\nAnswer is correct:... " + ] + }, + "execution_count": 657, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "eval_gpt4" + ] + }, + { + "cell_type": "code", + "execution_count": 651, + "id": "3efb66d6", + "metadata": {}, + "outputs": [], + "source": [ + "eval_chatgpt = analyze_outcome_llm(outcomes_vector_gpt3, llm_predictor_chatgpt)" + ] + }, + { + "cell_type": "code", + "execution_count": 652, + "id": "4c452767", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>Test Query</th>\n", + " <th>Correct Response (LLM)</th>\n", + " <th>Correct Source (LLM)</th>\n", + " <th>Eval (LLM)</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>What battles took place in New York City in th...</td>\n", + " <td>True</td>\n", + " <td>True</td>\n", + " <td>\\n\\nContext is relevant: True\\nAnswer is corre...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>Who was elected as the mayor after the Great D...</td>\n", + " <td>True</td>\n", + " <td>True</td>\n", + " <td>\\n\\nContext is relevant: True\\nAnswer is corre...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>How many tourists visited New York City in 2019?</td>\n", + " <td>False</td>\n", + " <td>False</td>\n", + " <td>\\n\\nContext is relevant: False\\nAnswer is corr...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>What are the airports in New York City?</td>\n", + " <td>True</td>\n", + " <td>False</td>\n", + " <td>\\n\\nContext is relevant: False\\nAnswer is corr...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>When was the first documented visit into New Y...</td>\n", + " <td>False</td>\n", + " <td>True</td>\n", + " <td>\\n\\nContext is relevant: True\\nAnswer is corre...</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " Test Query Correct Response (LLM) \\\n", + "0 What battles took place in New York City in th... True \n", + "1 Who was elected as the mayor after the Great D... True \n", + "2 How many tourists visited New York City in 2019? False \n", + "3 What are the airports in New York City? True \n", + "4 When was the first documented visit into New Y... False \n", + "\n", + " Correct Source (LLM) Eval (LLM) \n", + "0 True \\n\\nContext is relevant: True\\nAnswer is corre... \n", + "1 True \\n\\nContext is relevant: True\\nAnswer is corre... \n", + "2 False \\n\\nContext is relevant: False\\nAnswer is corr... \n", + "3 False \\n\\nContext is relevant: False\\nAnswer is corr... \n", + "4 True \\n\\nContext is relevant: True\\nAnswer is corre... " + ] + }, + "execution_count": 652, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "eval_chatgpt" + ] + }, + { + "cell_type": "code", + "execution_count": 649, + "id": "61e8dad2", + "metadata": {}, + "outputs": [], + "source": [ + "eval_gpt3 = analyze_outcome_llm(outcomes_vector_gpt3, llm_predictor_gpt3)" + ] + }, + { + "cell_type": "code", + "execution_count": 650, + "id": "170400c3", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>Test Query</th>\n", + " <th>Correct Response (LLM)</th>\n", + " <th>Correct Source (LLM)</th>\n", + " <th>Eval (LLM)</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>What battles took place in New York City in th...</td>\n", + " <td>True</td>\n", + " <td>True</td>\n", + " <td>\\n\\nContext is relevant: True\\nAnswer is corre...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>Who was elected as the mayor after the Great D...</td>\n", + " <td>True</td>\n", + " <td>True</td>\n", + " <td>\\n\\nContext is relevant: True\\nAnswer is corre...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>How many tourists visited New York City in 2019?</td>\n", + " <td>False</td>\n", + " <td>False</td>\n", + " <td>\\n\\nContext is relevant: False\\nAnswer is corr...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>What are the airports in New York City?</td>\n", + " <td>True</td>\n", + " <td>True</td>\n", + " <td>\\n\\nContext is relevant: True\\nAnswer is corre...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>When was the first documented visit into New Y...</td>\n", + " <td>True</td>\n", + " <td>True</td>\n", + " <td>\\n\\nContext is relevant: True\\nAnswer is corre...</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " Test Query Correct Response (LLM) \\\n", + "0 What battles took place in New York City in th... True \n", + "1 Who was elected as the mayor after the Great D... True \n", + "2 How many tourists visited New York City in 2019? False \n", + "3 What are the airports in New York City? True \n", + "4 When was the first documented visit into New Y... True \n", + "\n", + " Correct Source (LLM) Eval (LLM) \n", + "0 True \\n\\nContext is relevant: True\\nAnswer is corre... \n", + "1 True \\n\\nContext is relevant: True\\nAnswer is corre... \n", + "2 False \\n\\nContext is relevant: False\\nAnswer is corr... \n", + "3 True \\n\\nContext is relevant: True\\nAnswer is corre... \n", + "4 True \\n\\nContext is relevant: True\\nAnswer is corre... " + ] + }, + "execution_count": 650, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "eval_gpt3" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.0" + } + }, + "nbformat": 4, + "nbformat_minor": 5 } diff --git a/examples/test_wiki/TestNYC-Tree-GPT4.ipynb b/examples/test_wiki/TestNYC-Tree-GPT4.ipynb index 0e4865b09c..ceef063e4c 100644 --- a/examples/test_wiki/TestNYC-Tree-GPT4.ipynb +++ b/examples/test_wiki/TestNYC-Tree-GPT4.ipynb @@ -1,5 +1,13 @@ { "cells": [ + { + "cell_type": "markdown", + "id": "cf0f06c0", + "metadata": {}, + "source": [ + "# GPT4 NYC Wiki Tree Index" + ] + }, { "cell_type": "code", "execution_count": 18, @@ -984,7 +992,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.10" + "version": "3.11.0" } }, "nbformat": 4, diff --git a/examples/test_wiki/TestNYC.ipynb b/examples/test_wiki/TestNYC.ipynb index dd0526b19e..4a91651ae4 100644 --- a/examples/test_wiki/TestNYC.ipynb +++ b/examples/test_wiki/TestNYC.ipynb @@ -1,180 +1,188 @@ { - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "id": "9080b39e", - "metadata": {}, - "outputs": [], - "source": [ - "import logging\n", - "import sys\n", - "\n", - "logging.basicConfig(stream=sys.stdout, level=logging.INFO)\n", - "logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b4b4387b-413e-4016-ba1e-88b3d9410a38", - "metadata": {}, - "outputs": [], - "source": [ - "# fetch \"New York City\" page from Wikipedia\n", - "from pathlib import Path\n", - "\n", - "import requests\n", - "response = requests.get(\n", - " 'https://en.wikipedia.org/w/api.php',\n", - " params={\n", - " 'action': 'query',\n", - " 'format': 'json',\n", - " 'titles': 'New York City',\n", - " 'prop': 'extracts',\n", - " # 'exintro': True,\n", - " 'explaintext': True,\n", - " }\n", - ").json()\n", - "page = next(iter(response['query']['pages'].values()))\n", - "nyc_text = page['extract']\n", - "\n", - "data_path = Path('data')\n", - "if not data_path.exists():\n", - " Path.mkdir(data_path)\n", - "\n", - "with open('data/nyc_text.txt', 'w') as fp:\n", - " fp.write(nyc_text)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f1a9eb90-335c-4214-8bb6-fd1edbe3ccbd", - "metadata": {}, - "outputs": [], - "source": [ - "# My OpenAI Key\n", - "import os\n", - "os.environ['OPENAI_API_KEY'] = \"INSERT OPENAI KEY\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8d0b2364-4806-4656-81e7-3f6e4b910b5b", - "metadata": {}, - "outputs": [], - "source": [ - "from llama_index import GPTTreeIndex, SimpleDirectoryReader" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1298bbb4-c99e-431e-93ef-eb32c0a2fc2a", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "documents = SimpleDirectoryReader('data').load_data()\n", - "index = GPTTreeIndex.from_documents(documents)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0b4fe9b6-5762-4e86-b51e-aac45d3ecdb1", - "metadata": {}, - "outputs": [], - "source": [ - "index.save_to_disk('index.json')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5eec265d-211b-4f26-b05b-5b4e7072bc6e", - "metadata": {}, - "outputs": [], - "source": [ - "# try loading\n", - "new_index = GPTTreeIndex.load_from_disk('index.json')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "68c9ebfe-b1b6-4f4e-9278-174346de8c90", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "# GPT doesn't find the corresponding evidence in the leaf node, but still gives the correct answer\n", - "# set Logging to DEBUG for more detailed outputs\n", - "\n", - "new_index.query(\"What is the name of the professional women's basketball team in New York City?\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4fc3f18a-0ef9-453c-acf8-7aedd784cdcf", - "metadata": {}, - "outputs": [], - "source": [ - "# GPT doesn't find the corresponding evidence in the leaf node, but still gives the correct answer\n", - "# set Logging to DEBUG for more detailed outputs\n", - "\n", - "new_index.query(\"What battles took place in New York City in the American Revolution?\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "97f3ddf1-8dc2-4fb8-831f-2c06649e0955", - "metadata": {}, - "outputs": [], - "source": [ - "# GPT doesn't find the corresponding evidence in the leaf node, but still gives the correct answer\n", - "# set Logging to DEBUG for more detailed outputs\n", - "\n", - "new_index.query(\"What are the airports in New York City?\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "53265fd4-da98-4cf9-abfb-3f76105fd2ff", - "metadata": {}, - "outputs": [], - "source": [ - "# Try using embedding query\n", - "new_index.query(\"What are the airports in New York City?\", mode=\"embedding\")" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.10" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} \ No newline at end of file + "cells": [ + { + "cell_type": "markdown", + "id": "b8a16e85", + "metadata": {}, + "source": [ + "# NYC Wiki Demo" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9080b39e", + "metadata": {}, + "outputs": [], + "source": [ + "import logging\n", + "import sys\n", + "\n", + "logging.basicConfig(stream=sys.stdout, level=logging.INFO)\n", + "logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b4b4387b-413e-4016-ba1e-88b3d9410a38", + "metadata": {}, + "outputs": [], + "source": [ + "# fetch \"New York City\" page from Wikipedia\n", + "from pathlib import Path\n", + "\n", + "import requests\n", + "response = requests.get(\n", + " 'https://en.wikipedia.org/w/api.php',\n", + " params={\n", + " 'action': 'query',\n", + " 'format': 'json',\n", + " 'titles': 'New York City',\n", + " 'prop': 'extracts',\n", + " # 'exintro': True,\n", + " 'explaintext': True,\n", + " }\n", + ").json()\n", + "page = next(iter(response['query']['pages'].values()))\n", + "nyc_text = page['extract']\n", + "\n", + "data_path = Path('data')\n", + "if not data_path.exists():\n", + " Path.mkdir(data_path)\n", + "\n", + "with open('data/nyc_text.txt', 'w') as fp:\n", + " fp.write(nyc_text)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f1a9eb90-335c-4214-8bb6-fd1edbe3ccbd", + "metadata": {}, + "outputs": [], + "source": [ + "# My OpenAI Key\n", + "import os\n", + "os.environ['OPENAI_API_KEY'] = \"INSERT OPENAI KEY\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8d0b2364-4806-4656-81e7-3f6e4b910b5b", + "metadata": {}, + "outputs": [], + "source": [ + "from llama_index import GPTTreeIndex, SimpleDirectoryReader" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1298bbb4-c99e-431e-93ef-eb32c0a2fc2a", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "documents = SimpleDirectoryReader('data').load_data()\n", + "index = GPTTreeIndex.from_documents(documents)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0b4fe9b6-5762-4e86-b51e-aac45d3ecdb1", + "metadata": {}, + "outputs": [], + "source": [ + "index.save_to_disk('index.json')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5eec265d-211b-4f26-b05b-5b4e7072bc6e", + "metadata": {}, + "outputs": [], + "source": [ + "# try loading\n", + "new_index = GPTTreeIndex.load_from_disk('index.json')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "68c9ebfe-b1b6-4f4e-9278-174346de8c90", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# GPT doesn't find the corresponding evidence in the leaf node, but still gives the correct answer\n", + "# set Logging to DEBUG for more detailed outputs\n", + "\n", + "new_index.query(\"What is the name of the professional women's basketball team in New York City?\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4fc3f18a-0ef9-453c-acf8-7aedd784cdcf", + "metadata": {}, + "outputs": [], + "source": [ + "# GPT doesn't find the corresponding evidence in the leaf node, but still gives the correct answer\n", + "# set Logging to DEBUG for more detailed outputs\n", + "\n", + "new_index.query(\"What battles took place in New York City in the American Revolution?\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "97f3ddf1-8dc2-4fb8-831f-2c06649e0955", + "metadata": {}, + "outputs": [], + "source": [ + "# GPT doesn't find the corresponding evidence in the leaf node, but still gives the correct answer\n", + "# set Logging to DEBUG for more detailed outputs\n", + "\n", + "new_index.query(\"What are the airports in New York City?\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "53265fd4-da98-4cf9-abfb-3f76105fd2ff", + "metadata": {}, + "outputs": [], + "source": [ + "# Try using embedding query\n", + "new_index.query(\"What are the airports in New York City?\", mode=\"embedding\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.0" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/examples/test_wiki/TestNYC_Embeddings.ipynb b/examples/test_wiki/TestNYC_Embeddings.ipynb index f2b79c1d07..5f61e4e587 100644 --- a/examples/test_wiki/TestNYC_Embeddings.ipynb +++ b/examples/test_wiki/TestNYC_Embeddings.ipynb @@ -1,445 +1,445 @@ { - "cells": [ - { - "cell_type": "markdown", - "id": "7a9f093e-e027-405b-ae3d-17dda9e30cd0", - "metadata": {}, - "source": [ - "# NYC Wikipedia Embeddings Demo" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "cadae9f2", - "metadata": {}, - "outputs": [], - "source": [ - "import logging\n", - "import sys\n", - "\n", - "logging.basicConfig(stream=sys.stdout, level=logging.INFO)\n", - "logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))" - ] - }, - { - "cell_type": "markdown", - "id": "3e594a62-110e-40b3-ad1e-c99f49a4e537", - "metadata": {}, - "source": [ - "Demonstrate embedding capabilities in GPTTreeIndex and GPTListIndex" - ] - }, - { - "cell_type": "markdown", - "id": "b145f093-afb0-46b8-a81f-466af8478439", - "metadata": {}, - "source": [ - "### Setup + Data Prep" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d038dcc1", - "metadata": {}, - "outputs": [], - "source": [ - "import logging\n", - "import sys\n", - "\n", - "logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)\n", - "logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b4b4387b-413e-4016-ba1e-88b3d9410a38", - "metadata": {}, - "outputs": [], - "source": [ - "# fetch \"New York City\" page from Wikipedia\n", - "from pathlib import Path\n", - "\n", - "import requests\n", - "response = requests.get(\n", - " 'https://en.wikipedia.org/w/api.php',\n", - " params={\n", - " 'action': 'query',\n", - " 'format': 'json',\n", - " 'titles': 'New York City',\n", - " 'prop': 'extracts',\n", - " # 'exintro': True,\n", - " 'explaintext': True,\n", - " }\n", - ").json()\n", - "page = next(iter(response['query']['pages'].values()))\n", - "nyc_text = page['extract']\n", - "\n", - "data_path = Path('data')\n", - "if not data_path.exists():\n", - " Path.mkdir(data_path)\n", - "\n", - "with open('data/nyc_text.txt', 'w') as fp:\n", - " fp.write(nyc_text)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f1a9eb90-335c-4214-8bb6-fd1edbe3ccbd", - "metadata": {}, - "outputs": [], - "source": [ - "# My OpenAI Key\n", - "import os\n", - "os.environ['OPENAI_API_KEY'] = \"INSERT OPENAI KEY\"" - ] - }, - { - "cell_type": "markdown", - "id": "def4eca7-ba03-48e2-b18f-fd669b91a5fc", - "metadata": {}, - "source": [ - "### GPTTreeIndex - Embedding-based Query" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8d0b2364-4806-4656-81e7-3f6e4b910b5b", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.\n" - ] - } - ], - "source": [ - "from llama_index import GPTTreeIndex, SimpleDirectoryReader\n", - "from IPython.display import Markdown" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1298bbb4-c99e-431e-93ef-eb32c0a2fc2a", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "documents = SimpleDirectoryReader('data').load_data()\n", - "index = GPTTreeIndex.from_documents(documents)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0b4fe9b6-5762-4e86-b51e-aac45d3ecdb1", - "metadata": {}, - "outputs": [], - "source": [ - "index.save_to_disk('index.json')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5eec265d-211b-4f26-b05b-5b4e7072bc6e", - "metadata": {}, - "outputs": [], - "source": [ - "new_index = GPTTreeIndex.load_from_disk('index.json')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "68c9ebfe-b1b6-4f4e-9278-174346de8c90", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "# set Logging to DEBUG for more detailed outputs\n", - "response = new_index.query(\"What is the name of the professional women's basketball team in New York City?\", mode=\"embedding\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e1000018-18de-410d-b6d9-c66bf37ccf1d", - "metadata": {}, - "outputs": [], - "source": [ - "display(Markdown(f\"<b>{response}</b>\"))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4fc3f18a-0ef9-453c-acf8-7aedd784cdcf", - "metadata": {}, - "outputs": [], - "source": [ - "response = new_index.query(\n", - " \"What battles took place in New York City in the American Revolution?\", \n", - " mode=\"embedding\"\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5588289b-9fdc-4b86-bab9-808c97be05e1", - "metadata": {}, - "outputs": [], - "source": [ - "display(Markdown(f\"<b>{response}</b>\"))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "53265fd4-da98-4cf9-abfb-3f76105fd2ff", - "metadata": {}, - "outputs": [], - "source": [ - "# set Logging to DEBUG for more detailed outputs\n", - "response = new_index.query(\"What are the airports in New York City?\", mode=\"embedding\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "bc08060f-b031-4dc5-a980-427dd2407b5d", - "metadata": {}, - "outputs": [], - "source": [ - "display(Markdown(f\"<b>{response}</b>\"))" - ] - }, - { - "cell_type": "markdown", - "id": "63009734-deda-4159-9f2b-0af19720e913", - "metadata": {}, - "source": [ - "### GPTListIndex - Embedding-based Query" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "fd8920ae-8115-457c-b092-21e50cc3bcc0", - "metadata": {}, - "outputs": [], - "source": [ - "from llama_index import GPTListIndex, SimpleDirectoryReader\n", - "from IPython.display import Markdown" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "27c8bbee-daf5-494d-ba66-b60142592a96", - "metadata": {}, - "outputs": [], - "source": [ - "documents = SimpleDirectoryReader('data').load_data()\n", - "index = GPTListIndex.from_documents(documents)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c3d5a589-ee75-40bd-9529-75f693874ed7", - "metadata": {}, - "outputs": [], - "source": [ - "index.save_to_disk('index_list_emb.json')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9dfbef52-50fb-46ca-b82b-c44cfa2301ef", - "metadata": {}, - "outputs": [], - "source": [ - "# try loading\n", - "new_index = GPTListIndex.load_from_disk('index_list_emb.json')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2cbf24c2-060e-4216-9188-a6746af1830d", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "# set Logging to DEBUG for more detailed outputs\n", - "response = new_index.query(\"What is the name of the professional women's basketball team in New York City?\", mode=\"embedding\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "14e1b19f-fbf7-49fd-a96f-cbb37bafd498", - "metadata": {}, - "outputs": [], - "source": [ - "display(Markdown(f\"<b>{response}</b>\"))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "48b86c8d-9149-4395-9d52-6070597c814d", - "metadata": {}, - "outputs": [], - "source": [ - "# set Logging to DEBUG for more detailed outputs\n", - "response = new_index.query(\"What battles took place in New York City in the American Revolution?\", mode=\"embedding\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "57fbd90c-a8d3-4738-8531-e8f48a953167", - "metadata": {}, - "outputs": [], - "source": [ - "display(Markdown(f\"<b>{response}</b>\"))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7ab01446-9b07-4222-a577-eeb4617ce4fc", - "metadata": {}, - "outputs": [], - "source": [ - "# set Logging to DEBUG for more detailed outputs\n", - "response = new_index.query(\"What are the airports in New York City?\", mode=\"embedding\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "091afaea-a61e-4a7c-b2f1-7df387380b8b", - "metadata": {}, - "outputs": [], - "source": [ - "display(Markdown(f\"<b>{response}</b>\"))" - ] - }, - { - "cell_type": "markdown", - "id": "aca03087-d6cc-4d87-8ec6-185fa03d9fea", - "metadata": {}, - "source": [ - "## Try out other embeddings! \n", - "(courtesy of langchain)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "27c24411-7049-45c7-862c-0857c03db580", - "metadata": {}, - "outputs": [], - "source": [ - "from llama_index import GPTListIndex, SimpleDirectoryReader, ServiceContext\n", - "from IPython.display import Markdown" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b9ff1944-a06a-4b05-adae-a2ef25e74e8b", - "metadata": {}, - "outputs": [], - "source": [ - "# load in HF embedding model from langchain\n", - "from langchain.embeddings.huggingface import HuggingFaceEmbeddings\n", - "from llama_index import LangchainEmbedding\n", - "embed_model = LangchainEmbedding(HuggingFaceEmbeddings())" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3049d517-05db-459b-9e32-711e380fda67", - "metadata": {}, - "outputs": [], - "source": [ - "# try loading index\n", - "new_index = GPTListIndex.load_from_disk('index_list_emb.json')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1494cabb-0123-408a-9d81-8e02db9b3acd", - "metadata": {}, - "outputs": [], - "source": [ - "# configure\n", - "service_context = ServiceContext.from_defaults(embed_model=embed_model)\n", - "\n", - "# set Logging to DEBUG for more detailed outputs\n", - "response = new_index.query(\n", - " \"What is the name of the professional women's basketball team in New York City?\", \n", - " mode=\"embedding\", \n", - " service_context=service_context, \n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4d96a2e7-4eb1-474e-b855-eca3efed1bad", - "metadata": {}, - "outputs": [], - "source": [ - "response" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "80510d3a-8bf8-47f2-b1d4-3d1bd0d5a1bb", - "metadata": {}, - "outputs": [], - "source": [] - } + "cells": [ + { + "cell_type": "markdown", + "id": "7a9f093e-e027-405b-ae3d-17dda9e30cd0", + "metadata": {}, + "source": [ + "# NYC Wikipedia Embeddings Demo" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cadae9f2", + "metadata": {}, + "outputs": [], + "source": [ + "import logging\n", + "import sys\n", + "\n", + "logging.basicConfig(stream=sys.stdout, level=logging.INFO)\n", + "logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))" + ] + }, + { + "cell_type": "markdown", + "id": "3e594a62-110e-40b3-ad1e-c99f49a4e537", + "metadata": {}, + "source": [ + "Demonstrate embedding capabilities in GPTTreeIndex and GPTListIndex" + ] + }, + { + "cell_type": "markdown", + "id": "b145f093-afb0-46b8-a81f-466af8478439", + "metadata": {}, + "source": [ + "### Setup + Data Prep" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d038dcc1", + "metadata": {}, + "outputs": [], + "source": [ + "import logging\n", + "import sys\n", + "\n", + "logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)\n", + "logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b4b4387b-413e-4016-ba1e-88b3d9410a38", + "metadata": {}, + "outputs": [], + "source": [ + "# fetch \"New York City\" page from Wikipedia\n", + "from pathlib import Path\n", + "\n", + "import requests\n", + "response = requests.get(\n", + " 'https://en.wikipedia.org/w/api.php',\n", + " params={\n", + " 'action': 'query',\n", + " 'format': 'json',\n", + " 'titles': 'New York City',\n", + " 'prop': 'extracts',\n", + " # 'exintro': True,\n", + " 'explaintext': True,\n", + " }\n", + ").json()\n", + "page = next(iter(response['query']['pages'].values()))\n", + "nyc_text = page['extract']\n", + "\n", + "data_path = Path('data')\n", + "if not data_path.exists():\n", + " Path.mkdir(data_path)\n", + "\n", + "with open('data/nyc_text.txt', 'w') as fp:\n", + " fp.write(nyc_text)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f1a9eb90-335c-4214-8bb6-fd1edbe3ccbd", + "metadata": {}, + "outputs": [], + "source": [ + "# My OpenAI Key\n", + "import os\n", + "os.environ['OPENAI_API_KEY'] = \"INSERT OPENAI KEY\"" + ] + }, + { + "cell_type": "markdown", + "id": "def4eca7-ba03-48e2-b18f-fd669b91a5fc", + "metadata": {}, + "source": [ + "### GPTTreeIndex - Embedding-based Query" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8d0b2364-4806-4656-81e7-3f6e4b910b5b", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.\n" + ] + } ], + "source": [ + "from llama_index import GPTTreeIndex, SimpleDirectoryReader\n", + "from IPython.display import Markdown" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1298bbb4-c99e-431e-93ef-eb32c0a2fc2a", "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.9" - } + "tags": [] }, - "nbformat": 4, - "nbformat_minor": 5 -} \ No newline at end of file + "outputs": [], + "source": [ + "documents = SimpleDirectoryReader('data').load_data()\n", + "index = GPTTreeIndex.from_documents(documents)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0b4fe9b6-5762-4e86-b51e-aac45d3ecdb1", + "metadata": {}, + "outputs": [], + "source": [ + "index.save_to_disk('index.json')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5eec265d-211b-4f26-b05b-5b4e7072bc6e", + "metadata": {}, + "outputs": [], + "source": [ + "new_index = GPTTreeIndex.load_from_disk('index.json')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "68c9ebfe-b1b6-4f4e-9278-174346de8c90", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# set Logging to DEBUG for more detailed outputs\n", + "response = new_index.query(\"What is the name of the professional women's basketball team in New York City?\", mode=\"embedding\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e1000018-18de-410d-b6d9-c66bf37ccf1d", + "metadata": {}, + "outputs": [], + "source": [ + "display(Markdown(f\"<b>{response}</b>\"))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4fc3f18a-0ef9-453c-acf8-7aedd784cdcf", + "metadata": {}, + "outputs": [], + "source": [ + "response = new_index.query(\n", + " \"What battles took place in New York City in the American Revolution?\", \n", + " mode=\"embedding\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5588289b-9fdc-4b86-bab9-808c97be05e1", + "metadata": {}, + "outputs": [], + "source": [ + "display(Markdown(f\"<b>{response}</b>\"))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "53265fd4-da98-4cf9-abfb-3f76105fd2ff", + "metadata": {}, + "outputs": [], + "source": [ + "# set Logging to DEBUG for more detailed outputs\n", + "response = new_index.query(\"What are the airports in New York City?\", mode=\"embedding\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bc08060f-b031-4dc5-a980-427dd2407b5d", + "metadata": {}, + "outputs": [], + "source": [ + "display(Markdown(f\"<b>{response}</b>\"))" + ] + }, + { + "cell_type": "markdown", + "id": "63009734-deda-4159-9f2b-0af19720e913", + "metadata": {}, + "source": [ + "### GPTListIndex - Embedding-based Query" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fd8920ae-8115-457c-b092-21e50cc3bcc0", + "metadata": {}, + "outputs": [], + "source": [ + "from llama_index import GPTListIndex, SimpleDirectoryReader\n", + "from IPython.display import Markdown" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "27c8bbee-daf5-494d-ba66-b60142592a96", + "metadata": {}, + "outputs": [], + "source": [ + "documents = SimpleDirectoryReader('data').load_data()\n", + "index = GPTListIndex.from_documents(documents)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c3d5a589-ee75-40bd-9529-75f693874ed7", + "metadata": {}, + "outputs": [], + "source": [ + "index.save_to_disk('index_list_emb.json')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9dfbef52-50fb-46ca-b82b-c44cfa2301ef", + "metadata": {}, + "outputs": [], + "source": [ + "# try loading\n", + "new_index = GPTListIndex.load_from_disk('index_list_emb.json')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2cbf24c2-060e-4216-9188-a6746af1830d", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# set Logging to DEBUG for more detailed outputs\n", + "response = new_index.query(\"What is the name of the professional women's basketball team in New York City?\", mode=\"embedding\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "14e1b19f-fbf7-49fd-a96f-cbb37bafd498", + "metadata": {}, + "outputs": [], + "source": [ + "display(Markdown(f\"<b>{response}</b>\"))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "48b86c8d-9149-4395-9d52-6070597c814d", + "metadata": {}, + "outputs": [], + "source": [ + "# set Logging to DEBUG for more detailed outputs\n", + "response = new_index.query(\"What battles took place in New York City in the American Revolution?\", mode=\"embedding\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "57fbd90c-a8d3-4738-8531-e8f48a953167", + "metadata": {}, + "outputs": [], + "source": [ + "display(Markdown(f\"<b>{response}</b>\"))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7ab01446-9b07-4222-a577-eeb4617ce4fc", + "metadata": {}, + "outputs": [], + "source": [ + "# set Logging to DEBUG for more detailed outputs\n", + "response = new_index.query(\"What are the airports in New York City?\", mode=\"embedding\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "091afaea-a61e-4a7c-b2f1-7df387380b8b", + "metadata": {}, + "outputs": [], + "source": [ + "display(Markdown(f\"<b>{response}</b>\"))" + ] + }, + { + "cell_type": "markdown", + "id": "aca03087-d6cc-4d87-8ec6-185fa03d9fea", + "metadata": {}, + "source": [ + "## Try out other embeddings! \n", + "(courtesy of langchain)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "27c24411-7049-45c7-862c-0857c03db580", + "metadata": {}, + "outputs": [], + "source": [ + "from llama_index import GPTListIndex, SimpleDirectoryReader, ServiceContext\n", + "from IPython.display import Markdown" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b9ff1944-a06a-4b05-adae-a2ef25e74e8b", + "metadata": {}, + "outputs": [], + "source": [ + "# load in HF embedding model from langchain\n", + "from langchain.embeddings.huggingface import HuggingFaceEmbeddings\n", + "from llama_index import LangchainEmbedding\n", + "embed_model = LangchainEmbedding(HuggingFaceEmbeddings())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3049d517-05db-459b-9e32-711e380fda67", + "metadata": {}, + "outputs": [], + "source": [ + "# try loading index\n", + "new_index = GPTListIndex.load_from_disk('index_list_emb.json')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1494cabb-0123-408a-9d81-8e02db9b3acd", + "metadata": {}, + "outputs": [], + "source": [ + "# configure\n", + "service_context = ServiceContext.from_defaults(embed_model=embed_model)\n", + "\n", + "# set Logging to DEBUG for more detailed outputs\n", + "response = new_index.query(\n", + " \"What is the name of the professional women's basketball team in New York City?\", \n", + " mode=\"embedding\", \n", + " service_context=service_context, \n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4d96a2e7-4eb1-474e-b855-eca3efed1bad", + "metadata": {}, + "outputs": [], + "source": [ + "response" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "80510d3a-8bf8-47f2-b1d4-3d1bd0d5a1bb", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.0" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/examples/test_wiki/TestWikiReader.ipynb b/examples/test_wiki/TestWikiReader.ipynb index a919e7c340..d8a30a42d6 100644 --- a/examples/test_wiki/TestWikiReader.ipynb +++ b/examples/test_wiki/TestWikiReader.ipynb @@ -1,289 +1,297 @@ { - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "id": "52295407", - "metadata": {}, - "outputs": [], - "source": [ - "import logging\n", - "import sys\n", - "\n", - "logging.basicConfig(stream=sys.stdout, level=logging.INFO)\n", - "logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c5d167a5-81f8-4d2c-b42f-0a190577132f", - "metadata": {}, - "outputs": [], - "source": [ - "# My OpenAI Key\n", - "import os\n", - "os.environ['OPENAI_API_KEY'] = \"INSERT OPENAI KEY\"" - ] - }, - { - "cell_type": "markdown", - "id": "575750cc-479f-4b1f-b93f-4b00ed756d52", - "metadata": {}, - "source": [ - "## Wikipedia Reader + Keyword Table" - ] - }, - { - "cell_type": "code", - "execution_count": 36, - "id": "5f60348e-731d-4a95-bae2-426e184a914e", - "metadata": {}, - "outputs": [], - "source": [ - "from llama_index import GPTKeywordTableIndex, WikipediaReader" - ] - }, - { - "cell_type": "code", - "execution_count": 37, - "id": "952c4659-7fbb-447e-8caf-06916412cc37", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "page: Covid-19\n" - ] - } - ], - "source": [ - "wiki_docs = WikipediaReader().load_data(pages=['Covid-19'])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3be202db-a4c7-41d2-ba7d-446d1f934830", - "metadata": {}, - "outputs": [], - "source": [ - "index = GPTKeywordTableIndex.from_documents(wiki_docs)" - ] - }, - { - "cell_type": "code", - "execution_count": 42, - "id": "7f5667a9-6758-447b-9af2-5e5a4d008a29", - "metadata": {}, - "outputs": [], - "source": [ - "# save index to docs\n", - "index.save_to_disk('index_covid.json')" - ] - }, - { - "cell_type": "code", - "execution_count": 45, - "id": "77340460-8319-474f-91eb-545ea5790127", - "metadata": {}, - "outputs": [], - "source": [ - "new_index = GPTKeywordTableIndex.load_from_disk('index_covid.json')" - ] - }, - { - "cell_type": "code", - "execution_count": 46, - "id": "28d7163e-f26f-4ad8-89d5-9cb7662c4d9c", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> Starting query: Which country included tocilizumab in treatment for covid-19?\n", - "Extracted keywords: ['tocilizumab', 'treatment', 'covid-19', 'covid', '19']\n", - "> Querying with idx: 1105763466456338724: of age or older weighing at least 40 kilograms ...\n", - "> Querying with idx: 2820318727532393752: Coronavirus disease 2019 (COVID-19) is a contag...\n", - "> Querying with idx: 897499143815831368: if the mask includes an exhalation valve, a wea...\n", - "> Querying with idx: 8628144746434065339: pulmonary fibrosis, cystic fibrosis. Evidence s...\n" - ] - }, - { - "data": { - "text/plain": [ - "'\\n\\nChina'" - ] - }, - "execution_count": 46, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# GPT doesn't find the corresponding evidence in the leaf node, but still gives the correct answer\n", - "# set Logging to DEBUG for more detailed outputs\n", - "new_index.query(\"Which country included tocilizumab in treatment for covid-19?\")" - ] - }, - { - "cell_type": "markdown", - "id": "addb0c4d-f1ae-40c1-8b69-5a989609672f", - "metadata": {}, - "source": [ - "## Wikipedia Reader + List" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a0fc24e1-eca5-4267-a962-f7fe0fc5c7df", - "metadata": {}, - "outputs": [], - "source": [ - "from llama_index import GPTListIndex, WikipediaReader" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "872a651a-ca4a-43e2-8b29-e4f667f9d3c5", - "metadata": {}, - "outputs": [], - "source": [ - "wiki_docs = WikipediaReader().load_data(pages=['Covid-19'])" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "37e85af0-b1c3-4c18-b239-6e32a7acf8d6", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> Adding chunk: Coronavirus disease 2019 (COVID-19) is a contag...\n", - "> Adding chunk: people with COVID‑19 and acute respiratory dist...\n", - "> Adding chunk: encourage or mandate the use of face masks or c...\n", - "> Adding chunk: have elevated liver enzymes, reflecting liver i...\n", - "> Adding chunk: insofar as their drug use may have caused lung ...\n", - "> Adding chunk: treatment of mild-to-moderate COVID‑19 in adult...\n" - ] - } - ], - "source": [ - "index = GPTListIndex.from_documents(wiki_docs)" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "id": "ec0119ef-786e-40ea-89af-f1ca0ad26de6", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> Starting query: Which country included tocilizumab in treatment for covid-19?\n" - ] - } - ], - "source": [ - "# set Logging to DEBUG for more detailed outputs\n", - "# with keyword lookup\n", - "response = index.query(\n", - " \"Which country included tocilizumab in treatment for covid-19?\", \n", - " required_keywords=[\"tocilizumab\"]\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "id": "b4087a84-0939-444f-93f2-a1a7aa32db3f", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'China'" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "display(response.strip())" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "id": "fb155bc7-cb50-47b6-b92b-895852c2d8f4", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> Starting query: Which country included tocilizumab in treatment for covid-19?\n" - ] - } - ], - "source": [ - "# set Logging to DEBUG for more detailed outputs\n", - "# without keyword lookup\n", - "response = index.query(\n", - " \"Which country included tocilizumab in treatment for covid-19?\"\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "id": "5b45c07a-4e76-4a45-86b6-6b2df1ef4f7b", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'There is no definite answer to this question as different countries have different treatment methods for covid-19. However, according to the context information, it is known that the virus SARS-CoV-2 can cause severe damage to various organs in the human body by inducing systemic inflammation. Therefore, it is possible that tocilizumab, which is a drug that inhibits the virus, may be included in treatment for covid-19 in some countries in order to prevent or reduce the severity of a cytokine storm. Additionally, passive antibodies may be used to treat people with active COVID-19 in order to help them recover.'" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "display(response.strip())" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.1" - } + "cells": [ + { + "cell_type": "markdown", + "id": "3e3d4fcb", + "metadata": {}, + "source": [ + "# Test Wiki Reader" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "52295407", + "metadata": {}, + "outputs": [], + "source": [ + "imporat logging\n", + "import sys\n", + "\n", + "logging.basicConfig(stream=sys.stdout, level=logging.INFO)\n", + "logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c5d167a5-81f8-4d2c-b42f-0a190577132f", + "metadata": {}, + "outputs": [], + "source": [ + "# My OpenAI Key\n", + "import os\n", + "os.environ['OPENAI_API_KEY'] = \"INSERT OPENAI KEY\"" + ] + }, + { + "cell_type": "markdown", + "id": "575750cc-479f-4b1f-b93f-4b00ed756d52", + "metadata": {}, + "source": [ + "## Wikipedia Reader + Keyword Table" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "id": "5f60348e-731d-4a95-bae2-426e184a914e", + "metadata": {}, + "outputs": [], + "source": [ + "from llama_index import GPTKeywordTableIndex, WikipediaReader" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "id": "952c4659-7fbb-447e-8caf-06916412cc37", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "page: Covid-19\n" + ] + } + ], + "source": [ + "wiki_docs = WikipediaReader().load_data(pages=['Covid-19'])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3be202db-a4c7-41d2-ba7d-446d1f934830", + "metadata": {}, + "outputs": [], + "source": [ + "index = GPTKeywordTableIndex.from_documents(wiki_docs)" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "id": "7f5667a9-6758-447b-9af2-5e5a4d008a29", + "metadata": {}, + "outputs": [], + "source": [ + "# save index to docs\n", + "index.save_to_disk('index_covid.json')" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "id": "77340460-8319-474f-91eb-545ea5790127", + "metadata": {}, + "outputs": [], + "source": [ + "new_index = GPTKeywordTableIndex.load_from_disk('index_covid.json')" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "id": "28d7163e-f26f-4ad8-89d5-9cb7662c4d9c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> Starting query: Which country included tocilizumab in treatment for covid-19?\n", + "Extracted keywords: ['tocilizumab', 'treatment', 'covid-19', 'covid', '19']\n", + "> Querying with idx: 1105763466456338724: of age or older weighing at least 40 kilograms ...\n", + "> Querying with idx: 2820318727532393752: Coronavirus disease 2019 (COVID-19) is a contag...\n", + "> Querying with idx: 897499143815831368: if the mask includes an exhalation valve, a wea...\n", + "> Querying with idx: 8628144746434065339: pulmonary fibrosis, cystic fibrosis. Evidence s...\n" + ] }, - "nbformat": 4, - "nbformat_minor": 5 -} \ No newline at end of file + { + "data": { + "text/plain": [ + "'\\n\\nChina'" + ] + }, + "execution_count": 46, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# GPT doesn't find the corresponding evidence in the leaf node, but still gives the correct answer\n", + "# set Logging to DEBUG for more detailed outputs\n", + "new_index.query(\"Which country included tocilizumab in treatment for covid-19?\")" + ] + }, + { + "cell_type": "markdown", + "id": "addb0c4d-f1ae-40c1-8b69-5a989609672f", + "metadata": {}, + "source": [ + "## Wikipedia Reader + List" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a0fc24e1-eca5-4267-a962-f7fe0fc5c7df", + "metadata": {}, + "outputs": [], + "source": [ + "from llama_index import GPTListIndex, WikipediaReader" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "872a651a-ca4a-43e2-8b29-e4f667f9d3c5", + "metadata": {}, + "outputs": [], + "source": [ + "wiki_docs = WikipediaReader().load_data(pages=['Covid-19'])" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "37e85af0-b1c3-4c18-b239-6e32a7acf8d6", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> Adding chunk: Coronavirus disease 2019 (COVID-19) is a contag...\n", + "> Adding chunk: people with COVID‑19 and acute respiratory dist...\n", + "> Adding chunk: encourage or mandate the use of face masks or c...\n", + "> Adding chunk: have elevated liver enzymes, reflecting liver i...\n", + "> Adding chunk: insofar as their drug use may have caused lung ...\n", + "> Adding chunk: treatment of mild-to-moderate COVID‑19 in adult...\n" + ] + } + ], + "source": [ + "index = GPTListIndex.from_documents(wiki_docs)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "ec0119ef-786e-40ea-89af-f1ca0ad26de6", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> Starting query: Which country included tocilizumab in treatment for covid-19?\n" + ] + } + ], + "source": [ + "# set Logging to DEBUG for more detailed outputs\n", + "# with keyword lookup\n", + "response = index.query(\n", + " \"Which country included tocilizumab in treatment for covid-19?\", \n", + " required_keywords=[\"tocilizumab\"]\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "b4087a84-0939-444f-93f2-a1a7aa32db3f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'China'" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "display(response.strip())" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "fb155bc7-cb50-47b6-b92b-895852c2d8f4", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> Starting query: Which country included tocilizumab in treatment for covid-19?\n" + ] + } + ], + "source": [ + "# set Logging to DEBUG for more detailed outputs\n", + "# without keyword lookup\n", + "response = index.query(\n", + " \"Which country included tocilizumab in treatment for covid-19?\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "5b45c07a-4e76-4a45-86b6-6b2df1ef4f7b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'There is no definite answer to this question as different countries have different treatment methods for covid-19. However, according to the context information, it is known that the virus SARS-CoV-2 can cause severe damage to various organs in the human body by inducing systemic inflammation. Therefore, it is possible that tocilizumab, which is a drug that inhibits the virus, may be included in treatment for covid-19 in some countries in order to prevent or reduce the severity of a cytokine storm. Additionally, passive antibodies may be used to treat people with active COVID-19 in order to help them recover.'" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "display(response.strip())" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.0" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/examples/vector_indices/AsyncIndexCreationDemo.ipynb b/examples/vector_indices/AsyncIndexCreationDemo.ipynb index fcd75314e6..9f1b82f1f1 100644 --- a/examples/vector_indices/AsyncIndexCreationDemo.ipynb +++ b/examples/vector_indices/AsyncIndexCreationDemo.ipynb @@ -1,215 +1,215 @@ { - "cells": [ - { - "cell_type": "markdown", - "id": "f57c7b08", - "metadata": {}, - "source": [ - "# Async Index Creation Demo" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "5db0283d", - "metadata": {}, - "outputs": [], - "source": [ - "import time\n", - "\n", - "# Helps asyncio run within Jupyter\n", - "import nest_asyncio\n", - "nest_asyncio.apply()\n", - "\n", - "# My OpenAI Key\n", - "import os\n", - "os.environ['OPENAI_API_KEY'] = \"[YOUR_API_KEY]\"" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "50e3bb2e", - "metadata": {}, - "outputs": [], - "source": [ - "from llama_index import GPTSimpleVectorIndex, download_loader\n", - "\n", - "WikipediaReader = download_loader(\"WikipediaReader\")\n", - "\n", - "loader = WikipediaReader()\n", - "documents = loader.load_data(pages=['Berlin', 'Santiago', 'Moscow', 'Tokyo', 'Jakarta', 'Cairo', 'Bogota', 'Shanghai', 'Damascus'])" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "d14b17bf", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "9" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "len(documents)" - ] - }, - { - "cell_type": "markdown", - "id": "2684824b", - "metadata": {}, - "source": [ - "9 Wikipedia articles downloaded as documents" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "id": "4537def9", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "INFO:root:> [build_index_from_documents] Total LLM token usage: 0 tokens\n", - "INFO:root:> [build_index_from_documents] Total embedding token usage: 142295 tokens\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "7.691995083000052\n" - ] - } - ], - "source": [ - "start_time = time.perf_counter()\n", - "index = GPTSimpleVectorIndex.from_documents(documents)\n", - "duration = time.perf_counter() - start_time\n", - "print(duration)" - ] - }, - { - "cell_type": "markdown", - "id": "6374ac99", - "metadata": {}, - "source": [ - "Standard index creation took 7.69 seconds" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "id": "60a7c522", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "INFO:openai:message='OpenAI API response' path=https://api.openai.com/v1/engines/text-embedding-ada-002/embeddings processing_ms=245 request_id=314b145a07f65fd34e707f633cc1a444 response_code=200\n", - "INFO:openai:message='OpenAI API response' path=https://api.openai.com/v1/engines/text-embedding-ada-002/embeddings processing_ms=432 request_id=bb9e796d0b8f9c2365b68de8a56009ff response_code=200\n", - "INFO:openai:message='OpenAI API response' path=https://api.openai.com/v1/engines/text-embedding-ada-002/embeddings processing_ms=433 request_id=7a94707fe2f8916e9cdd8276a5748207 response_code=200\n", - "INFO:openai:message='OpenAI API response' path=https://api.openai.com/v1/engines/text-embedding-ada-002/embeddings processing_ms=499 request_id=cda679215293c3a13ed57c2eae3dc582 response_code=200\n", - "INFO:openai:message='OpenAI API response' path=https://api.openai.com/v1/engines/text-embedding-ada-002/embeddings processing_ms=527 request_id=5e1c3e74aa3f9f950e4035f81a0f0a15 response_code=200\n", - "INFO:openai:message='OpenAI API response' path=https://api.openai.com/v1/engines/text-embedding-ada-002/embeddings processing_ms=585 request_id=81983fe76eab95f73f82df881ff7b2d9 response_code=200\n", - "INFO:openai:message='OpenAI API response' path=https://api.openai.com/v1/engines/text-embedding-ada-002/embeddings processing_ms=574 request_id=702a182b54a29a33719205f722378c8e response_code=200\n", - "INFO:openai:message='OpenAI API response' path=https://api.openai.com/v1/engines/text-embedding-ada-002/embeddings processing_ms=575 request_id=d1df11775c59a3ba403dda253081f8eb response_code=200\n", - "INFO:openai:message='OpenAI API response' path=https://api.openai.com/v1/engines/text-embedding-ada-002/embeddings processing_ms=575 request_id=47929f13469569527505b51958cd8e71 response_code=200\n", - "INFO:root:> [build_index_from_documents] Total LLM token usage: 0 tokens\n", - "INFO:root:> [build_index_from_documents] Total embedding token usage: 142295 tokens\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2.3730635830000892\n" - ] - } - ], - "source": [ - "start_time = time.perf_counter()\n", - "index = GPTSimpleVectorIndex(documents, use_async=True)\n", - "duration = time.perf_counter() - start_time\n", - "print(duration)" - ] - }, - { - "cell_type": "markdown", - "id": "8bd9de0b", - "metadata": {}, - "source": [ - "Async index creation took 2.37 seconds" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "d0db93cb", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "INFO:root:> [query] Total LLM token usage: 4075 tokens\n", - "INFO:root:> [query] Total embedding token usage: 8 tokens\n" - ] - }, - { - "data": { - "text/plain": [ - "Response(response=\"\\n\\nThe name 'Jakarta' is derived from the word Jayakarta (Devanagari: जयकर्त) which is ultimately derived from the Sanskrit जय jaya (victorious), and कृत krta (accomplished, acquired), thus Jayakarta translates as 'victorious deed', 'complete act' or 'complete victory'. It was named for the Muslim troops of Fatahillah which successfully defeated and drove the Portuguese away from the city in 1527. Before it was called Jayakarta, the city was known as 'Sunda Kelapa'. Tomé Pires, a Portuguese apothecary wrote the name of the city on his magnum opus as Jacatra or Jacarta during his journey to East Indies. The city is located in a low-lying area ranging from −2 to 91 m (−7 to 299 ft) with an average elevation of 8 m (26 ft) above sea level with historically extensive swampy areas. Some parts of the city have been constructed on reclaimed tidal flats that occur around the area. Thirteen rivers flow through Jakarta, including the Ciliwung River, Kalibaru, Pesanggra\", source_nodes=[SourceNode(source_text=\"Jakarta (; Indonesian pronunciation: [dʒaˈkarta] (listen)), officially the Special Capital Region of Jakarta (Indonesian: Daerah Khusus Ibukota Jakarta), is the capital and largest city of Indonesia. Lying on the northwest coast of Java, the world's most populous island, Jakarta is the largest city in Southeast Asia and serves as the diplomatic capital of ASEAN.\\nThe city is the economic, cultural, and political centre of Indonesia. It possesses a province-level status and has a population of 10,562,088 as of mid-2021. Although Jakarta extends over only 664.01 km2 (256.38 sq mi) and thus has the smallest area of any Indonesian province, its metropolitan area covers 9,957.08 km2 (3,844.45 sq mi), which includes the satellite cities Bogor, Depok, Tangerang, South Tangerang, and Bekasi, and has an estimated population of 35 million as of 2021, making it the largest urban area in Indonesia and the second-largest in the world (after Tokyo). Jakarta ranks first among the Indonesian provinces in the human development index. Jakarta's business and employment opportunities, along with its ability to offer a potentially higher standard of living compared to other parts of the country, have attracted migrants from across the Indonesian archipelago, making it a melting pot of numerous cultures.\\nJakarta is one of the oldest continuously inhabited cities in Southeast Asia. Established in the fourth century as Sunda Kelapa, the city became an important trading port for the Sunda Kingdom. At one time, it was the de facto capital of the Dutch East Indies, when it was known as Batavia. Jakarta was officially a city within West Java until 1960 when its official status was changed to a province with special capital region distinction. As a province, its government consists of five administrative cities and one administrative regency. Jakarta is an alpha world city and is the seat of the ASEAN secretariat. Financial institutions such as the Bank of Indonesia, Indonesia Stock Exchange, and corporate headquarters of numerous Indonesian companies and multinational corporations are located in the city. In 2021, the city's GRP PPP was estimated at US$602.946 billion.\\nJakarta's main challenges include rapid urban growth, ecological breakdown, gridlocked traffic, congestion, and flooding. Jakarta is sinking up to 17 cm (6.7 inches) annually, which coupled with the rising of sea levels, has made the city more prone to flooding. Hence, it is one of the fastest-sinking capitals in the world. In response to these challenges, in August 2019, President Joko Widodo announced that the capital of Indonesia would be moved from Jakarta to the planned city of Nusantara, in the province of East Kalimantan on the island of Borneo.\\n\\n\\n== Name ==\\n\\nJakarta has been home to multiple settlements. Below is the list of names used during its existence:\\n\\nSunda Kelapa (397–1527)\\nJayakarta (1527–1619)\\nBatavia (1619–1942)\\nDjakarta (1942–1972)\\nJakarta (1972–present)The name 'Jakarta' is derived from the word Jayakarta (Devanagari: जयकर्त) which is ultimately derived from the Sanskrit जय jaya (victorious), and कृत krta (accomplished, acquired), thus Jayakarta translates as 'victorious deed', 'complete act' or 'complete victory'. It was named for the Muslim troops of Fatahillah which successfully defeated and drove the Portuguese away from the city in 1527. Before it was called Jayakarta, the city was known as 'Sunda Kelapa'. Tomé Pires, a Portuguese apothecary wrote the name of the city on his magnum opus as Jacatra or Jacarta during his journey to East Indies. \\nIn the 17th century, the city was known as Koningin van het Oosten (Queen of the Orient), a name that was given for the urban beauty of downtown Batavia's canals, mansions and ordered city layout. After expanding to the south in the 19th century, this nickname came to be more associated with the suburbs (e.g. Menteng and the area around Merdeka Square), with their wide lanes, green spaces and villas. During the Japanese occupation, the city was renamed as Jakaruta Tokubetsu-shi (ジャカルタ特別市, Jakarta Special City).\\n\\n\\n== History ==\\n\\n\\n=== Precolonial era ===\\n\\nThe north coast area of western Java including Jakarta was the location of prehistoric Buni culture that flourished from 400 BC to 100 AD. The area in and around modern Jakarta was part of the 4th-century Sundanese kingdom of Tarumanagara, one of the oldest Hindu kingdoms in Indonesia. The area of North Jakarta around Tugu became a populated settlement in the early 5th century. The Tugu inscription (probably written around 417 AD) discovered in Batutumbuh hamlet, Tugu village, Koja, North Jakarta, mentions that King Purnawarman of Tarumanagara undertook hydraulic projects; the irrigation and water drainage project of the Chandrabhaga river and the Gomati river near his capital. Following the decline of Tarumanagara, its territories, including the Jakarta area, became part of the Hindu Kingdom of Sunda. From the 7th to the early 13th century, the port of Sunda was under the Srivijaya maritime empire. According to the Chinese source, Chu-fan-chi, written circa 1225, Chou Ju-kua reported in the early 13th century that Srivijaya still ruled Sumatra, the Malay peninsula and western Java (Sunda). The source says the port of Sunda is strategic and thriving, mentioning pepper from Sunda as among the best in quality. The people worked in agriculture, and their houses were built on wooden piles. The harbour area became known as Sunda Kelapa, (Sundanese: ᮞᮥᮔ᮪ᮓ ᮊᮨᮜᮕ) and by the 14th century, it was an important trading port for the Sunda Kingdom.\\nThe first European fleet, four Portuguese ships from Malacca, arrived in 1513 while looking for a route for spices. The Sunda Kingdom made an alliance treaty with the Portuguese by allowing them to build a port in 1522 to defend against the rising power of Demak Sultanate from central Java. In 1527, Fatahillah, a Javanese general from Demak attacked and conquered Sunda Kelapa, driving out the Portuguese. Sunda Kelapa was renamed Jayakarta, and became a fiefdom of the Banten Sultanate, which became a major Southeast Asian trading centre.\\nThrough the relationship with Prince Jayawikarta of the Banten Sultanate, Dutch ships arrived in 1596. In 1602, the British East India Company's first voyage, commanded by Sir James Lancaster, arrived in Aceh and sailed on to Banten where they were allowed to build a trading post. This site became the centre of British trade in the Indonesian archipelago until 1682. Jayawikarta is thought to have made trading connections with the British merchants, rivals of the Dutch, by allowing them to build houses directly across from the Dutch buildings in 1615.\\n\\n\\n=== Colonial era ===\\n\\nWhen relations between Prince Jayawikarta and the Dutch deteriorated, his soldiers attacked the Dutch fortress. His army and the British, however, were defeated by the Dutch, in part owing to the timely arrival of Jan Pieterszoon Coen. The Dutch burned the British fort and forced them to retreat on their ships. The victory consolidated Dutch power, and they renamed the city Batavia in 1619.\\n\\nCommercial opportunities in the city attracted native and especially Chinese and Arab immigrants. This sudden population increase created burdens on the city. Tensions grew as the colonial government tried to restrict Chinese migration through deportations. Following a revolt, 5,000 Chinese were massacred by the Dutch and natives on 9 October 1740, and the following year, Chinese inhabitants were moved to Glodok outside the city walls. At the beginning of the 19th century, around 400 Arabs and Moors lived in Batavia, a number that changed little during the following decades. Among the commodities traded were fabrics, mainly imported cotton, batik and clothing worn by Arab communities.The city began to expand further south as epidemics in 1835 and 1870 forced residents to move away from the port. The Koningsplein, now Merdeka Square was completed in 1818, the housing park of Menteng was started in 1913, and Kebayoran Baru was the last Dutch-built residential area. By 1930, Batavia had more than 500,000 inhabitants, including 37,067 Europeans.On 5 March 1942, the Japanese captured Batavia from Dutch control, and the city was named Jakarta (Jakarta Special City (ジャカルタ特別市, Jakaruta tokubetsu-shi), under the special status that was assigned to the city). After the war, the Dutch name Batavia was internationally recognised until full Indonesian independence on 27 December 1949. The city, now renamed Jakarta, was officially proclaimed the national capital of Indonesia.\\n\\n\\n=== Independence era ===\\n\\nAfter World War II ended, Indonesian nationalists declared independence on 17 August 1945, and the government of Jakarta City was changed into the Jakarta National Administration in the following month. During the Indonesian National Revolution, Indonesian Republicans withdrew from Allied-occupied Jakarta and established their capital in Yogyakarta.\\nAfter securing full independence, Jakarta again became the national capital in 1950. With Jakarta selected to host the 1962 Asian Games, Soekarno, envisaging Jakarta as a great international city, instigated large government-funded projects with openly nationalistic and modernist architecture. Projects included a cloverleaf interchange, a major boulevard (Jalan MH Thamrin-Sudirman), monuments such as The National Monument, Hotel Indonesia, a shopping centre, and a new building intended to be the headquarters of CONEFO. In October 1965, Jakarta was the site of an abortive coup attempt in which six top generals were killed, precipitating a violent anti-communist purge which killed at least 500,000 people, including some ethnic Chinese. The event marked the beginning of Suharto's New Order. The first government was led by a mayor until the end of 1960 when the office was changed to that of a governor. The last mayor of Jakarta was Soediro until he was replaced by Soemarno Sosroatmodjo as governor. Based on law No. 5 of 1974 relating to regional governments, Jakarta was confirmed as the capital of Indonesia and one of the country's then 26 provinces.In 1966, Jakarta was declared a 'special capital region' (Daerah Khusus Ibukota), with a status equivalent to that of a province. Lieutenant General Ali Sadikin served as governor from 1966 to 1977; he rehabilitated roads and bridges, encouraged the arts, built hospitals and a large number of schools. He cleared out slum dwellers for new development projects — some for the benefit of the Suharto family,— and attempted to eliminate rickshaws and ban street vendors. He began control of migration to the city to stem overcrowding and poverty. Foreign investment contributed to a real estate boom that transformed the face of Jakarta. The boom ended with the 1997 Asian financial crisis, putting Jakarta at the centre of violence, protest, and political manoeuvring.\\nAfter three decades in power, support for President Suharto began to wane. Tensions peaked when four students were shot dead at Trisakti University by security forces. Four days of riots and violence in 1998 ensued that killed an estimated 1,200, and destroyed or damaged 6,000 buildings, forcing Suharto to resign. Much of the rioting targeted Chinese Indonesians. In the post-Suharto era, Jakarta has remained the focal point of democratic change in Indonesia. Jemaah Islamiah-connected bombings occurred almost annually in the city between 2000 and 2005, with another in 2009. In August 2007, Jakarta held its first-ever election to choose a governor as part of a nationwide decentralisation program that allows direct local elections in several areas. Previously, governors were elected by the city's legislative body.During the Jokowi presidency, the Government adopted a plan to move Indonesia's capital to East Kalimantan.Between 2016 and 2017, a series of terrorist attacks rocked Jakarta with scenes of multiple suicide bombings and gunfire. In suspicion to its links, the Islamic State, the perpetrator led by Abu Bakr al-Baghdadi claimed responsibility for the attacks.\\n\\n\\n== Geography ==\\n\\nJakarta covers 699.5 km2 (270.1 sq mi), the smallest among any Indonesian provinces. However, its metropolitan area covers 6,392 km2 (2,468 sq mi), which extends into two of the bordering provinces of West Java and Banten. The Greater Jakarta area includes three bordering regencies (Bekasi Regency, Tangerang Regency and Bogor Regency) and five adjacent cities (Bogor, Depok, Bekasi, Tangerang and South Tangerang).\\n\\nJakarta is situated on the northwest coast of Java, at the mouth of the Ciliwung River on Jakarta Bay, an inlet of the Java Sea. It is strategically located near the Sunda Strait. The northern part of Jakarta is plain land, some areas of which are below sea level, and subject to frequent flooding. The southern parts of the city are hilly. It is one of only two Asian capital cities located in the southern hemisphere (along with East Timor's Dili). Officially, the area of the Jakarta Special District is 662 km2 (256 sq mi) of land area and 6,977 km2 (2,694 sq mi) of sea area. The Thousand Islands, which are administratively a part of Jakarta, are located in Jakarta Bay, north of the city.\\nJakarta lies in a low and flat alluvial plain, ranging from −2 to 91 m (−7 to 299 ft) with an average elevation of 8 m (26 ft) above sea level with historically extensive swampy areas. Some parts of the city have been constructed on reclaimed tidal flats that occur around the area. Thirteen rivers flow through Jakarta. They are Ciliwung River, Kalibaru, Pesanggrahan, Cipinang, Angke River, Maja, Mookervart, Krukut, Buaran, West Tarum, Cakung, Petukangan, Sunter River and Grogol River. They flow from the Puncak highlands to the south of the city, then across the city northwards towards the Java Sea. The Ciliwung River divides the city into the western and eastern districts.\\nThese rivers, combined with the wet season rains and insufficient\", doc_id='eeb6ef32-c857-44e2-b0c5-dff6e29a9cd7', extra_info=None, node_info={'start': 0, 'end': 13970}, similarity=0.8701780916463354)], extra_info=None)" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "index.query(\"What is the etymology of Jakarta?\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4d2e2a79", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.6" - } + "cells": [ + { + "cell_type": "markdown", + "id": "f57c7b08", + "metadata": {}, + "source": [ + "# Async Index Creation Demo" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "5db0283d", + "metadata": {}, + "outputs": [], + "source": [ + "import time\n", + "\n", + "# Helps asyncio run within Jupyter\n", + "import nest_asyncio\n", + "nest_asyncio.apply()\n", + "\n", + "# My OpenAI Key\n", + "import os\n", + "os.environ['OPENAI_API_KEY'] = \"[YOUR_API_KEY]\"" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "50e3bb2e", + "metadata": {}, + "outputs": [], + "source": [ + "from llama_index import GPTSimpleVectorIndex, download_loader\n", + "\n", + "WikipediaReader = download_loader(\"WikipediaReader\")\n", + "\n", + "loader = WikipediaReader()\n", + "documents = loader.load_data(pages=['Berlin', 'Santiago', 'Moscow', 'Tokyo', 'Jakarta', 'Cairo', 'Bogota', 'Shanghai', 'Damascus'])" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "d14b17bf", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "9" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(documents)" + ] + }, + { + "cell_type": "markdown", + "id": "2684824b", + "metadata": {}, + "source": [ + "9 Wikipedia articles downloaded as documents" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "4537def9", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:root:> [build_index_from_documents] Total LLM token usage: 0 tokens\n", + "INFO:root:> [build_index_from_documents] Total embedding token usage: 142295 tokens\n" + ] }, - "nbformat": 4, - "nbformat_minor": 5 -} \ No newline at end of file + { + "name": "stdout", + "output_type": "stream", + "text": [ + "7.691995083000052\n" + ] + } + ], + "source": [ + "start_time = time.perf_counter()\n", + "index = GPTSimpleVectorIndex.from_documents(documents)\n", + "duration = time.perf_counter() - start_time\n", + "print(duration)" + ] + }, + { + "cell_type": "markdown", + "id": "6374ac99", + "metadata": {}, + "source": [ + "Standard index creation took 7.69 seconds" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "60a7c522", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:openai:message='OpenAI API response' path=https://api.openai.com/v1/engines/text-embedding-ada-002/embeddings processing_ms=245 request_id=314b145a07f65fd34e707f633cc1a444 response_code=200\n", + "INFO:openai:message='OpenAI API response' path=https://api.openai.com/v1/engines/text-embedding-ada-002/embeddings processing_ms=432 request_id=bb9e796d0b8f9c2365b68de8a56009ff response_code=200\n", + "INFO:openai:message='OpenAI API response' path=https://api.openai.com/v1/engines/text-embedding-ada-002/embeddings processing_ms=433 request_id=7a94707fe2f8916e9cdd8276a5748207 response_code=200\n", + "INFO:openai:message='OpenAI API response' path=https://api.openai.com/v1/engines/text-embedding-ada-002/embeddings processing_ms=499 request_id=cda679215293c3a13ed57c2eae3dc582 response_code=200\n", + "INFO:openai:message='OpenAI API response' path=https://api.openai.com/v1/engines/text-embedding-ada-002/embeddings processing_ms=527 request_id=5e1c3e74aa3f9f950e4035f81a0f0a15 response_code=200\n", + "INFO:openai:message='OpenAI API response' path=https://api.openai.com/v1/engines/text-embedding-ada-002/embeddings processing_ms=585 request_id=81983fe76eab95f73f82df881ff7b2d9 response_code=200\n", + "INFO:openai:message='OpenAI API response' path=https://api.openai.com/v1/engines/text-embedding-ada-002/embeddings processing_ms=574 request_id=702a182b54a29a33719205f722378c8e response_code=200\n", + "INFO:openai:message='OpenAI API response' path=https://api.openai.com/v1/engines/text-embedding-ada-002/embeddings processing_ms=575 request_id=d1df11775c59a3ba403dda253081f8eb response_code=200\n", + "INFO:openai:message='OpenAI API response' path=https://api.openai.com/v1/engines/text-embedding-ada-002/embeddings processing_ms=575 request_id=47929f13469569527505b51958cd8e71 response_code=200\n", + "INFO:root:> [build_index_from_documents] Total LLM token usage: 0 tokens\n", + "INFO:root:> [build_index_from_documents] Total embedding token usage: 142295 tokens\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2.3730635830000892\n" + ] + } + ], + "source": [ + "start_time = time.perf_counter()\n", + "index = GPTSimpleVectorIndex(documents, use_async=True)\n", + "duration = time.perf_counter() - start_time\n", + "print(duration)" + ] + }, + { + "cell_type": "markdown", + "id": "8bd9de0b", + "metadata": {}, + "source": [ + "Async index creation took 2.37 seconds" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "d0db93cb", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:root:> [query] Total LLM token usage: 4075 tokens\n", + "INFO:root:> [query] Total embedding token usage: 8 tokens\n" + ] + }, + { + "data": { + "text/plain": [ + "Response(response=\"\\n\\nThe name 'Jakarta' is derived from the word Jayakarta (Devanagari: जयकर्त) which is ultimately derived from the Sanskrit जय jaya (victorious), and कृत krta (accomplished, acquired), thus Jayakarta translates as 'victorious deed', 'complete act' or 'complete victory'. It was named for the Muslim troops of Fatahillah which successfully defeated and drove the Portuguese away from the city in 1527. Before it was called Jayakarta, the city was known as 'Sunda Kelapa'. Tomé Pires, a Portuguese apothecary wrote the name of the city on his magnum opus as Jacatra or Jacarta during his journey to East Indies. The city is located in a low-lying area ranging from −2 to 91 m (−7 to 299 ft) with an average elevation of 8 m (26 ft) above sea level with historically extensive swampy areas. Some parts of the city have been constructed on reclaimed tidal flats that occur around the area. Thirteen rivers flow through Jakarta, including the Ciliwung River, Kalibaru, Pesanggra\", source_nodes=[SourceNode(source_text=\"Jakarta (; Indonesian pronunciation: [dʒaˈkarta] (listen)), officially the Special Capital Region of Jakarta (Indonesian: Daerah Khusus Ibukota Jakarta), is the capital and largest city of Indonesia. Lying on the northwest coast of Java, the world's most populous island, Jakarta is the largest city in Southeast Asia and serves as the diplomatic capital of ASEAN.\\nThe city is the economic, cultural, and political centre of Indonesia. It possesses a province-level status and has a population of 10,562,088 as of mid-2021. Although Jakarta extends over only 664.01 km2 (256.38 sq mi) and thus has the smallest area of any Indonesian province, its metropolitan area covers 9,957.08 km2 (3,844.45 sq mi), which includes the satellite cities Bogor, Depok, Tangerang, South Tangerang, and Bekasi, and has an estimated population of 35 million as of 2021, making it the largest urban area in Indonesia and the second-largest in the world (after Tokyo). Jakarta ranks first among the Indonesian provinces in the human development index. Jakarta's business and employment opportunities, along with its ability to offer a potentially higher standard of living compared to other parts of the country, have attracted migrants from across the Indonesian archipelago, making it a melting pot of numerous cultures.\\nJakarta is one of the oldest continuously inhabited cities in Southeast Asia. Established in the fourth century as Sunda Kelapa, the city became an important trading port for the Sunda Kingdom. At one time, it was the de facto capital of the Dutch East Indies, when it was known as Batavia. Jakarta was officially a city within West Java until 1960 when its official status was changed to a province with special capital region distinction. As a province, its government consists of five administrative cities and one administrative regency. Jakarta is an alpha world city and is the seat of the ASEAN secretariat. Financial institutions such as the Bank of Indonesia, Indonesia Stock Exchange, and corporate headquarters of numerous Indonesian companies and multinational corporations are located in the city. In 2021, the city's GRP PPP was estimated at US$602.946 billion.\\nJakarta's main challenges include rapid urban growth, ecological breakdown, gridlocked traffic, congestion, and flooding. Jakarta is sinking up to 17 cm (6.7 inches) annually, which coupled with the rising of sea levels, has made the city more prone to flooding. Hence, it is one of the fastest-sinking capitals in the world. In response to these challenges, in August 2019, President Joko Widodo announced that the capital of Indonesia would be moved from Jakarta to the planned city of Nusantara, in the province of East Kalimantan on the island of Borneo.\\n\\n\\n== Name ==\\n\\nJakarta has been home to multiple settlements. Below is the list of names used during its existence:\\n\\nSunda Kelapa (397–1527)\\nJayakarta (1527–1619)\\nBatavia (1619–1942)\\nDjakarta (1942–1972)\\nJakarta (1972–present)The name 'Jakarta' is derived from the word Jayakarta (Devanagari: जयकर्त) which is ultimately derived from the Sanskrit जय jaya (victorious), and कृत krta (accomplished, acquired), thus Jayakarta translates as 'victorious deed', 'complete act' or 'complete victory'. It was named for the Muslim troops of Fatahillah which successfully defeated and drove the Portuguese away from the city in 1527. Before it was called Jayakarta, the city was known as 'Sunda Kelapa'. Tomé Pires, a Portuguese apothecary wrote the name of the city on his magnum opus as Jacatra or Jacarta during his journey to East Indies. \\nIn the 17th century, the city was known as Koningin van het Oosten (Queen of the Orient), a name that was given for the urban beauty of downtown Batavia's canals, mansions and ordered city layout. After expanding to the south in the 19th century, this nickname came to be more associated with the suburbs (e.g. Menteng and the area around Merdeka Square), with their wide lanes, green spaces and villas. During the Japanese occupation, the city was renamed as Jakaruta Tokubetsu-shi (ジャカルタ特別市, Jakarta Special City).\\n\\n\\n== History ==\\n\\n\\n=== Precolonial era ===\\n\\nThe north coast area of western Java including Jakarta was the location of prehistoric Buni culture that flourished from 400 BC to 100 AD. The area in and around modern Jakarta was part of the 4th-century Sundanese kingdom of Tarumanagara, one of the oldest Hindu kingdoms in Indonesia. The area of North Jakarta around Tugu became a populated settlement in the early 5th century. The Tugu inscription (probably written around 417 AD) discovered in Batutumbuh hamlet, Tugu village, Koja, North Jakarta, mentions that King Purnawarman of Tarumanagara undertook hydraulic projects; the irrigation and water drainage project of the Chandrabhaga river and the Gomati river near his capital. Following the decline of Tarumanagara, its territories, including the Jakarta area, became part of the Hindu Kingdom of Sunda. From the 7th to the early 13th century, the port of Sunda was under the Srivijaya maritime empire. According to the Chinese source, Chu-fan-chi, written circa 1225, Chou Ju-kua reported in the early 13th century that Srivijaya still ruled Sumatra, the Malay peninsula and western Java (Sunda). The source says the port of Sunda is strategic and thriving, mentioning pepper from Sunda as among the best in quality. The people worked in agriculture, and their houses were built on wooden piles. The harbour area became known as Sunda Kelapa, (Sundanese: ᮞᮥᮔ᮪ᮓ ᮊᮨᮜᮕ) and by the 14th century, it was an important trading port for the Sunda Kingdom.\\nThe first European fleet, four Portuguese ships from Malacca, arrived in 1513 while looking for a route for spices. The Sunda Kingdom made an alliance treaty with the Portuguese by allowing them to build a port in 1522 to defend against the rising power of Demak Sultanate from central Java. In 1527, Fatahillah, a Javanese general from Demak attacked and conquered Sunda Kelapa, driving out the Portuguese. Sunda Kelapa was renamed Jayakarta, and became a fiefdom of the Banten Sultanate, which became a major Southeast Asian trading centre.\\nThrough the relationship with Prince Jayawikarta of the Banten Sultanate, Dutch ships arrived in 1596. In 1602, the British East India Company's first voyage, commanded by Sir James Lancaster, arrived in Aceh and sailed on to Banten where they were allowed to build a trading post. This site became the centre of British trade in the Indonesian archipelago until 1682. Jayawikarta is thought to have made trading connections with the British merchants, rivals of the Dutch, by allowing them to build houses directly across from the Dutch buildings in 1615.\\n\\n\\n=== Colonial era ===\\n\\nWhen relations between Prince Jayawikarta and the Dutch deteriorated, his soldiers attacked the Dutch fortress. His army and the British, however, were defeated by the Dutch, in part owing to the timely arrival of Jan Pieterszoon Coen. The Dutch burned the British fort and forced them to retreat on their ships. The victory consolidated Dutch power, and they renamed the city Batavia in 1619.\\n\\nCommercial opportunities in the city attracted native and especially Chinese and Arab immigrants. This sudden population increase created burdens on the city. Tensions grew as the colonial government tried to restrict Chinese migration through deportations. Following a revolt, 5,000 Chinese were massacred by the Dutch and natives on 9 October 1740, and the following year, Chinese inhabitants were moved to Glodok outside the city walls. At the beginning of the 19th century, around 400 Arabs and Moors lived in Batavia, a number that changed little during the following decades. Among the commodities traded were fabrics, mainly imported cotton, batik and clothing worn by Arab communities.The city began to expand further south as epidemics in 1835 and 1870 forced residents to move away from the port. The Koningsplein, now Merdeka Square was completed in 1818, the housing park of Menteng was started in 1913, and Kebayoran Baru was the last Dutch-built residential area. By 1930, Batavia had more than 500,000 inhabitants, including 37,067 Europeans.On 5 March 1942, the Japanese captured Batavia from Dutch control, and the city was named Jakarta (Jakarta Special City (ジャカルタ特別市, Jakaruta tokubetsu-shi), under the special status that was assigned to the city). After the war, the Dutch name Batavia was internationally recognised until full Indonesian independence on 27 December 1949. The city, now renamed Jakarta, was officially proclaimed the national capital of Indonesia.\\n\\n\\n=== Independence era ===\\n\\nAfter World War II ended, Indonesian nationalists declared independence on 17 August 1945, and the government of Jakarta City was changed into the Jakarta National Administration in the following month. During the Indonesian National Revolution, Indonesian Republicans withdrew from Allied-occupied Jakarta and established their capital in Yogyakarta.\\nAfter securing full independence, Jakarta again became the national capital in 1950. With Jakarta selected to host the 1962 Asian Games, Soekarno, envisaging Jakarta as a great international city, instigated large government-funded projects with openly nationalistic and modernist architecture. Projects included a cloverleaf interchange, a major boulevard (Jalan MH Thamrin-Sudirman), monuments such as The National Monument, Hotel Indonesia, a shopping centre, and a new building intended to be the headquarters of CONEFO. In October 1965, Jakarta was the site of an abortive coup attempt in which six top generals were killed, precipitating a violent anti-communist purge which killed at least 500,000 people, including some ethnic Chinese. The event marked the beginning of Suharto's New Order. The first government was led by a mayor until the end of 1960 when the office was changed to that of a governor. The last mayor of Jakarta was Soediro until he was replaced by Soemarno Sosroatmodjo as governor. Based on law No. 5 of 1974 relating to regional governments, Jakarta was confirmed as the capital of Indonesia and one of the country's then 26 provinces.In 1966, Jakarta was declared a 'special capital region' (Daerah Khusus Ibukota), with a status equivalent to that of a province. Lieutenant General Ali Sadikin served as governor from 1966 to 1977; he rehabilitated roads and bridges, encouraged the arts, built hospitals and a large number of schools. He cleared out slum dwellers for new development projects — some for the benefit of the Suharto family,— and attempted to eliminate rickshaws and ban street vendors. He began control of migration to the city to stem overcrowding and poverty. Foreign investment contributed to a real estate boom that transformed the face of Jakarta. The boom ended with the 1997 Asian financial crisis, putting Jakarta at the centre of violence, protest, and political manoeuvring.\\nAfter three decades in power, support for President Suharto began to wane. Tensions peaked when four students were shot dead at Trisakti University by security forces. Four days of riots and violence in 1998 ensued that killed an estimated 1,200, and destroyed or damaged 6,000 buildings, forcing Suharto to resign. Much of the rioting targeted Chinese Indonesians. In the post-Suharto era, Jakarta has remained the focal point of democratic change in Indonesia. Jemaah Islamiah-connected bombings occurred almost annually in the city between 2000 and 2005, with another in 2009. In August 2007, Jakarta held its first-ever election to choose a governor as part of a nationwide decentralisation program that allows direct local elections in several areas. Previously, governors were elected by the city's legislative body.During the Jokowi presidency, the Government adopted a plan to move Indonesia's capital to East Kalimantan.Between 2016 and 2017, a series of terrorist attacks rocked Jakarta with scenes of multiple suicide bombings and gunfire. In suspicion to its links, the Islamic State, the perpetrator led by Abu Bakr al-Baghdadi claimed responsibility for the attacks.\\n\\n\\n== Geography ==\\n\\nJakarta covers 699.5 km2 (270.1 sq mi), the smallest among any Indonesian provinces. However, its metropolitan area covers 6,392 km2 (2,468 sq mi), which extends into two of the bordering provinces of West Java and Banten. The Greater Jakarta area includes three bordering regencies (Bekasi Regency, Tangerang Regency and Bogor Regency) and five adjacent cities (Bogor, Depok, Bekasi, Tangerang and South Tangerang).\\n\\nJakarta is situated on the northwest coast of Java, at the mouth of the Ciliwung River on Jakarta Bay, an inlet of the Java Sea. It is strategically located near the Sunda Strait. The northern part of Jakarta is plain land, some areas of which are below sea level, and subject to frequent flooding. The southern parts of the city are hilly. It is one of only two Asian capital cities located in the southern hemisphere (along with East Timor's Dili). Officially, the area of the Jakarta Special District is 662 km2 (256 sq mi) of land area and 6,977 km2 (2,694 sq mi) of sea area. The Thousand Islands, which are administratively a part of Jakarta, are located in Jakarta Bay, north of the city.\\nJakarta lies in a low and flat alluvial plain, ranging from −2 to 91 m (−7 to 299 ft) with an average elevation of 8 m (26 ft) above sea level with historically extensive swampy areas. Some parts of the city have been constructed on reclaimed tidal flats that occur around the area. Thirteen rivers flow through Jakarta. They are Ciliwung River, Kalibaru, Pesanggrahan, Cipinang, Angke River, Maja, Mookervart, Krukut, Buaran, West Tarum, Cakung, Petukangan, Sunter River and Grogol River. They flow from the Puncak highlands to the south of the city, then across the city northwards towards the Java Sea. The Ciliwung River divides the city into the western and eastern districts.\\nThese rivers, combined with the wet season rains and insufficient\", doc_id='eeb6ef32-c857-44e2-b0c5-dff6e29a9cd7', extra_info=None, node_info={'start': 0, 'end': 13970}, similarity=0.8701780916463354)], extra_info=None)" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "index.query(\"What is the etymology of Jakarta?\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4d2e2a79", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/examples/vector_indices/SimpleIndexDemo-ChatGPT.ipynb b/examples/vector_indices/SimpleIndexDemo-ChatGPT.ipynb index 12a648260b..a74dcb2292 100644 --- a/examples/vector_indices/SimpleIndexDemo-ChatGPT.ipynb +++ b/examples/vector_indices/SimpleIndexDemo-ChatGPT.ipynb @@ -1,426 +1,433 @@ { - "cells": [ - { - "cell_type": "markdown", - "id": "9c48213d-6e6a-4c10-838a-2a7c710c3a05", - "metadata": {}, - "source": [ - "# Simple Index Demo + ChatGPT" - ] - }, - { - "cell_type": "markdown", - "id": "e34da56e-bc3b-433e-b65c-96edea4db5dd", - "metadata": {}, - "source": [ - "Use a very simple wrapper around the ChatGPT API" - ] - }, - { - "cell_type": "markdown", - "id": "50d3b817-b70e-4667-be4f-d3a0fe4bd119", - "metadata": {}, - "source": [ - "#### Load documents, build the GPTSimpleVectorIndex" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "690a6918-7c75-4f95-9ccc-d2c4a1fe00d7", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/jerryliu/Programming/gpt_index/.venv/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", - " from .autonotebook import tqdm as notebook_tqdm\n" - ] - } - ], - "source": [ - "import logging\n", - "import sys\n", - "\n", - "logging.basicConfig(stream=sys.stdout, level=logging.INFO)\n", - "logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))\n", - "\n", - "from gpt_index import GPTSimpleVectorIndex, SimpleDirectoryReader, LLMPredictor, ServiceContext\n", - "from langchain.chat_models import ChatOpenAI\n", - "from IPython.display import Markdown, display" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "03d1691e-544b-454f-825b-5ee12f7faa8a", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "# load documents\n", - "documents = SimpleDirectoryReader('../paul_graham_essay/data').load_data()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6cc980e8-f4e1-4fad-93f8-ab1bbaa874f3", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "INFO:gpt_index.token_counter.token_counter:> [build_index_from_documents] Total LLM token usage: 0 tokens\n", - "> [build_index_from_documents] Total LLM token usage: 0 tokens\n", - "INFO:gpt_index.token_counter.token_counter:> [build_index_from_documents] Total embedding token usage: 18579 tokens\n", - "> [build_index_from_documents] Total embedding token usage: 18579 tokens\n" - ] - } - ], - "source": [ - "# LLM Predictor (gpt-3.5-turbo) + service context\n", - "llm_predictor = LLMPredictor(llm=ChatOpenAI(temperature=0, model_name=\"gpt-3.5-turbo\"))\n", - "service_context = ServiceContext.from_defaults(llm_predictor=llm_predictor, chunk_size_limit=512)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ad144ee7-96da-4dd6-be00-fd6cf0c78e58", - "metadata": { - "scrolled": true, - "tags": [] - }, - "outputs": [], - "source": [ - "index = GPTSimpleVectorIndex.from_documents(documents, service_context=service_context)" - ] - }, - { - "cell_type": "markdown", - "id": "b6caf93b-6345-4c65-a346-a95b0f1746c4", - "metadata": {}, - "source": [ - "#### Query Index" - ] - }, - { - "cell_type": "markdown", - "id": "83e2905e-3789-4793-82b9-0ac488246824", - "metadata": {}, - "source": [ - "By default, with the help of langchain's PromptSelector abstraction, we use \n", - "a modified refine prompt tailored for ChatGPT-use if the ChatGPT model is used." - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "85466fdf-93f3-4cb1-a5f9-0056a8245a6f", - "metadata": { - "scrolled": true, - "tags": [] - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n", - "KeyboardInterrupt\n", - "\n" - ] - } - ], - "source": [ - "response = index.query(\n", - " \"What did the author do growing up?\", \n", - " service_context=service_context,\n", - " similarity_top_k=3\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "bdda1b2c-ae46-47cf-91d7-3153e8d0473b", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "data": { - "text/markdown": [ - "<b>Before college, the author worked on writing essays and programming. They wrote short stories and essays on various topics. They also tried programming on an IBM 1401 in 9th grade using an early version of Fortran. In college, the author studied painting and art history and spent a year in Florence, Italy, where they painted portraits of people and still life. After college, the author worked on software development and co-founded a startup called Viaweb. Later, the author spent three months writing essays in 2015 before returning to work on Bel, a programming language they had been developing for years. The author worked intensively on Bel, often having chunks of the code in their head and working on it while watching their children play. Most of Bel was written while the author was living in England, where they moved with their family in 2016. In the fall of 2019, Bel was finally finished, and the author resumed writing essays and thinking about other projects.</b>" - ], - "text/plain": [ - "<IPython.core.display.Markdown object>" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "display(Markdown(f\"<b>{response}</b>\"))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ec88df57", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "response = index.query(\n", - " \"What did the author do during his time at RISD?\", \n", - " service_context=service_context,\n", - " similarity_top_k=5\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "67e8e675-1b03-423a-b53e-23ab278ba03b", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "data": { - "text/markdown": [ - "<b>The author attended RISD to learn how to paint and took a color class there. However, he mostly taught himself to paint and dropped out in 1993. He then moved to New York City to pursue his career as an artist.</b>" - ], - "text/plain": [ - "<IPython.core.display.Markdown object>" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "display(Markdown(f\"<b>{response}</b>\"))" - ] - }, - { - "cell_type": "markdown", - "id": "88ca1808-d112-4c28-b110-b65dcc9b7207", - "metadata": {}, - "source": [ - "**Refine Prompt**: Here is the chat refine prompt " - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "id": "2f0c270d-9de5-40bf-88fc-83a360523db0", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "from gpt_index.prompts.chat_prompts import CHAT_REFINE_PROMPT" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "id": "4db38651-9790-4a61-ac3d-689ce6dfa369", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "data": { - "text/plain": [ - "{'input_variables': ['context_msg', 'query_str', 'existing_answer'],\n", - " 'output_parser': None,\n", - " 'partial_variables': {},\n", - " 'messages': [HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['query_str'], output_parser=None, partial_variables={}, template='{query_str}', template_format='f-string', validate_template=True), additional_kwargs={}),\n", - " AIMessagePromptTemplate(prompt=PromptTemplate(input_variables=['existing_answer'], output_parser=None, partial_variables={}, template='{existing_answer}', template_format='f-string', validate_template=True), additional_kwargs={}),\n", - " HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context_msg'], output_parser=None, partial_variables={}, template=\"We have the opportunity to refine the above answer (only if needed) with some more context below.\\n------------\\n{context_msg}\\n------------\\nGiven the new context, refine the original answer to better answer the question. If the context isn't useful, output the original answer again.\", template_format='f-string', validate_template=True), additional_kwargs={})]}" - ] - }, - "execution_count": 22, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "dict(CHAT_REFINE_PROMPT.prompt)" - ] - }, - { - "cell_type": "markdown", - "id": "6cb664e8-f53f-4d6c-a086-1f2784cc1dc8", - "metadata": {}, - "source": [ - "#### Query Index (Using the standard Refine Prompt)\n", - "\n", - "If we use the \"standard\" refine prompt (where the prompt is one text template instead of multiple messages), we find that the results over ChatGPT are worse. " - ] - }, - { - "cell_type": "code", - "execution_count": 27, - "id": "29c416f8-d5ab-47d6-8b16-f615bfa58219", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "from gpt_index.prompts.default_prompts import DEFAULT_REFINE_PROMPT" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3df1acc4-735a-48ac-9fb4-73d9d7eabc02", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "response = index.query(\n", - " \"What did the author do during his time at RISD?\", \n", - " service_context=service_context,\n", - " refine_template=DEFAULT_REFINE_PROMPT,\n", - " similarity_top_k=5\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 31, - "id": "b8938077-6527-4008-8d0c-af7a8178ff10", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "data": { - "text/markdown": [ - "<b>\n", - "\n", - "The existing answer is not relevant to the new context provided and therefore the original answer remains sufficient. The author dropped out of RISD in 1993 and moved to New York to pursue painting.</b>" - ], - "text/plain": [ - "<IPython.core.display.Markdown object>" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "display(Markdown(f\"<b>{response}</b>\"))" - ] - }, - { - "cell_type": "markdown", - "id": "2e024521-97b5-417f-8c27-950983f52cda", - "metadata": {}, - "source": [ - "### [Beta] Use ChatGPTLLMPredictor\n", - "\n", - "Very simple GPT-Index-native ChatGPT wrapper. Note: this is a beta feature. If this doesn't work please\n", - "use the suggested flow above." - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "a49d9a1b-21fb-4153-ad24-191a13513d64", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "# use ChatGPT [beta]\n", - "from gpt_index.llm_predictor.chatgpt import ChatGPTLLMPredictor\n", - "from langchain.prompts.chat import SystemMessagePromptTemplate\n", - "\n", - "llm_predictor = ChatGPTLLMPredictor()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "596af2aa-7ddf-41f2-801b-4a24a4980dd8", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "response = index.query(\n", - " \"What did the author do during his time at RISD?\", \n", - " service_context=service_context\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "771e20ba-ccba-447e-89d6-8d731accc6f3", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "data": { - "text/plain": [ - "'Arrr, the scallywag went to RISD and had to do the foundation classes in fundamental subjects like drawing, color, and design.'" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "str(response)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "829bb58d-910e-4a19-bbea-8a9546d24b92", - "metadata": {}, - "outputs": [], - "source": [] - } + "cells": [ + { + "cell_type": "markdown", + "id": "9c48213d-6e6a-4c10-838a-2a7c710c3a05", + "metadata": {}, + "source": [ + "# Simple Index Demo + ChatGPT" + ] + }, + { + "cell_type": "markdown", + "id": "e34da56e-bc3b-433e-b65c-96edea4db5dd", + "metadata": {}, + "source": [ + "Use a very simple wrapper around the ChatGPT API" + ] + }, + { + "cell_type": "markdown", + "id": "50d3b817-b70e-4667-be4f-d3a0fe4bd119", + "metadata": {}, + "source": [ + "#### Load documents, build the GPTSimpleVectorIndex" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "690a6918-7c75-4f95-9ccc-d2c4a1fe00d7", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/jerryliu/Programming/gpt_index/.venv/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n" + ] + } + ], + "source": [ + "import logging\n", + "import sys\n", + "\n", + "logging.basicConfig(stream=sys.stdout, level=logging.INFO)\n", + "logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))\n", + "\n", + "from gpt_index import GPTSimpleVectorIndex, SimpleDirectoryReader, LLMPredictor, ServiceContext\n", + "from langchain.chat_models import ChatOpenAI\n", + "from IPython.display import Markdown, display" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "03d1691e-544b-454f-825b-5ee12f7faa8a", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# load documents\n", + "documents = SimpleDirectoryReader('../paul_graham_essay/data').load_data()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6cc980e8-f4e1-4fad-93f8-ab1bbaa874f3", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "INFO:gpt_index.token_counter.token_counter:> [build_index_from_documents] Total LLM token usage: 0 tokens\n", + "> [build_index_from_documents] Total LLM token usage: 0 tokens\n", + "INFO:gpt_index.token_counter.token_counter:> [build_index_from_documents] Total embedding token usage: 18579 tokens\n", + "> [build_index_from_documents] Total embedding token usage: 18579 tokens\n" + ] + } + ], + "source": [ + "# LLM Predictor (gpt-3.5-turbo) + service context\n", + "llm_predictor = LLMPredictor(llm=ChatOpenAI(temperature=0, model_name=\"gpt-3.5-turbo\"))\n", + "service_context = ServiceContext.from_defaults(llm_predictor=llm_predictor, chunk_size_limit=512)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ad144ee7-96da-4dd6-be00-fd6cf0c78e58", + "metadata": { + "scrolled": true, + "tags": [] + }, + "outputs": [], + "source": [ + "index = GPTSimpleVectorIndex.from_documents(documents, service_context=service_context)" + ] + }, + { + "cell_type": "markdown", + "id": "b6caf93b-6345-4c65-a346-a95b0f1746c4", + "metadata": {}, + "source": [ + "#### Query Index" + ] + }, + { + "cell_type": "markdown", + "id": "83e2905e-3789-4793-82b9-0ac488246824", + "metadata": {}, + "source": [ + "By default, with the help of langchain's PromptSelector abstraction, we use \n", + "a modified refine prompt tailored for ChatGPT-use if the ChatGPT model is used." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "85466fdf-93f3-4cb1-a5f9-0056a8245a6f", + "metadata": { + "scrolled": true, + "tags": [] + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "KeyboardInterrupt\n", + "\n" + ] + } + ], + "source": [ + "response = index.query(\n", + " \"What did the author do growing up?\", \n", + " service_context=service_context,\n", + " similarity_top_k=3\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "bdda1b2c-ae46-47cf-91d7-3153e8d0473b", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/markdown": [ + "<b>Before college, the author worked on writing essays and programming. They wrote short stories and essays on various topics. They also tried programming on an IBM 1401 in 9th grade using an early version of Fortran. In college, the author studied painting and art history and spent a year in Florence, Italy, where they painted portraits of people and still life. After college, the author worked on software development and co-founded a startup called Viaweb. Later, the author spent three months writing essays in 2015 before returning to work on Bel, a programming language they had been developing for years. The author worked intensively on Bel, often having chunks of the code in their head and working on it while watching their children play. Most of Bel was written while the author was living in England, where they moved with their family in 2016. In the fall of 2019, Bel was finally finished, and the author resumed writing essays and thinking about other projects.</b>" + ], + "text/plain": [ + "<IPython.core.display.Markdown object>" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "display(Markdown(f\"<b>{response}</b>\"))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ec88df57", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "response = index.query(\n", + " \"What did the author do during his time at RISD?\", \n", + " service_context=service_context,\n", + " similarity_top_k=5\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "67e8e675-1b03-423a-b53e-23ab278ba03b", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/markdown": [ + "<b>The author attended RISD to learn how to paint and took a color class there. However, he mostly taught himself to paint and dropped out in 1993. He then moved to New York City to pursue his career as an artist.</b>" + ], + "text/plain": [ + "<IPython.core.display.Markdown object>" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "display(Markdown(f\"<b>{response}</b>\"))" + ] + }, + { + "cell_type": "markdown", + "id": "88ca1808-d112-4c28-b110-b65dcc9b7207", + "metadata": {}, + "source": [ + "**Refine Prompt**: Here is the chat refine prompt " + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "2f0c270d-9de5-40bf-88fc-83a360523db0", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from gpt_index.prompts.chat_prompts import CHAT_REFINE_PROMPT" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "4db38651-9790-4a61-ac3d-689ce6dfa369", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "{'input_variables': ['context_msg', 'query_str', 'existing_answer'],\n", + " 'output_parser': None,\n", + " 'partial_variables': {},\n", + " 'messages': [HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['query_str'], output_parser=None, partial_variables={}, template='{query_str}', template_format='f-string', validate_template=True), additional_kwargs={}),\n", + " AIMessagePromptTemplate(prompt=PromptTemplate(input_variables=['existing_answer'], output_parser=None, partial_variables={}, template='{existing_answer}', template_format='f-string', validate_template=True), additional_kwargs={}),\n", + " HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context_msg'], output_parser=None, partial_variables={}, template=\"We have the opportunity to refine the above answer (only if needed) with some more context below.\\n------------\\n{context_msg}\\n------------\\nGiven the new context, refine the original answer to better answer the question. If the context isn't useful, output the original answer again.\", template_format='f-string', validate_template=True), additional_kwargs={})]}" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } ], + "source": [ + "dict(CHAT_REFINE_PROMPT.prompt)" + ] + }, + { + "cell_type": "markdown", + "id": "6cb664e8-f53f-4d6c-a086-1f2784cc1dc8", + "metadata": {}, + "source": [ + "#### Query Index (Using the standard Refine Prompt)\n", + "\n", + "If we use the \"standard\" refine prompt (where the prompt is one text template instead of multiple messages), we find that the results over ChatGPT are worse. " + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "29c416f8-d5ab-47d6-8b16-f615bfa58219", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from gpt_index.prompts.default_prompts import DEFAULT_REFINE_PROMPT" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3df1acc4-735a-48ac-9fb4-73d9d7eabc02", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "response = index.query(\n", + " \"What did the author do during his time at RISD?\", \n", + " service_context=service_context,\n", + " refine_template=DEFAULT_REFINE_PROMPT,\n", + " similarity_top_k=5\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "id": "b8938077-6527-4008-8d0c-af7a8178ff10", "metadata": { - "kernelspec": { - "display_name": "llama_index", - "language": "python", - "name": "llama_index" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.10" - } + "tags": [] + }, + "outputs": [ + { + "data": { + "text/markdown": [ + "<b>\n", + "\n", + "The existing answer is not relevant to the new context provided and therefore the original answer remains sufficient. The author dropped out of RISD in 1993 and moved to New York to pursue painting.</b>" + ], + "text/plain": [ + "<IPython.core.display.Markdown object>" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "display(Markdown(f\"<b>{response}</b>\"))" + ] + }, + { + "cell_type": "markdown", + "id": "2e024521-97b5-417f-8c27-950983f52cda", + "metadata": {}, + "source": [ + "### [Beta] Use ChatGPTLLMPredictor\n", + "\n", + "Very simple GPT-Index-native ChatGPT wrapper. Note: this is a beta feature. If this doesn't work please\n", + "use the suggested flow above." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a49d9a1b-21fb-4153-ad24-191a13513d64", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# use ChatGPT [beta]\n", + "from llama_index.llm_predictor.chatgpt import ChatGPTLLMPredictor\n", + "from langchain.prompts.chat import SystemMessagePromptTemplate\n", + "\n", + "prepend_messages = [\n", + " SystemMessagePromptTemplate.from_template(\n", + " \"Talk like a pirate in every response.\"\n", + " )\n", + "]\n", + "\n", + "llm_predictor = ChatGPTLLMPredictor(prepend_messages=prepend_messages)\n", + "service_context = ServiceContext.from_defaults(llm_predictor=llm_predictor, chunk_size_limit=512)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "596af2aa-7ddf-41f2-801b-4a24a4980dd8", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "response = index.query(\n", + " \"What did the author do during his time at RISD?\", \n", + " service_context=service_context\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "771e20ba-ccba-447e-89d6-8d731accc6f3", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "'Arrr, the scallywag went to RISD and had to do the foundation classes in fundamental subjects like drawing, color, and design.'" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "str(response)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "829bb58d-910e-4a19-bbea-8a9546d24b92", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "llama_index", + "language": "python", + "name": "llama_index" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 }, - "nbformat": 4, - "nbformat_minor": 5 -} \ No newline at end of file + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.0" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/examples/vector_indices/SimpleIndexDemo-multistep.ipynb b/examples/vector_indices/SimpleIndexDemo-multistep.ipynb index 120c2cd156..c5cb8a3f71 100644 --- a/examples/vector_indices/SimpleIndexDemo-multistep.ipynb +++ b/examples/vector_indices/SimpleIndexDemo-multistep.ipynb @@ -5,7 +5,7 @@ "id": "9c48213d-6e6a-4c10-838a-2a7c710c3a05", "metadata": {}, "source": [ - "# Simple Index Demo" + "# Simple Index Demo - Multistep" ] }, { @@ -739,7 +739,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.10" + "version": "3.11.0" } }, "nbformat": 4, diff --git a/examples/vector_indices/SimpleIndexDemo-streaming.ipynb b/examples/vector_indices/SimpleIndexDemo-streaming.ipynb index af20ea2071..672d0a652a 100644 --- a/examples/vector_indices/SimpleIndexDemo-streaming.ipynb +++ b/examples/vector_indices/SimpleIndexDemo-streaming.ipynb @@ -1,140 +1,140 @@ { - "cells": [ - { - "cell_type": "markdown", - "id": "9c48213d-6e6a-4c10-838a-2a7c710c3a05", - "metadata": {}, - "source": [ - "# Simple Index Demo" - ] - }, - { - "cell_type": "markdown", - "id": "50d3b817-b70e-4667-be4f-d3a0fe4bd119", - "metadata": {}, - "source": [ - "#### Load documents, build the GPTSimpleVectorIndex" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "690a6918-7c75-4f95-9ccc-d2c4a1fe00d7", - "metadata": {}, - "outputs": [], - "source": [ - "import logging\n", - "import sys\n", - "\n", - "logging.basicConfig(stream=sys.stdout, level=logging.INFO)\n", - "logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))\n", - "\n", - "from gpt_index import GPTSimpleVectorIndex, SimpleDirectoryReader\n", - "from IPython.display import Markdown, display" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "03d1691e-544b-454f-825b-5ee12f7faa8a", - "metadata": {}, - "outputs": [], - "source": [ - "# load documents\n", - "documents = SimpleDirectoryReader('../paul_graham_essay/data').load_data()" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "id": "ad144ee7-96da-4dd6-be00-fd6cf0c78e58", - "metadata": { - "scrolled": true - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "INFO:root:> [build_index_from_documents] Total LLM token usage: 0 tokens\n", - "> [build_index_from_documents] Total LLM token usage: 0 tokens\n", - "> [build_index_from_documents] Total LLM token usage: 0 tokens\n", - "INFO:root:> [build_index_from_documents] Total embedding token usage: 18509 tokens\n", - "> [build_index_from_documents] Total embedding token usage: 18509 tokens\n", - "> [build_index_from_documents] Total embedding token usage: 18509 tokens\n" - ] - } - ], - "source": [ - "index = GPTSimpleVectorIndex.from_documents(documents, chunk_size_limit=1024)" - ] - }, - { - "cell_type": "markdown", - "id": "b6caf93b-6345-4c65-a346-a95b0f1746c4", - "metadata": {}, - "source": [ - "#### Query Index" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "85466fdf-93f3-4cb1-a5f9-0056a8245a6f", - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "# set Logging to DEBUG for more detailed outputs\n", - "response_stream = index.query(\n", - " \"What did the author do growing up?\", \n", - " streaming=True,\n", - " similarity_top_k=1\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "16c15a25-15ed-4aed-813a-5c4c9182d7eb", - "metadata": {}, - "outputs": [], - "source": [ - "response_stream.print_response_stream()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "bdda1b2c-ae46-47cf-91d7-3153e8d0473b", - "metadata": {}, - "outputs": [], - "source": [ - "# can also get a normal response object\n", - "response = response_stream.get_response()" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "gpt_retrieve_venv", - "language": "python", - "name": "gpt_retrieve_venv" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.16" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} \ No newline at end of file + "cells": [ + { + "cell_type": "markdown", + "id": "9c48213d-6e6a-4c10-838a-2a7c710c3a05", + "metadata": {}, + "source": [ + "# Simple Index Demo - Streaming" + ] + }, + { + "cell_type": "markdown", + "id": "50d3b817-b70e-4667-be4f-d3a0fe4bd119", + "metadata": {}, + "source": [ + "#### Load documents, build the GPTSimpleVectorIndex" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "690a6918-7c75-4f95-9ccc-d2c4a1fe00d7", + "metadata": {}, + "outputs": [], + "source": [ + "import logging\n", + "import sys\n", + "\n", + "logging.basicConfig(stream=sys.stdout, level=logging.INFO)\n", + "logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))\n", + "\n", + "from gpt_index import GPTSimpleVectorIndex, SimpleDirectoryReader\n", + "from IPython.display import Markdown, display" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "03d1691e-544b-454f-825b-5ee12f7faa8a", + "metadata": {}, + "outputs": [], + "source": [ + "# load documents\n", + "documents = SimpleDirectoryReader('../paul_graham_essay/data').load_data()" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "ad144ee7-96da-4dd6-be00-fd6cf0c78e58", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "INFO:root:> [build_index_from_documents] Total LLM token usage: 0 tokens\n", + "> [build_index_from_documents] Total LLM token usage: 0 tokens\n", + "> [build_index_from_documents] Total LLM token usage: 0 tokens\n", + "INFO:root:> [build_index_from_documents] Total embedding token usage: 18509 tokens\n", + "> [build_index_from_documents] Total embedding token usage: 18509 tokens\n", + "> [build_index_from_documents] Total embedding token usage: 18509 tokens\n" + ] + } + ], + "source": [ + "index = GPTSimpleVectorIndex.from_documents(documents, chunk_size_limit=1024)" + ] + }, + { + "cell_type": "markdown", + "id": "b6caf93b-6345-4c65-a346-a95b0f1746c4", + "metadata": {}, + "source": [ + "#### Query Index" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "85466fdf-93f3-4cb1-a5f9-0056a8245a6f", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "# set Logging to DEBUG for more detailed outputs\n", + "response_stream = index.query(\n", + " \"What did the author do growing up?\", \n", + " streaming=True,\n", + " similarity_top_k=1\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "16c15a25-15ed-4aed-813a-5c4c9182d7eb", + "metadata": {}, + "outputs": [], + "source": [ + "response_stream.print_response_stream()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bdda1b2c-ae46-47cf-91d7-3153e8d0473b", + "metadata": {}, + "outputs": [], + "source": [ + "# can also get a normal response object\n", + "response = response_stream.get_response()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.0" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/examples/vector_indices/SimpleIndexDemo.ipynb b/examples/vector_indices/SimpleIndexDemo.ipynb index 7450584124..fcc61febb2 100644 --- a/examples/vector_indices/SimpleIndexDemo.ipynb +++ b/examples/vector_indices/SimpleIndexDemo.ipynb @@ -5,7 +5,7 @@ "id": "9c48213d-6e6a-4c10-838a-2a7c710c3a05", "metadata": {}, "source": [ - "# Simple Index Demo" + "# Simple Index Demo - Basic" ] }, { @@ -266,7 +266,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.10" + "version": "3.11.0" } }, "nbformat": 4, -- GitLab