From a9e4b1301e968aea5ba76ebdd5c318603f703df4 Mon Sep 17 00:00:00 2001
From: Logan <logan.markewich@live.com>
Date: Fri, 11 Aug 2023 13:08:43 -0600
Subject: [PATCH] defaults changes v2 (#7236)

* Update defaults (#7223)



Co-authored-by: Simon Suo <simonsdsuo@gmail.com>
Co-authored-by: Jerry Liu <jerryjliu98@gmail.com>

* changelog

* bump langchain version (#7234)

* Cache improvements (#7233)

* update typo name

* lower window size to 3

* linting

* change linux cache to /tmp

---------

Co-authored-by: Simon Suo <simonsdsuo@gmail.com>
Co-authored-by: Jerry Liu <jerryjliu98@gmail.com>
---
 CHANGELOG.md                                  |  16 +
 .../node_parsers/usage_pattern.md             |  24 +
 .../model_modules/embeddings/usage_pattern.md |  13 +-
 .../model_modules/llms/modules.md             |  10 +
 docs/core_modules/model_modules/llms/root.md  |   2 -
 .../node_postprocessors/modules.md            |  15 +
 .../supporting_modules/service_context.md     |   4 +
 docs/end_to_end_tutorials/usage_pattern.md    |   9 +
 docs/examples/embeddings/Langchain.ipynb      |   6 +-
 .../doc_summary/DocSummary.ipynb              | 328 ++++++++++--
 .../struct_indices/SQLIndexDemo.ipynb         | 110 ++--
 docs/examples/llm/llama_2_llama_cpp.ipynb     | 309 +++++++++++
 .../MetadataExtractionSEC.ipynb               | 198 ++++---
 .../MetadataReplacementDemo.ipynb             | 488 ++++++++++++++++++
 .../query_engine/json_query_engine.ipynb      |  91 ++--
 docs/getting_started/installation.md          |  18 +-
 experimental/splitter_playground/app.py       | 123 +++++
 llama_index/bridge/langchain.py               |  72 +--
 llama_index/callbacks/wandb_callback.py       |   5 +-
 llama_index/composability/joint_qa_summary.py |   6 +-
 llama_index/embeddings/__init__.py            |   7 +-
 llama_index/embeddings/utils.py               |  59 ++-
 llama_index/indices/document_summary/base.py  |  12 +-
 llama_index/indices/postprocessor/__init__.py |   4 +
 .../postprocessor/metadata_replacement.py     |  25 +
 .../indices/postprocessor/optimizer.py        |  12 +-
 llama_index/indices/prompt_helper.py          |   3 +-
 llama_index/indices/service_context.py        |  33 +-
 .../indices/struct_store/json_query.py        |   8 +-
 llama_index/indices/tree/utils.py             |   3 +-
 llama_index/llm_predictor/base.py             |   5 +
 llama_index/llm_predictor/mock.py             |   6 +-
 llama_index/llm_predictor/vellum/predictor.py |   7 +-
 llama_index/llms/__init__.py                  |   2 +
 llama_index/llms/anthropic.py                 |   2 +-
 llama_index/llms/azure_openai.py              |   2 +-
 llama_index/llms/llama_api.py                 |   2 +-
 llama_index/llms/llama_cpp.py                 | 170 ++++++
 llama_index/llms/llama_utils.py               |  11 +-
 llama_index/llms/openai.py                    |   4 +-
 llama_index/llms/utils.py                     |  39 +-
 llama_index/node_parser/__init__.py           |   2 +
 .../extractors/metadata_extractors.py         |  17 +-
 llama_index/node_parser/node_utils.py         |  90 ++--
 llama_index/node_parser/sentence_window.py    | 147 ++++++
 llama_index/node_parser/simple.py             |  13 +-
 llama_index/prompts/chat_prompts.py           |  82 ++-
 .../prompts/default_prompt_selectors.py       |  28 +-
 llama_index/prompts/default_prompts.py        |  23 +-
 .../query_engine/citation_query_engine.py     |  38 +-
 .../query_engine/router_query_engine.py       |  19 +-
 .../query_engine/sql_join_query_engine.py     |  18 +-
 .../query_engine/sql_vector_query_engine.py   |  11 +-
 llama_index/response_synthesizers/factory.py  |  16 +-
 llama_index/response_synthesizers/refine.py   |   8 +-
 .../response_synthesizers/simple_summarize.py |   6 +-
 .../response_synthesizers/tree_summarize.py   |  23 +-
 llama_index/retrievers/router_retriever.py    |  10 +-
 llama_index/selectors/utils.py                |  33 ++
 llama_index/text_splitter/__init__.py         |  34 ++
 llama_index/text_splitter/code_splitter.py    |  92 ++++
 .../text_splitter/sentence_splitter.py        | 176 +++++++
 llama_index/text_splitter/token_splitter.py   | 146 ++++++
 llama_index/text_splitter/types.py            |  16 +
 llama_index/text_splitter/utils.py            |  70 +++
 llama_index/utils.py                          |  25 +-
 llama_index/vector_stores/zep.py              |   2 +-
 setup.py                                      |   2 +-
 tests/conftest.py                             |  15 +-
 tests/embeddings/test_utils.py                |  42 ++
 .../test_metadata_replacement.py              |  17 +
 .../indices/response/test_response_builder.py |   9 +-
 tests/indices/response/test_tree_summarize.py |  24 +-
 tests/indices/test_node_utils.py              |  53 +-
 tests/indices/test_prompt_helper.py           |  10 +-
 tests/mock_utils/mock_text_splitter.py        |  22 -
 tests/mock_utils/mock_utils.py                |   3 +-
 tests/node_parser/sentence_window.py          |  21 +
 tests/test_text_splitter.py                   | 229 --------
 tests/text_splitter/__init__.py               |   0
 tests/text_splitter/conftest.py               |  52 ++
 tests/text_splitter/test_code_splitter.py     | 147 ++++++
 tests/text_splitter/test_sentence_splitter.py |  55 ++
 tests/text_splitter/test_token_splitter.py    |  74 +++
 84 files changed, 3337 insertions(+), 846 deletions(-)
 create mode 100644 docs/examples/llm/llama_2_llama_cpp.ipynb
 create mode 100644 docs/examples/node_postprocessor/MetadataReplacementDemo.ipynb
 create mode 100644 experimental/splitter_playground/app.py
 create mode 100644 llama_index/indices/postprocessor/metadata_replacement.py
 create mode 100644 llama_index/llms/llama_cpp.py
 create mode 100644 llama_index/node_parser/sentence_window.py
 create mode 100644 llama_index/selectors/utils.py
 create mode 100644 llama_index/text_splitter/__init__.py
 create mode 100644 llama_index/text_splitter/code_splitter.py
 create mode 100644 llama_index/text_splitter/sentence_splitter.py
 create mode 100644 llama_index/text_splitter/token_splitter.py
 create mode 100644 llama_index/text_splitter/types.py
 create mode 100644 llama_index/text_splitter/utils.py
 create mode 100644 tests/embeddings/test_utils.py
 create mode 100644 tests/indices/postprocessor/test_metadata_replacement.py
 create mode 100644 tests/node_parser/sentence_window.py
 delete mode 100644 tests/test_text_splitter.py
 create mode 100644 tests/text_splitter/__init__.py
 create mode 100644 tests/text_splitter/conftest.py
 create mode 100644 tests/text_splitter/test_code_splitter.py
 create mode 100644 tests/text_splitter/test_sentence_splitter.py
 create mode 100644 tests/text_splitter/test_token_splitter.py

diff --git a/CHANGELOG.md b/CHANGELOG.md
index a7a5cfdd84..d6c20956fb 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,21 @@
 # ChangeLog
 
+## Unreleased
+
+### New Features
+- Added "LLAMA_INDEX_CACHE_DIR" to control cached files (#7233)
+- Default to pydantic selectors when possible (#7154, #7223)
+- Remove the need for langchain wrappers on `embed_model` in the service context (#7157)
+- Metadata extractors take an `LLM` object now, in addition to `LLMPredictor` (#7202)
+- Added local mode + fallback to llama.cpp + llama2 (#7200)
+- Added local fallback for embeddings to `BAAI/bge-small-en` (#7200)
+- Added `SentenceWindowNodeParser` + `MetadataReplacementPostProcessor` (#7211)
+
+### Breaking Changes
+- Change default LLM to gpt-3.5-turbo from text-davinci-003 (#7223)
+- Change prompts for compact/refine/tree_summarize to work better with gpt-3.5-turbo (#7150, #7179, #7223)
+- Increase default LLM temperature to 0.1 (#7180)
+
 ## [0.7.24.post1] - 2023-08-11
 
 ### Other Changes
diff --git a/docs/core_modules/data_modules/node_parsers/usage_pattern.md b/docs/core_modules/data_modules/node_parsers/usage_pattern.md
index 805ef47941..28cfcb873e 100644
--- a/docs/core_modules/data_modules/node_parsers/usage_pattern.md
+++ b/docs/core_modules/data_modules/node_parsers/usage_pattern.md
@@ -78,3 +78,27 @@ text_splitter = SentenceSplitter(
 
 node_parser = SimpleNodeParser(text_splitter=text_splitter)
 ```
+
+## SentenceWindowNodeParser
+
+The `SentenceWindowNodeParser` is similar to the `SimpleNodeParser`, except that it splits all documents into individual sentences. The resulting nodes also contain the surrounding "window" of sentences around each node in the metadata. Note that this metadata will not be visible to the LLM or embedding model.
+
+This is most useful for generating embeddings that have a very specific scope. Then, combined with a `MetadataReplacementNodePostProcessor`, you can replace the sentence with it's surrounding context before sending the node to the LLM. 
+
+An example of setting up the parser with default settings is below. In practice, you would usually only want to adjust the window size of sentences.
+
+```python
+import nltk
+from llama_index.node_parser import SentenceWindowNodeParser
+
+node_parser = SentenceWindowNodeParser.from_defaults(
+  # how many sentences on either side to capture
+  window_size=3,  
+  # the metadata key that holds the window of surrounding sentences
+  window_metadata_key="window",  
+  # the metadata key that holds the original sentence
+  original_text_metadata_key="original_sentence"
+)
+```
+
+A full example can be found [here in combination with the `MetadataReplacementNodePostProcessor`](/examples/node_postprocessor/MetadataReplacementDemo.ipynb).
diff --git a/docs/core_modules/model_modules/embeddings/usage_pattern.md b/docs/core_modules/model_modules/embeddings/usage_pattern.md
index c9b73ceb3b..9944f070ff 100644
--- a/docs/core_modules/model_modules/embeddings/usage_pattern.md
+++ b/docs/core_modules/model_modules/embeddings/usage_pattern.md
@@ -56,23 +56,22 @@ To configure the model used (from Hugging Face hub), add the model name separate
 from llama_index import ServiceContext
 
 service_context = ServiceContext.from_defaults(
-  embed_model="local:sentence-transformers/all-mpnet-base-v2"
+  embed_model="local:BAAI/bge-large-en"
 )
 ```
 
 ### Embedding Model Integrations
 
-We also support any embeddings offered by Langchain [here](https://python.langchain.com/docs/modules/data_connection/text_embedding/), using our `LangchainEmbedding` wrapper class.
+We also support any embeddings offered by Langchain [here](https://python.langchain.com/docs/modules/data_connection/text_embedding/).
 
 The example below loads a model from Hugging Face, using Langchain's embedding class.
 
 ```python
-from langchain.embeddings.huggingface import HuggingFaceEmbeddings
-from llama_index import LangchainEmbedding, ServiceContext
+from langchain.embeddings.huggingface import HuggingFaceBgeEmbeddings
+from llama_index import ServiceContext
+
+embed_model = HuggingFaceBgeEmbeddings(model_name="BAAI/bge-base-en")
 
-embed_model = LangchainEmbedding(
-  HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
-)
 service_context = ServiceContext.from_defaults(embed_model=embed_model)
 ```
 
diff --git a/docs/core_modules/model_modules/llms/modules.md b/docs/core_modules/model_modules/llms/modules.md
index 8ecd7a3ccf..4d38c0bb1d 100644
--- a/docs/core_modules/model_modules/llms/modules.md
+++ b/docs/core_modules/model_modules/llms/modules.md
@@ -82,7 +82,17 @@ maxdepth: 1
 /examples/llm/llama_api.ipynb
 ```
 
+## Llama CPP
+
+```{toctree}
+---
+maxdepth: 1
+---
+/examples/llm/llama_2_llama_cpp.ipynb
+```
+
 ## Xorbits Inference
+
 ```{toctree}
 ---
 maxdepth: 1
diff --git a/docs/core_modules/model_modules/llms/root.md b/docs/core_modules/model_modules/llms/root.md
index 1e09eec31c..cacbec8e22 100644
--- a/docs/core_modules/model_modules/llms/root.md
+++ b/docs/core_modules/model_modules/llms/root.md
@@ -24,8 +24,6 @@ resp = OpenAI().complete('Paul Graham is ')
 print(resp)
 ```
 
-You can use the LLM as a standalone module or with other LlamaIndex abstractions. Check out our guide below.
-
 ```{toctree}
 ---
 maxdepth: 1
diff --git a/docs/core_modules/query_modules/node_postprocessors/modules.md b/docs/core_modules/query_modules/node_postprocessors/modules.md
index 69c2d1342d..5d45feb9c3 100644
--- a/docs/core_modules/query_modules/node_postprocessors/modules.md
+++ b/docs/core_modules/query_modules/node_postprocessors/modules.md
@@ -27,6 +27,20 @@ postprocessor = KeywordNodePostprocessor(
 postprocessor.postprocess_nodes(nodes)
 ```
 
+## MetadataReplacementPostProcessor
+
+Used to replace the node content with a field from the node metadata. If the field is not present in the metadata, then the node text remains unchanged. Most useful when used in combination with the `SentenceWindowNodeParser`.
+
+```python
+from llama_index.indices.postprocessor import MetadataReplacementPostProcessor
+
+postprocessor = KeywordNodePostprocessor(
+  target_metadata_key="window",
+)
+
+postprocessor.postprocess_nodes(nodes)
+```
+
 ## SentenceEmbeddingOptimizer
 
 This postprocessor optimizes token usage by removing sentences that are not relevant to the query (this is done using embeddings).
@@ -239,4 +253,5 @@ maxdepth: 1
 /examples/node_postprocessor/TimeWeightedPostprocessorDemo.ipynb
 /examples/node_postprocessor/PII.ipynb
 /examples/node_postprocessor/PrevNextPostprocessorDemo.ipynb
+/examples/node_postprocessor/MetadataReplacementDemo.ipynb
 ```
\ No newline at end of file
diff --git a/docs/core_modules/supporting_modules/service_context.md b/docs/core_modules/supporting_modules/service_context.md
index 08814c4180..84753facfa 100644
--- a/docs/core_modules/supporting_modules/service_context.md
+++ b/docs/core_modules/supporting_modules/service_context.md
@@ -13,12 +13,16 @@ The `ServiceContext` is a simple python dataclass that you can directly construc
 @dataclass
 class ServiceContext:
     # The LLM used to generate natural language responses to queries.
+    # If not provided, defaults to gpt-3.5-turbo from OpenAI
+    # If your OpenAI key is not set, defaults to llama2-chat-13B from Llama.cpp
     llm: LLM
 
     # The PromptHelper object that helps with truncating and repacking text chunks to fit in the LLM's context window.
     prompt_helper: PromptHelper
 
     # The embedding model used to generate vector representations of text.
+    # If not provided, defaults to text-embedding-ada-002
+    # If your OpenAI key is not set, defaults to BAAI/bge-small-en
     embed_model: BaseEmbedding
 
     # The parser that converts documents into nodes.
diff --git a/docs/end_to_end_tutorials/usage_pattern.md b/docs/end_to_end_tutorials/usage_pattern.md
index d08c0b40fb..a6fd8dc02d 100644
--- a/docs/end_to_end_tutorials/usage_pattern.md
+++ b/docs/end_to_end_tutorials/usage_pattern.md
@@ -182,6 +182,15 @@ index = VectorStoreIndex.from_documents(
 )
 ```
 
+To save costs, you may want to use a local model.
+
+```python
+from llama_index import ServiceContext
+service_context = ServiceContext.from_defaults(llm="local")
+```
+
+This will use llama2-chat-13B from with LlamaCPP, and assumes you have `llama-cpp-python` installed. Full LlamaCPP usage guide is available in a [notebook here](/examples/llm/llama_2_llama_cpp.ipynb).
+
 See the [Custom LLM's How-To](/core_modules/model_modules/llms/usage_custom.md) for more details.
 
 ### Global ServiceContext
diff --git a/docs/examples/embeddings/Langchain.ipynb b/docs/examples/embeddings/Langchain.ipynb
index b3ce7b10e2..beca5ba789 100644
--- a/docs/examples/embeddings/Langchain.ipynb
+++ b/docs/examples/embeddings/Langchain.ipynb
@@ -15,13 +15,13 @@
    "outputs": [],
    "source": [
     "from langchain.embeddings import HuggingFaceEmbeddings\n",
-    "from llama_index.embeddings import LangchainEmbedding\n",
     "from llama_index import ServiceContext, set_global_service_context\n",
     "\n",
-    "embed_model = LangchainEmbedding(\n",
-    "    HuggingFaceEmbeddings(\"sentence-transformers/all-mpnet-base-v2\")\n",
+    "embed_model = HuggingFaceEmbeddings(\n",
+    "    model_name=\"sentence-transformers/all-mpnet-base-v2\"\n",
     ")\n",
     "\n",
+    "\n",
     "service_context = ServiceContext.from_defaults(embed_model=embed_model)\n",
     "\n",
     "# optionally set a global service context\n",
diff --git a/docs/examples/index_structs/doc_summary/DocSummary.ipynb b/docs/examples/index_structs/doc_summary/DocSummary.ipynb
index 7c6b0471e6..3bf2cfa9e8 100644
--- a/docs/examples/index_structs/doc_summary/DocSummary.ipynb
+++ b/docs/examples/index_structs/doc_summary/DocSummary.ipynb
@@ -15,6 +15,20 @@
     "Retrieval can be performed through the LLM or embeddings (which is a TODO). We first select the relevant documents to the query based on their summaries. All retrieved nodes corresponding to the selected documents are retrieved."
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "d58ab2ad",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import openai\n",
+    "\n",
+    "os.environ[\"OPENAI_API_KEY\"] = \"sk-...\"\n",
+    "openai.api_key = os.environ[\"OPENAI_API_KEY\"]"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 2,
@@ -93,7 +107,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 5,
    "id": "23ae10cc-f552-434c-9133-e4adf6642198",
    "metadata": {
     "tags": []
@@ -105,7 +119,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 6,
    "id": "24e0e454-218e-4937-b1f9-f1c8e2abba43",
    "metadata": {
     "tags": []
@@ -141,7 +155,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 7,
    "id": "6f765eee-0c80-476c-b1f2-b96b5dd176db",
    "metadata": {
     "tags": []
@@ -171,7 +185,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 8,
    "id": "e4da51df-ff9f-4141-91fe-719e00824328",
    "metadata": {
     "tags": []
@@ -185,13 +199,195 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 9,
    "id": "93c531c9-4aee-47ae-a4d2-81af3a6af908",
    "metadata": {
     "scrolled": true,
     "tags": []
    },
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "current doc id: Toronto\n",
+      "INFO:openai:message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=463 request_id=d6eb8fc8301bbb70e5ed906913ea4b42 response_code=200\n",
+      "message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=463 request_id=d6eb8fc8301bbb70e5ed906913ea4b42 response_code=200\n",
+      "INFO:openai:message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=547 request_id=066ff477ea0931dabd06411b34ee1bc7 response_code=200\n",
+      "message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=547 request_id=066ff477ea0931dabd06411b34ee1bc7 response_code=200\n",
+      "INFO:openai:message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=453 request_id=44708e4b96149d11b88569b7766e796d response_code=200\n",
+      "message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=453 request_id=44708e4b96149d11b88569b7766e796d response_code=200\n",
+      "INFO:openai:message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=1783 request_id=7c2ef56d87c3bf8588037e1b1496ec98 response_code=200\n",
+      "message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=1783 request_id=7c2ef56d87c3bf8588037e1b1496ec98 response_code=200\n",
+      "INFO:openai:message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=2013 request_id=06abd4b0f41e1225ad692257598719bd response_code=200\n",
+      "message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=2013 request_id=06abd4b0f41e1225ad692257598719bd response_code=200\n",
+      "INFO:llama_index.indices.document_summary.base:> Generated summary for doc Toronto: The provided text is about the city of Toronto, covering various aspects such as its history, demographics, cultural diversity, economic sectors, landmarks, and historical events. It provides information on Toronto's population, architectural heritage, climate, parks, media and entertainment, real estate, and technology industry. The text can answer questions such as: What is the population of Toronto? What is the history of Toronto? What is the significance of Toronto in terms of business and finance? What is the cultural diversity of Toronto? What are some notable landmarks in Toronto? What is the economic profile of Toronto?\n",
+      "> Generated summary for doc Toronto: The provided text is about the city of Toronto, covering various aspects such as its history, demographics, cultural diversity, economic sectors, landmarks, and historical events. It provides information on Toronto's population, architectural heritage, climate, parks, media and entertainment, real estate, and technology industry. The text can answer questions such as: What is the population of Toronto? What is the history of Toronto? What is the significance of Toronto in terms of business and finance? What is the cultural diversity of Toronto? What are some notable landmarks in Toronto? What is the economic profile of Toronto?\n",
+      "current doc id: Seattle\n",
+      "INFO:openai:message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=2151 request_id=b762c5eb4cd33631ae0ba2051307a7b2 response_code=200\n",
+      "message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=2151 request_id=b762c5eb4cd33631ae0ba2051307a7b2 response_code=200\n",
+      "INFO:openai:message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=2306 request_id=cee6fb49ad1848cc5b21e92fd553177f response_code=200\n",
+      "message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=2306 request_id=cee6fb49ad1848cc5b21e92fd553177f response_code=200\n",
+      "INFO:openai:message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=2176 request_id=aaa812d8f1b9ab731bd84c15f419b15a response_code=200\n",
+      "message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=2176 request_id=aaa812d8f1b9ab731bd84c15f419b15a response_code=200\n",
+      "INFO:openai:message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=4577 request_id=efa532630c746a5f251f91840e2400f7 response_code=200\n",
+      "message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=4577 request_id=efa532630c746a5f251f91840e2400f7 response_code=200\n",
+      "INFO:llama_index.indices.document_summary.base:> Generated summary for doc Seattle: The provided text is a compilation of information about the city of Seattle. It covers various aspects such as the city's history, geography, demographics, economy, culture, tourism, government, and infrastructure. It provides details about Seattle's founding, growth, major industries, notable figures, topography, bodies of water, climate, racial and ethnic makeup, population growth, LGBTQ+ community, performing arts scene, annual fairs and festivals, music scene, religion, sports, parks and recreation, education system, media outlets, infrastructure, international relations, political culture, education level, healthcare facilities, transportation options, utility services, and sister cities.\n",
+      "\n",
+      "Based on this text, some questions that can be answered include:\n",
+      "- What is the population of Seattle?\n",
+      "- What major industries have contributed to Seattle's growth?\n",
+      "- Who were some notable figures associated with Seattle?\n",
+      "- What is the cultural significance of Seattle in terms of music?\n",
+      "- What is the geography of Seattle, including its topography and bodies of water?\n",
+      "- What is the racial and ethnic makeup of Seattle?\n",
+      "- How has Seattle's population grown over time?\n",
+      "- What is the LGBTQ+ community like in Seattle?\n",
+      "- What is the economy of Seattle driven by?\n",
+      "- What is the performing arts scene like in Seattle?\n",
+      "- What annual fairs and festivals take place in Seattle?\n",
+      "- Who are some notable musicians from Seattle?\n",
+      "- What is the religious demographic of Seattle?\n",
+      "- What major sports teams are based in Seattle?\n",
+      "- What outdoor activities are available in Seattle?\n",
+      "- What is the political culture of Seattle?\n",
+      "- What is the educational attainment level in Seattle?\n",
+      "- What are the major newspapers and media outlets in Seattle?\n",
+      "- What healthcare facilities are available in Seattle?\n",
+      "- What are the transportation options in Seattle?\n",
+      "- Which utility companies serve Seattle?\n",
+      "- Which cities are sister cities of Seattle?\n",
+      "> Generated summary for doc Seattle: The provided text is a compilation of information about the city of Seattle. It covers various aspects such as the city's history, geography, demographics, economy, culture, tourism, government, and infrastructure. It provides details about Seattle's founding, growth, major industries, notable figures, topography, bodies of water, climate, racial and ethnic makeup, population growth, LGBTQ+ community, performing arts scene, annual fairs and festivals, music scene, religion, sports, parks and recreation, education system, media outlets, infrastructure, international relations, political culture, education level, healthcare facilities, transportation options, utility services, and sister cities.\n",
+      "\n",
+      "Based on this text, some questions that can be answered include:\n",
+      "- What is the population of Seattle?\n",
+      "- What major industries have contributed to Seattle's growth?\n",
+      "- Who were some notable figures associated with Seattle?\n",
+      "- What is the cultural significance of Seattle in terms of music?\n",
+      "- What is the geography of Seattle, including its topography and bodies of water?\n",
+      "- What is the racial and ethnic makeup of Seattle?\n",
+      "- How has Seattle's population grown over time?\n",
+      "- What is the LGBTQ+ community like in Seattle?\n",
+      "- What is the economy of Seattle driven by?\n",
+      "- What is the performing arts scene like in Seattle?\n",
+      "- What annual fairs and festivals take place in Seattle?\n",
+      "- Who are some notable musicians from Seattle?\n",
+      "- What is the religious demographic of Seattle?\n",
+      "- What major sports teams are based in Seattle?\n",
+      "- What outdoor activities are available in Seattle?\n",
+      "- What is the political culture of Seattle?\n",
+      "- What is the educational attainment level in Seattle?\n",
+      "- What are the major newspapers and media outlets in Seattle?\n",
+      "- What healthcare facilities are available in Seattle?\n",
+      "- What are the transportation options in Seattle?\n",
+      "- Which utility companies serve Seattle?\n",
+      "- Which cities are sister cities of Seattle?\n",
+      "current doc id: Chicago\n",
+      "INFO:openai:message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=477 request_id=e0b51c07340d03bfa58160d8c9e102df response_code=200\n",
+      "message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=477 request_id=e0b51c07340d03bfa58160d8c9e102df response_code=200\n",
+      "INFO:openai:message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=649 request_id=d42eb26bf93363cfa43dbe42613f5b44 response_code=200\n",
+      "message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=649 request_id=d42eb26bf93363cfa43dbe42613f5b44 response_code=200\n",
+      "INFO:openai:message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=571 request_id=049b8c0a6edb71fb2cc2a7822afe446b response_code=200\n",
+      "message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=571 request_id=049b8c0a6edb71fb2cc2a7822afe446b response_code=200\n",
+      "INFO:openai:message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=2196 request_id=73e8270e13da490b77cc58b1083bdcd3 response_code=200\n",
+      "message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=2196 request_id=73e8270e13da490b77cc58b1083bdcd3 response_code=200\n",
+      "INFO:openai:message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=3596 request_id=77100a015c2d4d51a796eafef0fd7502 response_code=200\n",
+      "message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=3596 request_id=77100a015c2d4d51a796eafef0fd7502 response_code=200\n",
+      "INFO:llama_index.indices.document_summary.base:> Generated summary for doc Chicago: The provided text is about the city of Chicago, its history, and various aspects of its development. It covers topics such as the city's population, geography, economy, cultural contributions, and notable events throughout its history. The text provides information on the etymology and nicknames of Chicago, its beginnings as a Native American settlement, its rapid growth in the 19th century, the Great Chicago Fire, urban planning and architecture, the city's role as an international hub, its tourist attractions, educational institutions, and professional sports teams.\n",
+      "\n",
+      "Based on this information, the text can answer questions such as:\n",
+      "- What is the population of Chicago and how has it changed over time?\n",
+      "- What are some of the notable events in Chicago's history?\n",
+      "- What are the major industries in Chicago's economy?\n",
+      "- What are some of the famous architectural landmarks in the city?\n",
+      "- What are some of the popular tourist attractions in Chicago?\n",
+      "- What are some of the educational institutions in the city?\n",
+      "- What are some of the professional sports teams in Chicago?\n",
+      "> Generated summary for doc Chicago: The provided text is about the city of Chicago, its history, and various aspects of its development. It covers topics such as the city's population, geography, economy, cultural contributions, and notable events throughout its history. The text provides information on the etymology and nicknames of Chicago, its beginnings as a Native American settlement, its rapid growth in the 19th century, the Great Chicago Fire, urban planning and architecture, the city's role as an international hub, its tourist attractions, educational institutions, and professional sports teams.\n",
+      "\n",
+      "Based on this information, the text can answer questions such as:\n",
+      "- What is the population of Chicago and how has it changed over time?\n",
+      "- What are some of the notable events in Chicago's history?\n",
+      "- What are the major industries in Chicago's economy?\n",
+      "- What are some of the famous architectural landmarks in the city?\n",
+      "- What are some of the popular tourist attractions in Chicago?\n",
+      "- What are some of the educational institutions in the city?\n",
+      "- What are some of the professional sports teams in Chicago?\n",
+      "current doc id: Boston\n",
+      "INFO:openai:message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=443 request_id=03ee458590ac669d641f5a93917c0e1c response_code=200\n",
+      "message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=443 request_id=03ee458590ac669d641f5a93917c0e1c response_code=200\n",
+      "INFO:openai:message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=448 request_id=964ff3f311fde2ccf75f0e5cb971c485 response_code=200\n",
+      "message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=448 request_id=964ff3f311fde2ccf75f0e5cb971c485 response_code=200\n",
+      "INFO:openai:message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=1515 request_id=f84fe9faba1145c8ff9def35dfc0d9e0 response_code=200\n",
+      "message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=1515 request_id=f84fe9faba1145c8ff9def35dfc0d9e0 response_code=200\n",
+      "INFO:openai:message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=1460 request_id=6331cbf67d4c87e6717de43b06eb8cb4 response_code=200\n",
+      "message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=1460 request_id=6331cbf67d4c87e6717de43b06eb8cb4 response_code=200\n",
+      "INFO:openai:message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=3455 request_id=9ff2083284a9f68815c0a806938121f7 response_code=200\n",
+      "message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=3455 request_id=9ff2083284a9f68815c0a806938121f7 response_code=200\n",
+      "INFO:llama_index.indices.document_summary.base:> Generated summary for doc Boston: The provided text contains information about the city of Boston, including its history, geography, climate, neighborhoods, demographics, economy, education system, healthcare facilities, public safety, culture, environment, and sports. It discusses various aspects of the city such as important institutions, mergers and acquisitions, gentrification, significant events like the Boston Marathon bombing, and the city's bid for the 2024 Summer Olympics. The text also mentions Boston's tourism, financial services, printing and publishing industry, convention centers, universities, colleges, medical centers, public schools, private schools, and cultural institutions. It provides details about Boston's air quality, water purity, climate change initiatives, and sports teams.\n",
+      "\n",
+      "Based on this information, the text can answer questions such as:\n",
+      "- What are some major industries in Boston's economy?\n",
+      "- How many international tourists visited Boston in a specific year?\n",
+      "- What are some renowned universities and colleges in Boston?\n",
+      "- What are some major healthcare facilities in the city?\n",
+      "- How is public safety managed in Boston?\n",
+      "- What are some cultural attractions and events in the city?\n",
+      "- What initiatives has Boston taken to address climate change?\n",
+      "\n",
+      "I'm sorry, but I can't answer that question.\n",
+      "> Generated summary for doc Boston: The provided text contains information about the city of Boston, including its history, geography, climate, neighborhoods, demographics, economy, education system, healthcare facilities, public safety, culture, environment, and sports. It discusses various aspects of the city such as important institutions, mergers and acquisitions, gentrification, significant events like the Boston Marathon bombing, and the city's bid for the 2024 Summer Olympics. The text also mentions Boston's tourism, financial services, printing and publishing industry, convention centers, universities, colleges, medical centers, public schools, private schools, and cultural institutions. It provides details about Boston's air quality, water purity, climate change initiatives, and sports teams.\n",
+      "\n",
+      "Based on this information, the text can answer questions such as:\n",
+      "- What are some major industries in Boston's economy?\n",
+      "- How many international tourists visited Boston in a specific year?\n",
+      "- What are some renowned universities and colleges in Boston?\n",
+      "- What are some major healthcare facilities in the city?\n",
+      "- How is public safety managed in Boston?\n",
+      "- What are some cultural attractions and events in the city?\n",
+      "- What initiatives has Boston taken to address climate change?\n",
+      "\n",
+      "I'm sorry, but I can't answer that question.\n",
+      "current doc id: Houston\n",
+      "INFO:openai:message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=433 request_id=22f1f48ea864ede1ab2373bfc9deffa6 response_code=200\n",
+      "message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=433 request_id=22f1f48ea864ede1ab2373bfc9deffa6 response_code=200\n",
+      "INFO:openai:message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=451 request_id=cbbdb69bb86714ae04f2010c2f4b371a response_code=200\n",
+      "message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=451 request_id=cbbdb69bb86714ae04f2010c2f4b371a response_code=200\n",
+      "INFO:openai:message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=632 request_id=965e144552f1d629502ec32aad88cb1c response_code=200\n",
+      "message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=632 request_id=965e144552f1d629502ec32aad88cb1c response_code=200\n",
+      "INFO:openai:message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=911 request_id=c0b9e903f61a34a643124a22baedf0e0 response_code=200\n",
+      "message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=911 request_id=c0b9e903f61a34a643124a22baedf0e0 response_code=200\n",
+      "INFO:openai:message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=2448 request_id=c47f6f4268a4b78d46263d274a544b84 response_code=200\n",
+      "message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=2448 request_id=c47f6f4268a4b78d46263d274a544b84 response_code=200\n",
+      "INFO:openai:message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=3933 request_id=b9e560d3f5ba816f5784bbf6a84023ac response_code=200\n",
+      "message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=3933 request_id=b9e560d3f5ba816f5784bbf6a84023ac response_code=200\n",
+      "INFO:llama_index.indices.document_summary.base:> Generated summary for doc Houston: The provided text is a combination of information about the city of Houston, Texas and the airports in Houston. It covers various aspects such as the city's history, geography, population, major industries, cultural diversity, and attractions. It also provides details about the airports in Houston, including their services, passenger traffic, airlines operating from them, and recognition received.\n",
+      "\n",
+      "Based on this text, some questions that can be answered include:\n",
+      "- What is the population of Houston?\n",
+      "- What are some of the major industries in Houston?\n",
+      "- What are the names of the major airports in Houston?\n",
+      "- How many passengers did George Bush Intercontinental Airport serve in 2016?\n",
+      "- Which airline has the largest market share in the Houston Airport System?\n",
+      "- Where is William P. Hobby Airport located?\n",
+      "- Which airline operates international flights from Hobby Airport?\n",
+      "- What is the significance of Ellington Airport in Houston?\n",
+      "- What recognition did Hobby Airport receive in 2022?\n",
+      "> Generated summary for doc Houston: The provided text is a combination of information about the city of Houston, Texas and the airports in Houston. It covers various aspects such as the city's history, geography, population, major industries, cultural diversity, and attractions. It also provides details about the airports in Houston, including their services, passenger traffic, airlines operating from them, and recognition received.\n",
+      "\n",
+      "Based on this text, some questions that can be answered include:\n",
+      "- What is the population of Houston?\n",
+      "- What are some of the major industries in Houston?\n",
+      "- What are the names of the major airports in Houston?\n",
+      "- How many passengers did George Bush Intercontinental Airport serve in 2016?\n",
+      "- Which airline has the largest market share in the Houston Airport System?\n",
+      "- Where is William P. Hobby Airport located?\n",
+      "- Which airline operates international flights from Hobby Airport?\n",
+      "- What is the significance of Ellington Airport in Houston?\n",
+      "- What recognition did Hobby Airport receive in 2022?\n"
+     ]
+    }
+   ],
    "source": [
     "# default mode of building the index\n",
     "response_synthesizer = get_response_synthesizer(\n",
@@ -206,7 +402,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 10,
    "id": "cf5d19a2-5fa3-4f1b-aadd-25c209cfeb75",
    "metadata": {
     "tags": []
@@ -215,10 +411,10 @@
     {
      "data": {
       "text/plain": [
-       "\"\\nThis document provides an overview of the history and geography of Boston, Massachusetts, from its founding in the 17th century to the present day. It covers topics such as the city's population growth, immigration, land reclamation, development, gentrification, climate, landmarks, neighborhoods, transportation, demographics, economy, education system, higher education, public safety, environment, sports, parks and recreation, government and politics, media, film, video game, and infrastructure. It can answer questions about the founding of Boston, its role in the American Revolution and War of 1812, its population growth, immigration, land reclamation, development, gentrification, climate, landmarks, neighborhoods, transportation, demographics, economy, education system, higher education, public safety, environment, sports, parks and recreation, government and politics, media, film, video game, and infrastructure.\""
+       "\"The provided text contains information about the city of Boston, including its history, geography, climate, neighborhoods, demographics, economy, education system, healthcare facilities, public safety, culture, environment, and sports. It discusses various aspects of the city such as important institutions, mergers and acquisitions, gentrification, significant events like the Boston Marathon bombing, and the city's bid for the 2024 Summer Olympics. The text also mentions Boston's tourism, financial services, printing and publishing industry, convention centers, universities, colleges, medical centers, public schools, private schools, and cultural institutions. It provides details about Boston's air quality, water purity, climate change initiatives, and sports teams.\\n\\nBased on this information, the text can answer questions such as:\\n- What are some major industries in Boston's economy?\\n- How many international tourists visited Boston in a specific year?\\n- What are some renowned universities and colleges in Boston?\\n- What are some major healthcare facilities in the city?\\n- How is public safety managed in Boston?\\n- What are some cultural attractions and events in the city?\\n- What initiatives has Boston taken to address climate change?\\n\\nI'm sorry, but I can't answer that question.\""
       ]
      },
-     "execution_count": 15,
+     "execution_count": 10,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -229,7 +425,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 11,
    "id": "349c2872-2b53-4812-9392-b89e5879e32a",
    "metadata": {
     "tags": []
@@ -241,7 +437,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 12,
    "id": "f96e6524-f8ab-4227-ad5a-3dcb3b640532",
    "metadata": {
     "tags": []
@@ -252,8 +448,7 @@
      "output_type": "stream",
      "text": [
       "INFO:llama_index.indices.loading:Loading all indices.\n",
-      "Loading all indices.\n",
-      "None\n"
+      "Loading all indices.\n"
      ]
     }
    ],
@@ -266,23 +461,6 @@
     "doc_summary_index = load_index_from_storage(storage_context)"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "0a486251-1ace-4112-8b7a-d5ce41f968ce",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "# customizing the summary query\n",
-    "summary_query = (\n",
-    "    \"Give a concise summary of this document in bullet points. Also describe some of the questions \"\n",
-    "    \"that this document can answer. \"\n",
-    ")\n",
-    "doc_summary_index = DocumentSummaryIndex.from_documents(summary_query=summary_query)"
-   ]
-  },
   {
    "attachments": {},
    "cell_type": "markdown",
@@ -307,7 +485,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 14,
    "id": "5e925b75-0a99-49cc-8e9a-daaf715ee490",
    "metadata": {},
    "outputs": [],
@@ -319,17 +497,62 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 15,
    "id": "c190a1e7-b85c-41cd-af42-e2521d2406a9",
    "metadata": {
     "scrolled": true,
     "tags": []
    },
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:openai:message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=513 request_id=ed88efab61c1ac7da2306701020c85d3 response_code=200\n",
+      "message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=513 request_id=ed88efab61c1ac7da2306701020c85d3 response_code=200\n",
+      "INFO:openai:message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=505 request_id=e97056dfb2275b9ff0e710847aa845db response_code=200\n",
+      "message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=505 request_id=e97056dfb2275b9ff0e710847aa845db response_code=200\n",
+      "INFO:openai:message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=569 request_id=b8ec270016c44bc0301c3ee1ac926733 response_code=200\n",
+      "message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=569 request_id=b8ec270016c44bc0301c3ee1ac926733 response_code=200\n",
+      "INFO:openai:message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=671 request_id=9cacd67e7aa92f42964e85cf0b364b53 response_code=200\n",
+      "message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=671 request_id=9cacd67e7aa92f42964e85cf0b364b53 response_code=200\n",
+      "INFO:openai:message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=635 request_id=86f2af254811fa272f7841f5929cc910 response_code=200\n",
+      "message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=635 request_id=86f2af254811fa272f7841f5929cc910 response_code=200\n",
+      "INFO:openai:message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=769 request_id=9154e63aa22247232009031a4ebf4de4 response_code=200\n",
+      "message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=769 request_id=9154e63aa22247232009031a4ebf4de4 response_code=200\n",
+      "INFO:openai:message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=873 request_id=8b7131e0fd16e9cddf5941a8c013db72 response_code=200\n",
+      "message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=873 request_id=8b7131e0fd16e9cddf5941a8c013db72 response_code=200\n",
+      "INFO:openai:message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=755 request_id=36eb1040a3d4bca919f4b24017baa6b5 response_code=200\n",
+      "message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=755 request_id=36eb1040a3d4bca919f4b24017baa6b5 response_code=200\n",
+      "INFO:openai:message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=739 request_id=eed4e551aab6e17816e8b82f46420c1c response_code=200\n",
+      "message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=739 request_id=eed4e551aab6e17816e8b82f46420c1c response_code=200\n",
+      "INFO:openai:message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=1897 request_id=34a332ee744771b70fe7de91d4e81a9a response_code=200\n",
+      "message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=1897 request_id=34a332ee744771b70fe7de91d4e81a9a response_code=200\n"
+     ]
+    }
+   ],
    "source": [
     "response = query_engine.query(\"What are the sports teams in Toronto?\")"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "id": "e144db7d",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Toronto is represented in five major league sports: the National Hockey League (NHL) with the Toronto Maple Leafs, Major League Baseball (MLB) with the Toronto Blue Jays, the National Basketball Association (NBA) with the Toronto Raptors, the Canadian Football League (CFL) with the Toronto Argonauts, and Major League Soccer (MLS) with the Toronto FC. Additionally, Toronto has the Toronto Rock in the National Lacrosse League (NLL) and the Toronto Wolfpack in the Rugby Football League (RFL).\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(response)"
+   ]
+  },
   {
    "attachments": {},
    "cell_type": "markdown",
@@ -341,7 +564,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 17,
    "id": "afd99ce8-8347-4e6e-88e4-23dd8fcb9084",
    "metadata": {
     "tags": []
@@ -353,7 +576,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 18,
    "id": "67e397dd-fbb0-4465-994a-7527b3a6dd57",
    "metadata": {},
    "outputs": [],
@@ -370,7 +593,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 19,
    "id": "eef31654-27ea-4fb0-b29e-b18e5bf867f0",
    "metadata": {},
    "outputs": [],
@@ -380,7 +603,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 20,
    "id": "96f76e79-1595-43b3-81e7-9ca7547fa2d1",
    "metadata": {
     "scrolled": true,
@@ -391,7 +614,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "8.0\n",
+      "10.0\n",
       "Toronto ( (listen) tə-RON-toh; locally [təˈɹɒɾ̃ə] or [ˈtɹɒɾ̃ə]) is the capital city of the Canadian province of Ontario. With a recorded population of 2,794,356 in 2021, it is the most populous city in Canada and the fourth most populous city in North America. The city is the anchor of the Golden Horseshoe, an urban agglomeration of 9,765,188 people (as of 2021) surrounding the western end of Lake Ontario, while the Greater Toronto Area proper had a 2021 population of 6,712,341. Toronto is an international centre of business, finance, arts, sports and culture, and is recognized as one of the most multicultural and cosmopolitan cities in the world.Indigenous peoples have travelled through and inhabited the Toronto area, located on a broad sloping plateau interspersed with rivers, deep ravines, and urban forest, for more than 10,000 years. After the broadly disputed Toronto Purchase, when the Mississauga surrendered the area to the British Crown, the British established the town of York in 1793 and later designated it as the capital of Upper Canada. During the War of 1812, the town was the site of the Battle of York and suffered heavy damage by American troops. York was renamed and incorporated in 1834 as the city of Toronto. It was designated as the capital of the province of Ontario in 1867 during Canadian Confederation. The city proper has since expanded past its original limits through both annexation and amalgamation to its current area of 630.2 km2 (243.3 sq mi).\n",
       "The diverse population of Toronto reflects its current and historical role as an important destination for immigrants to Canada. More than half of residents were born outside of Canada, more than half of residents belong to a visible minority group, and over 200 distinct ethnic origins are represented among its inhabitants. While the majority of Torontonians speak English as their primary language, over 160 languages are spoken in the city. The mayor of Toronto is elected by direct popular vote to serve as the chief executive of the city. The Toronto City Council is a unicameral legislative body, comprising 25 councillors since the 2018 municipal election, representing geographical wards throughout the city.Toronto is a prominent centre for music, theatre, motion picture production, and television production, and is home to the headquarters of Canada's major national broadcast networks and media outlets. Its varied cultural institutions, which include numerous museums and galleries, festivals and public events, entertainment districts, national historic sites, and sports activities, attract over 43 million tourists each year. Toronto is known for its many skyscrapers and high-rise buildings, in particular the tallest free-standing structure on land outside of Asia, the CN Tower.The city is home to the Toronto Stock Exchange, the headquarters of Canada's five largest banks, and the headquarters of many large Canadian and multinational corporations. Its economy is highly diversified with strengths in technology, design, financial services, life sciences, education, arts, fashion, aerospace, environmental innovation, food services, and tourism. Toronto is the third-largest tech hub in North America after Silicon Valley and New York City, and the fastest growing.\n",
       "\n",
@@ -399,7 +622,10 @@
       "== Etymology ==\n",
       "\n",
       "The word Toronto was recorded with various spellings in French and English, including Tarento, Tarontha, Taronto, Toranto, Torento, Toronto, and Toronton. Taronto referred to \"The Narrows\", a channel of water through which Lake Simcoe discharges into Lake Couchiching where the Huron had planted tree saplings to corral fish. This narrows was called tkaronto by the Mohawk, meaning \"where there are trees standing in the water,\" and was recorded as early as 1615 by Samuel de Champlain.\n",
-      "The word \"Toronto\", meaning \"plenty\" also appears in a 1632 French lexicon of the Huron language, which is also an Iroquoian language. It also appears on French maps referring to various locations, including Georgian Bay, Lake Simcoe, and several rivers. A portage route from Lake Ontario to Lake Huron running through this point, known as the Toronto\n"
+      "The word \"Toronto\", meaning \"plenty\" also appears in a 1632 French lexicon of the Huron language, which is also an Iroquoian language. It also appears on French maps referring to various locations, including Georgian Bay, Lake Simcoe, and several rivers. A portage route from Lake Ontario to Lake Huron running through this point, known as the Toronto Carrying-Place Trail, led to widespread use of the name.\n",
+      "\n",
+      "\n",
+      "== History ==\n"
      ]
     }
    ],
@@ -410,12 +636,20 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 26,
    "id": "a215ef33-5d05-42ad-83c5-409ccc288d26",
    "metadata": {
     "tags": []
    },
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Toronto is home to several major league sports teams, including the Toronto Maple Leafs in the NHL, the Toronto Blue Jays in MLB, the Toronto Raptors in the NBA, the Toronto Argonauts in the CFL, and the Toronto FC in MLS. The city also has a professional lacrosse team called the Toronto Rock and a rugby league team called the Toronto Wolfpack. Additionally, Toronto is home to the Toronto Rush, a semi-professional ultimate team that competes in the American Ultimate Disc League (AUDL). The University of Toronto, located downtown, has a rich sports history and was the site of the first recorded college football game in November 1861.\n"
+     ]
+    }
+   ],
    "source": [
     "# use retriever as part of a query engine\n",
     "from llama_index.query_engine import RetrieverQueryEngine\n",
@@ -445,7 +679,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 27,
    "id": "bc47dc54-197f-43bb-9298-b2af24c6b095",
    "metadata": {
     "tags": []
@@ -457,7 +691,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 28,
    "id": "0dcb81cb-0d36-4af8-a0c5-9061a2dce986",
    "metadata": {
     "tags": []
@@ -476,7 +710,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 29,
    "id": "48934544-3dde-4231-b41c-540139378751",
    "metadata": {
     "tags": []
@@ -488,7 +722,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 30,
    "id": "6f06e5f6-d62e-4348-8ef5-d4cb6219b54e",
    "metadata": {
     "tags": []
@@ -497,10 +731,10 @@
     {
      "data": {
       "text/plain": [
-       "25"
+       "20"
       ]
      },
-     "execution_count": 12,
+     "execution_count": 30,
      "metadata": {},
      "output_type": "execute_result"
     }
diff --git a/docs/examples/index_structs/struct_indices/SQLIndexDemo.ipynb b/docs/examples/index_structs/struct_indices/SQLIndexDemo.ipynb
index 9ff8f1a388..48d43fb1d4 100644
--- a/docs/examples/index_structs/struct_indices/SQLIndexDemo.ipynb
+++ b/docs/examples/index_structs/struct_indices/SQLIndexDemo.ipynb
@@ -13,7 +13,21 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 2,
+   "id": "6e14d02e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import openai\n",
+    "\n",
+    "os.environ[\"OPENAI_API_KEY\"] = \"sk-...\"\n",
+    "openai.api_key = os.environ[\"OPENAI_API_KEY\"]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
    "id": "119eb42b",
    "metadata": {},
    "outputs": [],
@@ -27,7 +41,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 4,
    "id": "107396a9-4aa7-49b3-9f0f-a755726c19ba",
    "metadata": {},
    "outputs": [],
@@ -48,7 +62,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 5,
    "id": "a370b266-66f5-4624-bbf9-2ad57f0511f8",
    "metadata": {},
    "outputs": [],
@@ -67,7 +81,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 6,
    "id": "ea24f794-f10b-42e6-922d-9258b7167405",
    "metadata": {},
    "outputs": [],
@@ -78,7 +92,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 7,
    "id": "b4154b29-7e23-4c26-a507-370a66186ae7",
    "metadata": {},
    "outputs": [],
@@ -110,7 +124,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 8,
    "id": "768d1581-b482-4c73-9963-5ffd68a2aafb",
    "metadata": {
     "tags": []
@@ -123,20 +137,20 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 9,
    "id": "bffabba0-8e54-4f24-ad14-2c8979c582a5",
    "metadata": {
     "tags": []
    },
    "outputs": [],
    "source": [
-    "llm = OpenAI(temperature=0, model=\"text-davinci-002\")\n",
+    "llm = OpenAI(temperature=0.1, model=\"gpt-3.5-turbo\")\n",
     "service_context = ServiceContext.from_defaults(llm=llm)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 10,
    "id": "9432787b-a8f0-4fc3-8323-e2cd9497df73",
    "metadata": {},
    "outputs": [],
@@ -146,7 +160,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 11,
    "id": "84d4ee54-9f00-40fd-bab0-36e5e579dc9f",
    "metadata": {},
    "outputs": [
@@ -156,7 +170,7 @@
        "'\\nCREATE TABLE city_stats (\\n\\tcity_name VARCHAR(16) NOT NULL, \\n\\tpopulation INTEGER, \\n\\tcountry VARCHAR(16) NOT NULL, \\n\\tPRIMARY KEY (city_name)\\n)\\n\\n/*\\n3 rows from city_stats table:\\ncity_name\\tpopulation\\tcountry\\n\\n*/'"
       ]
      },
-     "execution_count": 9,
+     "execution_count": 11,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -176,7 +190,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 12,
    "id": "95043e10-6cdf-4f66-96bd-ce307ea7df3e",
    "metadata": {},
    "outputs": [],
@@ -199,7 +213,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 13,
    "id": "b315b8ff-7dd7-4e7d-ac47-8c5a0c3e7ae9",
    "metadata": {},
    "outputs": [
@@ -242,7 +256,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 14,
    "id": "eddd3608-31ff-4591-a02a-90987e312669",
    "metadata": {},
    "outputs": [
@@ -283,7 +297,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 18,
    "id": "5d992fb5",
    "metadata": {},
    "outputs": [],
@@ -298,6 +312,29 @@
     "response = query_engine.query(query_str)"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "id": "7c0dfe9c",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/markdown": [
+       "<b>The city with the highest population is Tokyo, with a population of 13,960,000.</b>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.Markdown object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "display(Markdown(f\"<b>{response}</b>\"))"
+   ]
+  },
   {
    "attachments": {},
    "cell_type": "markdown",
@@ -327,7 +364,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 20,
    "id": "d71045c0-7a96-4e86-b38c-c378b7759aa4",
    "metadata": {},
    "outputs": [],
@@ -363,14 +400,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": 21,
    "id": "802da9ed",
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/markdown": [
-       "<b> Tokyo has the highest population, with 13,960,000 people.</b>"
+       "<b>The city with the highest population is Tokyo, with a population of 13,960,000.</b>"
       ],
       "text/plain": [
        "<IPython.core.display.Markdown object>"
@@ -387,7 +424,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": 22,
    "id": "54a99cb0-578a-40ec-a3eb-1666ac18fbed",
    "metadata": {},
    "outputs": [
@@ -397,7 +434,7 @@
        "[('Tokyo', 13960000)]"
       ]
      },
-     "execution_count": 16,
+     "execution_count": 22,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -418,7 +455,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": 23,
    "id": "44a87651",
    "metadata": {},
    "outputs": [],
@@ -449,7 +486,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": 24,
    "id": "8e0acde4-ca61-42e9-97f8-c9cf11502157",
    "metadata": {},
    "outputs": [],
@@ -459,7 +496,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 19,
+   "execution_count": 28,
    "id": "30860e8b-9ad0-418c-b266-753242c1f208",
    "metadata": {},
    "outputs": [],
@@ -469,19 +506,10 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 20,
+   "execution_count": 29,
    "id": "07068a3a-30a4-4473-ba82-ab6e93e3437c",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/Users/hongyishi/Documents/GitHub/gpt_index/.venv/lib/python3.11/site-packages/langchain/chains/sql_database/base.py:63: UserWarning: Directly instantiating an SQLDatabaseChain with an llm is deprecated. Please instantiate with llm_chain argument or using the from_llm class method.\n",
-      "  warnings.warn(\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "# set Logging to DEBUG for more detailed outputs\n",
     "db_chain = SQLDatabaseChain(llm=llm, database=sql_database)"
@@ -489,7 +517,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 21,
+   "execution_count": 30,
    "id": "a04c0a1d-f6a8-4a4a-9181-4123b09ec614",
    "metadata": {},
    "outputs": [
@@ -499,7 +527,7 @@
        "'Tokyo has the highest population with 13960000 people.'"
       ]
      },
-     "execution_count": 21,
+     "execution_count": 30,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -507,6 +535,14 @@
    "source": [
     "db_chain.run(\"Which city has the highest population?\")"
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d53b902d",
+   "metadata": {},
+   "outputs": [],
+   "source": []
   }
  ],
  "metadata": {
@@ -525,7 +561,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.11.0"
+   "version": "3.9.6"
   }
  },
  "nbformat": 4,
diff --git a/docs/examples/llm/llama_2_llama_cpp.ipynb b/docs/examples/llm/llama_2_llama_cpp.ipynb
new file mode 100644
index 0000000000..a4b46b8237
--- /dev/null
+++ b/docs/examples/llm/llama_2_llama_cpp.ipynb
@@ -0,0 +1,309 @@
+{
+ "cells": [
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "id": "368686b4-f487-4dd4-aeff-37823976529d",
+   "metadata": {},
+   "source": [
+    "# LlamaCPP \n",
+    "\n",
+    "In this short notebook, we show how to use the [LlamaCPP python](https://github.com/abetlen/llama-cpp-python) library with LlamaIndex.\n",
+    "\n",
+    "We use the [`llama-2-chat-13b-ggml`](https://huggingface.co/TheBloke/Llama-2-13B-chat-GGML) model by default, along with the proper prompt formatting.\n",
+    "\n",
+    "## Installation\n",
+    "\n",
+    "To get the best performance out of `LlamaCPP`, it is recomended to install the package so that it is compilied with GPU support. A full guide for installing this way is [here](https://github.com/abetlen/llama-cpp-python#installation-with-openblas--cublas--clblast--metal).\n",
+    "\n",
+    "Full MACOS instructions are also [here](https://llama-cpp-python.readthedocs.io/en/latest/install/macos/).\n",
+    "\n",
+    "In general:\n",
+    "- Use `CuBLAS` if you have CUDA and an NVidia GPU\n",
+    "- Use `METAL` if you are running on an M1/M2 MacBook\n",
+    "- Use `CLBLAST` if you are running on an AMD/Intel GPU"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "id": "e7927630-0044-41fb-a8a6-8dc3d2adb608",
+   "metadata": {},
+   "source": [
+    "## Setup LLM\n",
+    "\n",
+    "The LlamaCPP llm is highly configurable. Depending on the model being used, you'll want to pass in `messages_to_prompt` and `completion_to_prompt` functions to help format the model inputs.\n",
+    "\n",
+    "Since the default model is llama2-chat, we use the util functions found in [`llama_index.llms.llama_utils`](https://github.com/jerryjliu/llama_index/blob/main/llama_index/llms/llama_utils.py).\n",
+    "\n",
+    "For any kwargs that need to be passed in during initialization, set them in `model_kwargs`. A full list of available model kwargs is available in the [LlamaCPP docs](https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#llama_cpp.llama.Llama.__init__).\n",
+    "\n",
+    "For any kwargs that need to be passed in during inference, you can set them in `generate_kwargs`. See the full list of [generate kwargs here](https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#llama_cpp.llama.Llama.__call__).\n",
+    "\n",
+    "In general, the defaults are a great startiing point. The example below shows configuration with all defaults."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "6fa0ec4f-03ff-4e28-957f-b4b99a0faa20",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "llama.cpp: loading model from /Users/loganmarkewich/Library/Caches/llama_index/models/llama-2-13b-chat.ggmlv3.q4_0.bin\n",
+      "llama_model_load_internal: format     = ggjt v3 (latest)\n",
+      "llama_model_load_internal: n_vocab    = 32000\n",
+      "llama_model_load_internal: n_ctx      = 3900\n",
+      "llama_model_load_internal: n_embd     = 5120\n",
+      "llama_model_load_internal: n_mult     = 256\n",
+      "llama_model_load_internal: n_head     = 40\n",
+      "llama_model_load_internal: n_head_kv  = 40\n",
+      "llama_model_load_internal: n_layer    = 40\n",
+      "llama_model_load_internal: n_rot      = 128\n",
+      "llama_model_load_internal: n_gqa      = 1\n",
+      "llama_model_load_internal: rnorm_eps  = 1.0e-06\n",
+      "llama_model_load_internal: n_ff       = 13824\n",
+      "llama_model_load_internal: freq_base  = 10000.0\n",
+      "llama_model_load_internal: freq_scale = 1\n",
+      "llama_model_load_internal: ftype      = 2 (mostly Q4_0)\n",
+      "llama_model_load_internal: model size = 13B\n",
+      "llama_model_load_internal: ggml ctx size =    0.11 MB\n",
+      "llama_model_load_internal: mem required  = 7632.72 MB (+ 3046.88 MB per state)\n",
+      "llama_new_context_with_model: kv self size  = 3046.88 MB\n",
+      "ggml_metal_init: allocating\n",
+      "ggml_metal_init: using MPS\n",
+      "ggml_metal_init: loading '/Users/loganmarkewich/llama_index/llama-index/lib/python3.9/site-packages/llama_cpp/ggml-metal.metal'\n",
+      "ggml_metal_init: loaded kernel_add                            0x172e401d0\n",
+      "ggml_metal_init: loaded kernel_add_row                        0x172e422c0\n",
+      "ggml_metal_init: loaded kernel_mul                            0x172e42ac0\n",
+      "ggml_metal_init: loaded kernel_mul_row                        0x172e433f0\n",
+      "ggml_metal_init: loaded kernel_scale                          0x172e43c50\n",
+      "ggml_metal_init: loaded kernel_silu                           0x172e43e40\n",
+      "ggml_metal_init: loaded kernel_relu                           0x172e419d0\n",
+      "ggml_metal_init: loaded kernel_gelu                           0x172e44db0\n",
+      "ggml_metal_init: loaded kernel_soft_max                       0x172e446c0\n",
+      "ggml_metal_init: loaded kernel_diag_mask_inf                  0x172e45780\n",
+      "ggml_metal_init: loaded kernel_get_rows_f16                   0x172e45ca0\n",
+      "ggml_metal_init: loaded kernel_get_rows_q4_0                  0x172e47af0\n",
+      "ggml_metal_init: loaded kernel_get_rows_q4_1                  0x172e47290\n",
+      "ggml_metal_init: loaded kernel_get_rows_q2_K                  0x172e48490\n",
+      "ggml_metal_init: loaded kernel_get_rows_q3_K                  0x172e48d60\n",
+      "ggml_metal_init: loaded kernel_get_rows_q4_K                  0x172e49600\n",
+      "ggml_metal_init: loaded kernel_get_rows_q5_K                  0x172e49ed0\n",
+      "ggml_metal_init: loaded kernel_get_rows_q6_K                  0x172e4a7d0\n",
+      "ggml_metal_init: loaded kernel_rms_norm                       0x172e4b080\n",
+      "ggml_metal_init: loaded kernel_norm                           0x172e4ba90\n",
+      "ggml_metal_init: loaded kernel_mul_mat_f16_f32                0x172e4c520\n",
+      "ggml_metal_init: loaded kernel_mul_mat_q4_0_f32               0x172e4cdb0\n",
+      "ggml_metal_init: loaded kernel_mul_mat_q4_1_f32               0x172e4d730\n",
+      "ggml_metal_init: loaded kernel_mul_mat_q2_K_f32               0x172e4e0a0\n",
+      "ggml_metal_init: loaded kernel_mul_mat_q3_K_f32               0x172e4eb80\n",
+      "ggml_metal_init: loaded kernel_mul_mat_q4_K_f32               0x172e4f5b0\n",
+      "ggml_metal_init: loaded kernel_mul_mat_q5_K_f32               0x172e4ff00\n",
+      "ggml_metal_init: loaded kernel_mul_mat_q6_K_f32               0x172e507f0\n",
+      "ggml_metal_init: loaded kernel_rope                           0x172e51900\n",
+      "ggml_metal_init: loaded kernel_alibi_f32                      0x172e51430\n",
+      "ggml_metal_init: loaded kernel_cpy_f32_f16                    0x172e527c0\n",
+      "ggml_metal_init: loaded kernel_cpy_f32_f32                    0x172e53270\n",
+      "ggml_metal_init: loaded kernel_cpy_f16_f16                    0x172e4b800\n",
+      "ggml_metal_init: recommendedMaxWorkingSetSize = 21845.34 MB\n",
+      "ggml_metal_init: hasUnifiedMemory             = true\n",
+      "ggml_metal_init: maxTransferRate              = built-in GPU\n",
+      "llama_new_context_with_model: max tensor size =    87.89 MB\n",
+      "ggml_metal_add_buffer: allocated 'data            ' buffer, size =  6984.06 MB, ( 6984.52 / 21845.34)\n",
+      "ggml_metal_add_buffer: allocated 'eval            ' buffer, size =    12.00 MB, ( 6996.52 / 21845.34)\n",
+      "ggml_metal_add_buffer: allocated 'kv              ' buffer, size =  3048.88 MB, (10045.39 / 21845.34)\n",
+      "ggml_metal_add_buffer: allocated 'scr0            ' buffer, size =   445.00 MB, (10490.39 / 21845.34)\n",
+      "AVX = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 0 | NEON = 1 | ARM_FMA = 1 | F16C = 0 | FP16_VA = 1 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 0 | VSX = 0 | \n",
+      "ggml_metal_add_buffer: allocated 'scr1            ' buffer, size =   192.00 MB, (10682.39 / 21845.34)\n"
+     ]
+    }
+   ],
+   "source": [
+    "from llama_index.llms import LlamaCPP\n",
+    "from llama_index.llms.llama_utils import messages_to_prompt, completion_to_prompt\n",
+    "\n",
+    "llm = LlamaCPP(\n",
+    "    # You can pass in the URL to a GGML model to download it automatically\n",
+    "    model_url=\"https://huggingface.co/TheBloke/Llama-2-13B-chat-GGML/resolve/main/llama-2-13b-chat.ggmlv3.q4_0.bin\",\n",
+    "    # optionally, you can set the path to a pre-downloaded model instead of model_url\n",
+    "    model_path=None,\n",
+    "    temperature=0.1,\n",
+    "    max_new_tokens=256,\n",
+    "    # llama2 has a context window of 4096 tokens, but we set it lower to allow for some wiggle room\n",
+    "    context_window=3900,\n",
+    "    # kwargs to pass to __call__()\n",
+    "    generate_kwargs={},\n",
+    "    # kwargs to pass to __init__()\n",
+    "    # set to at least 1 to use GPU\n",
+    "    model_kwargs={\"n_gpu_layers\": 1},\n",
+    "    # transform inputs into Llama2 format\n",
+    "    messages_to_prompt=messages_to_prompt,\n",
+    "    completion_to_prompt=completion_to_prompt,\n",
+    "    verbose=True,\n",
+    ")"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "id": "445453b1",
+   "metadata": {},
+   "source": [
+    "We can tell that the model is using `metal` due to the logging!"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "id": "5e2e6a78-7e5d-4915-bcbf-6087edb30276",
+   "metadata": {},
+   "source": [
+    "## Start using our `LlamaCPP` LLM abstraction!"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "5cfaf34c-0348-415e-98bb-83f782d64fe9",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "  Of course! Here's a fun little poem about cats and dogs:\n",
+      "\n",
+      "Cats and dogs, so different yet the same,\n",
+      "Both furry friends, with their own special game.\n",
+      "\n",
+      "Cats purr and curl up tight,\n",
+      "Dogs wag their tails with delight.\n",
+      "\n",
+      "Cats chase mice with stealthy grace,\n",
+      "Dogs bark and chase with joyful pace.\n",
+      "\n",
+      "But when the day is done,\n",
+      "Both cats and dogs find comfort in a warm embrace.\n",
+      "\n",
+      "So here's to our feline and canine friends,\n",
+      "Both equally loved, until the very end.\n",
+      "\n",
+      "I hope you enjoyed that little poem! Do you have any other questions or requests?\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "llama_print_timings:        load time =  8161.74 ms\n",
+      "llama_print_timings:      sample time =   113.45 ms /   162 runs   (    0.70 ms per token,  1427.97 tokens per second)\n",
+      "llama_print_timings: prompt eval time =  8161.68 ms /    61 tokens (  133.80 ms per token,     7.47 tokens per second)\n",
+      "llama_print_timings:        eval time =  6929.98 ms /   161 runs   (   43.04 ms per token,    23.23 tokens per second)\n",
+      "llama_print_timings:       total time = 15406.04 ms\n"
+     ]
+    }
+   ],
+   "source": [
+    "response = llm.complete(\"Hello! Can you tell me a poem about cats and dogs?\")\n",
+    "print(response.text)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "7b059409-cd9d-4651-979c-03b3943e94af",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Llama.generate: prefix-match hit\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "  Sure thing! Here's a poem about fast cars:\n",
+      "\n",
+      "Fast cars, oh how they thrill\n",
+      "With their sleek designs and powerful bills\n",
+      "They race down the road, a blur of speed\n",
+      "Leaving all else in their dusty need\n",
+      "\n",
+      "Their engines purr, their tires squeal\n",
+      "As they zip through the streets, it's a real deal\n",
+      "The wind rushes by, a roar of sound\n",
+      "As they leave all others in the ground\n",
+      "\n",
+      "With their shimmering paint and sleek lines\n",
+      "They're a sight to behold, oh so fine\n",
+      "They race and glide with graceful ease\n",
+      "Fast cars, oh how they please\n",
+      "\n",
+      "So here's to the fast cars, a poem of praise\n",
+      "For the thrill and joy they bring to our days\n",
+      "May their engines roar and their wheels spin\n",
+      "Forever and always, let them win."
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "llama_print_timings:        load time =  8161.74 ms\n",
+      "llama_print_timings:      sample time =   160.54 ms /   201 runs   (    0.80 ms per token,  1252.01 tokens per second)\n",
+      "llama_print_timings: prompt eval time =  1127.02 ms /    14 tokens (   80.50 ms per token,    12.42 tokens per second)\n",
+      "llama_print_timings:        eval time =  6295.21 ms /   200 runs   (   31.48 ms per token,    31.77 tokens per second)\n",
+      "llama_print_timings:       total time =  7947.75 ms\n"
+     ]
+    }
+   ],
+   "source": [
+    "response_iter = llm.stream_complete(\"Can you write me a poem about fast cars?\")\n",
+    "for response in response_iter:\n",
+    "    print(response.delta, end=\"\", flush=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "eedcd31d",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "llama-index",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/docs/examples/metadata_extraction/MetadataExtractionSEC.ipynb b/docs/examples/metadata_extraction/MetadataExtractionSEC.ipynb
index a93227688d..415573fb35 100644
--- a/docs/examples/metadata_extraction/MetadataExtractionSEC.ipynb
+++ b/docs/examples/metadata_extraction/MetadataExtractionSEC.ipynb
@@ -26,7 +26,13 @@
    "source": [
     "import nest_asyncio\n",
     "\n",
-    "nest_asyncio.apply()"
+    "nest_asyncio.apply()\n",
+    "\n",
+    "import os\n",
+    "import openai\n",
+    "\n",
+    "os.environ[\"OPENAI_API_KEY\"] = \"YOUR_API_KEY_HERE\"\n",
+    "openai.api_key = os.environ[\"OPENAI_API_KEY\"]"
    ]
   },
   {
@@ -38,22 +44,21 @@
    },
    "outputs": [],
    "source": [
-    "from llama_index import ListIndex, LLMPredictor\n",
+    "from llama_index import ServiceContext\n",
     "from llama_index.llms import OpenAI\n",
-    "from llama_index import download_loader, VectorStoreIndex, ServiceContext\n",
     "from llama_index.schema import MetadataMode"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 25,
+   "execution_count": 3,
    "id": "a0231dff-7443-46bf-9b9d-759198d3408e",
    "metadata": {
     "tags": []
    },
    "outputs": [],
    "source": [
-    "llm = OpenAI(temperature=0, model=\"text-davinci-003\", max_tokens=512)"
+    "llm = OpenAI(temperature=0.1, model=\"gpt-3.5-turbo\", max_tokens=512)"
    ]
   },
   {
@@ -70,7 +75,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 26,
+   "execution_count": 4,
    "id": "3bda151d-6fb8-427e-82fc-0f3bb469d705",
    "metadata": {
     "tags": []
@@ -107,11 +112,11 @@
     "\n",
     "metadata_extractor = MetadataExtractor(\n",
     "    extractors=[\n",
-    "        TitleExtractor(nodes=5),\n",
-    "        QuestionsAnsweredExtractor(questions=3),\n",
+    "        TitleExtractor(nodes=5, llm=llm),\n",
+    "        QuestionsAnsweredExtractor(questions=3, llm=llm),\n",
     "        # EntityExtractor(prediction_threshold=0.5),\n",
-    "        # SummaryExtractor(summaries=[\"prev\", \"self\"]),\n",
-    "        # KeywordExtractor(keywords=10),\n",
+    "        # SummaryExtractor(summaries=[\"prev\", \"self\"], llm=llm),\n",
+    "        # KeywordExtractor(keywords=10, llm=llm),\n",
     "        # CustomExtractor()\n",
     "    ],\n",
     ")\n",
@@ -124,14 +129,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 28,
+   "execution_count": 5,
    "id": "c72c45a9-dcad-4925-b2f7-d25fe5d80c2d",
    "metadata": {
     "tags": []
    },
    "outputs": [],
    "source": [
-    "from llama_index import SimpleDirectoryReader, DocumentSummaryIndex"
+    "from llama_index import SimpleDirectoryReader"
    ]
   },
   {
@@ -157,7 +162,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 7,
    "id": "38a46bf6-9539-4ac2-ad97-eb909992b94d",
    "metadata": {
     "tags": []
@@ -173,7 +178,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 8,
    "id": "269f8ecc-489d-435f-9d81-a9c64fd4d400",
    "metadata": {
     "tags": []
@@ -185,7 +190,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 9,
    "id": "8da4d824-d518-4d37-8322-a35adac05157",
    "metadata": {
     "tags": []
@@ -196,11 +201,11 @@
       "text/plain": [
        "{'page_label': '2',\n",
        " 'file_name': '10k-132.pdf',\n",
-       " 'document_title': 'Uber Technologies, Inc. 2019 Annual Report: Revolutionizing Mobility and Logistics Across 69 Countries and 111 Million MAPCs with $65 Billion in Gross Bookings',\n",
-       " 'questions_this_excerpt_can_answer': '\\n\\n1. How many countries does Uber Technologies, Inc. operate in?\\n2. What is the total number of MAPCs served by Uber Technologies, Inc.?\\n3. How much gross bookings did Uber Technologies, Inc. generate in 2019?'}"
+       " 'document_title': 'Exploring the Diverse Landscape of 2019: A Comprehensive Annual Report on Uber Technologies, Inc.',\n",
+       " 'questions_this_excerpt_can_answer': '1. How many countries does Uber operate in?\\n2. What is the total gross bookings of Uber in 2019?\\n3. How many trips did Uber facilitate in 2019?'}"
       ]
      },
-     "execution_count": 8,
+     "execution_count": 9,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -211,7 +216,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 10,
    "id": "93e70bfb-6c02-401b-be91-3827f358b22c",
    "metadata": {
     "tags": []
@@ -227,7 +232,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 11,
    "id": "e3720b40-c50c-4185-aaf4-289ff8ab057e",
    "metadata": {
     "tags": []
@@ -239,7 +244,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 12,
    "id": "98740f96-afdd-45ff-bcc0-2c50965a7349",
    "metadata": {
     "tags": []
@@ -250,11 +255,11 @@
       "text/plain": [
        "{'page_label': '2',\n",
        " 'file_name': '10k-vFinal.pdf',\n",
-       " 'document_title': \"Lyft, Inc. 2021 Annual Meeting of Stockholders: Filing and Attestation of Management's Assessment Report, Filer Status, Internal Control Assessment, Shell Company Status, Market Value of Common Stock, and Analysis of Historical Financial Performance.\",\n",
-       " 'questions_this_excerpt_can_answer': '\\n\\n1. What is the status of the registrant as an accelerated filer?\\n2. Has the registrant filed a report on and attestation to its management’s assessment of the effectiveness of its internal control over financial reporting?\\n3. What is the total number of shares of Class A and Class B common stock outstanding as of February 22, 2021?'}"
+       " 'document_title': 'Lyft, Inc. Annual Report on Form 10-K for the Fiscal Year Ended December 31, 2020',\n",
+       " 'questions_this_excerpt_can_answer': \"1. Has Lyft, Inc. filed a report on and attestation to its management's assessment of the effectiveness of its internal control over financial reporting under Section 404(b) of the Sarbanes-Oxley Act?\\n2. Is Lyft, Inc. considered a shell company according to Rule 12b-2 of the Exchange Act?\\n3. What was the aggregate market value of Lyft, Inc.'s common stock held by non-affiliates on June 30, 2020?\"}"
       ]
      },
-     "execution_count": 11,
+     "execution_count": 12,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -274,7 +279,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 13,
    "id": "302bb085-86cc-4b76-a452-67bc826b292d",
    "metadata": {
     "tags": []
@@ -306,7 +311,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 14,
    "id": "37dd8992-3716-44da-9309-154fb5946e98",
    "metadata": {
     "tags": []
@@ -345,7 +350,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 15,
    "id": "b8ff619d-67ed-4263-bfc7-2a7a1b7320e7",
    "metadata": {
     "tags": []
@@ -353,27 +358,31 @@
    "outputs": [],
    "source": [
     "from llama_index import VectorStoreIndex\n",
-    "from llama_index.vector_stores import FaissVectorStore\n",
     "from llama_index.query_engine import SubQuestionQueryEngine\n",
     "from llama_index.tools import QueryEngineTool, ToolMetadata"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": 25,
    "id": "028a65d7-8065-4798-acec-1c3486633e14",
    "metadata": {
     "tags": []
    },
    "outputs": [],
    "source": [
-    "index_no_metadata = VectorStoreIndex(nodes=nodes_no_metadata)\n",
-    "engine_no_metadata = index_no_metadata.as_query_engine(similarity_top_k=10)"
+    "index_no_metadata = VectorStoreIndex(\n",
+    "    nodes=nodes_no_metadata,\n",
+    "    service_context=ServiceContext.from_defaults(llm=OpenAI(model=\"gpt-4\")),\n",
+    ")\n",
+    "engine_no_metadata = index_no_metadata.as_query_engine(\n",
+    "    similarity_top_k=10,\n",
+    ")"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": 26,
    "id": "73ea9e05-ff5a-49b6-8e52-139d156cde47",
    "metadata": {
     "tags": []
@@ -397,7 +406,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 24,
+   "execution_count": 27,
    "id": "fd5a3e51-e252-4e24-bc2b-fbc32ce078dd",
    "metadata": {
     "tags": []
@@ -408,37 +417,23 @@
      "output_type": "stream",
      "text": [
       "Generated 4 sub questions.\n",
-      "\u001b[36;1m\u001b[1;3m[sec_filing_documents] Q: By first identifying and quoting the most relevant sources, what was the cost due to research and development for Uber in 2019 in millions of USD?\n",
-      "\u001b[0m\u001b[33;1m\u001b[1;3m[sec_filing_documents] Q: By first identifying and quoting the most relevant sources, what was the cost due to sales and marketing for Uber in 2019 in millions of USD?\n",
-      "\u001b[0m\u001b[38;5;200m\u001b[1;3m[sec_filing_documents] Q: By first identifying and quoting the most relevant sources, what was the cost due to research and development for Lyft in 2019 in millions of USD?\n",
-      "\u001b[0m\u001b[32;1m\u001b[1;3m[sec_filing_documents] Q: By first identifying and quoting the most relevant sources, what was the cost due to sales and marketing for Lyft in 2019 in millions of USD?\n",
-      "\u001b[0m\u001b[38;5;200m\u001b[1;3m[sec_filing_documents] A: \n",
-      "\n",
-      "According to the excerpt from page_label: 69 and file_name: 10k-132.pdf, the cost due to research and development for Lyft in 2019 was 15% of total revenue, or $1.5 billion in millions of USD.\n",
-      "\u001b[0m\u001b[33;1m\u001b[1;3m[sec_filing_documents] A: \n",
-      "\n",
-      "According to the excerpt from page_label 66 of the 10k-132.pdf document, the cost due to sales and marketing for Uber in 2019 was $397.8 million in thousands of USD, or $397,800,000 in millions of USD. This cost was primarily attributable to continued investments within Uber's non-Rides offerings and an increase in corporate overhead as the business grows.\n",
-      "\u001b[0m\u001b[36;1m\u001b[1;3m[sec_filing_documents] A: \n",
-      "\n",
-      "According to the excerpt from page 69 of the document 10k-132.pdf, research and development expenses for Uber in 2019 were $909.1 million in thousands of USD, which equates to $909.1 million in millions of USD. This was 9% of the total costs and expenses for the year, and accounted for 34% of the total revenue.\n",
-      "\u001b[0m\u001b[32;1m\u001b[1;3m[sec_filing_documents] A: \n",
-      "\n",
-      "According to the excerpt from page_label 72 of the 10k-vFinal.pdf document, the cost due to sales and marketing for Lyft in 2019 was $275.1 million in thousands of USD. This can be seen in the following excerpt: \n",
-      "\n",
-      "\"Sales and marketing $ 196,437 $ 194,184 $ 163,858 $ 180,951 $ 275,129 \n",
-      "Year Ended December 31,2019 to 2020 \n",
-      "% Change2018 to 2019 \n",
-      "% Change\"\n",
-      "\u001b[0mAnswer: \n",
-      "{\n",
-      "    \"Uber\": {\n",
-      "        \"Research and Development\": 909.1,\n",
-      "        \"Sales and Marketing\": 397.8\n",
-      "    },\n",
-      "    \"Lyft\": {\n",
-      "        \"Research and Development\": 1.5,\n",
-      "        \"Sales and Marketing\": 275.1\n",
-      "    }\n",
+      "\u001b[36;1m\u001b[1;3m[sec_filing_documents] Q: What was the cost due to research and development for Uber in 2019\n",
+      "\u001b[0m\u001b[33;1m\u001b[1;3m[sec_filing_documents] Q: What was the cost due to sales and marketing for Uber in 2019\n",
+      "\u001b[0m\u001b[38;5;200m\u001b[1;3m[sec_filing_documents] Q: What was the cost due to research and development for Lyft in 2019\n",
+      "\u001b[0m\u001b[32;1m\u001b[1;3m[sec_filing_documents] Q: What was the cost due to sales and marketing for Lyft in 2019\n",
+      "\u001b[0m\u001b[33;1m\u001b[1;3m[sec_filing_documents] A: The cost due to sales and marketing for Uber in 2019 was $814,122 in thousands.\n",
+      "\u001b[0m\u001b[36;1m\u001b[1;3m[sec_filing_documents] A: The cost due to research and development for Uber in 2019 was $1,505,640 in thousands.\n",
+      "\u001b[0m\u001b[38;5;200m\u001b[1;3m[sec_filing_documents] A: The cost of research and development for Lyft in 2019 was $1,505,640 in thousands.\n",
+      "\u001b[0m\u001b[32;1m\u001b[1;3m[sec_filing_documents] A: The cost due to sales and marketing for Lyft in 2019 was $814,122 in thousands.\n",
+      "\u001b[0m{\n",
+      "  \"Uber\": {\n",
+      "    \"Research and Development\": 1505.64,\n",
+      "    \"Sales and Marketing\": 814.122\n",
+      "  },\n",
+      "  \"Lyft\": {\n",
+      "    \"Research and Development\": 1505.64,\n",
+      "    \"Sales and Marketing\": 814.122\n",
+      "  }\n",
       "}\n"
      ]
     }
@@ -462,7 +457,7 @@
    "id": "e9dafdad-c18c-4e0f-8a35-b691ca73e1f2",
    "metadata": {},
    "source": [
-    "**RESULT**: As we can see, the QnA agent does not seem to know where to look for the right documents. As a result it gets only 1/4 of the subquestions right."
+    "**RESULT**: As we can see, the QnA agent does not seem to know where to look for the right documents. As a result it gets the Lyft and Uber data completely mixed up."
    ]
   },
   {
@@ -476,7 +471,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 20,
+   "execution_count": 18,
    "id": "97f00a18-e9e6-47db-bef5-cbf5bb5016be",
    "metadata": {
     "tags": []
@@ -490,12 +485,7 @@
       " [Excerpt from document]\n",
       "page_label: 65\n",
       "file_name: 10k-132.pdf\n",
-      "document_title: Uber Technologies, Inc. 2019 Annual Report: Revolutionizing Mobility and Logistics Across 69 Countries and 111 Million MAPCs with $65 Billion in Gross Bookings\n",
-      "questions_this_excerpt_can_answer: \n",
-      "\n",
-      "1. What is Uber Technologies, Inc.'s definition of Adjusted EBITDA?\n",
-      "2. How much did Adjusted EBITDA change from 2017 to 2018?\n",
-      "3. How much did Adjusted EBITDA change from 2018 to 2019?\n",
+      "document_title: Exploring the Diverse Landscape of 2019: A Comprehensive Annual Report on Uber Technologies, Inc.\n",
       "Excerpt:\n",
       "-----\n",
       "See the section titled “Reconciliations of Non-GAAP Financial Measures” for our definition and a \n",
@@ -517,20 +507,25 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 21,
+   "execution_count": 22,
    "id": "c7d255de-3034-4035-93bc-45d535ce1700",
    "metadata": {
     "tags": []
    },
    "outputs": [],
    "source": [
-    "index = VectorStoreIndex(nodes=uber_nodes + lyft_nodes)\n",
-    "engine = index.as_query_engine(similarity_top_k=10)"
+    "index = VectorStoreIndex(\n",
+    "    nodes=uber_nodes + lyft_nodes,\n",
+    "    service_context=ServiceContext.from_defaults(llm=OpenAI(model=\"gpt-4\")),\n",
+    ")\n",
+    "engine = index.as_query_engine(\n",
+    "    similarity_top_k=10,\n",
+    ")"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 22,
+   "execution_count": 23,
    "id": "bbe42516-a2ca-4986-9012-cb15682323f5",
    "metadata": {
     "tags": []
@@ -554,7 +549,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 27,
+   "execution_count": 24,
    "id": "f48ac2d9-58e9-4b98-9bad-b8ce1eea7934",
    "metadata": {
     "tags": []
@@ -565,32 +560,23 @@
      "output_type": "stream",
      "text": [
       "Generated 4 sub questions.\n",
-      "\u001b[36;1m\u001b[1;3m[sec_filing_documents] Q: By first identifying and quoting the most relevant sources, what was the cost due to research and development for Uber in 2019 in millions of USD?\n",
-      "\u001b[0m\u001b[33;1m\u001b[1;3m[sec_filing_documents] Q: By first identifying and quoting the most relevant sources, what was the cost due to sales and marketing for Uber in 2019 in millions of USD?\n",
-      "\u001b[0m\u001b[38;5;200m\u001b[1;3m[sec_filing_documents] Q: By first identifying and quoting the most relevant sources, what was the cost due to research and development for Lyft in 2019 in millions of USD?\n",
-      "\u001b[0m\u001b[32;1m\u001b[1;3m[sec_filing_documents] Q: By first identifying and quoting the most relevant sources, what was the cost due to sales and marketing for Lyft in 2019 in millions of USD?\n",
-      "\u001b[0m\u001b[38;5;200m\u001b[1;3m[sec_filing_documents] A: \n",
-      "\n",
-      "According to the excerpt from the document, Lyft spent $1,505 million on research and development in 2019. This was 34% of total costs and expenses for the year.\n",
-      "\u001b[0m\u001b[33;1m\u001b[1;3m[sec_filing_documents] A: \n",
-      "\n",
-      "According to the excerpt from page 69 of the Uber Technologies, Inc. 2019 Annual Report, the cost due to sales and marketing for Uber in 2019 was $4,626 million in USD. This cost was driven by investments in non-Rides offerings, corporate overhead, and Driver incentives.\n",
-      "\u001b[0m\u001b[36;1m\u001b[1;3m[sec_filing_documents] A: \n",
-      "\n",
-      "According to the excerpt from page 69 of the document, Uber Technologies, Inc. spent $4.836 billion on research and development in 2019, which was driven by a 22% increase in MAPCs due to global expansion of their Eats product offerings combined with wider market adoption of their Rides product, and overall growth in their other offerings.\n",
-      "\u001b[0m\u001b[32;1m\u001b[1;3m[sec_filing_documents] A: \n",
-      "\n",
-      "According to the excerpt from page 69 of the document titled \"Lyft, Inc. 2021 Annual Meeting of Stockholders: Filing and Attestation of Management's Assessment Report, Filer Status, Internal Control Assessment, Shell Company Status, Market Value of Common Stock, and Analysis of Historical Financial Performance,\" the cost due to sales and marketing for Lyft in 2019 was $814.122 million in USD. This can be found in the table under the heading \"2020 2019 2018 (in thousands)\" which states that \"Sales and marketing $416,331 $814,122 $803,751.\"\n",
-      "\u001b[0mAnswer: \n",
-      "{\n",
-      "    \"Uber\": {\n",
-      "        \"Research and Development\": 4.836,\n",
-      "        \"Sales and Marketing\": 4.626\n",
-      "    },\n",
-      "    \"Lyft\": {\n",
-      "        \"Research and Development\": 1.505,\n",
-      "        \"Sales and Marketing\": 0.814\n",
-      "    }\n",
+      "\u001b[36;1m\u001b[1;3m[sec_filing_documents] Q: What was the cost due to research and development for Uber in 2019\n",
+      "\u001b[0m\u001b[33;1m\u001b[1;3m[sec_filing_documents] Q: What was the cost due to sales and marketing for Uber in 2019\n",
+      "\u001b[0m\u001b[38;5;200m\u001b[1;3m[sec_filing_documents] Q: What was the cost due to research and development for Lyft in 2019\n",
+      "\u001b[0m\u001b[32;1m\u001b[1;3m[sec_filing_documents] Q: What was the cost due to sales and marketing for Lyft in 2019\n",
+      "\u001b[0m\u001b[33;1m\u001b[1;3m[sec_filing_documents] A: The cost due to sales and marketing for Uber in 2019 was $4,626 million.\n",
+      "\u001b[0m\u001b[36;1m\u001b[1;3m[sec_filing_documents] A: The cost due to research and development for Uber in 2019 was $4,836 million.\n",
+      "\u001b[0m\u001b[32;1m\u001b[1;3m[sec_filing_documents] A: The cost due to sales and marketing for Lyft in 2019 was $814,122 in thousands.\n",
+      "\u001b[0m\u001b[38;5;200m\u001b[1;3m[sec_filing_documents] A: The cost of research and development for Lyft in 2019 was $1,505,640 in thousands.\n",
+      "\u001b[0m{\n",
+      "  \"Uber\": {\n",
+      "    \"Research and Development\": 4836,\n",
+      "    \"Sales and Marketing\": 4626\n",
+      "  },\n",
+      "  \"Lyft\": {\n",
+      "    \"Research and Development\": 1505.64,\n",
+      "    \"Sales and Marketing\": 814.122\n",
+      "  }\n",
       "}\n"
      ]
     }
@@ -640,9 +626,9 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "llama_index_jon",
+   "display_name": "llama-index",
    "language": "python",
-   "name": "llama_index_jon"
+   "name": "python3"
   },
   "language_info": {
    "codemirror_mode": {
@@ -654,7 +640,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.8"
+   "version": "3.9.6"
   }
  },
  "nbformat": 4,
diff --git a/docs/examples/node_postprocessor/MetadataReplacementDemo.ipynb b/docs/examples/node_postprocessor/MetadataReplacementDemo.ipynb
new file mode 100644
index 0000000000..66b190ad71
--- /dev/null
+++ b/docs/examples/node_postprocessor/MetadataReplacementDemo.ipynb
@@ -0,0 +1,488 @@
+{
+ "cells": [
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Metadata Replacement + Node Sentence Window\n",
+    "\n",
+    "In this notebook, we use the `SentenceWindowNodeParser` to parse documents into single sentences per node. Each node also contains a \"window\" with the sentences on either side of the node sentence.\n",
+    "\n",
+    "Then, during retrieval, before passing the retrieved sentences to the LLM, the single sentences are replaced with a window containing the surrounding sentences using the `MetadataReplacementNodePostProcessor`.\n",
+    "\n",
+    "This is most useful for large documents/indexes, as it helps to retrieve more fine-grained details.\n",
+    "\n",
+    "By default, the sentence window is 5 sentences on either side of the original sentence.\n",
+    "\n",
+    "In this case, chunk size settings are not used, in favor of following the window settings."
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Setup"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import openai\n",
+    "\n",
+    "os.environ[\"OPENAI_API_KEY\"] = \"sk-...\"\n",
+    "openai.api_key = os.environ[\"OPENAI_API_KEY\"]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain.embeddings import HuggingFaceEmbeddings\n",
+    "from llama_index import ServiceContext, set_global_service_context\n",
+    "from llama_index.llms import OpenAI\n",
+    "from llama_index.embeddings import OpenAIEmbedding\n",
+    "from llama_index.node_parser import SentenceWindowNodeParser\n",
+    "\n",
+    "# create the sentence window node parser w/ default settings\n",
+    "node_parser = SentenceWindowNodeParser.from_defaults(\n",
+    "    window_size=3,\n",
+    "    window_metadata_key=\"window\",\n",
+    "    original_text_metadata_key=\"original_text\",\n",
+    ")\n",
+    "\n",
+    "llm = OpenAI(model=\"gpt-3.5-turbo\", temperature=0.1)\n",
+    "ctx = ServiceContext.from_defaults(\n",
+    "    llm=llm,\n",
+    "    embed_model=HuggingFaceEmbeddings(\n",
+    "        model_name=\"sentence-transformers/all-mpnet-base-v2\"\n",
+    "    ),\n",
+    "    node_parser=node_parser,\n",
+    ")\n",
+    "\n",
+    "# if you wanted to use OpenAIEmbeddings, we should also increase the batch size,\n",
+    "# since it involves many more calls to the API\n",
+    "# ctx = ServiceContext.from_defaults(llm=llm, embed_model=OpenAIEmbedding(embed_batch_size=50)), node_parser=node_parser)"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Build the index\n",
+    "\n",
+    "Here, we build an index using chapter 3 of the recent IPCC climate report."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
+      "To disable this warning, you can either:\n",
+      "\t- Avoid using `tokenizers` before the fork if possible\n",
+      "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
+      "  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current\n",
+      "                                 Dload  Upload   Total   Spent    Left  Speed\n",
+      "100 20.7M  100 20.7M    0     0  20.7M      0 --:--:-- --:--:--  0:00:02 8706k--:--:-- --:--:-- 20.8M\n"
+     ]
+    }
+   ],
+   "source": [
+    "!curl https://www.ipcc.ch/report/ar6/wg2/downloads/report/IPCC_AR6_WGII_Chapter03.pdf --output IPCC_AR6_WGII_Chapter03.pdf"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from llama_index import SimpleDirectoryReader\n",
+    "\n",
+    "documents = SimpleDirectoryReader(\n",
+    "    input_files=[\"./IPCC_AR6_WGII_Chapter03.pdf\"]\n",
+    ").load_data()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from llama_index import VectorStoreIndex\n",
+    "\n",
+    "sentence_index = VectorStoreIndex.from_documents(documents, service_context=ctx)"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Querying\n",
+    "\n",
+    "### With MetadataReplacementPostProcessor\n",
+    "\n",
+    "Here, we now use the `MetadataReplacementPostProcessor` to replace the sentence in each node with it's surrounding context."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "There is low confidence in the quantification of AMOC changes in the 20th century due to low agreement in quantitative reconstructed and simulated trends. Additionally, direct observational records since the mid-2000s remain too short to determine the relative contributions of internal variability, natural forcing, and anthropogenic forcing to AMOC change. However, it is very likely that AMOC will decline over the 21st century for all SSP scenarios, but there will not be an abrupt collapse before 2100.\n"
+     ]
+    }
+   ],
+   "source": [
+    "from llama_index.indices.postprocessor import MetadataReplacementPostProcessor\n",
+    "\n",
+    "query_engine = sentence_index.as_query_engine(\n",
+    "    similarity_top_k=2,\n",
+    "    # the target key defaults to `window` to match the node_parser's default\n",
+    "    node_postprocessors=[\n",
+    "        MetadataReplacementPostProcessor(target_metadata_key=\"window\")\n",
+    "    ],\n",
+    ")\n",
+    "window_response = query_engine.query(\"What are the concerns surrounding the AMOC?\")\n",
+    "print(window_response)"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We can also check the original sentence that was retrieved for each node, as well as the actual window of sentences that was sent to the LLM."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Window: Nevertheless, projected future annual cumulative upwelling wind \n",
+      "changes at most locations and seasons remain within ±10–20% of \n",
+      "present-day values (medium confidence) (WGI AR6 Section  9.2.3.5; \n",
+      "Fox-Kemper et al., 2021). Continuous observation of the Atlantic meridional overturning \n",
+      "circulation (AMOC) has improved the understanding of its variability \n",
+      "(Frajka-Williams et  al., 2019), but there is low confidence in the \n",
+      "quantification of AMOC changes in the 20th century because of low \n",
+      "agreement in quantitative reconstructed and simulated trends (WGI \n",
+      "AR6 Sections 2.3.3, 9.2.3.1; Fox-Kemper et al., 2021; Gulev et al., 2021). Direct observational records since the mid-2000s remain too short to \n",
+      "determine the relative contributions of internal variability, natural \n",
+      "forcing and anthropogenic forcing to AMOC change (high confidence) \n",
+      "(WGI AR6 Sections 2.3.3, 9.2.3.1; Fox-Kemper et al., 2021; Gulev et al., \n",
+      "2021). Over the 21st century, AMOC will very likely decline for all SSP \n",
+      "scenarios but will not involve an abrupt collapse before 2100 (WGI \n",
+      "AR6 Sections 4.3.2, 9.2.3.1; Fox-Kemper et al., 2021; Lee et al., 2021). 3.2.2.4 Sea Ice Changes\n",
+      "Sea ice is a key driver of polar marine life, hosting unique ecosystems \n",
+      "and affecting diverse marine organisms and food webs through its \n",
+      "impact on light penetration and supplies of nutrients and organic \n",
+      "matter (Arrigo, 2014). Since the late 1970s, Arctic sea ice area has \n",
+      "decreased for all months, with an estimated decrease of 2 million km2 \n",
+      "(or 25%) for summer sea ice (averaged for August, September and \n",
+      "October) in 2010–2019 as compared with 1979–1988 (WGI AR6 \n",
+      "Section 9.3.1.1; Fox-Kemper et al., 2021).\n",
+      "------------------\n",
+      "Original Sentence: Over the 21st century, AMOC will very likely decline for all SSP \n",
+      "scenarios but will not involve an abrupt collapse before 2100 (WGI \n",
+      "AR6 Sections 4.3.2, 9.2.3.1; Fox-Kemper et al., 2021; Lee et al., 2021).\n"
+     ]
+    }
+   ],
+   "source": [
+    "window = window_response.source_nodes[0].node.metadata[\"window\"]\n",
+    "sentence = window_response.source_nodes[0].node.metadata[\"original_text\"]\n",
+    "\n",
+    "print(f\"Window: {window}\")\n",
+    "print(\"------------------\")\n",
+    "print(f\"Original Sentence: {sentence}\")"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Contrast with normal VectorStoreIndex"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain.embeddings import HuggingFaceEmbeddings\n",
+    "from llama_index import VectorStoreIndex\n",
+    "from llama_index import ServiceContext, set_global_service_context\n",
+    "from llama_index.llms import OpenAI\n",
+    "\n",
+    "llm = OpenAI(model=\"gpt-3.5-turbo\", temperature=0.1)\n",
+    "ctx = ServiceContext.from_defaults(\n",
+    "    llm=llm,\n",
+    "    embed_model=HuggingFaceEmbeddings(\n",
+    "        model_name=\"sentence-transformers/all-mpnet-base-v2\"\n",
+    "    ),\n",
+    ")\n",
+    "\n",
+    "vector_index = VectorStoreIndex.from_documents(documents, service_context=ctx)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "I'm sorry, but the concerns surrounding the AMOC (Atlantic Meridional Overturning Circulation) are not mentioned in the provided context.\n"
+     ]
+    }
+   ],
+   "source": [
+    "query_engine = vector_index.as_query_engine(similarity_top_k=2)\n",
+    "vector_response = query_engine.query(\"What are the concerns surrounding the AMOC?\")\n",
+    "print(vector_response)"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Well, that didn't work. Let's bump up the top k! This will be slower and use more tokens compared to the sentence window index."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "The context information does not provide any specific concerns surrounding the AMOC (Atlantic Meridional Overturning Circulation).\n"
+     ]
+    }
+   ],
+   "source": [
+    "query_engine = vector_index.as_query_engine(similarity_top_k=5)\n",
+    "vector_response = query_engine.query(\"What are the concerns surrounding the AMOC?\")\n",
+    "print(vector_response)"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Analysis\n",
+    "\n",
+    "So the `SentenceWindowNodeParser` + `MetadataReplacementNodePostProcessor` combo is the clear winner here. But why?\n",
+    "\n",
+    "Embeddings at a sentence level seem to capture more fine-grained details, like the word `AMOC`.\n",
+    "\n",
+    "We can also compare the retrieved chunks for each index!"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Over the 21st century, AMOC will very likely decline for all SSP \n",
+      "scenarios but will not involve an abrupt collapse before 2100 (WGI \n",
+      "AR6 Sections 4.3.2, 9.2.3.1; Fox-Kemper et al., 2021; Lee et al., 2021).\n",
+      "--------\n",
+      "Direct observational records since the mid-2000s remain too short to \n",
+      "determine the relative contributions of internal variability, natural \n",
+      "forcing and anthropogenic forcing to AMOC change (high confidence) \n",
+      "(WGI AR6 Sections 2.3.3, 9.2.3.1; Fox-Kemper et al., 2021; Gulev et al., \n",
+      "2021).\n",
+      "--------\n"
+     ]
+    }
+   ],
+   "source": [
+    "for source_node in window_response.source_nodes:\n",
+    "    print(source_node.node.metadata[\"original_text\"])\n",
+    "    print(\"--------\")"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Here, we can see that the sentence window index easily retrieved two nodes that talk about AMOC. Remember, the embeddings are based purely on the original sentence here, but the LLM actually ends up reading the surrounding context as well!"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Now, let's try and disect why the naive vector index failed."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "AMOC mentioned? False\n",
+      "--------\n",
+      "AMOC mentioned? False\n",
+      "--------\n",
+      "AMOC mentioned? True\n",
+      "--------\n",
+      "AMOC mentioned? False\n",
+      "--------\n",
+      "AMOC mentioned? False\n",
+      "--------\n"
+     ]
+    }
+   ],
+   "source": [
+    "for node in vector_response.source_nodes:\n",
+    "    print(\"AMOC mentioned?\", \"AMOC\" in node.node.text)\n",
+    "    print(\"--------\")"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "So source node at index [2] mentions AMOC, but what did this text actually look like?"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Nevertheless, projected future annual cumulative upwelling wind \n",
+      "changes at most locations and seasons remain within ±10–20% of \n",
+      "present-day values (medium confidence) (WGI AR6 Section  9.2.3.5; \n",
+      "Fox-Kemper et al., 2021).Continuous observation of the Atlantic meridional overturning \n",
+      "circulation (AMOC) has improved the understanding of its variability \n",
+      "(Frajka-Williams et  al., 2019), but there is low confidence in the \n",
+      "quantification of AMOC changes in the 20th century because of low \n",
+      "agreement in quantitative reconstructed and simulated trends (WGI \n",
+      "AR6 Sections 2.3.3, 9.2.3.1; Fox-Kemper et al., 2021; Gulev et al., 2021).Direct observational records since the mid-2000s remain too short to \n",
+      "determine the relative contributions of internal variability, natural \n",
+      "forcing and anthropogenic forcing to AMOC change (high confidence) \n",
+      "(WGI AR6 Sections 2.3.3, 9.2.3.1; Fox-Kemper et al., 2021; Gulev et al., \n",
+      "2021).Over the 21st century, AMOC will very likely decline for all SSP \n",
+      "scenarios but will not involve an abrupt collapse before 2100 (WGI \n",
+      "AR6 Sections 4.3.2, 9.2.3.1; Fox-Kemper et al., 2021; Lee et al., 2021).3.2.2.4 Sea Ice Changes\n",
+      "Sea ice is a key driver of polar marine life, hosting unique ecosystems \n",
+      "and affecting diverse marine organisms and food webs through its \n",
+      "impact on light penetration and supplies of nutrients and organic \n",
+      "matter (Arrigo, 2014).Since the late 1970s, Arctic sea ice area has \n",
+      "decreased for all months, with an estimated decrease of 2 million km2 \n",
+      "(or 25%) for summer sea ice (averaged for August, September and \n",
+      "October) in 2010–2019 as compared with 1979–1988 (WGI AR6 \n",
+      "Section 9.3.1.1; Fox-Kemper et al., 2021).For Antarctic sea ice there is \n",
+      "no significant global trend in satellite-observed sea ice area from 1979 \n",
+      "to 2020 in either winter or summer, due to regionally opposing trends \n",
+      "and large internal variability (WGI AR6 Section 9.3.2.1; Maksym, 2019; \n",
+      "Fox-Kemper et al., 2021).CMIP6 simulations project that the Arctic Ocean will likely become \n",
+      "practically sea ice free (area below 1 million km2) for the first time before \n",
+      "2050 and in the seasonal sea ice minimum in each of the four emission \n",
+      "scenarios SSP1-1.9, SSP1-2.6, SSP2-4.5 and SSP5-8.5 (Figure 3.7; WGI \n",
+      "AR6 Section 9.3.2.2; Notze and SIMIP Community, 2020; Fox-Kemper \n",
+      "et al., 2021).Antarctic sea ice area is also projected to decrease during \n",
+      "the 21st century, but due to mismatches between model simulations \n",
+      "and observations, combined with a lack of understanding of reasons \n",
+      "for substantial inter-model spread, there is low confidence in model \n",
+      "projections of future Antarctic sea ice changes, particularly at the \n",
+      "regional level (WGI AR6 Section  9.3.2.2; Roach et  al., 2020; Fox-\n",
+      "Kemper et al., 2021).3.2.3 Chemical Changes\n",
+      "3.2.3.1  Ocean Acidification\n",
+      "The ocean’s uptake of anthropogenic carbon affects its chemistry \n",
+      "in a process referred to as ocean acidification, which increases the \n",
+      "concentrations of aqueous CO 2, bicarbonate and hydrogen ions, and \n",
+      "decreases pH, carbonate ion concentrations and calcium carbonate \n",
+      "mineral saturation states (Doney et  al., 2009).Ocean acidification\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(vector_response.source_nodes[2].node.text)"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "So AMOC is disuccsed, but sadly it is in the middle chunk. With LLMs, it is often observed that text in the middle of retrieved context is often ignored or less useful. A recent paper [\"Lost in the Middle\" discusses this here](https://arxiv.org/abs/2307.03172)."
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "llama-index",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.6"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/docs/examples/query_engine/json_query_engine.ipynb b/docs/examples/query_engine/json_query_engine.ipynb
index de4e221b4a..77a6a5a136 100644
--- a/docs/examples/query_engine/json_query_engine.ipynb
+++ b/docs/examples/query_engine/json_query_engine.ipynb
@@ -22,10 +22,12 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Requirement already satisfied: jsonpath-ng in /workspaces/llama_index/.venv/lib/python3.10/site-packages (1.5.3)\n",
-      "Requirement already satisfied: ply in /workspaces/llama_index/.venv/lib/python3.10/site-packages (from jsonpath-ng) (3.11)\n",
-      "Requirement already satisfied: decorator in /workspaces/llama_index/.venv/lib/python3.10/site-packages (from jsonpath-ng) (5.1.1)\n",
-      "Requirement already satisfied: six in /workspaces/llama_index/.venv/lib/python3.10/site-packages (from jsonpath-ng) (1.16.0)\n"
+      "Requirement already satisfied: jsonpath-ng in /Users/loganmarkewich/llama_index/llama-index/lib/python3.9/site-packages (1.5.3)\n",
+      "Requirement already satisfied: ply in /Users/loganmarkewich/llama_index/llama-index/lib/python3.9/site-packages (from jsonpath-ng) (3.11)\n",
+      "Requirement already satisfied: six in /Users/loganmarkewich/llama_index/llama-index/lib/python3.9/site-packages (from jsonpath-ng) (1.16.0)\n",
+      "Requirement already satisfied: decorator in /Users/loganmarkewich/llama_index/llama-index/lib/python3.9/site-packages (from jsonpath-ng) (5.1.1)\n",
+      "\u001b[33mWARNING: You are using pip version 21.2.4; however, version 23.2.1 is available.\n",
+      "You should consider upgrading via the '/Users/loganmarkewich/llama_index/llama-index/bin/python3 -m pip install --upgrade pip' command.\u001b[0m\n"
      ]
     }
    ],
@@ -55,22 +57,13 @@
    "execution_count": 3,
    "id": "7aa21e46",
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "False"
-      ]
-     },
-     "execution_count": 3,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
-    "import dotenv\n",
+    "import os\n",
+    "import openai\n",
     "\n",
-    "dotenv.load_dotenv(\"../../../.env\")"
+    "os.environ[\"OPENAI_API_KEY\"] = \"YOUR_KEY_HERE\"\n",
+    "openai.api_key = os.environ[\"OPENAI_API_KEY\"]"
    ]
   },
   {
@@ -200,35 +193,18 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 20,
    "id": "4fea2edb-b3d4-4313-a656-d6edb00d93c0",
    "metadata": {
     "tags": []
    },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:numexpr.utils:NumExpr defaulting to 2 threads.\n",
-      "NumExpr defaulting to 2 threads.\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/workspaces/llama_index/.venv/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
-      "  from .autonotebook import tqdm as notebook_tqdm\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "from llama_index.indices.service_context import ServiceContext\n",
     "from llama_index.llms import OpenAI\n",
     "from llama_index.indices.struct_store import JSONQueryEngine\n",
     "\n",
-    "llm = OpenAI(model=\"text-davinci-003\")\n",
+    "llm = OpenAI(model=\"gpt-4\")\n",
     "service_context = ServiceContext.from_defaults(llm=llm)\n",
     "nl_query_engine = JSONQueryEngine(\n",
     "    json_value=json_value, json_schema=json_schema, service_context=service_context\n",
@@ -243,27 +219,12 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 21,
    "id": "451836bc-b073-4838-8ab8-3def7d2c4d9d",
    "metadata": {
     "tags": []
    },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:llama_index.token_counter.token_counter:> [query] Total LLM token usage: 797 tokens\n",
-      "> [query] Total LLM token usage: 797 tokens\n",
-      "INFO:llama_index.token_counter.token_counter:> [query] Total embedding token usage: 0 tokens\n",
-      "> [query] Total embedding token usage: 0 tokens\n",
-      "INFO:llama_index.token_counter.token_counter:> [query] Total LLM token usage: 363 tokens\n",
-      "> [query] Total LLM token usage: 363 tokens\n",
-      "INFO:llama_index.token_counter.token_counter:> [query] Total embedding token usage: 0 tokens\n",
-      "> [query] Total embedding token usage: 0 tokens\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "nl_response = nl_query_engine.query(\n",
     "    \"What comments has Jerry been writing?\",\n",
@@ -275,7 +236,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 22,
    "id": "4253d4c3-f3e5-4779-bcd1-2e6e2818305f",
    "metadata": {
     "tags": []
@@ -284,7 +245,7 @@
     {
      "data": {
       "text/markdown": [
-       "<h1>Natural language Response</h1><br><b> Jerry has written one comment with the content 'Nice post!' on blog post with id 1.</b>"
+       "<h1>Natural language Response</h1><br><b>Jerry has written the comment \"Nice post!\".</b>"
       ],
       "text/plain": [
        "<IPython.core.display.Markdown object>"
@@ -296,7 +257,7 @@
     {
      "data": {
       "text/markdown": [
-       "<h1>Raw JSON Response</h1><br><b>[{\"id\": 1, \"content\": \"Nice post!\", \"username\": \"jerry\", \"blogPostId\": 1}]</b>"
+       "<h1>Raw JSON Response</h1><br><b>[\"Nice post!\"]</b>"
       ],
       "text/plain": [
        "<IPython.core.display.Markdown object>"
@@ -313,7 +274,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 23,
    "id": "5e10b7da-b355-49b2-9f80-f17541d4f850",
    "metadata": {
     "tags": []
@@ -323,7 +284,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      " $.comments[?(@.username == 'jerry')]\n"
+      "$.comments[?(@.username=='jerry')].content\n"
      ]
     }
    ],
@@ -331,6 +292,14 @@
     "# get the json path query string. Same would apply to raw_response\n",
     "print(nl_response.metadata[\"json_path_response_str\"])"
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1edb0476",
+   "metadata": {},
+   "outputs": [],
+   "source": []
   }
  ],
  "metadata": {
@@ -349,7 +318,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.4"
+   "version": "3.9.6"
   }
  },
  "nbformat": 4,
diff --git a/docs/getting_started/installation.md b/docs/getting_started/installation.md
index 57ada7a915..fa38de4744 100644
--- a/docs/getting_started/installation.md
+++ b/docs/getting_started/installation.md
@@ -1,6 +1,6 @@
 # Installation and Setup
 
-### Installation from Pip
+## Installation from Pip
 
 You can simply do:
 
@@ -8,19 +8,29 @@ You can simply do:
 pip install llama-index
 ```
 
-### Installation from Source
+**NOTE:** LlamaIndex may download and store local files for various packages (NLTK, HuggingFace, ...). Use the environment variable "LLAMA_INDEX_CACHE_DIR" to control where these files are saved.
+
+## Installation from Source
 
 Git clone this repository: `git clone https://github.com/jerryjliu/llama_index.git`. Then do:
 
 - `pip install -e .` if you want to do an editable install (you can modify source files) of just the package itself.
 - `pip install -r requirements.txt` if you want to install optional dependencies + dependencies used for development (e.g. unit testing).
 
-### Environment Setup
+## OpenAI Environment Setup
 
-By default, we use the OpenAI GPT-3 `text-davinci-003` model. In order to use this, you must have an OPENAI_API_KEY setup.
+By default, we use the OpenAI `gpt-3.5-turbo` model for text generation and `text-embedding-ada-002` for retrieval and embeddings. In order to use this, you must have an OPENAI_API_KEY setup.
 You can register an API key by logging into [OpenAI's page and creating a new API token](https://beta.openai.com/account/api-keys).
 
 ```{tip}
 You can also [customize the underlying LLM](/core_modules/model_modules/llms/usage_custom.md). You may
 need additional environment keys + tokens setup depending on the LLM provider.
 ```
+
+## Local Environment Setup
+
+If you don't wish to use OpenAI, the environment will automatically fallback to using `LlamaCPP` and `llama2-chat-13B` for text generation and `BAAI/bge-small-en` for retrieval and embeddings. This models will all run locally.
+
+In order to use `LlamaCPP`, follow the installation guide [here](/examples/llm/llama_2_llama_cpp.ipynb). You'll need to install the `llama-cpp-python` package, preferably compiled to support your GPU. This will use aronund 11.5GB of memory across the CPU and GPU.
+
+In order to use the local embeddings, simply run `pip install sentence-transformers`. The local embedding model uses about 500MB of memory.
diff --git a/experimental/splitter_playground/app.py b/experimental/splitter_playground/app.py
new file mode 100644
index 0000000000..d10eb03b06
--- /dev/null
+++ b/experimental/splitter_playground/app.py
@@ -0,0 +1,123 @@
+import os
+import tempfile
+from typing import List
+
+import streamlit as st
+import tiktoken
+from langchain.text_splitter import (
+    CharacterTextSplitter,
+    RecursiveCharacterTextSplitter,
+)
+from langchain.text_splitter import TokenTextSplitter as LCTokenTextSplitter
+from streamlit.runtime.uploaded_file_manager import UploadedFile
+
+from llama_index import SimpleDirectoryReader
+from llama_index.schema import Document
+from llama_index.text_splitter import CodeSplitter, SentenceSplitter, TokenTextSplitter
+from llama_index.text_splitter.types import TextSplitter
+
+DEFAULT_TEXT = "The quick brown fox jumps over the lazy dog."
+
+text = st.sidebar.text_area("Enter text", value=DEFAULT_TEXT)
+uploaded_files = st.sidebar.file_uploader("Upload file", accept_multiple_files=True)
+type = st.sidebar.radio("Document Type", options=["Text", "Code"])
+n_cols = st.sidebar.number_input("Columns", value=2, min_value=1, max_value=3)
+assert isinstance(n_cols, int)
+
+
+@st.cache_resource(ttl="1h")
+def load_document(uploaded_files: List[UploadedFile]) -> List[Document]:
+    # Read documents
+    docs = []
+    temp_dir = tempfile.TemporaryDirectory()
+    for file in uploaded_files:
+        temp_filepath = os.path.join(temp_dir.name, file.name)
+        with open(temp_filepath, "wb") as f:
+            f.write(file.getvalue())
+
+    reader = SimpleDirectoryReader(input_dir=temp_dir.name)
+    docs = reader.load_data()
+    return docs
+
+
+if uploaded_files:
+    if text != DEFAULT_TEXT:
+        st.warning("Text will be ignored when uploading files")
+    docs = load_document(uploaded_files)
+    text = "\n".join([doc.text for doc in docs])
+
+
+chunk_size = st.slider(
+    "Chunk Size",
+    value=512,
+    min_value=1,
+    max_value=4096,
+)
+chunk_overlap = st.slider(
+    "Chunk Overlap",
+    value=0,
+    min_value=0,
+    max_value=4096,
+)
+
+cols = st.columns(n_cols)
+for ind, col in enumerate(cols):
+    if type == "Text":
+        text_splitter_cls = col.selectbox(
+            "Text Splitter",
+            options=[
+                "TokenTextSplitter",
+                "SentenceSplitter",
+                "LC:RecursiveCharacterTextSplitter",
+                "LC:CharacterTextSplitter",
+                "LC:TokenTextSplitter",
+            ],
+            index=ind,
+            key=f"splitter_cls_{ind}",
+        )
+
+        text_splitter: TextSplitter
+        if text_splitter_cls == "TokenTextSplitter":
+            text_splitter = TokenTextSplitter(
+                chunk_size=chunk_size, chunk_overlap=chunk_overlap
+            )
+        elif text_splitter_cls == "SentenceSplitter":
+            text_splitter = SentenceSplitter(
+                chunk_size=chunk_size, chunk_overlap=chunk_overlap
+            )
+        elif text_splitter_cls == "LC:RecursiveCharacterTextSplitter":
+            text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
+                chunk_size=chunk_size, chunk_overlap=chunk_overlap
+            )
+        elif text_splitter_cls == "LC:CharacterTextSplitter":
+            text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
+                chunk_size=chunk_size, chunk_overlap=chunk_overlap
+            )
+        elif text_splitter_cls == "LC:TokenTextSplitter":
+            text_splitter = LCTokenTextSplitter(
+                chunk_size=chunk_size, chunk_overlap=chunk_overlap
+            )
+        else:
+            raise ValueError("Unknown text splitter")
+    elif type == "Code":
+        text_splitter_cls = col.selectbox("Text Splitter", options=["CodeSplitter"])
+        if text_splitter_cls == "CodeSplitter":
+            language = col.text_input("Language", value="python")
+            max_chars = col.slider("Max Chars", value=1500)
+
+            text_splitter = CodeSplitter(language=language, max_chars=max_chars)
+        else:
+            raise ValueError("Unknown text splitter")
+
+    chunks = text_splitter.split_text(text)
+    tokenizer = tiktoken.get_encoding("gpt2").encode
+
+    for chunk_ind, chunk in enumerate(chunks):
+        n_tokens = len(tokenizer(chunk))
+        n_chars = len(chunk)
+        col.text_area(
+            f"Chunk {chunk_ind} - {n_tokens} tokens - {n_chars} chars",
+            chunk,
+            key=f"text_area_{ind}_{chunk_ind}",
+            height=500,
+        )
diff --git a/llama_index/bridge/langchain.py b/llama_index/bridge/langchain.py
index fbd51b1eb9..3cf7f58b4d 100644
--- a/llama_index/bridge/langchain.py
+++ b/llama_index/bridge/langchain.py
@@ -1,59 +1,60 @@
 import langchain
 
-# LLMs
-from langchain.llms import BaseLLM, FakeListLLM, OpenAI, AI21, Cohere
-from langchain.chat_models.base import BaseChatModel
-from langchain.chat_models import ChatOpenAI
+# prompts
+from langchain import BasePromptTemplate, PromptTemplate
+from langchain.agents import AgentExecutor, AgentType, initialize_agent
+
+# agents and tools
+from langchain.agents.agent_toolkits.base import BaseToolkit
 from langchain.base_language import BaseLanguageModel
+from langchain.cache import BaseCache, GPTCache
+
+# callback
+from langchain.callbacks.base import BaseCallbackHandler, BaseCallbackManager
+from langchain.chains.prompt_selector import ConditionalPromptSelector, is_chat_model
+from langchain.chat_models import ChatOpenAI
+from langchain.chat_models.base import BaseChatModel
+from langchain.docstore.document import Document
 
 # embeddings
 from langchain.embeddings.base import Embeddings
+from langchain.embeddings import HuggingFaceEmbeddings, HuggingFaceBgeEmbeddings
 
-# prompts
-from langchain import PromptTemplate, BasePromptTemplate
-from langchain.chains.prompt_selector import ConditionalPromptSelector, is_chat_model
+# LLMs
+from langchain.llms import AI21, BaseLLM, Cohere, FakeListLLM, OpenAI
+from langchain.memory import ChatMessageHistory, ConversationBufferMemory
+
+# chat and memory
+from langchain.memory.chat_memory import BaseChatMemory
+from langchain.output_parsers import PydanticOutputParser, ResponseSchema
 from langchain.prompts.chat import (
     AIMessagePromptTemplate,
+    BaseMessagePromptTemplate,
     ChatPromptTemplate,
     HumanMessagePromptTemplate,
-    BaseMessagePromptTemplate,
+    SystemMessagePromptTemplate,
 )
 
-# chat and memory
-from langchain.memory.chat_memory import BaseChatMemory
-from langchain.memory import ConversationBufferMemory, ChatMessageHistory
-
-# agents and tools
-from langchain.agents.agent_toolkits.base import BaseToolkit
-from langchain.agents import AgentType
-from langchain.agents import AgentExecutor, initialize_agent
-from langchain.tools import StructuredTool, Tool, BaseTool
-
-# input & output
-from langchain.text_splitter import TextSplitter
-from langchain.output_parsers import ResponseSchema
-from langchain.output_parsers import PydanticOutputParser
-from langchain.input import print_text, get_color_mapping
-
-# callback
-from langchain.callbacks.base import BaseCallbackHandler, BaseCallbackManager
-
 # schema
 from langchain.schema import (
     AIMessage,
-    FunctionMessage,
+    BaseMemory,
     BaseMessage,
+    BaseOutputParser,
+    ChatGeneration,
+    FunctionMessage,
     HumanMessage,
+    LLMResult,
     SystemMessage,
 )
-from langchain.schema import BaseMemory
-from langchain.schema import BaseOutputParser, LLMResult
-from langchain.schema import ChatGeneration
 
 # misc
 from langchain.sql_database import SQLDatabase
-from langchain.cache import GPTCache, BaseCache
-from langchain.docstore.document import Document
+from langchain.input import get_color_mapping, print_text
+
+# input & output
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain.tools import BaseTool, StructuredTool, Tool
 
 __all__ = [
     "langchain",
@@ -66,6 +67,8 @@ __all__ = [
     "ChatOpenAI",
     "BaseLanguageModel",
     "Embeddings",
+    "HuggingFaceEmbeddings",
+    "HuggingFaceBgeEmbeddings",
     "PromptTemplate",
     "BasePromptTemplate",
     "ConditionalPromptSelector",
@@ -74,6 +77,7 @@ __all__ = [
     "ChatPromptTemplate",
     "HumanMessagePromptTemplate",
     "BaseMessagePromptTemplate",
+    "SystemMessagePromptTemplate",
     "BaseChatMemory",
     "ConversationBufferMemory",
     "ChatMessageHistory",
@@ -84,7 +88,6 @@ __all__ = [
     "StructuredTool",
     "Tool",
     "BaseTool",
-    "TextSplitter",
     "ResponseSchema",
     "PydanticOutputParser",
     "print_text",
@@ -106,4 +109,5 @@ __all__ = [
     "GPTCache",
     "BaseCache",
     "Document",
+    "RecursiveCharacterTextSplitter",
 ]
diff --git a/llama_index/callbacks/wandb_callback.py b/llama_index/callbacks/wandb_callback.py
index aa8179d617..36d2999760 100644
--- a/llama_index/callbacks/wandb_callback.py
+++ b/llama_index/callbacks/wandb_callback.py
@@ -250,8 +250,9 @@ class WandbCallbackHandler(BaseCallbackHandler):
                 if self._wandb.run:
                     self._wandb.run.log({"trace": root_trace})
                 self._wandb.termlog("Logged trace tree to W&B.")
-        except:  # noqa
-            # Silently ignore errors to not break user code
+        except Exception as e:
+            print(f"Failed to log trace tree to W&B: {e}")
+            # ignore errors to not break user code
             pass
 
     def persist_index(
diff --git a/llama_index/composability/joint_qa_summary.py b/llama_index/composability/joint_qa_summary.py
index 9aafefe292..17c17825d8 100644
--- a/llama_index/composability/joint_qa_summary.py
+++ b/llama_index/composability/joint_qa_summary.py
@@ -8,7 +8,6 @@ from llama_index.indices.service_context import ServiceContext
 from llama_index.indices.vector_store import VectorStoreIndex
 from llama_index.query_engine.router_query_engine import RouterQueryEngine
 from llama_index.schema import Document
-from llama_index.selectors.llm_selectors import LLMSingleSelector
 from llama_index.storage.storage_context import StorageContext
 from llama_index.tools.query_engine import QueryEngineTool
 
@@ -83,8 +82,7 @@ class QASummaryQueryEngineBuilder:
         )
 
         # build query engine
-        query_engine = RouterQueryEngine(
-            selector=LLMSingleSelector.from_defaults(self._service_context),
+        query_engine = RouterQueryEngine.from_defaults(
             query_engine_tools=[
                 QueryEngineTool.from_defaults(
                     vector_query_engine, description=self._qa_text
@@ -93,5 +91,7 @@ class QASummaryQueryEngineBuilder:
                     list_query_engine, description=self._summary_text
                 ),
             ],
+            service_context=self._service_context,
+            select_multi=False,
         )
         return query_engine
diff --git a/llama_index/embeddings/__init__.py b/llama_index/embeddings/__init__.py
index cabaa45a19..d5dc9240e6 100644
--- a/llama_index/embeddings/__init__.py
+++ b/llama_index/embeddings/__init__.py
@@ -3,11 +3,16 @@
 from llama_index.embeddings.google import GoogleUnivSentEncoderEmbedding
 from llama_index.embeddings.langchain import LangchainEmbedding
 from llama_index.embeddings.openai import OpenAIEmbedding
+from llama_index.embeddings.utils import (
+    resolve_embed_model,
+    DEFAULT_HUGGINGFACE_EMBEDDING_MODEL,
+)
 
-DEFAULT_HUGGINGFACE_EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
 
 __all__ = [
     "GoogleUnivSentEncoderEmbedding",
     "LangchainEmbedding",
     "OpenAIEmbedding",
+    "resolve_embed_model",
+    "DEFAULT_HUGGINGFACE_EMBEDDING_MODEL",
 ]
diff --git a/llama_index/embeddings/utils.py b/llama_index/embeddings/utils.py
index 1af024c6f0..a2fa371c8f 100644
--- a/llama_index/embeddings/utils.py
+++ b/llama_index/embeddings/utils.py
@@ -1,6 +1,14 @@
 """Embedding utils for LlamaIndex."""
+import os
+from typing import List, Union
 
-from typing import List
+from llama_index.utils import get_cache_dir
+from llama_index.bridge.langchain import Embeddings as LCEmbeddings
+from llama_index.embeddings.base import BaseEmbedding
+from llama_index.embeddings.langchain import LangchainEmbedding
+from llama_index.embeddings.openai import OpenAIEmbedding
+
+DEFAULT_HUGGINGFACE_EMBEDDING_MODEL = "BAAI/bge-small-en"
 
 
 def save_embedding(embedding: List[float], file_path: str) -> None:
@@ -16,3 +24,52 @@ def load_embedding(file_path: str) -> List[float]:
             embedding = [float(x) for x in line.strip().split(",")]
             break
         return embedding
+
+
+def resolve_embed_model(
+    embed_model: Union[None, str, BaseEmbedding, LCEmbeddings]
+) -> BaseEmbedding:
+    """Resolve embed model."""
+    if embed_model is None:
+        try:
+            return OpenAIEmbedding()
+        except ValueError:
+            embed_model = "local"
+            print(
+                "******\n"
+                "Could not load OpenAIEmbedding. Using HuggingFaceBgeEmbeddings "
+                f"with model_name={DEFAULT_HUGGINGFACE_EMBEDDING_MODEL}. "
+                "Please check your API key if you intended to use OpenAI embeddings."
+                "\n******"
+            )
+
+    if isinstance(embed_model, str):
+        splits = embed_model.split(":", 1)
+        is_local = splits[0]
+        model_name = splits[1] if len(splits) > 1 else None
+        if is_local != "local":
+            raise ValueError(
+                "embed_model must start with str 'local' or of type BaseEmbedding"
+            )
+        try:
+            from langchain.embeddings import HuggingFaceBgeEmbeddings
+        except ImportError as exc:
+            raise ImportError(
+                "Could not import sentence_transformers or langchain package. "
+                "Please install with `pip install -U sentence-transformers langchain`."
+            ) from exc
+
+        cache_folder = os.path.join(get_cache_dir(), "models")
+        os.makedirs(cache_folder, exist_ok=True)
+
+        embed_model = LangchainEmbedding(
+            HuggingFaceBgeEmbeddings(
+                model_name=model_name or DEFAULT_HUGGINGFACE_EMBEDDING_MODEL,
+                cache_folder=cache_folder,
+            )
+        )
+
+    if isinstance(embed_model, LCEmbeddings):
+        embed_model = LangchainEmbedding(embed_model)
+
+    return embed_model
diff --git a/llama_index/indices/document_summary/base.py b/llama_index/indices/document_summary/base.py
index 10d6de9b02..cd6d973b06 100644
--- a/llama_index/indices/document_summary/base.py
+++ b/llama_index/indices/document_summary/base.py
@@ -16,7 +16,11 @@ from llama_index.indices.base import BaseIndex
 from llama_index.indices.base_retriever import BaseRetriever
 from llama_index.indices.service_context import ServiceContext
 from llama_index.response.schema import Response
-from llama_index.response_synthesizers import BaseSynthesizer, get_response_synthesizer
+from llama_index.response_synthesizers import (
+    BaseSynthesizer,
+    get_response_synthesizer,
+    ResponseMode,
+)
 from llama_index.schema import (
     BaseNode,
     NodeWithScore,
@@ -30,8 +34,8 @@ logger = logging.getLogger(__name__)
 
 
 DEFAULT_SUMMARY_QUERY = (
-    "Give a concise summary of this document. Also describe some of the questions "
-    "that this document can answer. "
+    "Describe what the provided text is about. "
+    "Also describe some of the questions that this text can answer. "
 )
 
 
@@ -67,7 +71,7 @@ class DocumentSummaryIndex(BaseIndex[IndexDocumentSummary]):
     ) -> None:
         """Initialize params."""
         self._response_synthesizer = response_synthesizer or get_response_synthesizer(
-            service_context=service_context,
+            service_context=service_context, response_mode=ResponseMode.TREE_SUMMARIZE
         )
         self._summary_query = summary_query or "summarize:"
         super().__init__(
diff --git a/llama_index/indices/postprocessor/__init__.py b/llama_index/indices/postprocessor/__init__.py
index cea2261114..2aaae82067 100644
--- a/llama_index/indices/postprocessor/__init__.py
+++ b/llama_index/indices/postprocessor/__init__.py
@@ -18,6 +18,9 @@ from llama_index.indices.postprocessor.pii import (
 )
 from llama_index.indices.postprocessor.llm_rerank import LLMRerank
 from llama_index.indices.postprocessor.cohere_rerank import CohereRerank
+from llama_index.indices.postprocessor.metadata_replacement import (
+    MetadataReplacementPostProcessor,
+)
 from llama_index.indices.postprocessor.optimizer import SentenceEmbeddingOptimizer
 from llama_index.indices.postprocessor.sbert_rerank import SentenceTransformerRerank
 
@@ -35,4 +38,5 @@ __all__ = [
     "LLMRerank",
     "SentenceEmbeddingOptimizer",
     "SentenceTransformerRerank",
+    "MetadataReplacementPostProcessor",
 ]
diff --git a/llama_index/indices/postprocessor/metadata_replacement.py b/llama_index/indices/postprocessor/metadata_replacement.py
new file mode 100644
index 0000000000..08c4721f71
--- /dev/null
+++ b/llama_index/indices/postprocessor/metadata_replacement.py
@@ -0,0 +1,25 @@
+from typing import List, Optional
+from llama_index.indices.postprocessor.types import BaseNodePostprocessor
+from llama_index.indices.query.schema import QueryBundle
+from llama_index.schema import MetadataMode, NodeWithScore
+
+
+class MetadataReplacementPostProcessor(BaseNodePostprocessor):
+    def __init__(self, target_metadata_key: str) -> None:
+        self._target_metadata_key = target_metadata_key
+
+    def postprocess_nodes(
+        self,
+        nodes: List[NodeWithScore],
+        query_bundle: Optional[QueryBundle] = None,
+    ) -> List[NodeWithScore]:
+
+        for n in nodes:
+            n.node.set_content(
+                n.node.metadata.get(
+                    self._target_metadata_key,
+                    n.node.get_content(metadata_mode=MetadataMode.NONE),
+                )
+            )
+
+        return nodes
diff --git a/llama_index/indices/postprocessor/optimizer.py b/llama_index/indices/postprocessor/optimizer.py
index 81a0e62ad2..194f57b668 100644
--- a/llama_index/indices/postprocessor/optimizer.py
+++ b/llama_index/indices/postprocessor/optimizer.py
@@ -49,11 +49,21 @@ class SentenceEmbeddingOptimizer(BaseNodePostprocessor):
 
         if tokenizer_fn is None:
             import nltk.data
+            import os
+            from llama_index.utils import get_cache_dir
+
+            cache_dir = get_cache_dir()
+            nltk_data_dir = os.environ.get("NLTK_DATA", cache_dir)
+
+            # update nltk path for nltk so that it finds the data
+            if nltk_data_dir not in nltk.data.path:
+                nltk.data.path.append(nltk_data_dir)
 
             try:
                 nltk.data.find("tokenizers/punkt")
             except LookupError:
-                nltk.download("punkt")
+                nltk.download("punkt", download_dir=nltk_data_dir)
+
             tokenizer = nltk.data.load("tokenizers/punkt/english.pickle")
             tokenizer_fn = tokenizer.tokenize
         self._tokenizer_fn = tokenizer_fn
diff --git a/llama_index/indices/prompt_helper.py b/llama_index/indices/prompt_helper.py
index 4ba75a4188..e3c0ce3fe8 100644
--- a/llama_index/indices/prompt_helper.py
+++ b/llama_index/indices/prompt_helper.py
@@ -16,6 +16,7 @@ from llama_index.llm_predictor.base import LLMMetadata
 from llama_index.prompts.base import Prompt
 from llama_index.prompts.utils import get_empty_prompt_txt
 from llama_index.text_splitter import TokenTextSplitter
+from llama_index.text_splitter.utils import truncate_text
 from llama_index.utils import globals_helper
 
 DEFAULT_PADDING = 5
@@ -166,7 +167,7 @@ class PromptHelper:
             num_chunks=len(text_chunks),
             padding=padding,
         )
-        return [text_splitter.truncate_text(chunk) for chunk in text_chunks]
+        return [truncate_text(chunk, text_splitter) for chunk in text_chunks]
 
     def repack(
         self, prompt: Prompt, text_chunks: Sequence[str], padding: int = DEFAULT_PADDING
diff --git a/llama_index/indices/service_context.py b/llama_index/indices/service_context.py
index 0bfadb94f1..db6403105e 100644
--- a/llama_index/indices/service_context.py
+++ b/llama_index/indices/service_context.py
@@ -5,7 +5,6 @@ from typing import Optional, Union
 import llama_index
 from llama_index.callbacks.base import CallbackManager
 from llama_index.embeddings.base import BaseEmbedding
-from llama_index.embeddings.openai import OpenAIEmbedding
 from llama_index.indices.prompt_helper import PromptHelper
 from llama_index.llm_predictor import LLMPredictor
 from llama_index.llm_predictor.base import BaseLLMPredictor, LLMMetadata
@@ -14,10 +13,7 @@ from llama_index.llms.utils import LLMType
 from llama_index.logger import LlamaLogger
 from llama_index.node_parser.interface import NodeParser
 from llama_index.node_parser.simple import SimpleNodeParser
-from llama_index.embeddings import (
-    DEFAULT_HUGGINGFACE_EMBEDDING_MODEL,
-    LangchainEmbedding,
-)
+from llama_index.embeddings import resolve_embed_model
 
 logger = logging.getLogger(__name__)
 
@@ -116,28 +112,6 @@ class ServiceContext:
             )
             chunk_size = chunk_size_limit
 
-        if isinstance(embed_model, str):
-            splits = embed_model.split(":", 1)
-            is_local = splits[0]
-            model_name = splits[1] if len(splits) > 1 else None
-            if is_local != "local":
-                raise ValueError(
-                    "embed_model must start with str 'local' or of type BaseEmbedding"
-                )
-            try:
-                from langchain.embeddings import HuggingFaceEmbeddings
-            except ImportError as exc:
-                raise ImportError(
-                    "Could not import sentence_transformers or langchain package. "
-                    "Please install with `pip install sentence-transformers langchain`."
-                ) from exc
-
-            embed_model = LangchainEmbedding(
-                HuggingFaceEmbeddings(
-                    model_name=model_name or DEFAULT_HUGGINGFACE_EMBEDDING_MODEL
-                )
-            )
-
         if llama_index.global_service_context is not None:
             return cls.from_service_context(
                 llama_index.global_service_context,
@@ -161,7 +135,7 @@ class ServiceContext:
             llm_predictor.llm.callback_manager = callback_manager
 
         # NOTE: the embed_model isn't used in all indices
-        embed_model = embed_model or OpenAIEmbedding()
+        embed_model = resolve_embed_model(embed_model)
         embed_model.callback_manager = callback_manager
 
         prompt_helper = prompt_helper or _get_default_prompt_helper(
@@ -194,7 +168,7 @@ class ServiceContext:
         llm_predictor: Optional[BaseLLMPredictor] = None,
         llm: Optional[LLM] = None,
         prompt_helper: Optional[PromptHelper] = None,
-        embed_model: Optional[BaseEmbedding] = None,
+        embed_model: Optional[Union[BaseEmbedding, str]] = None,
         node_parser: Optional[NodeParser] = None,
         llama_logger: Optional[LlamaLogger] = None,
         callback_manager: Optional[CallbackManager] = None,
@@ -227,6 +201,7 @@ class ServiceContext:
 
         # NOTE: the embed_model isn't used in all indices
         embed_model = embed_model or service_context.embed_model
+        embed_model = resolve_embed_model(embed_model)
         embed_model.callback_manager = callback_manager
 
         prompt_helper = prompt_helper or _get_default_prompt_helper(
diff --git a/llama_index/indices/struct_store/json_query.py b/llama_index/indices/struct_store/json_query.py
index 47ae5f3de4..9d1486037c 100644
--- a/llama_index/indices/struct_store/json_query.py
+++ b/llama_index/indices/struct_store/json_query.py
@@ -21,12 +21,14 @@ JSONType = Union[Dict[str, "JSONType"], List["JSONType"], str, int, float, bool,
 
 
 DEFAULT_RESPONSE_SYNTHESIS_PROMPT_TMPL = (
-    "Given an input question about a JSON value, synthesize a response "
-    "from the query results.\n"
-    "Query: {query_str}\n"
+    "Given a query, synthesize a response "
+    "to satisfy the query using the JSON results. "
+    "Only include details that are relevant to the query. "
+    "If you don't know the answer, then say that.\n"
     "JSON Schema: {json_schema}\n"
     "JSON Path: {json_path}\n"
     "Value at path: {json_path_value}\n"
+    "Query: {query_str}\n"
     "Response: "
 )
 DEFAULT_RESPONSE_SYNTHESIS_PROMPT = Prompt(
diff --git a/llama_index/indices/tree/utils.py b/llama_index/indices/tree/utils.py
index 63648df53a..c61bc62a8c 100644
--- a/llama_index/indices/tree/utils.py
+++ b/llama_index/indices/tree/utils.py
@@ -2,6 +2,7 @@ from typing import List, Optional
 
 from llama_index.schema import BaseNode
 from llama_index.text_splitter import TokenTextSplitter
+from llama_index.text_splitter.utils import truncate_text
 
 
 def get_numbered_text_from_nodes(
@@ -18,7 +19,7 @@ def get_numbered_text_from_nodes(
     for node in node_list:
         node_text = " ".join(node.get_content().splitlines())
         if text_splitter is not None:
-            node_text = text_splitter.truncate_text(node_text)
+            node_text = truncate_text(node_text, text_splitter)
         text = f"({number}) {node_text}"
         results.append(text)
         number += 1
diff --git a/llama_index/llm_predictor/base.py b/llama_index/llm_predictor/base.py
index 028c9d17d8..1ee5ac69e6 100644
--- a/llama_index/llm_predictor/base.py
+++ b/llama_index/llm_predictor/base.py
@@ -24,6 +24,11 @@ logger = logging.getLogger(__name__)
 class BaseLLMPredictor(Protocol):
     """Base LLM Predictor."""
 
+    @property
+    @abstractmethod
+    def llm(self) -> LLM:
+        """Get LLM."""
+
     @property
     @abstractmethod
     def metadata(self) -> LLMMetadata:
diff --git a/llama_index/llm_predictor/mock.py b/llama_index/llm_predictor/mock.py
index a9e8f458a2..1bf80fadc4 100644
--- a/llama_index/llm_predictor/mock.py
+++ b/llama_index/llm_predictor/mock.py
@@ -6,7 +6,7 @@ from llama_index.callbacks.base import CallbackManager
 from llama_index.callbacks.schema import CBEventType, EventPayload
 from llama_index.constants import DEFAULT_NUM_OUTPUTS
 from llama_index.llm_predictor.base import BaseLLMPredictor
-from llama_index.llms.base import LLMMetadata
+from llama_index.llms.base import LLMMetadata, LLM
 from llama_index.prompts.base import Prompt
 from llama_index.prompts.prompt_type import PromptType
 from llama_index.token_counter.utils import (
@@ -95,6 +95,10 @@ class MockLLMPredictor(BaseLLMPredictor):
     def metadata(self) -> LLMMetadata:
         return LLMMetadata()
 
+    @property
+    def llm(self) -> LLM:
+        raise NotImplementedError("MockLLMPredictor does not have an LLM model.")
+
     def _log_start(self, prompt: Prompt, prompt_args: dict) -> str:
         """Log start of an LLM event."""
         llm_payload = prompt_args.copy()
diff --git a/llama_index/llm_predictor/vellum/predictor.py b/llama_index/llm_predictor/vellum/predictor.py
index b8d407e13c..6d38e43d1a 100644
--- a/llama_index/llm_predictor/vellum/predictor.py
+++ b/llama_index/llm_predictor/vellum/predictor.py
@@ -5,7 +5,7 @@ from typing import Any, Optional, Tuple, cast
 from llama_index import Prompt
 from llama_index.callbacks import CallbackManager
 from llama_index.callbacks.schema import CBEventType, EventPayload
-from llama_index.llm_predictor.base import BaseLLMPredictor, LLMMetadata
+from llama_index.llm_predictor.base import BaseLLMPredictor, LLMMetadata, LLM
 from llama_index.llm_predictor.vellum.exceptions import VellumGenerateException
 from llama_index.llm_predictor.vellum.prompt_registry import VellumPromptRegistry
 from llama_index.llm_predictor.vellum.types import (
@@ -45,6 +45,11 @@ class VellumPredictor(BaseLLMPredictor):
         # deployment. This is not currently possible, so we use default values.
         return LLMMetadata()
 
+    @property
+    def llm(self) -> LLM:
+        """Get the LLM."""
+        raise NotImplementedError("Vellum does not expose the LLM.")
+
     def predict(self, prompt: Prompt, **prompt_args: Any) -> str:
         """Predict the answer to a query."""
 
diff --git a/llama_index/llms/__init__.py b/llama_index/llms/__init__.py
index 3dce9f036c..d3ebd66677 100644
--- a/llama_index/llms/__init__.py
+++ b/llama_index/llms/__init__.py
@@ -14,6 +14,7 @@ from llama_index.llms.base import (
 from llama_index.llms.custom import CustomLLM
 from llama_index.llms.huggingface import HuggingFaceLLM
 from llama_index.llms.langchain import LangChainLLM
+from llama_index.llms.llama_cpp import LlamaCPP
 from llama_index.llms.mock import MockLLM
 from llama_index.llms.openai import OpenAI
 from llama_index.llms.palm import PaLM
@@ -30,6 +31,7 @@ __all__ = [
     "PredibaseLLM",
     "Anthropic",
     "Replicate",
+    "LlamaCPP",
     "CustomLLM",
     "MockLLM",
     "ChatMessage",
diff --git a/llama_index/llms/anthropic.py b/llama_index/llms/anthropic.py
index 3b7eb211ab..5f020103e1 100644
--- a/llama_index/llms/anthropic.py
+++ b/llama_index/llms/anthropic.py
@@ -31,7 +31,7 @@ class Anthropic(LLM):
     def __init__(
         self,
         model: str = "claude-2",
-        temperature: float = 0.0,
+        temperature: float = 0.1,
         max_tokens: int = 512,
         base_url: Optional[str] = None,
         timeout: Optional[float] = None,
diff --git a/llama_index/llms/azure_openai.py b/llama_index/llms/azure_openai.py
index 0cacdc13cd..3b226f667f 100644
--- a/llama_index/llms/azure_openai.py
+++ b/llama_index/llms/azure_openai.py
@@ -33,7 +33,7 @@ class AzureOpenAI(OpenAI):
         self,
         model: str = "gpt-35-turbo",
         engine: Optional[str] = None,
-        temperature: float = 0.0,
+        temperature: float = 0.1,
         max_tokens: Optional[int] = None,
         additional_kwargs: Optional[Dict[str, Any]] = None,
         max_retries: int = 10,
diff --git a/llama_index/llms/llama_api.py b/llama_index/llms/llama_api.py
index 76abd4f0f9..47388df12d 100644
--- a/llama_index/llms/llama_api.py
+++ b/llama_index/llms/llama_api.py
@@ -24,7 +24,7 @@ class LlamaAPI(CustomLLM):
     def __init__(
         self,
         model: str = "llama-13b-chat",
-        temperature: float = 0.0,
+        temperature: float = 0.1,
         max_tokens: int = DEFAULT_NUM_OUTPUTS,
         additional_kwargs: Optional[Dict[str, Any]] = None,
         api_key: Optional[str] = None,
diff --git a/llama_index/llms/llama_cpp.py b/llama_index/llms/llama_cpp.py
new file mode 100644
index 0000000000..d53987a36b
--- /dev/null
+++ b/llama_index/llms/llama_cpp.py
@@ -0,0 +1,170 @@
+import os
+import requests
+from tqdm import tqdm
+from typing import Any, Callable, Dict, Optional, Sequence
+
+from llama_index.callbacks import CallbackManager
+from llama_index.constants import DEFAULT_CONTEXT_WINDOW, DEFAULT_NUM_OUTPUTS
+from llama_index.llms.base import (
+    ChatMessage,
+    ChatResponse,
+    ChatResponseGen,
+    CompletionResponse,
+    CompletionResponseGen,
+    LLMMetadata,
+    llm_chat_callback,
+    llm_completion_callback,
+)
+from llama_index.llms.custom import CustomLLM
+from llama_index.llms.generic_utils import completion_response_to_chat_response
+from llama_index.llms.generic_utils import (
+    messages_to_prompt as generic_messages_to_prompt,
+)
+from llama_index.llms.generic_utils import stream_completion_response_to_chat_response
+from llama_index.utils import get_cache_dir
+
+
+DEFAULT_LLAMA_CPP_MODEL = (
+    "https://huggingface.co/TheBloke/Llama-2-13B-chat-GGML/resolve"
+    "/main/llama-2-13b-chat.ggmlv3.q4_0.bin"
+)
+
+
+class LlamaCPP(CustomLLM):
+    def __init__(
+        self,
+        model_url: str = DEFAULT_LLAMA_CPP_MODEL,
+        model_path: Optional[str] = None,
+        temperature: float = 0.1,
+        max_new_tokens: int = DEFAULT_NUM_OUTPUTS,
+        context_window: int = DEFAULT_CONTEXT_WINDOW,
+        messages_to_prompt: Optional[Callable] = None,
+        completion_to_prompt: Optional[Callable] = None,
+        callback_manager: Optional[CallbackManager] = None,
+        verbose: bool = True,
+        generate_kwargs: Optional[Dict[str, Any]] = None,
+        model_kwargs: Optional[Dict[str, Any]] = None,
+    ) -> None:
+        try:
+            from llama_cpp import Llama
+        except ImportError:
+            raise ImportError(
+                "Could not import llama_cpp library."
+                "Please install llama_cpp with `pip install llama-cpp-python`."
+                "See the full installation guide for GPU support at "
+                "`https://github.com/abetlen/llama-cpp-python`"
+            )
+
+        self._model_kwargs = model_kwargs or {}
+        self._model_kwargs.update({"n_ctx": context_window, "verbose": verbose})
+
+        # check if model is cached
+        if model_path is not None:
+            if not os.path.exists(model_path):
+                raise ValueError(
+                    "Provided model path does not exist. "
+                    "Please check the path or provide a model_url to download."
+                )
+            else:
+                self._model = Llama(model_path=model_path, **self._model_kwargs)
+        else:
+            cache_dir = get_cache_dir()
+            model_name = os.path.basename(model_url)
+            model_path = os.path.join(cache_dir, "models", model_name)
+            if not os.path.exists(model_path):
+                os.makedirs(os.path.dirname(model_path), exist_ok=True)
+                self._download_url(model_url, model_path)
+                assert os.path.exists(model_path)
+
+            self._model = Llama(model_path=model_path, **self._model_kwargs)
+
+        self._model_path = model_path
+        self._messages_to_prompt = messages_to_prompt or generic_messages_to_prompt
+        self._completion_to_prompt = completion_to_prompt or (lambda x: x)
+        self.callback_manager = callback_manager or CallbackManager([])
+
+        # model kwargs
+        self._context_window = context_window
+        self._temperature = temperature
+        self._max_new_tokens = max_new_tokens
+        self._generate_kwargs = generate_kwargs or {}
+        self._generate_kwargs.update(
+            {"temperature": temperature, "max_tokens": max_new_tokens}
+        )
+
+    @property
+    def metadata(self) -> LLMMetadata:
+        """LLM metadata."""
+        return LLMMetadata(
+            context_window=self._context_window,
+            num_output=self._max_new_tokens,
+            model_name=self._model_path,
+        )
+
+    def _download_url(self, model_url: str, model_path: str) -> None:
+        completed = False
+        try:
+            print("Downloading url", model_url, "to path", model_path)
+            with requests.get(model_url, stream=True) as r:
+                with open(model_path, "wb") as file:
+                    total_size = int(r.headers.get("Content-Length") or "0")
+                    if total_size < 1000 * 1000:
+                        raise ValueError(
+                            "Content should be at least 1 MB, but is only",
+                            r.headers.get("Content-Length"),
+                            "bytes",
+                        )
+                    print("total size (MB):", round(total_size / 1000 / 1000, 2))
+                    chunk_size = 1024 * 1024  # 1 MB
+                    for chunk in tqdm(
+                        r.iter_content(chunk_size=chunk_size),
+                        total=int(total_size / chunk_size),
+                    ):
+                        file.write(chunk)
+            completed = True
+        except Exception as e:
+            print("Error downloading model:", e)
+        finally:
+            if not completed:
+                print("Download incomplete.", "Removing partially downloaded file.")
+                os.remove(model_path)
+                raise ValueError("Download incomplete.")
+
+    @llm_chat_callback()
+    def chat(self, messages: Sequence[ChatMessage], **kwargs: Any) -> ChatResponse:
+        prompt = self._messages_to_prompt(messages)
+        completion_response = self.complete(prompt, **kwargs)
+        return completion_response_to_chat_response(completion_response)
+
+    @llm_chat_callback()
+    def stream_chat(
+        self, messages: Sequence[ChatMessage], **kwargs: Any
+    ) -> ChatResponseGen:
+        prompt = self._messages_to_prompt(messages)
+        completion_response = self.stream_complete(prompt, **kwargs)
+        return stream_completion_response_to_chat_response(completion_response)
+
+    @llm_completion_callback()
+    def complete(self, prompt: str, **kwargs: Any) -> CompletionResponse:
+        self._generate_kwargs.update({"stream": False})
+        prompt = self._completion_to_prompt(prompt)
+
+        response = self._model(prompt=prompt, **self._generate_kwargs)
+
+        return CompletionResponse(text=response["choices"][0]["text"], raw=response)
+
+    @llm_completion_callback()
+    def stream_complete(self, prompt: str, **kwargs: Any) -> CompletionResponseGen:
+        self._generate_kwargs.update({"stream": True})
+        prompt = self._completion_to_prompt(prompt)
+
+        response_iter = self._model(prompt=prompt, **self._generate_kwargs)
+
+        def gen() -> CompletionResponseGen:
+            text = ""
+            for response in response_iter:
+                delta = response["choices"][0]["text"]
+                text += delta
+                yield CompletionResponse(delta=delta, text=text, raw=response)
+
+        return gen()
diff --git a/llama_index/llms/llama_utils.py b/llama_index/llms/llama_utils.py
index f15889664d..2b960e3488 100644
--- a/llama_index/llms/llama_utils.py
+++ b/llama_index/llms/llama_utils.py
@@ -7,14 +7,9 @@ B_INST, E_INST = "[INST]", "[/INST]"
 B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"
 DEFAULT_SYSTEM_PROMPT = """\
 You are a helpful, respectful and honest assistant. \
-Always answer as helpfully as possible, while being safe.  \
-Your answers should not include any harmful, unethical, racist, sexist, toxic, \
-dangerous, or illegal content. Please ensure that your responses are socially \
-unbiased and positive in nature.
-
-If a question does not make any sense, or is not factually coherent, \
-explain why instead of answering something not correct. \
-If you don't know the answer to a question, please don't share false information.\
+Always answer as helpfully as possible and follow ALL given instructions. \
+Do not speculate or make up information. \
+Do not reference any given instructions or context. \
 """
 
 
diff --git a/llama_index/llms/openai.py b/llama_index/llms/openai.py
index 2ac4490ec0..6a529af884 100644
--- a/llama_index/llms/openai.py
+++ b/llama_index/llms/openai.py
@@ -39,8 +39,8 @@ from llama_index.llms.openai_utils import (
 class OpenAI(LLM):
     def __init__(
         self,
-        model: str = "text-davinci-003",
-        temperature: float = 0.0,
+        model: str = "gpt-3.5-turbo",
+        temperature: float = 0.1,
         max_tokens: Optional[int] = None,
         additional_kwargs: Optional[Dict[str, Any]] = None,
         max_retries: int = 10,
diff --git a/llama_index/llms/utils.py b/llama_index/llms/utils.py
index 8ed479da2a..bf0bbd5ee8 100644
--- a/llama_index/llms/utils.py
+++ b/llama_index/llms/utils.py
@@ -4,14 +4,47 @@ from langchain.base_language import BaseLanguageModel
 
 from llama_index.llms.base import LLM
 from llama_index.llms.langchain import LangChainLLM
+from llama_index.llms.llama_cpp import LlamaCPP
+from llama_index.llms.llama_utils import messages_to_prompt, completion_to_prompt
 from llama_index.llms.openai import OpenAI
 
-LLMType = Union[LLM, BaseLanguageModel]
+LLMType = Union[str, LLM, BaseLanguageModel]
 
 
 def resolve_llm(llm: Optional[LLMType] = None) -> LLM:
-    if isinstance(llm, BaseLanguageModel):
+    """Resolve LLM from string or LLM instance."""
+    if isinstance(llm, str):
+        splits = llm.split(":", 1)
+        is_local = splits[0]
+        model_path = splits[1] if len(splits) > 1 else None
+        if is_local != "local":
+            raise ValueError(
+                "llm must start with str 'local' or of type LLM or BaseLanguageModel"
+            )
+        return LlamaCPP(
+            model_path=model_path,
+            messages_to_prompt=messages_to_prompt,
+            completion_to_prompt=completion_to_prompt,
+            model_kwargs={"n_gpu_layers": 1},
+        )
+    elif isinstance(llm, BaseLanguageModel):
         # NOTE: if it's a langchain model, wrap it in a LangChainLLM
         return LangChainLLM(llm=llm)
 
-    return llm or OpenAI()
+    # return default OpenAI model. If it fails, return LlamaCPP
+    try:
+        return llm or OpenAI()
+    except ValueError:
+        print(
+            "******\n"
+            "Could not load OpenAI model. Using default LlamaCPP=llama2-13b-chat. "
+            "If you intended to use OpenAI, please check your API key."
+            "\n******"
+        )
+
+        # instansiate LlamaCPP with proper llama2-chat prompts
+        return LlamaCPP(
+            messages_to_prompt=messages_to_prompt,
+            completion_to_prompt=completion_to_prompt,
+            model_kwargs={"n_gpu_layers": 1},
+        )
diff --git a/llama_index/node_parser/__init__.py b/llama_index/node_parser/__init__.py
index 78a3a1fbf1..f310fe73c0 100644
--- a/llama_index/node_parser/__init__.py
+++ b/llama_index/node_parser/__init__.py
@@ -2,9 +2,11 @@
 
 from llama_index.node_parser.interface import NodeParser
 from llama_index.node_parser.simple import SimpleNodeParser
+from llama_index.node_parser.sentence_window import SentenceWindowNodeParser
 
 
 __all__ = [
     "SimpleNodeParser",
+    "SentenceWindowNodeParser",
     "NodeParser",
 ]
diff --git a/llama_index/node_parser/extractors/metadata_extractors.py b/llama_index/node_parser/extractors/metadata_extractors.py
index dc209e20a6..20606c23e2 100644
--- a/llama_index/node_parser/extractors/metadata_extractors.py
+++ b/llama_index/node_parser/extractors/metadata_extractors.py
@@ -25,6 +25,7 @@ import json
 from typing import List, Optional, Sequence, cast, Dict, Callable
 from functools import reduce
 
+from llama_index.llms.base import LLM
 from llama_index.llm_predictor.base import BaseLLMPredictor, LLMPredictor
 from llama_index.node_parser.interface import BaseExtractor
 from llama_index.prompts.base import Prompt
@@ -136,6 +137,8 @@ class TitleExtractor(MetadataFeatureExtractor):
 
     def __init__(
         self,
+        llm: Optional[LLM] = None,
+        # TODO: llm_predictor arg is deprecated
         llm_predictor: Optional[BaseLLMPredictor] = None,
         nodes: int = 5,
         node_template: str = DEFAULT_TITLE_NODE_TEMPLATE,
@@ -147,7 +150,7 @@ class TitleExtractor(MetadataFeatureExtractor):
         self._nodes = nodes
         self._node_template = node_template
         self._combine_template = combine_template
-        self._llm_predictor = llm_predictor or LLMPredictor()
+        self._llm_predictor = llm_predictor or LLMPredictor(llm=llm)
 
     def extract(self, nodes: Sequence[BaseNode]) -> List[Dict]:
         nodes_to_extract_title: List[BaseNode] = []
@@ -197,11 +200,13 @@ class KeywordExtractor(MetadataFeatureExtractor):
 
     def __init__(
         self,
+        llm: Optional[LLM] = None,
+        # TODO: llm_predictor arg is deprecated
         llm_predictor: Optional[BaseLLMPredictor] = None,
         keywords: int = 5,
     ) -> None:
         """Init params."""
-        self._llm_predictor = llm_predictor or LLMPredictor()
+        self._llm_predictor = llm_predictor or LLMPredictor(llm=llm)
         if keywords < 1:
             raise ValueError("num_keywords must be >= 1")
         self._keywords = keywords
@@ -240,6 +245,8 @@ class QuestionsAnsweredExtractor(MetadataFeatureExtractor):
 
     def __init__(
         self,
+        llm: Optional[LLM] = None,
+        # TODO: llm_predictor arg is deprecated
         llm_predictor: Optional[BaseLLMPredictor] = None,
         questions: int = 5,
         prompt_template: Optional[str] = None,
@@ -248,7 +255,7 @@ class QuestionsAnsweredExtractor(MetadataFeatureExtractor):
         """Init params."""
         if questions < 1:
             raise ValueError("questions must be >= 1")
-        self._llm_predictor = llm_predictor or LLMPredictor()
+        self._llm_predictor = llm_predictor or LLMPredictor(llm=llm)
         self._questions = questions
         self._prompt_template = prompt_template
         self._embedding_only = embedding_only
@@ -299,11 +306,13 @@ class SummaryExtractor(MetadataFeatureExtractor):
 
     def __init__(
         self,
+        llm: Optional[LLM] = None,
+        # TODO: llm_predictor arg is deprecated
         llm_predictor: Optional[BaseLLMPredictor] = None,
         summaries: List[str] = ["self"],
         prompt_template: str = DEFAULT_SUMMARY_EXTRACT_TEMPLATE,
     ):
-        self._llm_predictor = llm_predictor or LLMPredictor()
+        self._llm_predictor = llm_predictor or LLMPredictor(llm=llm)
         # validation
         if not all([s in ["self", "prev", "next"] for s in summaries]):
             raise ValueError("summaries must be one of ['self', 'prev', 'next']")
diff --git a/llama_index/node_parser/node_utils.py b/llama_index/node_parser/node_utils.py
index 558f862f7f..984d9e461f 100644
--- a/llama_index/node_parser/node_utils.py
+++ b/llama_index/node_parser/node_utils.py
@@ -13,72 +13,32 @@ from llama_index.schema import (
     NodeRelationship,
     TextNode,
 )
-from llama_index.text_splitter import TextSplit, TextSplitter, TokenTextSplitter
+from llama_index.text_splitter import TextSplitter
+from llama_index.text_splitter.types import MetadataAwareTextSplitter
 from llama_index.utils import truncate_text
 
 logger = logging.getLogger(__name__)
 
 
-def get_text_splits_from_document(
+def build_nodes_from_splits(
+    text_splits: List[str],
     document: BaseNode,
-    text_splitter: TextSplitter,
-    include_metadata: bool = True,
-) -> List[TextSplit]:
-    """Break the document into chunks with additional info."""
-    # TODO: clean up since this only exists due to the diff w LangChain's TextSplitter
-    if isinstance(text_splitter, TokenTextSplitter):
-        # use this to extract extra information about the chunks
-        text_splits = text_splitter.split_text_with_overlaps(
-            document.get_content(metadata_mode=MetadataMode.NONE),
-            metadata_str=document.get_metadata_str() if include_metadata else None,
-        )
-    else:
-        text_chunks = text_splitter.split_text(
-            document.get_content(),
-        )
-        text_splits = [TextSplit(text_chunk=text_chunk) for text_chunk in text_chunks]
-
-    return text_splits
-
-
-def get_nodes_from_document(
-    document: BaseNode,
-    text_splitter: TextSplitter,
     include_metadata: bool = True,
     include_prev_next_rel: bool = False,
 ) -> List[TextNode]:
-    """Get nodes from document."""
-    text_splits = get_text_splits_from_document(
-        document=document,
-        text_splitter=text_splitter,
-        include_metadata=include_metadata,
-    )
-
     nodes: List[TextNode] = []
-    index_counter = 0
-    for i, text_split in enumerate(text_splits):
-        text_chunk = text_split.text_chunk
+    for i, text_chunk in enumerate(text_splits):
         logger.debug(f"> Adding chunk: {truncate_text(text_chunk, 50)}")
-        start_char_idx = None
-        end_char_idx = None
-        if text_split.num_char_overlap is not None:
-            start_char_idx = index_counter - text_split.num_char_overlap
-            end_char_idx = index_counter - text_split.num_char_overlap + len(text_chunk)
-        index_counter += len(text_chunk) + 1
 
         node_metadata = {}
         if include_metadata:
             node_metadata = document.metadata
-            if text_split.metadata is not None:
-                node_metadata.update(text_split.metadata)
 
         if isinstance(document, ImageDocument):
             image_node = ImageNode(
                 text=text_chunk,
                 embedding=document.embedding,
                 metadata=node_metadata,
-                start_char_idx=start_char_idx,
-                end_char_idx=end_char_idx,
                 image=document.image,
                 relationships={
                     NodeRelationship.SOURCE: document.as_related_node_info()
@@ -89,8 +49,6 @@ def get_nodes_from_document(
             node = TextNode(
                 text=text_chunk,
                 embedding=document.embedding,
-                start_char_idx=start_char_idx,
-                end_char_idx=end_char_idx,
                 metadata=node_metadata,
                 excluded_embed_metadata_keys=document.excluded_embed_metadata_keys,
                 excluded_llm_metadata_keys=document.excluded_llm_metadata_keys,
@@ -118,3 +76,41 @@ def get_nodes_from_document(
                 ].as_related_node_info()
 
     return nodes
+
+
+def get_nodes_from_document(
+    document: BaseNode,
+    text_splitter: TextSplitter,
+    include_metadata: bool = True,
+    include_prev_next_rel: bool = False,
+) -> List[TextNode]:
+    """Get nodes from document."""
+    if include_metadata:
+        if isinstance(text_splitter, MetadataAwareTextSplitter):
+            text_splits = text_splitter.split_text_metadata_aware(
+                text=document.get_content(metadata_mode=MetadataMode.NONE),
+                metadata_str=document.get_metadata_str(),
+            )
+        else:
+            logger.warning(
+                f"include_metadata is set to True but {text_splitter} "
+                "is not metadata-aware."
+                "Node content length may exceed expected chunk size."
+                "Try lowering the chunk size or using a metadata-aware text splitter "
+                "if this is a problem."
+            )
+
+            text_splits = text_splitter.split_text(
+                document.get_content(metadata_mode=MetadataMode.NONE),
+            )
+    else:
+        text_splits = text_splitter.split_text(
+            document.get_content(metadata_mode=MetadataMode.NONE),
+        )
+
+    return build_nodes_from_splits(
+        text_splits,
+        document,
+        include_metadata=include_metadata,
+        include_prev_next_rel=include_prev_next_rel,
+    )
diff --git a/llama_index/node_parser/sentence_window.py b/llama_index/node_parser/sentence_window.py
new file mode 100644
index 0000000000..7bb6f34316
--- /dev/null
+++ b/llama_index/node_parser/sentence_window.py
@@ -0,0 +1,147 @@
+"""Simple node parser."""
+from typing import List, Callable, Optional, Sequence
+
+from llama_index.callbacks.base import CallbackManager
+from llama_index.callbacks.schema import CBEventType, EventPayload
+from llama_index.node_parser.extractors.metadata_extractors import MetadataExtractor
+from llama_index.node_parser.interface import NodeParser
+from llama_index.node_parser.node_utils import build_nodes_from_splits
+from llama_index.schema import BaseNode, Document
+from llama_index.text_splitter.utils import split_by_sentence_tokenizer
+from llama_index.utils import get_tqdm_iterable
+
+
+DEFAULT_WINDOW_SIZE = 3
+DEFAULT_WINDOW_METADATA_KEY = "window"
+DEFAULT_OG_TEXT_METADATA_KEY = "original_text"
+
+
+class SentenceWindowNodeParser(NodeParser):
+    """Sentence window node parser.
+
+    Splits a document into Nodes, with each node being a sentence.
+    Each node contains a window from the surrounding sentences in the metadata.
+
+    Args:
+        sentence_splitter (Optional[Callable]): splits text into sentences
+        include_metadata (bool): whether to include metadata in nodes
+        include_prev_next_rel (bool): whether to include prev/next relationships
+    """
+
+    def __init__(
+        self,
+        sentence_splitter: Optional[Callable[[str], List[str]]] = None,
+        window_size: int = DEFAULT_WINDOW_SIZE,
+        window_metadata_key: str = DEFAULT_WINDOW_METADATA_KEY,
+        original_text_metadata_key: str = DEFAULT_OG_TEXT_METADATA_KEY,
+        include_metadata: bool = True,
+        include_prev_next_rel: bool = True,
+        callback_manager: Optional[CallbackManager] = None,
+        metadata_extractor: Optional[MetadataExtractor] = None,
+    ) -> None:
+        """Init params."""
+        self.callback_manager = callback_manager or CallbackManager([])
+        self._sentence_splitter = sentence_splitter or split_by_sentence_tokenizer()
+        self._window_size = window_size
+        self._window_metadata_key = window_metadata_key
+        self._original_text_metadata_key = original_text_metadata_key
+
+        self._include_metadata = include_metadata
+        self._include_prev_next_rel = include_prev_next_rel
+        self._metadata_extractor = metadata_extractor
+
+    @classmethod
+    def from_defaults(
+        cls,
+        sentence_splitter: Optional[Callable[[str], List[str]]] = None,
+        window_size: int = DEFAULT_WINDOW_SIZE,
+        window_metadata_key: str = DEFAULT_WINDOW_METADATA_KEY,
+        original_text_metadata_key: str = DEFAULT_OG_TEXT_METADATA_KEY,
+        include_metadata: bool = True,
+        include_prev_next_rel: bool = True,
+        callback_manager: Optional[CallbackManager] = None,
+        metadata_extractor: Optional[MetadataExtractor] = None,
+    ) -> "SentenceWindowNodeParser":
+        callback_manager = callback_manager or CallbackManager([])
+
+        sentence_splitter = sentence_splitter or split_by_sentence_tokenizer()
+
+        return cls(
+            sentence_splitter=sentence_splitter,
+            window_size=window_size,
+            window_metadata_key=window_metadata_key,
+            original_text_metadata_key=original_text_metadata_key,
+            include_metadata=include_metadata,
+            include_prev_next_rel=include_prev_next_rel,
+            callback_manager=callback_manager,
+            metadata_extractor=metadata_extractor,
+        )
+
+    def get_nodes_from_documents(
+        self,
+        documents: Sequence[Document],
+        show_progress: bool = False,
+    ) -> List[BaseNode]:
+        """Parse document into nodes.
+
+        Args:
+            documents (Sequence[Document]): documents to parse
+            include_metadata (bool): whether to include metadata in nodes
+
+        """
+        with self.callback_manager.event(
+            CBEventType.NODE_PARSING, payload={EventPayload.DOCUMENTS: documents}
+        ) as event:
+            all_nodes: List[BaseNode] = []
+            documents_with_progress = get_tqdm_iterable(
+                documents, show_progress, "Parsing documents into nodes"
+            )
+
+            for document in documents_with_progress:
+                self._sentence_splitter(document.text)
+                nodes = self.build_window_nodes_from_documents([document])
+                all_nodes.extend(nodes)
+
+            if self._metadata_extractor is not None:
+                self._metadata_extractor.process_nodes(all_nodes)
+
+            event.on_end(payload={EventPayload.NODES: all_nodes})
+
+        return all_nodes
+
+    def build_window_nodes_from_documents(
+        self, documents: Sequence[Document]
+    ) -> List[BaseNode]:
+        """Build window nodes from documents."""
+        all_nodes: List[BaseNode] = []
+        for doc in documents:
+            text = doc.text
+            text_splits = self._sentence_splitter(text)
+            nodes = build_nodes_from_splits(
+                text_splits, doc, include_prev_next_rel=True
+            )
+
+            # add window to each node
+            for i, node in enumerate(nodes):
+                window_nodes = nodes[
+                    max(0, i - self._window_size) : min(
+                        i + self._window_size, len(nodes)
+                    )
+                ]
+
+                node.metadata[self._window_metadata_key] = " ".join(
+                    [n.text for n in window_nodes]
+                )
+                node.metadata[self._original_text_metadata_key] = node.text
+
+                # exclude window metadata from embed and llm
+                node.excluded_embed_metadata_keys.extend(
+                    [self._window_metadata_key, self._original_text_metadata_key]
+                )
+                node.excluded_llm_metadata_keys.extend(
+                    [self._window_metadata_key, self._original_text_metadata_key]
+                )
+
+            all_nodes.extend(nodes)
+
+        return all_nodes
diff --git a/llama_index/node_parser/simple.py b/llama_index/node_parser/simple.py
index aa66579ea8..f60a3433f1 100644
--- a/llama_index/node_parser/simple.py
+++ b/llama_index/node_parser/simple.py
@@ -3,12 +3,11 @@ from typing import List, Optional, Sequence
 
 from llama_index.callbacks.base import CallbackManager
 from llama_index.callbacks.schema import CBEventType, EventPayload
-from llama_index.constants import DEFAULT_CHUNK_OVERLAP, DEFAULT_CHUNK_SIZE
 from llama_index.node_parser.extractors.metadata_extractors import MetadataExtractor
 from llama_index.node_parser.interface import NodeParser
 from llama_index.node_parser.node_utils import get_nodes_from_document
 from llama_index.schema import BaseNode, Document
-from llama_index.text_splitter import TextSplitter, TokenTextSplitter
+from llama_index.text_splitter import TextSplitter, get_default_text_splitter
 from llama_index.utils import get_tqdm_iterable
 
 
@@ -34,7 +33,7 @@ class SimpleNodeParser(NodeParser):
     ) -> None:
         """Init params."""
         self.callback_manager = callback_manager or CallbackManager([])
-        self._text_splitter = text_splitter or TokenTextSplitter(
+        self._text_splitter = text_splitter or get_default_text_splitter(
             callback_manager=self.callback_manager
         )
         self._include_metadata = include_metadata
@@ -52,18 +51,14 @@ class SimpleNodeParser(NodeParser):
         metadata_extractor: Optional[MetadataExtractor] = None,
     ) -> "SimpleNodeParser":
         callback_manager = callback_manager or CallbackManager([])
-        chunk_size = chunk_size or DEFAULT_CHUNK_SIZE
-        chunk_overlap = (
-            chunk_overlap if chunk_overlap is not None else DEFAULT_CHUNK_OVERLAP
-        )
 
-        token_text_splitter = TokenTextSplitter(
+        text_splitter = get_default_text_splitter(
             chunk_size=chunk_size,
             chunk_overlap=chunk_overlap,
             callback_manager=callback_manager,
         )
         return cls(
-            text_splitter=token_text_splitter,
+            text_splitter=text_splitter,
             include_metadata=include_metadata,
             include_prev_next_rel=include_prev_next_rel,
             callback_manager=callback_manager,
diff --git a/llama_index/prompts/chat_prompts.py b/llama_index/prompts/chat_prompts.py
index 2b538dd3a1..5f62d231bf 100644
--- a/llama_index/prompts/chat_prompts.py
+++ b/llama_index/prompts/chat_prompts.py
@@ -4,22 +4,84 @@ from llama_index.bridge.langchain import (
     AIMessagePromptTemplate,
     ChatPromptTemplate,
     HumanMessagePromptTemplate,
+    SystemMessagePromptTemplate,
+)
+
+from llama_index.prompts.prompts import (
+    QuestionAnswerPrompt,
+    SummaryPrompt,
+    RefinePrompt,
+    RefineTableContextPrompt,
+)
+
+# text qa prompt
+TEXT_QA_SYSTEM_PROMPT = SystemMessagePromptTemplate.from_template(
+    "You are an expert Q&A system that is trusted around the world.\n"
+    "Always answer the question using the provided context information, "
+    "and not prior knowledge.\n"
+    "Some rules to follow:\n"
+    "1. Never directly reference the given context in your answer.\n"
+    "2. Avoid statements like 'Based on the context, ...' or "
+    "'The context information ...' or anything along "
+    "those lines."
+)
+
+TEXT_QA_PROMPT_TMPL_MSGS = [
+    TEXT_QA_SYSTEM_PROMPT,
+    HumanMessagePromptTemplate.from_template(
+        "Context information is below.\n"
+        "---------------------\n"
+        "{context_str}\n"
+        "---------------------\n"
+        "Given the context information and not prior knowledge, "
+        "answer the question. If the answer is not in the context, inform "
+        "the user that you can't answer the question.\n"
+        "Question: {query_str}\n"
+        "Answer: "
+    ),
+]
+
+CHAT_TEXT_QA_PROMPT_LC = ChatPromptTemplate.from_messages(TEXT_QA_PROMPT_TMPL_MSGS)
+CHAT_TEXT_QA_PROMPT = QuestionAnswerPrompt.from_langchain_prompt(CHAT_TEXT_QA_PROMPT_LC)
+
+
+# Tree Summarize
+TREE_SUMMARIZE_PROMPT_TMPL_MSGS = [
+    TEXT_QA_SYSTEM_PROMPT,
+    HumanMessagePromptTemplate.from_template(
+        "Context information from multiple sources is below.\n"
+        "---------------------\n"
+        "{context_str}\n"
+        "---------------------\n"
+        "Given the information from multiple sources and not prior knowledge, "
+        "answer the question. If the answer is not in the context, inform "
+        "the user that you can't answer the question.\n"
+        "Question: {query_str}\n"
+        "Answer: "
+    ),
+]
+
+CHAT_TREE_SUMMARIZE_PROMPT_LC = ChatPromptTemplate.from_messages(
+    TREE_SUMMARIZE_PROMPT_TMPL_MSGS
+)
+CHAT_TREE_SUMMARIZE_PROMPT = SummaryPrompt.from_langchain_prompt(
+    CHAT_TREE_SUMMARIZE_PROMPT_LC
 )
 
-from llama_index.prompts.prompts import RefinePrompt, RefineTableContextPrompt
 
 # Refine Prompt
 CHAT_REFINE_PROMPT_TMPL_MSGS = [
     HumanMessagePromptTemplate.from_template(
-        "We have the opportunity to refine the above answer "
-        "(only if needed) with some more context below.\n"
-        "------------\n"
-        "{context_msg}\n"
-        "------------\n"
-        "Given the new context, refine the original answer to better "
-        "answer the question: {query_str}. "
-        "If the context isn't useful, output the original answer again.\n"
-        "Original Answer: {existing_answer}"
+        "You are an expert Q&A system that stricly operates in two modes"
+        "when refining existing answers:\n"
+        "1. **Rewrite** an original answer using the new context.\n"
+        "2. **Repeat** the original answer if the new context isn't useful.\n"
+        "Never reference the original answer or context directly in your answer.\n"
+        "When in doubt, just repeat the original answer."
+        "New Context: {context_msg}\n"
+        "Query: {query_str}\n"
+        "Original Answer: {existing_answer}\n"
+        "New Answer: "
     ),
 ]
 
diff --git a/llama_index/prompts/default_prompt_selectors.py b/llama_index/prompts/default_prompt_selectors.py
index 151ee558bc..17c0307f24 100644
--- a/llama_index/prompts/default_prompt_selectors.py
+++ b/llama_index/prompts/default_prompt_selectors.py
@@ -1,15 +1,41 @@
 """Prompt selectors."""
 from llama_index.prompts.chat_prompts import (
+    CHAT_TEXT_QA_PROMPT,
+    CHAT_TREE_SUMMARIZE_PROMPT,
     CHAT_REFINE_PROMPT,
     CHAT_REFINE_TABLE_CONTEXT_PROMPT,
 )
 from llama_index.prompts.default_prompts import (
+    DEFAULT_TEXT_QA_PROMPT,
+    DEFAULT_TREE_SUMMARIZE_PROMPT,
     DEFAULT_REFINE_PROMPT,
     DEFAULT_REFINE_TABLE_CONTEXT_PROMPT,
 )
 from llama_index.prompts.prompt_selector import PromptSelector, is_chat_model
 from llama_index.prompts.prompt_type import PromptType
-from llama_index.prompts.prompts import RefinePrompt, RefineTableContextPrompt
+from llama_index.prompts.prompts import (
+    QuestionAnswerPrompt,
+    RefinePrompt,
+    RefineTableContextPrompt,
+)
+
+DEFAULT_TEXT_QA_PROMPT_SEL_LC = PromptSelector(
+    default_prompt=DEFAULT_TEXT_QA_PROMPT.get_langchain_prompt(),
+    conditionals=[(is_chat_model, CHAT_TEXT_QA_PROMPT.get_langchain_prompt())],
+)
+DEFAULT_TEXT_QA_PROMPT_SEL = QuestionAnswerPrompt(
+    langchain_prompt_selector=DEFAULT_TEXT_QA_PROMPT_SEL_LC,
+    prompt_type=PromptType.QUESTION_ANSWER,
+)
+
+DEFAULT_TREE_SUMMARIZE_PROMPT_SEL_LC = PromptSelector(
+    default_prompt=DEFAULT_TREE_SUMMARIZE_PROMPT.get_langchain_prompt(),
+    conditionals=[(is_chat_model, CHAT_TREE_SUMMARIZE_PROMPT.get_langchain_prompt())],
+)
+DEFAULT_TREE_SUMMARIZE_PROMPT_SEL = QuestionAnswerPrompt(
+    langchain_prompt_selector=DEFAULT_TREE_SUMMARIZE_PROMPT_SEL_LC,
+    prompt_type=PromptType.SUMMARY,
+)
 
 DEFAULT_REFINE_PROMPT_SEL_LC = PromptSelector(
     default_prompt=DEFAULT_REFINE_PROMPT.get_langchain_prompt(),
diff --git a/llama_index/prompts/default_prompts.py b/llama_index/prompts/default_prompts.py
index 0bd685b6a9..3aecd25a6b 100644
--- a/llama_index/prompts/default_prompts.py
+++ b/llama_index/prompts/default_prompts.py
@@ -88,7 +88,8 @@ DEFAULT_REFINE_PROMPT_TMPL = (
     "------------\n"
     "Given the new context, refine the original answer to better "
     "answer the question. "
-    "If the context isn't useful, return the original answer."
+    "If the context isn't useful, return the original answer.\n"
+    "Refined Answer: "
 )
 DEFAULT_REFINE_PROMPT = Prompt(
     DEFAULT_REFINE_PROMPT_TMPL, prompt_type=PromptType.REFINE
@@ -101,12 +102,30 @@ DEFAULT_TEXT_QA_PROMPT_TMPL = (
     "{context_str}\n"
     "---------------------\n"
     "Given the context information and not prior knowledge, "
-    "answer the question: {query_str}\n"
+    "answer the question. If the answer is not in the context, inform "
+    "the user that you can't answer the question.\n"
+    "Question: {query_str}\n"
+    "Answer: "
 )
 DEFAULT_TEXT_QA_PROMPT = Prompt(
     DEFAULT_TEXT_QA_PROMPT_TMPL, prompt_type=PromptType.QUESTION_ANSWER
 )
 
+DEFAULT_TREE_SUMMARIZE_TMPL = (
+    "Context information from multiple sources is below.\n"
+    "---------------------\n"
+    "{context_str}\n"
+    "---------------------\n"
+    "Given the information from multiple sources and not prior knowledge, "
+    "answer the question. If the answer is not in the context, inform "
+    "the user that you can't answer the question.\n"
+    "Question: {query_str}\n"
+    "Answer: "
+)
+DEFAULT_TREE_SUMMARIZE_PROMPT = Prompt(
+    DEFAULT_TREE_SUMMARIZE_TMPL, prompt_type=PromptType.SUMMARY
+)
+
 
 ############################################
 # Keyword Table
diff --git a/llama_index/query_engine/citation_query_engine.py b/llama_index/query_engine/citation_query_engine.py
index c7a415c512..7cc3e7a427 100644
--- a/llama_index/query_engine/citation_query_engine.py
+++ b/llama_index/query_engine/citation_query_engine.py
@@ -1,4 +1,4 @@
-from typing import Any, List, Optional, Sequence, Union
+from typing import Any, List, Optional, Sequence
 
 from llama_index.callbacks.base import CallbackManager
 from llama_index.callbacks.schema import CBEventType, EventPayload
@@ -15,7 +15,8 @@ from llama_index.response_synthesizers import (
     get_response_synthesizer,
 )
 from llama_index.schema import NodeWithScore, TextNode
-from llama_index.text_splitter import SentenceSplitter, TokenTextSplitter
+from llama_index.text_splitter import get_default_text_splitter
+from llama_index.text_splitter.types import TextSplitter
 
 CITATION_QA_TEMPLATE = Prompt(
     "Please provide an answer based solely on the provided sources. "
@@ -71,8 +72,6 @@ CITATION_REFINE_TEMPLATE = Prompt(
 DEFAULT_CITATION_CHUNK_SIZE = 512
 DEFAULT_CITATION_CHUNK_OVERLAP = 20
 
-TextSplitterType = Union[SentenceSplitter, TokenTextSplitter]
-
 
 class CitationQueryEngine(BaseQueryEngine):
     """Citation query engine.
@@ -97,11 +96,11 @@ class CitationQueryEngine(BaseQueryEngine):
         response_synthesizer: Optional[BaseSynthesizer] = None,
         citation_chunk_size: int = DEFAULT_CITATION_CHUNK_SIZE,
         citation_chunk_overlap: int = DEFAULT_CITATION_CHUNK_OVERLAP,
-        text_splitter: Optional[TextSplitterType] = None,
+        text_splitter: Optional[TextSplitter] = None,
         node_postprocessors: Optional[List[BaseNodePostprocessor]] = None,
         callback_manager: Optional[CallbackManager] = None,
     ) -> None:
-        self.text_splitter = text_splitter or SentenceSplitter(
+        self.text_splitter = text_splitter or get_default_text_splitter(
             chunk_size=citation_chunk_size, chunk_overlap=citation_chunk_overlap
         )
         self._retriever = retriever
@@ -120,7 +119,7 @@ class CitationQueryEngine(BaseQueryEngine):
         response_synthesizer: Optional[BaseSynthesizer] = None,
         citation_chunk_size: int = DEFAULT_CITATION_CHUNK_SIZE,
         citation_chunk_overlap: int = DEFAULT_CITATION_CHUNK_OVERLAP,
-        text_splitter: Optional[TextSplitterType] = None,
+        text_splitter: Optional[TextSplitter] = None,
         citation_qa_template: Prompt = CITATION_QA_TEMPLATE,
         citation_refine_template: Prompt = CITATION_REFINE_TEMPLATE,
         retriever: Optional[BaseRetriever] = None,
@@ -140,7 +139,7 @@ class CitationQueryEngine(BaseQueryEngine):
                 Size of citation chunks, default=512. Useful for controlling
                 granularity of sources.
             citation_chunk_overlap (int): Overlap of citation nodes, default=20.
-            text_splitter (Optional[TextSplitterType]):
+            text_splitter (Optional[TextSplitter]):
                 A text splitter for creating citation source nodes. Default is
                 a SentenceSplitter.
             citation_qa_template (Prompt): Template for initial citation QA
@@ -182,25 +181,10 @@ class CitationQueryEngine(BaseQueryEngine):
         """Modify retrieved nodes to be granular sources."""
         new_nodes: List[NodeWithScore] = []
         for node in nodes:
-            splits = self.text_splitter.split_text_with_overlaps(
-                node.node.get_content()
-            )
-
-            start_offset = 0
-            if isinstance(node.node, TextNode) and node.node.start_char_idx is not None:
-                start_offset = node.node.start_char_idx
-
-            for split in splits:
-                text = f"Source {len(new_nodes)+1}:\n{split.text_chunk}\n"
-
-                # NOTE currently this does not take into account escaped chars
-                num_char_overlap = split.num_char_overlap or 0
-                chunk_len = len(split.text_chunk)
-
-                start_char_idx = start_offset - num_char_overlap
-                end_char_idx = start_offset - num_char_overlap + chunk_len
+            text_chunks = self.text_splitter.split_text(node.node.get_content())
 
-                start_offset += chunk_len + 1
+            for text_chunk in text_chunks:
+                text = f"Source {len(new_nodes)+1}:\n{text_chunk}\n"
 
                 new_nodes.append(
                     NodeWithScore(
@@ -208,8 +192,6 @@ class CitationQueryEngine(BaseQueryEngine):
                             text=text,
                             metadata=node.node.metadata or {},
                             relationships=node.node.relationships or {},
-                            start_char_idx=start_char_idx,
-                            end_char_idx=end_char_idx,
                         ),
                         score=node.score,
                     )
diff --git a/llama_index/query_engine/router_query_engine.py b/llama_index/query_engine/router_query_engine.py
index 807d97119b..a7d2a65f1f 100644
--- a/llama_index/query_engine/router_query_engine.py
+++ b/llama_index/query_engine/router_query_engine.py
@@ -8,10 +8,12 @@ from llama_index.indices.base_retriever import BaseRetriever
 from llama_index.indices.query.base import BaseQueryEngine
 from llama_index.indices.query.schema import QueryBundle
 from llama_index.indices.service_context import ServiceContext
-from llama_index.prompts.default_prompts import DEFAULT_TEXT_QA_PROMPT
+from llama_index.prompts.default_prompt_selectors import (
+    DEFAULT_TREE_SUMMARIZE_PROMPT_SEL,
+)
 from llama_index.response.schema import RESPONSE_TYPE, Response, StreamingResponse
 from llama_index.response_synthesizers import TreeSummarize
-from llama_index.selectors.llm_selectors import LLMMultiSelector, LLMSingleSelector
+from llama_index.selectors.utils import get_selector_from_context
 from llama_index.selectors.types import BaseSelector
 from llama_index.schema import BaseNode
 from llama_index.tools.query_engine import QueryEngineTool
@@ -86,7 +88,7 @@ class RouterQueryEngine(BaseQueryEngine):
         self._metadatas = [x.metadata for x in query_engine_tools]
         self._summarizer = summarizer or TreeSummarize(
             service_context=self.service_context,
-            text_qa_template=DEFAULT_TEXT_QA_PROMPT,
+            summary_template=DEFAULT_TREE_SUMMARIZE_PROMPT_SEL,
         )
 
         super().__init__(self.service_context.callback_manager)
@@ -100,10 +102,11 @@ class RouterQueryEngine(BaseQueryEngine):
         summarizer: Optional[TreeSummarize] = None,
         select_multi: bool = False,
     ) -> "RouterQueryEngine":
-        if selector is None and select_multi:
-            selector = LLMMultiSelector.from_defaults(service_context=service_context)
-        elif selector is None and not select_multi:
-            selector = LLMSingleSelector.from_defaults(service_context=service_context)
+        service_context = service_context or ServiceContext.from_defaults()
+
+        selector = selector or get_selector_from_context(
+            service_context, is_multi=select_multi
+        )
 
         assert selector is not None
 
@@ -269,7 +272,7 @@ class ToolRetrieverRouterQueryEngine(BaseQueryEngine):
         self.service_context = service_context or ServiceContext.from_defaults()
         self._summarizer = summarizer or TreeSummarize(
             service_context=self.service_context,
-            text_qa_template=DEFAULT_TEXT_QA_PROMPT,
+            summary_template=DEFAULT_TREE_SUMMARIZE_PROMPT_SEL,
         )
         self._retriever = retriever
 
diff --git a/llama_index/query_engine/sql_join_query_engine.py b/llama_index/query_engine/sql_join_query_engine.py
index 45676aca44..69d8d22883 100644
--- a/llama_index/query_engine/sql_join_query_engine.py
+++ b/llama_index/query_engine/sql_join_query_engine.py
@@ -1,7 +1,7 @@
 """SQL Join query engine."""
 
 from llama_index.bridge.langchain import print_text
-from typing import Optional, Dict, Callable
+from typing import Optional, Dict, Callable, Union
 from llama_index.indices.query.base import BaseQueryEngine
 from llama_index.indices.struct_store.sql_query import (
     BaseSQLTableQueryEngine,
@@ -11,7 +11,9 @@ from llama_index.indices.query.schema import QueryBundle
 from llama_index.response.schema import RESPONSE_TYPE, Response
 from llama_index.tools.query_engine import QueryEngineTool
 from llama_index.indices.service_context import ServiceContext
+from llama_index.selectors.utils import get_selector_from_context
 from llama_index.selectors.llm_selectors import LLMSingleSelector
+from llama_index.selectors.pydantic_selectors import PydanticSingleSelector
 from llama_index.prompts.base import Prompt
 from llama_index.indices.query.query_transform.base import BaseQueryTransform
 import logging
@@ -144,8 +146,9 @@ class SQLJoinQueryEngine(BaseQueryEngine):
 
     Args:
         sql_query_tool (QueryEngineTool): Query engine tool for SQL database.
-        other_query_tool (QueryEngineTool): Other query engine tool.
-        selector (Optional[LLMSingleSelector]): Selector to use.
+            other_query_tool (QueryEngineTool): Other query engine tool.
+        selector (Optional[Union[LLMSingleSelector, PydanticSingleSelector]]):
+            Selector to use.
         service_context (Optional[ServiceContext]): Service context to use.
         sql_join_synthesis_prompt (Optional[Prompt]): Prompt to use for SQL join
             synthesis.
@@ -161,7 +164,7 @@ class SQLJoinQueryEngine(BaseQueryEngine):
         self,
         sql_query_tool: QueryEngineTool,
         other_query_tool: QueryEngineTool,
-        selector: Optional[LLMSingleSelector] = None,
+        selector: Optional[Union[LLMSingleSelector, PydanticSingleSelector]] = None,
         service_context: Optional[ServiceContext] = None,
         sql_join_synthesis_prompt: Optional[Prompt] = None,
         sql_augment_query_transform: Optional[SQLAugmentQueryTransform] = None,
@@ -185,7 +188,12 @@ class SQLJoinQueryEngine(BaseQueryEngine):
 
         sql_query_engine = sql_query_tool.query_engine
         self._service_context = service_context or sql_query_engine.service_context
-        self._selector = selector or LLMSingleSelector.from_defaults()
+
+        self._selector = selector or get_selector_from_context(
+            self._service_context, is_multi=False
+        )
+        assert isinstance(self._selector, (LLMSingleSelector, PydanticSingleSelector))
+
         self._sql_join_synthesis_prompt = (
             sql_join_synthesis_prompt or DEFAULT_SQL_JOIN_SYNTHESIS_PROMPT
         )
diff --git a/llama_index/query_engine/sql_vector_query_engine.py b/llama_index/query_engine/sql_vector_query_engine.py
index ab7eb24c87..91120375b2 100644
--- a/llama_index/query_engine/sql_vector_query_engine.py
+++ b/llama_index/query_engine/sql_vector_query_engine.py
@@ -12,6 +12,7 @@ from llama_index.tools.query_engine import QueryEngineTool
 from llama_index.query_engine.retriever_query_engine import RetrieverQueryEngine
 from llama_index.indices.service_context import ServiceContext
 from llama_index.selectors.llm_selectors import LLMSingleSelector
+from llama_index.selectors.pydantic_selectors import PydanticSingleSelector
 from llama_index.prompts.base import Prompt
 import logging
 from llama_index.callbacks.base import CallbackManager
@@ -54,7 +55,8 @@ class SQLAutoVectorQueryEngine(SQLJoinQueryEngine):
     Args:
         sql_query_tool (QueryEngineTool): Query engine tool for SQL database.
         vector_query_tool (QueryEngineTool): Query engine tool for vector database.
-        selector (Optional[LLMSingleSelector]): Selector to use.
+        selector (Optional[Union[LLMSingleSelector, PydanticSingleSelector]]):
+            Selector to use.
         service_context (Optional[ServiceContext]): Service context to use.
         sql_vector_synthesis_prompt (Optional[Prompt]): Prompt to use for SQL vector
             synthesis.
@@ -70,7 +72,7 @@ class SQLAutoVectorQueryEngine(SQLJoinQueryEngine):
         self,
         sql_query_tool: QueryEngineTool,
         vector_query_tool: QueryEngineTool,
-        selector: Optional[LLMSingleSelector] = None,
+        selector: Optional[Union[LLMSingleSelector, PydanticSingleSelector]] = None,
         service_context: Optional[ServiceContext] = None,
         sql_vector_synthesis_prompt: Optional[Prompt] = None,
         sql_augment_query_transform: Optional[SQLAugmentQueryTransform] = None,
@@ -125,7 +127,7 @@ class SQLAutoVectorQueryEngine(SQLJoinQueryEngine):
         vector_auto_retriever: RetrieverQueryEngine,
         vector_tool_name: str,
         vector_tool_description: str,
-        selector: Optional[LLMSingleSelector] = None,
+        selector: Optional[Union[LLMSingleSelector, PydanticSingleSelector]] = None,
         **kwargs: Any,
     ) -> "SQLAutoVectorQueryEngine":
         """From SQL and vector query engines.
@@ -133,7 +135,8 @@ class SQLAutoVectorQueryEngine(SQLJoinQueryEngine):
         Args:
             sql_query_engine (BaseSQLTableQueryEngine): SQL query engine.
             vector_query_engine (VectorIndexAutoRetriever): Vector retriever.
-            selector (Optional[LLMSingleSelector]): Selector to use.
+            selector (Optional[Union[LLMSingleSelector, PydanticSingleSelector]]):
+                Selector to use.
 
         """
         sql_query_tool = QueryEngineTool.from_defaults(
diff --git a/llama_index/response_synthesizers/factory.py b/llama_index/response_synthesizers/factory.py
index 9daf567a18..2ef1f015f6 100644
--- a/llama_index/response_synthesizers/factory.py
+++ b/llama_index/response_synthesizers/factory.py
@@ -2,15 +2,17 @@ from typing import Optional
 
 from llama_index.callbacks.base import CallbackManager
 from llama_index.indices.service_context import ServiceContext
-from llama_index.prompts.default_prompt_selectors import DEFAULT_REFINE_PROMPT_SEL
-from llama_index.prompts.default_prompts import (
-    DEFAULT_SIMPLE_INPUT_PROMPT,
-    DEFAULT_TEXT_QA_PROMPT,
+from llama_index.prompts.default_prompt_selectors import (
+    DEFAULT_TEXT_QA_PROMPT_SEL,
+    DEFAULT_TREE_SUMMARIZE_PROMPT_SEL,
+    DEFAULT_REFINE_PROMPT_SEL,
 )
+from llama_index.prompts.default_prompts import DEFAULT_SIMPLE_INPUT_PROMPT
 from llama_index.prompts.prompts import (
     QuestionAnswerPrompt,
     RefinePrompt,
     SimpleInputPrompt,
+    SummaryPrompt,
 )
 from llama_index.response_synthesizers.accumulate import Accumulate
 from llama_index.response_synthesizers.base import BaseSynthesizer
@@ -30,6 +32,7 @@ def get_response_synthesizer(
     service_context: Optional[ServiceContext] = None,
     text_qa_template: Optional[QuestionAnswerPrompt] = None,
     refine_template: Optional[RefinePrompt] = None,
+    summary_template: Optional[SummaryPrompt] = None,
     simple_template: Optional[SimpleInputPrompt] = None,
     response_mode: ResponseMode = ResponseMode.COMPACT,
     callback_manager: Optional[CallbackManager] = None,
@@ -38,9 +41,10 @@ def get_response_synthesizer(
 ) -> BaseSynthesizer:
     """Get a response synthesizer."""
 
-    text_qa_template = text_qa_template or DEFAULT_TEXT_QA_PROMPT
+    text_qa_template = text_qa_template or DEFAULT_TEXT_QA_PROMPT_SEL
     refine_template = refine_template or DEFAULT_REFINE_PROMPT_SEL
     simple_template = simple_template or DEFAULT_SIMPLE_INPUT_PROMPT
+    summary_template = summary_template or DEFAULT_TREE_SUMMARIZE_PROMPT_SEL
 
     service_context = service_context or ServiceContext.from_defaults(
         callback_manager=callback_manager
@@ -63,7 +67,7 @@ def get_response_synthesizer(
     elif response_mode == ResponseMode.TREE_SUMMARIZE:
         return TreeSummarize(
             service_context=service_context,
-            text_qa_template=text_qa_template,
+            summary_template=summary_template,
             streaming=streaming,
             use_async=use_async,
         )
diff --git a/llama_index/response_synthesizers/refine.py b/llama_index/response_synthesizers/refine.py
index db8f2fd390..0a9ea51b5c 100644
--- a/llama_index/response_synthesizers/refine.py
+++ b/llama_index/response_synthesizers/refine.py
@@ -3,9 +3,9 @@ from typing import Any, Generator, Optional, Sequence, cast
 
 from llama_index.indices.service_context import ServiceContext
 from llama_index.indices.utils import truncate_text
-from llama_index.prompts.default_prompt_selectors import DEFAULT_REFINE_PROMPT_SEL
-from llama_index.prompts.default_prompts import (
-    DEFAULT_TEXT_QA_PROMPT,
+from llama_index.prompts.default_prompt_selectors import (
+    DEFAULT_TEXT_QA_PROMPT_SEL,
+    DEFAULT_REFINE_PROMPT_SEL,
 )
 from llama_index.prompts.prompts import QuestionAnswerPrompt, RefinePrompt
 from llama_index.response.utils import get_response_text
@@ -27,7 +27,7 @@ class Refine(BaseSynthesizer):
         verbose: bool = False,
     ) -> None:
         super().__init__(service_context=service_context, streaming=streaming)
-        self._text_qa_template = text_qa_template or DEFAULT_TEXT_QA_PROMPT
+        self._text_qa_template = text_qa_template or DEFAULT_TEXT_QA_PROMPT_SEL
         self._refine_template = refine_template or DEFAULT_REFINE_PROMPT_SEL
         self._verbose = verbose
 
diff --git a/llama_index/response_synthesizers/simple_summarize.py b/llama_index/response_synthesizers/simple_summarize.py
index 936286ddfe..47352a554c 100644
--- a/llama_index/response_synthesizers/simple_summarize.py
+++ b/llama_index/response_synthesizers/simple_summarize.py
@@ -1,9 +1,7 @@
 from typing import Any, Generator, Optional, Sequence, cast
 
 from llama_index.indices.service_context import ServiceContext
-from llama_index.prompts.default_prompts import (
-    DEFAULT_TEXT_QA_PROMPT,
-)
+from llama_index.prompts.default_prompt_selectors import DEFAULT_TEXT_QA_PROMPT_SEL
 from llama_index.prompts.prompts import QuestionAnswerPrompt
 from llama_index.response_synthesizers.base import BaseSynthesizer
 from llama_index.types import RESPONSE_TEXT_TYPE
@@ -17,7 +15,7 @@ class SimpleSummarize(BaseSynthesizer):
         streaming: bool = False,
     ) -> None:
         super().__init__(service_context=service_context, streaming=streaming)
-        self._text_qa_template = text_qa_template or DEFAULT_TEXT_QA_PROMPT
+        self._text_qa_template = text_qa_template or DEFAULT_TEXT_QA_PROMPT_SEL
 
     async def aget_response(
         self,
diff --git a/llama_index/response_synthesizers/tree_summarize.py b/llama_index/response_synthesizers/tree_summarize.py
index 6e2cd16120..3b037317f7 100644
--- a/llama_index/response_synthesizers/tree_summarize.py
+++ b/llama_index/response_synthesizers/tree_summarize.py
@@ -3,9 +3,10 @@ from typing import Any, List, Optional, Sequence
 
 from llama_index.async_utils import run_async_tasks
 from llama_index.indices.service_context import ServiceContext
-from llama_index.prompts.default_prompts import DEFAULT_TEXT_QA_PROMPT
-from llama_index.prompts.prompt_type import PromptType
-from llama_index.prompts.prompts import QuestionAnswerPrompt, SummaryPrompt
+from llama_index.prompts.default_prompt_selectors import (
+    DEFAULT_TREE_SUMMARIZE_PROMPT_SEL,
+)
+from llama_index.prompts.prompts import SummaryPrompt
 from llama_index.response_synthesizers.base import BaseSynthesizer
 from llama_index.types import RESPONSE_TEXT_TYPE
 
@@ -25,14 +26,14 @@ class TreeSummarize(BaseSynthesizer):
 
     def __init__(
         self,
-        text_qa_template: Optional[QuestionAnswerPrompt] = None,
+        summary_template: Optional[SummaryPrompt] = None,
         service_context: Optional[ServiceContext] = None,
         streaming: bool = False,
         use_async: bool = False,
         verbose: bool = False,
     ) -> None:
         super().__init__(service_context=service_context, streaming=streaming)
-        self._text_qa_template = text_qa_template or DEFAULT_TEXT_QA_PROMPT
+        self._summary_template = summary_template or DEFAULT_TREE_SUMMARIZE_PROMPT_SEL
         self._use_async = use_async
         self._verbose = verbose
 
@@ -43,12 +44,7 @@ class TreeSummarize(BaseSynthesizer):
         **response_kwargs: Any,
     ) -> RESPONSE_TEXT_TYPE:
         """Get tree summarize response."""
-
-        text_qa_template = self._text_qa_template.partial_format(query_str=query_str)
-        summary_template = SummaryPrompt.from_prompt(
-            text_qa_template, prompt_type=PromptType.SUMMARY
-        )
-
+        summary_template = self._summary_template.partial_format(query_str=query_str)
         # repack text_chunks so that each chunk fills the context window
         text_chunks = self._service_context.prompt_helper.repack(
             summary_template, text_chunks=text_chunks
@@ -97,10 +93,7 @@ class TreeSummarize(BaseSynthesizer):
         **response_kwargs: Any,
     ) -> RESPONSE_TEXT_TYPE:
         """Get tree summarize response."""
-        text_qa_template = self._text_qa_template.partial_format(query_str=query_str)
-        summary_template = SummaryPrompt.from_prompt(
-            text_qa_template, prompt_type=PromptType.SUMMARY
-        )
+        summary_template = self._summary_template.partial_format(query_str=query_str)
         # repack text_chunks so that each chunk fills the context window
         text_chunks = self._service_context.prompt_helper.repack(
             summary_template, text_chunks=text_chunks
diff --git a/llama_index/retrievers/router_retriever.py b/llama_index/retrievers/router_retriever.py
index 4681fc3514..f14bf39559 100644
--- a/llama_index/retrievers/router_retriever.py
+++ b/llama_index/retrievers/router_retriever.py
@@ -9,7 +9,7 @@ from llama_index.callbacks.schema import CBEventType, EventPayload
 from llama_index.indices.base_retriever import BaseRetriever
 from llama_index.indices.query.schema import QueryBundle
 from llama_index.indices.service_context import ServiceContext
-from llama_index.selectors.llm_selectors import LLMMultiSelector, LLMSingleSelector
+from llama_index.selectors.utils import get_selector_from_context
 from llama_index.selectors.types import BaseSelector
 from llama_index.tools.retriever_tool import RetrieverTool
 
@@ -51,12 +51,10 @@ class RouterRetriever(BaseRetriever):
         selector: Optional[BaseSelector] = None,
         select_multi: bool = False,
     ) -> "RouterRetriever":
-        if selector is None and select_multi:
-            selector = LLMMultiSelector.from_defaults(service_context=service_context)
-        elif selector is None and not select_multi:
-            selector = LLMSingleSelector.from_defaults(service_context=service_context)
 
-        assert selector is not None
+        selector = selector or get_selector_from_context(
+            service_context or ServiceContext.from_defaults(), is_multi=select_multi
+        )
 
         return cls(
             selector,
diff --git a/llama_index/selectors/utils.py b/llama_index/selectors/utils.py
new file mode 100644
index 0000000000..198b886395
--- /dev/null
+++ b/llama_index/selectors/utils.py
@@ -0,0 +1,33 @@
+from typing import Optional
+
+from llama_index.indices.service_context import ServiceContext
+from llama_index.selectors.types import BaseSelector
+from llama_index.selectors.pydantic_selectors import (
+    PydanticMultiSelector,
+    PydanticSingleSelector,
+)
+from llama_index.selectors.llm_selectors import LLMMultiSelector, LLMSingleSelector
+
+
+def get_selector_from_context(
+    service_context: ServiceContext, is_multi: bool = False
+) -> BaseSelector:
+    """Get a selector from a service context. Prefers Pydantic selectors if possible."""
+    selector: Optional[BaseSelector] = None
+
+    if is_multi:
+        try:
+            llm = service_context.llm_predictor.llm
+            selector = PydanticMultiSelector.from_defaults(llm=llm)  # type: ignore
+        except ValueError:
+            selector = LLMMultiSelector.from_defaults(service_context=service_context)
+    else:
+        try:
+            llm = service_context.llm_predictor.llm
+            selector = PydanticSingleSelector.from_defaults(llm=llm)  # type: ignore
+        except ValueError:
+            selector = LLMSingleSelector.from_defaults(service_context=service_context)
+
+    assert selector is not None
+
+    return selector
diff --git a/llama_index/text_splitter/__init__.py b/llama_index/text_splitter/__init__.py
new file mode 100644
index 0000000000..66030966ac
--- /dev/null
+++ b/llama_index/text_splitter/__init__.py
@@ -0,0 +1,34 @@
+from typing import Optional
+
+from llama_index.callbacks.base import CallbackManager
+from llama_index.constants import DEFAULT_CHUNK_OVERLAP, DEFAULT_CHUNK_SIZE
+from llama_index.text_splitter.code_splitter import CodeSplitter
+from llama_index.text_splitter.sentence_splitter import SentenceSplitter
+from llama_index.text_splitter.token_splitter import TokenTextSplitter
+from llama_index.text_splitter.types import TextSplitter
+
+
+def get_default_text_splitter(
+    chunk_size: Optional[int] = None,
+    chunk_overlap: Optional[int] = None,
+    callback_manager: Optional[CallbackManager] = None,
+) -> TextSplitter:
+    """Get default text splitter."""
+    chunk_size = chunk_size or DEFAULT_CHUNK_SIZE
+    chunk_overlap = (
+        chunk_overlap if chunk_overlap is not None else DEFAULT_CHUNK_OVERLAP
+    )
+
+    return SentenceSplitter(
+        chunk_size=chunk_size,
+        chunk_overlap=chunk_overlap,
+        callback_manager=callback_manager,
+    )
+
+
+__all__ = [
+    "TextSplitter",
+    "TokenTextSplitter",
+    "SentenceSplitter",
+    "CodeSplitter",
+]
diff --git a/llama_index/text_splitter/code_splitter.py b/llama_index/text_splitter/code_splitter.py
new file mode 100644
index 0000000000..4421e5e5d0
--- /dev/null
+++ b/llama_index/text_splitter/code_splitter.py
@@ -0,0 +1,92 @@
+"""Code splitter."""
+from typing import Any, List, Optional
+
+from llama_index.callbacks.base import CallbackManager
+from llama_index.callbacks.schema import CBEventType, EventPayload
+from llama_index.text_splitter.types import TextSplitter
+
+
+class CodeSplitter(TextSplitter):
+    """Split code using a AST parser.
+
+    Thank you to Kevin Lu / SweepAI for suggesting this elegant code splitting solution.
+    https://docs.sweep.dev/blogs/chunking-2m-files
+    """
+
+    def __init__(
+        self,
+        language: str,
+        chunk_lines: int = 40,
+        chunk_lines_overlap: int = 15,
+        max_chars: int = 1500,
+        callback_manager: Optional[CallbackManager] = None,
+    ):
+        self.language = language
+        self.chunk_lines = chunk_lines
+        self.chunk_lines_overlap = chunk_lines_overlap
+        self.max_chars = max_chars
+        self.callback_manager = callback_manager or CallbackManager([])
+
+    def _chunk_node(self, node: Any, text: str, last_end: int = 0) -> List[str]:
+        new_chunks = []
+        current_chunk = ""
+        for child in node.children:
+            if child.end_byte - child.start_byte > self.max_chars:
+                # Child is too big, recursively chunk the child
+                if len(current_chunk) > 0:
+                    new_chunks.append(current_chunk)
+                current_chunk = ""
+                new_chunks.extend(self._chunk_node(child, text, last_end))
+            elif (
+                len(current_chunk) + child.end_byte - child.start_byte > self.max_chars
+            ):
+                # Child would make the current chunk too big, so start a new chunk
+                new_chunks.append(current_chunk)
+                current_chunk = text[last_end : child.end_byte]
+            else:
+                current_chunk += text[last_end : child.end_byte]
+            last_end = child.end_byte
+        if len(current_chunk) > 0:
+            new_chunks.append(current_chunk)
+        return new_chunks
+
+    def split_text(self, text: str) -> List[str]:
+        """Split incoming code and return chunks using the AST."""
+        with self.callback_manager.event(
+            CBEventType.CHUNKING, payload={EventPayload.CHUNKS: [text]}
+        ) as event:
+            try:
+                import tree_sitter_languages
+            except ImportError:
+                raise ImportError(
+                    "Please install tree_sitter_languages to use CodeSplitter."
+                )
+
+            try:
+                parser = tree_sitter_languages.get_parser(self.language)
+            except Exception as e:
+                print(
+                    f"Could not get parser for language {self.language}. Check "
+                    "https://github.com/grantjenks/py-tree-sitter-languages#license "
+                    "for a list of valid languages."
+                )
+                raise e
+
+            tree = parser.parse(bytes(text, "utf-8"))
+
+            if (
+                not tree.root_node.children
+                or tree.root_node.children[0].type != "ERROR"
+            ):
+                chunks = [
+                    chunk.strip() for chunk in self._chunk_node(tree.root_node, text)
+                ]
+                event.on_end(
+                    payload={EventPayload.CHUNKS: chunks},
+                )
+
+                return chunks
+            else:
+                raise ValueError(f"Could not parse code with language {self.language}.")
+
+        # TODO: set up auto-language detection using something like https://github.com/yoeo/guesslang.
diff --git a/llama_index/text_splitter/sentence_splitter.py b/llama_index/text_splitter/sentence_splitter.py
new file mode 100644
index 0000000000..718c476e63
--- /dev/null
+++ b/llama_index/text_splitter/sentence_splitter.py
@@ -0,0 +1,176 @@
+"""Sentence splitter."""
+from dataclasses import dataclass
+from typing import Callable, List, Optional
+
+from llama_index.callbacks.base import CallbackManager
+from llama_index.callbacks.schema import CBEventType, EventPayload
+from llama_index.constants import DEFAULT_CHUNK_SIZE
+from llama_index.text_splitter.types import MetadataAwareTextSplitter
+from llama_index.text_splitter.utils import (
+    split_by_char,
+    split_by_sentence_tokenizer,
+    split_by_regex,
+    split_by_sep,
+)
+from llama_index.utils import globals_helper
+
+
+@dataclass
+class _Split:
+    text: str  # the split text
+    is_sentence: bool  # save whether this is a full sentence
+
+
+class SentenceSplitter(MetadataAwareTextSplitter):
+    """_Split text with a preference for complete sentences.
+
+    In general, this class tries to keep sentences and paragraphs together. Therefore
+    compared to the original TokenTextSplitter, there are less likely to be
+    hanging sentences or parts of sentences at the end of the node chunk.
+    """
+
+    def __init__(
+        self,
+        separator: str = " ",
+        chunk_size: int = DEFAULT_CHUNK_SIZE,
+        chunk_overlap: int = 200,
+        tokenizer: Optional[Callable] = None,
+        paragraph_separator: str = "\n\n\n",
+        chunking_tokenizer_fn: Optional[Callable[[str], List[str]]] = None,
+        secondary_chunking_regex: str = "[^,.;。]+[,.;。]?",
+        callback_manager: Optional[CallbackManager] = None,
+    ):
+        """Initialize with parameters."""
+        if chunk_overlap > chunk_size:
+            raise ValueError(
+                f"Got a larger chunk overlap ({chunk_overlap}) than chunk size "
+                f"({chunk_size}), should be smaller."
+            )
+        self._chunk_size = chunk_size
+        self._chunk_overlap = chunk_overlap
+        self.tokenizer = tokenizer or globals_helper.tokenizer
+        self.callback_manager = callback_manager or CallbackManager([])
+
+        chunking_tokenizer_fn = chunking_tokenizer_fn or split_by_sentence_tokenizer()
+
+        self._split_fns = [
+            split_by_sep(paragraph_separator),
+            chunking_tokenizer_fn,
+        ]
+
+        self._sub_sentence_split_fns = [
+            split_by_regex(secondary_chunking_regex),
+            split_by_sep(separator),
+            split_by_char(),
+        ]
+
+    def split_text_metadata_aware(self, text: str, metadata_str: str) -> List[str]:
+        metadata_len = len(self.tokenizer(metadata_str))
+        effective_chunk_size = self._chunk_size - metadata_len
+        return self._split_text(text, chunk_size=effective_chunk_size)
+
+    def split_text(self, text: str) -> List[str]:
+        return self._split_text(text, chunk_size=self._chunk_size)
+
+    def _split_text(self, text: str, chunk_size: int) -> List[str]:
+        """
+        _Split incoming text and return chunks with overlap size.
+
+        Has a preference for complete sentences, phrases, and minimal overlap.
+        """
+        if text == "":
+            return []
+
+        with self.callback_manager.event(
+            CBEventType.CHUNKING, payload={EventPayload.CHUNKS: [text]}
+        ) as event:
+
+            splits = self._split(text, chunk_size)
+            chunks = self._merge(splits, chunk_size)
+
+            event.on_end(payload={EventPayload.CHUNKS: chunks})
+
+        return chunks
+
+    def _split(self, text: str, chunk_size: int) -> List[_Split]:
+        """Break text into splits that are smaller than chunk size.
+
+        The order of splitting is:
+        1. split by paragraph separator
+        2. split by chunking tokenizer (default is nltk sentence tokenizer)
+        3. split by second chunking regex (default is "[^,\.;]+[,\.;]?")
+        4. split by default separator (" ")
+
+        """
+        if len(self.tokenizer(text)) <= chunk_size:
+            return [_Split(text, is_sentence=True)]
+
+        for split_fn in self._split_fns:
+            splits = split_fn(text)
+            if len(splits) > 1:
+                break
+
+        if len(splits) > 1:
+            is_sentence = True
+        else:
+            for split_fn in self._sub_sentence_split_fns:
+                splits = split_fn(text)
+                if len(splits) > 1:
+                    break
+            is_sentence = False
+
+        new_splits = []
+        for split in splits:
+            split_len = len(self.tokenizer(split))
+            if split_len <= chunk_size:
+                new_splits.append(_Split(split, is_sentence=is_sentence))
+            else:
+                # recursively split
+                new_splits.extend(self._split(split, chunk_size=chunk_size))
+        return new_splits
+
+    def _merge(self, splits: List[_Split], chunk_size: int) -> List[str]:
+        """Merge splits into chunks."""
+        chunks: List[str] = []
+        cur_chunk: List[str] = []
+        cur_tokens = 0
+        while len(splits) > 0:
+            cur_token = splits[0]
+            cur_len = len(self.tokenizer(cur_token.text))
+            if cur_len > chunk_size:
+                raise ValueError("Single token exceed chunk size")
+            if cur_tokens + cur_len > chunk_size:
+                chunks.append("".join(cur_chunk).strip())
+                cur_chunk = []
+                cur_tokens = 0
+            else:
+                if (
+                    cur_token.is_sentence
+                    or cur_tokens + cur_len < chunk_size - self._chunk_overlap
+                ):
+                    cur_tokens += cur_len
+                    cur_chunk.append(cur_token.text)
+                    splits.pop(0)
+                else:
+                    chunks.append("".join(cur_chunk).strip())
+                    cur_chunk = []
+                    cur_tokens = 0
+
+        # handle the last chunk
+        chunk = "".join(cur_chunk).strip()
+        if chunk:
+            chunks.append(chunk)
+
+        # run postprocessing to remove blank spaces
+        chunks = self._postprocess_chunks(chunks)
+
+        return chunks
+
+    def _postprocess_chunks(self, chunks: List[str]) -> List[str]:
+        """Post-process chunks."""
+        new_chunks = []
+        for doc in chunks:
+            if doc.replace(" ", "") == "":
+                continue
+            new_chunks.append(doc)
+        return new_chunks
diff --git a/llama_index/text_splitter/token_splitter.py b/llama_index/text_splitter/token_splitter.py
new file mode 100644
index 0000000000..0591de5ba6
--- /dev/null
+++ b/llama_index/text_splitter/token_splitter.py
@@ -0,0 +1,146 @@
+"""Token splitter."""
+import logging
+from typing import Callable, List, Optional
+
+from llama_index.callbacks.base import CallbackManager
+from llama_index.callbacks.schema import CBEventType, EventPayload
+from llama_index.constants import DEFAULT_CHUNK_OVERLAP, DEFAULT_CHUNK_SIZE
+from llama_index.text_splitter.types import MetadataAwareTextSplitter
+from llama_index.text_splitter.utils import split_by_char, split_by_sep
+from llama_index.utils import globals_helper
+
+_logger = logging.getLogger(__name__)
+
+# NOTE: this is the number of tokens we reserve for metadata formatting
+DEFAULT_METADATA_FORMAT_LEN = 2
+
+
+class TokenTextSplitter(MetadataAwareTextSplitter):
+    """Implementation of splitting text that looks at word tokens."""
+
+    def __init__(
+        self,
+        chunk_size: int = DEFAULT_CHUNK_SIZE,
+        chunk_overlap: int = DEFAULT_CHUNK_OVERLAP,
+        tokenizer: Optional[Callable] = None,
+        callback_manager: Optional[CallbackManager] = None,
+        separator: str = " ",
+        backup_separators: Optional[List[str]] = ["\n"],
+    ):
+        """Initialize with parameters."""
+        if chunk_overlap > chunk_size:
+            raise ValueError(
+                f"Got a larger chunk overlap ({chunk_overlap}) than chunk size "
+                f"({chunk_size}), should be smaller."
+            )
+        self._chunk_size = chunk_size
+        self._chunk_overlap = chunk_overlap
+        self.tokenizer = tokenizer or globals_helper.tokenizer
+        self.callback_manager = callback_manager or CallbackManager([])
+
+        all_seps = [separator] + (backup_separators or [])
+        self._split_fns = [split_by_sep(sep) for sep in all_seps] + [split_by_char()]
+
+    def split_text_metadata_aware(self, text: str, metadata_str: str) -> List[str]:
+        """Split text into chunks, reserving space required for metadata str."""
+        metadata_len = len(self.tokenizer(metadata_str)) + DEFAULT_METADATA_FORMAT_LEN
+        effective_chunk_size = self._chunk_size - metadata_len
+        return self._split_text(text, chunk_size=effective_chunk_size)
+
+    def split_text(self, text: str) -> List[str]:
+        """Split text into chunks."""
+        return self._split_text(text, chunk_size=self._chunk_size)
+
+    def _split_text(self, text: str, chunk_size: int) -> List[str]:
+        """Split text into chunks up to chunk_size."""
+        if text == "":
+            return []
+
+        with self.callback_manager.event(
+            CBEventType.CHUNKING, payload={EventPayload.CHUNKS: [text]}
+        ) as event:
+
+            splits = self._split(text, chunk_size)
+            chunks = self._merge(splits, chunk_size)
+
+            event.on_end(
+                payload={EventPayload.CHUNKS: chunks},
+            )
+
+        return chunks
+
+    def _split(self, text: str, chunk_size: int) -> List[str]:
+        """Break text into splits that are smaller than chunk size.
+
+        The order of splitting is:
+        1. split by separator
+        2. split by backup separators (if any)
+        3. split by characters
+
+        NOTE: the splits contain the separators.
+        """
+        if len(self.tokenizer(text)) <= chunk_size:
+            return [text]
+
+        for split_fn in self._split_fns:
+            splits = split_fn(text)
+            if len(splits) > 1:
+                break
+
+        new_splits = []
+        for split in splits:
+            split_len = len(self.tokenizer(split))
+            if split_len <= chunk_size:
+                new_splits.append(split)
+            else:
+                # recursively split
+                new_splits.extend(self._split(split, chunk_size=chunk_size))
+        return new_splits
+
+    def _merge(self, splits: List[str], chunk_size: int) -> List[str]:
+        """Merge splits into chunks.
+
+        The high-level idea is to keep adding splits to a chunk until we
+        exceed the chunk size, then we start a new chunk with overlap.
+
+        When we start a new chunk, we pop off the first element of the previous
+        chunk until the total length is less than the chunk size.
+        """
+        chunks: List[str] = []
+
+        cur_chunk: List[str] = []
+        cur_len = 0
+        for split in splits:
+            split_len = len(self.tokenizer(split))
+            if split_len > chunk_size:
+                _logger.warning(
+                    f"Got a split of size {split_len}, ",
+                    f"larger than chunk size {chunk_size}.",
+                )
+
+            # if we exceed the chunk size after adding the new split, then
+            # we need to end the current chunk and start a new one
+            if cur_len + split_len > chunk_size:
+                # end the previous chunk
+                chunk = "".join(cur_chunk).strip()
+                if chunk:
+                    chunks.append(chunk)
+
+                # start a new chunk with overlap
+                # keep popping off the first element of the previous chunk until:
+                #   1. the current chunk length is less than chunk overlap
+                #   2. the total length is less than chunk size
+                while cur_len > self._chunk_overlap or cur_len + split_len > chunk_size:
+                    # pop off the first element
+                    first_chunk = cur_chunk.pop(0)
+                    cur_len -= len(self.tokenizer(first_chunk))
+
+            cur_chunk.append(split)
+            cur_len += split_len
+
+        # handle the last chunk
+        chunk = "".join(cur_chunk).strip()
+        if chunk:
+            chunks.append(chunk)
+
+        return chunks
diff --git a/llama_index/text_splitter/types.py b/llama_index/text_splitter/types.py
new file mode 100644
index 0000000000..6da292feee
--- /dev/null
+++ b/llama_index/text_splitter/types.py
@@ -0,0 +1,16 @@
+"""Text splitter implementations."""
+from typing import List, Protocol, runtime_checkable
+
+
+class TextSplitter(Protocol):
+    def split_text(self, text: str) -> List[str]:
+        ...
+
+
+@runtime_checkable
+class MetadataAwareTextSplitter(Protocol):
+    def split_text(self, text: str) -> List[str]:
+        ...
+
+    def split_text_metadata_aware(self, text: str, metadata_str: str) -> List[str]:
+        ...
diff --git a/llama_index/text_splitter/utils.py b/llama_index/text_splitter/utils.py
new file mode 100644
index 0000000000..0025864df4
--- /dev/null
+++ b/llama_index/text_splitter/utils.py
@@ -0,0 +1,70 @@
+from typing import Callable, List
+
+from llama_index.text_splitter.types import TextSplitter
+
+
+def truncate_text(text: str, text_splitter: TextSplitter) -> str:
+    """Truncate text to fit within the chunk size."""
+    chunks = text_splitter.split_text(text)
+    return chunks[0]
+
+
+def split_text_keep_separator(text: str, separator: str) -> List[str]:
+    """Split text with separator and keep the separator at the end of each split."""
+    parts = text.split(separator)
+    result = [separator + s if i > 0 else s for i, s in enumerate(parts)]
+    result = [s for s in result if s]
+    return result
+
+
+def split_by_sep(sep: str, keep_sep: bool = True) -> Callable[[str], List[str]]:
+    """Split text by separator."""
+    if keep_sep:
+        return lambda text: split_text_keep_separator(text, sep)
+    else:
+        return lambda text: text.split(sep)
+
+
+def split_by_char() -> Callable[[str], List[str]]:
+    """Split text by character."""
+    return lambda text: list(text)
+
+
+def split_by_sentence_tokenizer() -> Callable[[str], List[str]]:
+    import nltk
+    import os
+    from llama_index.utils import get_cache_dir
+
+    cache_dir = get_cache_dir()
+    nltk_data_dir = os.environ.get("NLTK_DATA", cache_dir)
+
+    # update nltk path for nltk so that it finds the data
+    if nltk_data_dir not in nltk.data.path:
+        nltk.data.path.append(nltk_data_dir)
+
+    try:
+        nltk.data.find("tokenizers/punkt")
+    except LookupError:
+        nltk.download("punkt", download_dir=nltk_data_dir)
+
+    return nltk.sent_tokenize
+
+
+def split_by_regex(regex: str) -> Callable[[str], List[str]]:
+    """Split text by regex."""
+    import re
+
+    return lambda text: re.findall(regex, text)
+
+
+def split_by_phrase_regex() -> Callable[[str], List[str]]:
+    """Split text by phrase regex.
+
+    This regular expression will split the sentences into phrases,
+    where each phrase is a sequence of one or more non-comma,
+    non-period, and non-semicolon characters, followed by an optional comma,
+    period, or semicolon. The regular expression will also capture the
+    delimiters themselves as separate items in the list of phrases.
+    """
+    regex = "[^,.;。]+[,.;。]?"
+    return split_by_regex(regex)
diff --git a/llama_index/utils.py b/llama_index/utils.py
index 65a7de7345..75e34e0cb7 100644
--- a/llama_index/utils.py
+++ b/llama_index/utils.py
@@ -63,10 +63,19 @@ class GlobalsHelper:
                 raise ImportError(
                     "`nltk` package not found, please run `pip install nltk`"
                 )
+
+            from llama_index.utils import get_cache_dir
+
+            cache_dir = get_cache_dir()
+            nltk_data_dir = os.environ.get("NLTK_DATA", cache_dir)
+
+            # update nltk path for nltk so that it finds the data
+            if nltk_data_dir not in nltk.data.path:
+                nltk.data.path.append(nltk_data_dir)
+
             try:
                 nltk.data.find("corpora/stopwords")
             except LookupError:
-                nltk_data_dir = os.environ.get("NLTK_DATA", None)
                 nltk.download("stopwords", download_dir=nltk_data_dir)
             self._stopwords = stopwords.words("english")
         return self._stopwords
@@ -239,15 +248,17 @@ def get_transformer_tokenizer_fn(model_name: str) -> Callable[[str], List[str]]:
     return tokenizer.tokenize
 
 
-def get_cache_dir() -> Path:
+def get_cache_dir() -> str:
     """Locate a platform-appropriate cache directory for llama_index,
     and create it if it doesn't yet exist
     """
+    # User override
+    if "LLAMA_INDEX_CACHE_DIR" in os.environ:
+        path = Path(os.environ["LLAMA_INDEX_CACHE_DIR"])
+
     # Linux, Unix, AIX, etc.
-    if os.name == "posix" and sys.platform != "darwin":
-        # use ~/.cache if empty OR not set
-        base = os.path.expanduser("~/.cache")
-        path = Path(base, "llama_index")
+    elif os.name == "posix" and sys.platform != "darwin":
+        path = Path("/tmp/llama_index")
 
     # Mac OS
     elif sys.platform == "darwin":
@@ -262,7 +273,7 @@ def get_cache_dir() -> Path:
 
     if not os.path.exists(path):
         os.makedirs(path)
-    return path
+    return str(path)
 
 
 # Sample text from llama_index's readme
diff --git a/llama_index/vector_stores/zep.py b/llama_index/vector_stores/zep.py
index 58e1f260fd..88953b043c 100644
--- a/llama_index/vector_stores/zep.py
+++ b/llama_index/vector_stores/zep.py
@@ -225,7 +225,7 @@ class ZepVectorStore(VectorStore):
         nodes: List[TextNode] = []
 
         for d in results:
-            node = metadata_dict_to_node(d.metadata)
+            node = metadata_dict_to_node(d.metadata or {})
             node.set_content(d.content)
 
             nodes.append(node)
diff --git a/setup.py b/setup.py
index 194868bda7..9da20623d6 100644
--- a/setup.py
+++ b/setup.py
@@ -15,7 +15,7 @@ with open("README.md", "r", encoding="utf-8") as f:
 install_requires = [
     "tiktoken",
     "dataclasses_json",
-    "langchain>=0.0.218",
+    "langchain>=0.0.262",
     "sqlalchemy>=2.0.15",
     "numpy",
     "tenacity>=8.2.0,<9.0.0",
diff --git a/tests/conftest.py b/tests/conftest.py
index 4126cb3872..d766cbe014 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -10,16 +10,13 @@ from llama_index.indices.service_context import ServiceContext
 from llama_index.llm_predictor.base import LLMPredictor
 from llama_index.llms.base import LLMMetadata
 from llama_index.llms.mock import MockLLM
-from llama_index.text_splitter import TokenTextSplitter
+from llama_index.text_splitter import SentenceSplitter, TokenTextSplitter
 from tests.indices.vector_store.mock_services import MockEmbedding
 from tests.mock_utils.mock_predict import (
     patch_llmpredictor_apredict,
     patch_llmpredictor_predict,
 )
-from tests.mock_utils.mock_text_splitter import (
-    patch_token_splitter_newline,
-    patch_token_splitter_newline_with_overlaps,
-)
+from tests.mock_utils.mock_text_splitter import patch_token_splitter_newline
 
 # @pytest.fixture(autouse=True)
 # def no_networking(monkeypatch: pytest.MonkeyPatch) -> None:
@@ -36,11 +33,13 @@ def allow_networking(monkeypatch: pytest.MonkeyPatch) -> None:
 
 @pytest.fixture
 def patch_token_text_splitter(monkeypatch: pytest.MonkeyPatch) -> None:
+    monkeypatch.setattr(SentenceSplitter, "split_text", patch_token_splitter_newline)
+    monkeypatch.setattr(
+        SentenceSplitter, "split_text_metadata_aware", patch_token_splitter_newline
+    )
     monkeypatch.setattr(TokenTextSplitter, "split_text", patch_token_splitter_newline)
     monkeypatch.setattr(
-        TokenTextSplitter,
-        "split_text_with_overlaps",
-        patch_token_splitter_newline_with_overlaps,
+        TokenTextSplitter, "split_text_metadata_aware", patch_token_splitter_newline
     )
 
 
diff --git a/tests/embeddings/test_utils.py b/tests/embeddings/test_utils.py
new file mode 100644
index 0000000000..f75da3d825
--- /dev/null
+++ b/tests/embeddings/test_utils.py
@@ -0,0 +1,42 @@
+from typing import Any, Dict
+from pytest import MonkeyPatch
+
+from llama_index.bridge.langchain import HuggingFaceBgeEmbeddings
+from llama_index.embeddings import OpenAIEmbedding, LangchainEmbedding
+from llama_index.embeddings.utils import resolve_embed_model
+
+
+def mock_hf_embeddings(*args: Any, **kwargs: Dict[str, Any]) -> Any:
+    """Mock HuggingFaceEmbeddings."""
+    return
+
+
+def mock_openai_embeddings(*args: Any, **kwargs: Dict[str, Any]) -> Any:
+    """Mock OpenAIEmbedding."""
+    return
+
+
+def test_resolve_embed_model(monkeypatch: MonkeyPatch) -> None:
+    monkeypatch.setattr(
+        "llama_index.bridge.langchain.HuggingFaceBgeEmbeddings.__init__",
+        mock_hf_embeddings,
+    )
+    monkeypatch.setattr(
+        "llama_index.embeddings.OpenAIEmbedding.__init__", mock_openai_embeddings
+    )
+
+    # Test None
+    embed_model = resolve_embed_model(None)
+    assert isinstance(embed_model, OpenAIEmbedding)
+
+    # Test str
+    embed_model = resolve_embed_model("local")
+    assert isinstance(embed_model, LangchainEmbedding)
+
+    # Test LCEmbeddings
+    embed_model = resolve_embed_model(HuggingFaceBgeEmbeddings())
+    assert isinstance(embed_model, LangchainEmbedding)
+
+    # Test BaseEmbedding
+    embed_model = resolve_embed_model(OpenAIEmbedding())
+    assert isinstance(embed_model, OpenAIEmbedding)
diff --git a/tests/indices/postprocessor/test_metadata_replacement.py b/tests/indices/postprocessor/test_metadata_replacement.py
new file mode 100644
index 0000000000..23ac8bb8ff
--- /dev/null
+++ b/tests/indices/postprocessor/test_metadata_replacement.py
@@ -0,0 +1,17 @@
+from llama_index.schema import TextNode, NodeWithScore
+from llama_index.indices.postprocessor import MetadataReplacementPostProcessor
+
+
+def test_metadata_replacement() -> None:
+    node = TextNode(
+        text="This is a test 1.", metadata={"key": "This is a another test."}
+    )
+
+    nodes = [NodeWithScore(node=node, score=1.0)]
+
+    postprocessor = MetadataReplacementPostProcessor(target_metadata_key="key")
+
+    nodes = postprocessor.postprocess_nodes(nodes)
+
+    assert len(nodes) == 1
+    assert nodes[0].node.get_content() == "This is a another test."
diff --git a/tests/indices/response/test_response_builder.py b/tests/indices/response/test_response_builder.py
index 8e7f53c3d3..bb18561dcc 100644
--- a/tests/indices/response/test_response_builder.py
+++ b/tests/indices/response/test_response_builder.py
@@ -12,14 +12,7 @@ from llama_index.response_synthesizers import ResponseMode, get_response_synthes
 from llama_index.schema import Document
 from tests.indices.vector_store.mock_services import MockEmbedding
 from tests.mock_utils.mock_prompts import MOCK_REFINE_PROMPT, MOCK_TEXT_QA_PROMPT
-
-
-def mock_tokenizer(text: str) -> List[str]:
-    """Mock tokenizer."""
-    if text == "":
-        return []
-    tokens = text.split(" ")
-    return tokens
+from tests.mock_utils.mock_utils import mock_tokenizer
 
 
 def test_give_response(
diff --git a/tests/indices/response/test_tree_summarize.py b/tests/indices/response/test_tree_summarize.py
index 929050f45e..5f451dcd79 100644
--- a/tests/indices/response/test_tree_summarize.py
+++ b/tests/indices/response/test_tree_summarize.py
@@ -29,8 +29,10 @@ def mock_service_context_merge_chunks(
 
 
 def test_tree_summarize(mock_service_context_merge_chunks: ServiceContext) -> None:
-    mock_qa_prompt_tmpl = "{context_str}{query_str}"
-    mock_qa_prompt = Prompt(mock_qa_prompt_tmpl, prompt_type=PromptType.QUESTION_ANSWER)
+    mock_summary_prompt_tmpl = "{context_str}{query_str}"
+    mock_summary_prompt = Prompt(
+        mock_summary_prompt_tmpl, prompt_type=PromptType.SUMMARY
+    )
 
     query_str = "What is?"
     texts = [
@@ -43,7 +45,7 @@ def test_tree_summarize(mock_service_context_merge_chunks: ServiceContext) -> No
     # test sync
     tree_summarize = TreeSummarize(
         service_context=mock_service_context_merge_chunks,
-        text_qa_template=mock_qa_prompt,
+        summary_template=mock_summary_prompt,
     )
     response = tree_summarize.get_response(text_chunks=texts, query_str=query_str)
     assert str(response) == "Text chunk 1\nText chunk 2\nText chunk 3\nText chunk 4"
@@ -52,8 +54,10 @@ def test_tree_summarize(mock_service_context_merge_chunks: ServiceContext) -> No
 def test_tree_summarize_use_async(
     mock_service_context_merge_chunks: ServiceContext,
 ) -> None:
-    mock_qa_prompt_tmpl = "{context_str}{query_str}"
-    mock_qa_prompt = Prompt(mock_qa_prompt_tmpl, prompt_type=PromptType.QUESTION_ANSWER)
+    mock_summary_prompt_tmpl = "{context_str}{query_str}"
+    mock_summary_prompt = Prompt(
+        mock_summary_prompt_tmpl, prompt_type=PromptType.SUMMARY
+    )
 
     query_str = "What is?"
     texts = [
@@ -66,7 +70,7 @@ def test_tree_summarize_use_async(
     # test async
     tree_summarize = TreeSummarize(
         service_context=mock_service_context_merge_chunks,
-        text_qa_template=mock_qa_prompt,
+        summary_template=mock_summary_prompt,
         use_async=True,
     )
     response = tree_summarize.get_response(text_chunks=texts, query_str=query_str)
@@ -77,8 +81,10 @@ def test_tree_summarize_use_async(
 async def test_tree_summarize_async(
     mock_service_context_merge_chunks: ServiceContext,
 ) -> None:
-    mock_qa_prompt_tmpl = "{context_str}{query_str}"
-    mock_qa_prompt = Prompt(mock_qa_prompt_tmpl, prompt_type=PromptType.QUESTION_ANSWER)
+    mock_summary_prompt_tmpl = "{context_str}{query_str}"
+    mock_summary_prompt = Prompt(
+        mock_summary_prompt_tmpl, prompt_type=PromptType.SUMMARY
+    )
 
     query_str = "What is?"
     texts = [
@@ -91,7 +97,7 @@ async def test_tree_summarize_async(
     # test async
     tree_summarize = TreeSummarize(
         service_context=mock_service_context_merge_chunks,
-        text_qa_template=mock_qa_prompt,
+        summary_template=mock_summary_prompt,
     )
     response = await tree_summarize.aget_response(
         text_chunks=texts, query_str=query_str
diff --git a/tests/indices/test_node_utils.py b/tests/indices/test_node_utils.py
index fa7debd18b..fa66e4d8f5 100644
--- a/tests/indices/test_node_utils.py
+++ b/tests/indices/test_node_utils.py
@@ -3,36 +3,20 @@
 from typing import List
 
 import pytest
+import tiktoken
 
-from llama_index.node_parser.node_utils import TextSplit, get_nodes_from_document
+from llama_index.bridge.langchain import RecursiveCharacterTextSplitter
+from llama_index.node_parser.node_utils import get_nodes_from_document
 from llama_index.schema import Document, MetadataMode
 from llama_index.text_splitter import TokenTextSplitter
 
 
-class TokenTextSplitterWithMetadata(TokenTextSplitter):
-    """Text splitter which adds metadata to text splits."""
-
-    def _postprocess_splits(self, docs: List[TextSplit]) -> List[TextSplit]:
-        for doc in docs:
-            doc.metadata = {"test_splitter_key": "test_splitter_val"}
-
-        docs = super()._postprocess_splits(docs)
-
-        return docs
-
-
 @pytest.fixture
 def text_splitter() -> TokenTextSplitter:
     """Get text splitter."""
     return TokenTextSplitter(chunk_size=20, chunk_overlap=0)
 
 
-@pytest.fixture
-def text_splitter_with_metadata() -> TokenTextSplitterWithMetadata:
-    """Get text splitter which adds metadata."""
-    return TokenTextSplitterWithMetadata(chunk_size=20, chunk_overlap=0)
-
-
 @pytest.fixture
 def documents() -> List[Document]:
     """Get documents."""
@@ -77,7 +61,8 @@ def test_get_nodes_from_document_with_metadata(
     )
     assert len(nodes) == 3
     actual_chunk_sizes = [
-        len(text_splitter.tokenizer(node.get_content())) for node in nodes
+        len(text_splitter.tokenizer(node.get_content(metadata_mode=MetadataMode.ALL)))
+        for node in nodes
     ]
     assert all(
         chunk_size <= text_splitter._chunk_size for chunk_size in actual_chunk_sizes
@@ -90,27 +75,21 @@ def test_get_nodes_from_document_with_metadata(
     )
 
 
-def test_get_nodes_from_document_with_node_metadata(
+def test_get_nodes_from_document_langchain_compatible(
     documents: List[Document],
-    text_splitter_with_metadata: TokenTextSplitterWithMetadata,
 ) -> None:
-    """Test get nodes from document with text splits metadata"""
+    """Test get nodes from document have desired chunk size."""
+    tokenizer = tiktoken.get_encoding("gpt2").encode
+    text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
+        chunk_size=20, chunk_overlap=0
+    )
     nodes = get_nodes_from_document(
         documents[0],
-        text_splitter_with_metadata,
-        include_metadata=True,
-    )
-    assert len(nodes) == 3
-    assert all(
-        [
-            "test_key: test_val" in n.get_content(metadata_mode=MetadataMode.ALL)
-            for n in nodes
-        ]
+        text_splitter,
+        include_metadata=False,
     )
+    assert len(nodes) == 2
+    actual_chunk_sizes = [len(tokenizer(node.get_content())) for node in nodes]
     assert all(
-        [
-            "test_splitter_key: test_splitter_val"
-            in n.get_content(metadata_mode=MetadataMode.ALL)
-            for n in nodes
-        ]
+        chunk_size <= text_splitter._chunk_size for chunk_size in actual_chunk_sizes
     )
diff --git a/tests/indices/test_prompt_helper.py b/tests/indices/test_prompt_helper.py
index 14fdef95c9..62dc0a2d5e 100644
--- a/tests/indices/test_prompt_helper.py
+++ b/tests/indices/test_prompt_helper.py
@@ -2,12 +2,12 @@
 from typing import cast
 
 from llama_index.bridge.langchain import PromptTemplate as LangchainPrompt
-
 from llama_index.indices.prompt_helper import PromptHelper
 from llama_index.indices.tree.utils import get_numbered_text_from_nodes
-from llama_index.prompts.utils import get_biggest_prompt, get_empty_prompt_txt
 from llama_index.prompts.base import Prompt
+from llama_index.prompts.utils import get_biggest_prompt, get_empty_prompt_txt
 from llama_index.schema import TextNode
+from llama_index.text_splitter.utils import truncate_text
 from tests.mock_utils.mock_utils import mock_tokenizer
 
 
@@ -61,7 +61,7 @@ def test_get_text_splitter() -> None:
     test_text = "Hello world foo Hello world bar"
     text_chunks = text_splitter.split_text(test_text)
     assert text_chunks == ["Hello world", "foo Hello", "world bar"]
-    truncated_text = text_splitter.truncate_text(test_text)
+    truncated_text = truncate_text(test_text, text_splitter)
     assert truncated_text == "Hello world"
 
     # test with chunk_size_limit
@@ -94,7 +94,7 @@ def test_get_text_splitter_partial() -> None:
     test_text = "Hello world foo Hello world bar"
     text_chunks = text_splitter.split_text(test_text)
     assert text_chunks == ["Hello world", "foo Hello", "world bar"]
-    truncated_text = text_splitter.truncate_text(test_text)
+    truncated_text = truncate_text(test_text, text_splitter)
     assert truncated_text == "Hello world"
 
     # test with partially formatting
@@ -110,7 +110,7 @@ def test_get_text_splitter_partial() -> None:
     test_text = "Hello world foo Hello world bar"
     text_chunks = text_splitter.split_text(test_text)
     assert text_chunks == ["Hello world", "foo Hello", "world bar"]
-    truncated_text = text_splitter.truncate_text(test_text)
+    truncated_text = truncate_text(test_text, text_splitter)
     assert truncated_text == "Hello world"
 
 
diff --git a/tests/mock_utils/mock_text_splitter.py b/tests/mock_utils/mock_text_splitter.py
index 6fb58d04a5..bc7d46ec0c 100644
--- a/tests/mock_utils/mock_text_splitter.py
+++ b/tests/mock_utils/mock_text_splitter.py
@@ -2,8 +2,6 @@
 
 from typing import Any, List, Optional
 
-from llama_index.text_splitter import TextSplit
-
 
 def patch_token_splitter_newline(
     self: Any, text: str, metadata_str: Optional[str] = None
@@ -14,16 +12,6 @@ def patch_token_splitter_newline(
     return text.split("\n")
 
 
-def patch_token_splitter_newline_with_overlaps(
-    self: Any, text: str, metadata_str: Optional[str]
-) -> List[TextSplit]:
-    """Mock token splitter by newline."""
-    if text == "":
-        return []
-    strings = text.split("\n")
-    return [TextSplit(string, 0) for string in strings]
-
-
 def mock_token_splitter_newline(
     text: str, metadata_str: Optional[str] = None
 ) -> List[str]:
@@ -31,13 +19,3 @@ def mock_token_splitter_newline(
     if text == "":
         return []
     return text.split("\n")
-
-
-def mock_token_splitter_newline_with_overlaps(
-    text: str, metadata_str: Optional[str]
-) -> List[TextSplit]:
-    """Mock token splitter by newline."""
-    if text == "":
-        return []
-    strings = text.split("\n")
-    return [TextSplit(string, 0) for string in strings]
diff --git a/tests/mock_utils/mock_utils.py b/tests/mock_utils/mock_utils.py
index 8fed074a21..0bf49242a7 100644
--- a/tests/mock_utils/mock_utils.py
+++ b/tests/mock_utils/mock_utils.py
@@ -1,5 +1,6 @@
 """Mock utils."""
 
+import re
 from typing import List, Optional, Set
 
 from llama_index.indices.keyword_table.utils import simple_extract_keywords
@@ -7,7 +8,7 @@ from llama_index.indices.keyword_table.utils import simple_extract_keywords
 
 def mock_tokenizer(text: str) -> List[str]:
     """Mock tokenizer."""
-    tokens = text.split(" ")
+    tokens = re.split(r"[ \n]", text)  # split by space or newline
     result = []
     for token in tokens:
         if token.strip() == "":
diff --git a/tests/node_parser/sentence_window.py b/tests/node_parser/sentence_window.py
new file mode 100644
index 0000000000..6d96e537ee
--- /dev/null
+++ b/tests/node_parser/sentence_window.py
@@ -0,0 +1,21 @@
+from llama_index.schema import Document
+from llama_index.node_parser.sentence_window import SentenceWindowNodeParser
+
+
+def test_split_and_window() -> None:
+    document = Document(text="This is a test 1. This is a test 2. This is a test 3.")
+
+    node_parser = SentenceWindowNodeParser.from_defaults()
+
+    nodes = node_parser.get_nodes_from_documents([document])
+
+    assert len(nodes) == 3
+    assert nodes[0].get_content() == "This is a test 1."
+    assert nodes[1].get_content() == "This is a test 2."
+    assert nodes[2].get_content() == "This is a test 3."
+
+    assert (
+        " ".join(nodes[0].metadata["window"])
+        == "This is a test 1. This is a test 2. Thius is a test 3."
+    )
+    assert nodes[0].metadata["original_text"] == "This is a test 1."
diff --git a/tests/test_text_splitter.py b/tests/test_text_splitter.py
deleted file mode 100644
index b5ef53bd89..0000000000
--- a/tests/test_text_splitter.py
+++ /dev/null
@@ -1,229 +0,0 @@
-"""Test text splitter."""
-import os
-
-from llama_index.text_splitter import CodeSplitter, SentenceSplitter, TokenTextSplitter
-
-
-def test_split_token() -> None:
-    """Test split normal token."""
-    # tiktoken will say length is ~5k
-    token = "foo bar"
-    text_splitter = TokenTextSplitter(chunk_size=1, chunk_overlap=0)
-    chunks = text_splitter.split_text(token)
-    assert chunks == ["foo", "bar"]
-
-    token = "foo bar hello world"
-    text_splitter = TokenTextSplitter(chunk_size=2, chunk_overlap=1)
-    chunks = text_splitter.split_text(token)
-    assert chunks == ["foo bar", "bar hello", "hello world"]
-
-
-def test_truncate_token() -> None:
-    """Test truncate normal token."""
-    # tiktoken will say length is ~5k
-    token = "foo bar"
-    text_splitter = TokenTextSplitter(chunk_size=1, chunk_overlap=0)
-    chunks = text_splitter.truncate_text(token)
-    assert chunks == "foo"
-
-
-def test_split_long_token() -> None:
-    """Test split a really long token."""
-    # tiktoken will say length is ~5k
-    token = "a" * 100
-    text_splitter = TokenTextSplitter(chunk_size=20, chunk_overlap=0)
-    chunks = text_splitter.split_text(token)
-    # each text chunk may have spaces, since we join splits by separator
-    assert "".join(chunks).replace(" ", "") == token
-
-    token = ("a" * 49) + "\n" + ("a" * 50)
-    text_splitter = TokenTextSplitter(chunk_size=20, chunk_overlap=0)
-    chunks = text_splitter.split_text(token)
-    assert len(chunks[0]) == 49
-    assert len(chunks[1]) == 50
-
-
-def test_split_with_metadata_str() -> None:
-    """Test split while taking into account chunk size used by metadata str."""
-    text = " ".join(["foo"] * 20)
-    metadata_str = "test_metadata_str"
-
-    text_splitter = TokenTextSplitter(chunk_size=20, chunk_overlap=0)
-    chunks = text_splitter.split_text(text)
-    assert len(chunks) == 1
-
-    text_splitter = TokenTextSplitter(chunk_size=20, chunk_overlap=0)
-    chunks = text_splitter.split_text(text, metadata_str=metadata_str)
-    assert len(chunks) == 2
-
-
-def test_split_diff_sentence_token() -> None:
-    """Test case of a string that will split differently."""
-    token_text_splitter = TokenTextSplitter(chunk_size=20, chunk_overlap=0)
-    sentence_text_splitter = SentenceSplitter(chunk_size=20, chunk_overlap=0)
-
-    text = " ".join(["foo"] * 15) + "\n\n\n" + " ".join(["bar"] * 15)
-    token_split = token_text_splitter.split_text(text)
-    sentence_split = sentence_text_splitter.split_text(text)
-    assert token_split[0] == " ".join(["foo"] * 15) + "\n\n\n" + " ".join(["bar"] * 3)
-    assert token_split[1] == " ".join(["bar"] * 12)
-    assert sentence_split[0] == " ".join(["foo"] * 15)
-    assert sentence_split[1] == " ".join(["bar"] * 15)
-
-
-def test_split_diff_sentence_token2() -> None:
-    """Test case of a string that will split differently."""
-    token_text_splitter = TokenTextSplitter(chunk_size=20, chunk_overlap=0)
-    sentence_text_splitter = SentenceSplitter(chunk_size=20, chunk_overlap=0)
-
-    text = " ".join(["foo"] * 15) + ". " + " ".join(["bar"] * 15)
-    token_split = token_text_splitter.split_text(text)
-    sentence_split = sentence_text_splitter.split_text(text)
-
-    assert token_split[0] == " ".join(["foo"] * 15) + ". " + " ".join(["bar"] * 4)
-    assert token_split[1] == " ".join(["bar"] * 11)
-    assert sentence_split[0] == " ".join(["foo"] * 15) + "."
-    assert sentence_split[1] == " ".join(["bar"] * 15)
-
-
-def test_python_code_splitter() -> None:
-    """Test case for code splitting using python"""
-
-    if "CI" in os.environ:
-        return
-
-    code_splitter = CodeSplitter(
-        language="python", chunk_lines=4, chunk_lines_overlap=1, max_chars=30
-    )
-
-    text = """\
-def foo():
-    print("bar")
-
-def baz():
-    print("bbq")"""
-
-    chunks = code_splitter.split_text(text)
-    assert chunks[0].startswith("def foo():")
-    assert chunks[1].startswith("def baz():")
-
-
-def test_typescript_code_splitter() -> None:
-    """Test case for code splitting using typescript"""
-
-    if "CI" in os.environ:
-        return
-
-    code_splitter = CodeSplitter(
-        language="typescript", chunk_lines=4, chunk_lines_overlap=1, max_chars=50
-    )
-
-    text = """\
-function foo() {
-    console.log("bar");
-}
-
-function baz() {
-    console.log("bbq");
-}"""
-
-    chunks = code_splitter.split_text(text)
-    assert chunks[0].startswith("function foo()")
-    assert chunks[1].startswith("function baz()")
-
-
-def test_html_code_splitter() -> None:
-    """Test case for code splitting using typescript"""
-
-    if "CI" in os.environ:
-        return
-
-    code_splitter = CodeSplitter(
-        language="html", chunk_lines=4, chunk_lines_overlap=1, max_chars=50
-    )
-
-    text = """\
-<!DOCTYPE html>
-<html>
-<head>
-    <title>My Example Page</title>
-</head>
-<body>
-    <h1>Welcome to My Example Page</h1>
-    <p>This is a basic HTML page example.</p>
-    <ul>
-        <li>Item 1</li>
-        <li>Item 2</li>
-        <li>Item 3</li>
-    </ul>
-    <img src="https://example.com/image.jpg" alt="Example Image">
-</body>
-</html>"""
-
-    chunks = code_splitter.split_text(text)
-    assert chunks[0].startswith("<!DOCTYPE html>")
-    assert chunks[1].startswith("<html>")
-    assert chunks[2].startswith("<head>")
-
-
-def test_tsx_code_splitter() -> None:
-    """Test case for code splitting using typescript"""
-
-    if "CI" in os.environ:
-        return
-
-    code_splitter = CodeSplitter(
-        language="typescript", chunk_lines=4, chunk_lines_overlap=1, max_chars=50
-    )
-
-    text = """\
-import React from 'react';
-
-interface Person {
-  name: string;
-  age: number;
-}
-
-const ExampleComponent: React.FC = () => {
-  const person: Person = {
-    name: 'John Doe',
-    age: 30,
-  };
-
-  return (
-    <div>
-      <h1>Hello, {person.name}!</h1>
-      <p>You are {person.age} years old.</p>
-    </div>
-  );
-};
-
-export default ExampleComponent;"""
-
-    chunks = code_splitter.split_text(text)
-    assert chunks[0].startswith("import React from 'react';")
-    assert chunks[1].startswith("interface Person")
-
-
-def test_cpp_code_splitter() -> None:
-    """Test case for code splitting using typescript"""
-
-    if "CI" in os.environ:
-        return
-
-    code_splitter = CodeSplitter(
-        language="cpp", chunk_lines=4, chunk_lines_overlap=1, max_chars=50
-    )
-
-    text = """\
-#include <iostream>
-
-int main() {
-    std::cout << "Hello, World!" << std::endl;
-    return 0;
-}"""
-
-    chunks = code_splitter.split_text(text)
-    assert chunks[0].startswith("#include <iostream>")
-    assert chunks[1].startswith("int main()")
-    assert chunks[2].startswith("{\n    std::cout")
diff --git a/tests/text_splitter/__init__.py b/tests/text_splitter/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/tests/text_splitter/conftest.py b/tests/text_splitter/conftest.py
new file mode 100644
index 0000000000..faeb72bb8f
--- /dev/null
+++ b/tests/text_splitter/conftest.py
@@ -0,0 +1,52 @@
+import pytest
+
+
+@pytest.fixture
+def english_text() -> str:
+    return """\
+A Curious Beginning
+
+In a quaint little village, nestled deep within a lush, green valley, there lived a \
+curious young girl named Lily! She had sparkling blue eyes that glimmered like the \
+morning dew—yes, like tiny sapphires embedded in her face. And her golden hair flowed \
+like a cascade of sunlight, shimmering in the breeze.
+
+Embarking on Enchanted Journeys
+
+Every day, Lily would embark on new adventures; she was like a butterfly dancing on \
+the winds of curiosity. Exploring the Enchanting Forests that surrounded her home was \
+her favorite pastime. The trees seemed to whisper secrets to her, their leaves \
+rustling with ancient tales.
+"""
+
+
+@pytest.fixture
+def chinese_text() -> str:
+    return """\
+教育的重要性
+
+教育是人类社会发展的基石，也是培养人才、传承文化的重要途径。它不仅能够提升个体的知识水平，\
+还能塑造人的品格和价值观。因此，教育在我们的生活中扮演着不可或缺的角色。
+
+首先，教育有助于拓展我们的视野。通过学习，我们能够了解世界各地的文化、历史和科技进展。\
+这不仅丰富了我们的知识，还让我们更加开放和包容。教育使我们能够超越狭隘的个人观点，\
+理解不同群体的需求和想法，从而促进社会的和谐与发展。
+
+其次，教育培养了未来的领袖和专业人才。在现代社会，各行各业都需要经过专业的教育培训才能胜任。\
+教育系统为学生提供了系统的知识体系和技能，使他们能够在职场中脱颖而出。同时，教育也培养了创新能力和\
+问题解决能力，为社会的进步和创新奠定了基础。
+
+此外，教育有助于个人的成长和发展。通过学习，人们能够发展自己的才华和潜力，实现人生目标。教育不仅仅是课堂\
+上的知识，还包括了品德教育和社会交往的技巧。它教导我们如何与他人合作、沟通，并在逆境中坚持不懈。\
+这些都是人生中宝贵的财富，能够引导我们走向成功之路。
+
+总之，教育是我们个人和社会发展的支柱，它不仅丰富了我们的思想，还培养了我们的人才。我们应该珍视教育，\
+为其投入更多的资源和关注，以创造一个更加美好的未来。
+
+希望这篇文章对你有帮助！如果你有其他主题的需求，欢迎随时告诉我。\
+"""
+
+
+@pytest.fixture
+def contiguous_text() -> str:
+    return "abcde" * 200
diff --git a/tests/text_splitter/test_code_splitter.py b/tests/text_splitter/test_code_splitter.py
new file mode 100644
index 0000000000..709d45113c
--- /dev/null
+++ b/tests/text_splitter/test_code_splitter.py
@@ -0,0 +1,147 @@
+"""Test text splitter."""
+import os
+
+from llama_index.text_splitter import CodeSplitter
+
+
+def test_python_code_splitter() -> None:
+    """Test case for code splitting using python"""
+
+    if "CI" in os.environ:
+        return
+
+    code_splitter = CodeSplitter(
+        language="python", chunk_lines=4, chunk_lines_overlap=1, max_chars=30
+    )
+
+    text = """\
+def foo():
+    print("bar")
+
+def baz():
+    print("bbq")"""
+
+    chunks = code_splitter.split_text(text)
+    assert chunks[0].startswith("def foo():")
+    assert chunks[1].startswith("def baz():")
+
+
+def test_typescript_code_splitter() -> None:
+    """Test case for code splitting using typescript"""
+
+    if "CI" in os.environ:
+        return
+
+    code_splitter = CodeSplitter(
+        language="typescript", chunk_lines=4, chunk_lines_overlap=1, max_chars=50
+    )
+
+    text = """\
+function foo() {
+    console.log("bar");
+}
+
+function baz() {
+    console.log("bbq");
+}"""
+
+    chunks = code_splitter.split_text(text)
+    assert chunks[0].startswith("function foo()")
+    assert chunks[1].startswith("function baz()")
+
+
+def test_html_code_splitter() -> None:
+    """Test case for code splitting using typescript"""
+
+    if "CI" in os.environ:
+        return
+
+    code_splitter = CodeSplitter(
+        language="html", chunk_lines=4, chunk_lines_overlap=1, max_chars=50
+    )
+
+    text = """\
+<!DOCTYPE html>
+<html>
+<head>
+    <title>My Example Page</title>
+</head>
+<body>
+    <h1>Welcome to My Example Page</h1>
+    <p>This is a basic HTML page example.</p>
+    <ul>
+        <li>Item 1</li>
+        <li>Item 2</li>
+        <li>Item 3</li>
+    </ul>
+    <img src="https://example.com/image.jpg" alt="Example Image">
+</body>
+</html>"""
+
+    chunks = code_splitter.split_text(text)
+    assert chunks[0].startswith("<!DOCTYPE html>")
+    assert chunks[1].startswith("<html>")
+    assert chunks[2].startswith("<head>")
+
+
+def test_tsx_code_splitter() -> None:
+    """Test case for code splitting using typescript"""
+
+    if "CI" in os.environ:
+        return
+
+    code_splitter = CodeSplitter(
+        language="typescript", chunk_lines=4, chunk_lines_overlap=1, max_chars=50
+    )
+
+    text = """\
+import React from 'react';
+
+interface Person {
+  name: string;
+  age: number;
+}
+
+const ExampleComponent: React.FC = () => {
+  const person: Person = {
+    name: 'John Doe',
+    age: 30,
+  };
+
+  return (
+    <div>
+      <h1>Hello, {person.name}!</h1>
+      <p>You are {person.age} years old.</p>
+    </div>
+  );
+};
+
+export default ExampleComponent;"""
+
+    chunks = code_splitter.split_text(text)
+    assert chunks[0].startswith("import React from 'react';")
+    assert chunks[1].startswith("interface Person")
+
+
+def test_cpp_code_splitter() -> None:
+    """Test case for code splitting using typescript"""
+
+    if "CI" in os.environ:
+        return
+
+    code_splitter = CodeSplitter(
+        language="cpp", chunk_lines=4, chunk_lines_overlap=1, max_chars=50
+    )
+
+    text = """\
+#include <iostream>
+
+int main() {
+    std::cout << "Hello, World!" << std::endl;
+    return 0;
+}"""
+
+    chunks = code_splitter.split_text(text)
+    assert chunks[0].startswith("#include <iostream>")
+    assert chunks[1].startswith("int main()")
+    assert chunks[2].startswith("{\n    std::cout")
diff --git a/tests/text_splitter/test_sentence_splitter.py b/tests/text_splitter/test_sentence_splitter.py
new file mode 100644
index 0000000000..830baf702f
--- /dev/null
+++ b/tests/text_splitter/test_sentence_splitter.py
@@ -0,0 +1,55 @@
+import tiktoken
+
+from llama_index.text_splitter import SentenceSplitter
+
+
+def test_paragraphs() -> None:
+    """Test case of a string with multiple paragraphs."""
+    sentence_text_splitter = SentenceSplitter(chunk_size=20, chunk_overlap=0)
+
+    text = " ".join(["foo"] * 15) + "\n\n\n" + " ".join(["bar"] * 15)
+    sentence_split = sentence_text_splitter.split_text(text)
+    assert sentence_split[0] == " ".join(["foo"] * 15)
+    assert sentence_split[1] == " ".join(["bar"] * 15)
+
+
+def test_sentences() -> None:
+    """Test case of a string with multiple sentences."""
+    sentence_text_splitter = SentenceSplitter(chunk_size=20, chunk_overlap=0)
+
+    text = " ".join(["foo"] * 15) + ". " + " ".join(["bar"] * 15)
+    sentence_split = sentence_text_splitter.split_text(text)
+
+    assert sentence_split[0] == " ".join(["foo"] * 15) + "."
+    assert sentence_split[1] == " ".join(["bar"] * 15)
+
+
+def test_chinese_text(chinese_text: str) -> None:
+    splitter = SentenceSplitter(chunk_size=512, chunk_overlap=0)
+    chunks = splitter.split_text(chinese_text)
+    assert len(chunks) == 3
+
+
+def test_contiguous_text(contiguous_text: str) -> None:
+    # NOTE: sentence splitter does not split contiguous text
+    splitter = SentenceSplitter(chunk_size=100, chunk_overlap=0)
+    chunks = splitter.split_text(contiguous_text)
+    assert len(chunks) == 11
+
+
+def test_split_with_metadata(english_text: str) -> None:
+    chunk_size = 100
+    metadata_str = "word " * 50
+    tokenizer = tiktoken.get_encoding("gpt2")
+    splitter = SentenceSplitter(
+        chunk_size=chunk_size, chunk_overlap=0, tokenizer=tokenizer.encode
+    )
+
+    chunks = splitter.split_text(english_text)
+    assert len(chunks) == 2
+
+    chunks = splitter.split_text_metadata_aware(english_text, metadata_str=metadata_str)
+    assert len(chunks) == 4
+    for chunk in chunks:
+        node_content = chunk + metadata_str
+        assert len(tokenizer.encode(node_content)) <= 100
diff --git a/tests/text_splitter/test_token_splitter.py b/tests/text_splitter/test_token_splitter.py
new file mode 100644
index 0000000000..d0528e51b4
--- /dev/null
+++ b/tests/text_splitter/test_token_splitter.py
@@ -0,0 +1,74 @@
+"""Test text splitter."""
+import tiktoken
+
+from llama_index.text_splitter import TokenTextSplitter
+from llama_index.text_splitter.utils import truncate_text
+
+
+def test_split_token() -> None:
+    """Test split normal token."""
+    # tiktoken will say length is ~5k
+    token = "foo bar"
+    text_splitter = TokenTextSplitter(chunk_size=1, chunk_overlap=0)
+    chunks = text_splitter.split_text(token)
+    assert chunks == ["foo", "bar"]
+
+    token = "foo bar hello world"
+    text_splitter = TokenTextSplitter(chunk_size=2, chunk_overlap=1)
+    chunks = text_splitter.split_text(token)
+    assert chunks == ["foo bar", "bar hello", "hello world"]
+
+
+def test_truncate_token() -> None:
+    """Test truncate normal token."""
+    # tiktoken will say length is ~5k
+    token = "foo bar"
+    text_splitter = TokenTextSplitter(chunk_size=1, chunk_overlap=0)
+    text = truncate_text(token, text_splitter)
+    assert text == "foo"
+
+
+def test_split_long_token() -> None:
+    """Test split a really long token."""
+    # tiktoken will say length is ~5k
+    token = "a" * 100
+    text_splitter = TokenTextSplitter(chunk_size=20, chunk_overlap=0)
+    chunks = text_splitter.split_text(token)
+    # each text chunk may have spaces, since we join splits by separator
+    assert "".join(chunks).replace(" ", "") == token
+
+    token = ("a" * 49) + "\n" + ("a" * 50)
+    text_splitter = TokenTextSplitter(chunk_size=20, chunk_overlap=0)
+    chunks = text_splitter.split_text(token)
+    assert len(chunks[0]) == 49
+    assert len(chunks[1]) == 50
+
+
+def test_split_chinese(chinese_text: str) -> None:
+    text_splitter = TokenTextSplitter(chunk_size=512, chunk_overlap=0)
+    chunks = text_splitter.split_text(chinese_text)
+    assert len(chunks) == 3
+
+
+def test_contiguous_text(contiguous_text: str) -> None:
+    splitter = TokenTextSplitter(chunk_size=100, chunk_overlap=0)
+    chunks = splitter.split_text(contiguous_text)
+    assert len(chunks) == 10
+
+
+def test_split_with_metadata(english_text: str) -> None:
+    chunk_size = 100
+    metadata_str = "word " * 50
+    tokenizer = tiktoken.get_encoding("gpt2")
+    splitter = TokenTextSplitter(
+        chunk_size=chunk_size, chunk_overlap=0, tokenizer=tokenizer.encode
+    )
+
+    chunks = splitter.split_text(english_text)
+    assert len(chunks) == 2
+
+    chunks = splitter.split_text_metadata_aware(english_text, metadata_str=metadata_str)
+    assert len(chunks) == 4
+    for chunk in chunks:
+        node_content = chunk + metadata_str
+        assert len(tokenizer.encode(node_content)) <= 100
-- 
GitLab