diff --git a/CHANGELOG.md b/CHANGELOG.md
index 6f5327167b43d0ef868b988b5182c2a03f48cc84..cbc06b381ba85b5f9a1c6cbf3ee06064194ab100 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -12,6 +12,11 @@
 - Change more than one image input for Replicate Multi-modal models from error to warning (#9360)
 - Removed GPT-Licensed `aiostream` dependency (#9403)
 
+### Breaking Changes
+
+- Updated the base `LLM` interface to match `LLMPredictor` (#9388)
+- Deprecated `LLMPredictor` (#9388)
+
 ## [0.9.13] - 2023-12-06
 
 ### New Features
diff --git a/benchmarks/agent/agent_utils.py b/benchmarks/agent/agent_utils.py
index d35fa8635d2ade832772c0573c04c244b9afac22..74a862518911a10842dd109cafca3ebf3e0f40f6 100644
--- a/benchmarks/agent/agent_utils.py
+++ b/benchmarks/agent/agent_utils.py
@@ -3,8 +3,8 @@ from typing import Dict, List, Type
 from llama_index.agent import OpenAIAgent, ReActAgent
 from llama_index.agent.types import BaseAgent
 from llama_index.llms import Anthropic, OpenAI
-from llama_index.llms.base import LLM
 from llama_index.llms.llama_utils import messages_to_prompt
+from llama_index.llms.llm import LLM
 from llama_index.llms.replicate import Replicate
 
 OPENAI_MODELS = [
diff --git a/benchmarks/struct_indices/spider/evaluate.py b/benchmarks/struct_indices/spider/evaluate.py
index 0ebcf7d56a78b7895dcefcbb858a19ecb88b1bdf..a914d1a02619779b24457a594ea1870b17f76120 100644
--- a/benchmarks/struct_indices/spider/evaluate.py
+++ b/benchmarks/struct_indices/spider/evaluate.py
@@ -10,8 +10,8 @@ from spider_utils import create_indexes, load_examples
 from tqdm import tqdm
 
 from llama_index.indices.struct_store.sql import SQLQueryMode, SQLStructStoreIndex
-from llama_index.llms.base import ChatMessage, MessageRole
 from llama_index.llms.openai import OpenAI
+from llama_index.llms.types import ChatMessage, MessageRole
 from llama_index.response.schema import Response
 
 logging.getLogger("root").setLevel(logging.WARNING)
diff --git a/docs/api_reference/service_context.rst b/docs/api_reference/service_context.rst
index 772e2c9d322f3b633e30e81d63c951bc87dfd8a7..e0f88cc9187eac3a91fc5ec96057377802ec97a1 100644
--- a/docs/api_reference/service_context.rst
+++ b/docs/api_reference/service_context.rst
@@ -6,7 +6,7 @@ Service Context
 The service context container is a utility container for LlamaIndex
 index and query classes. The container contains the following
 objects that are commonly used for configuring every index and
-query, such as the LLMPredictor (for configuring the LLM),
+query, such as the LLM,
 the PromptHelper (for configuring input size/chunk size),
 the BaseEmbedding (for configuring the embedding model), and more.
 
diff --git a/docs/changes/deprecated_terms.md b/docs/changes/deprecated_terms.md
index b4e78f4971e1981b1d1858c638661815533632be..def9fc8ee0dbdd3bb7ce9832e922b78863bd5810 100644
--- a/docs/changes/deprecated_terms.md
+++ b/docs/changes/deprecated_terms.md
@@ -24,7 +24,7 @@ This has been renamed to `VectorStoreIndex`, but it is only a cosmetic change. P
 
 ## LLMPredictor
 
-The `LLMPredictor` object is no longer intended to be used by users. Instead, you can setup an LLM directly and pass it into the `ServiceContext`.
+The `LLMPredictor` object is no longer intended to be used by users. Instead, you can setup an LLM directly and pass it into the `ServiceContext`. THe `LLM` class itself has similar attributes and methods as the `LLMPredictor`.
 
 - [LLMs in LlamaIndex](/module_guides/models/llms.md)
 - [Setting LLMs in the ServiceContext](/module_guides/supporting_modules/service_context.md)
diff --git a/docs/community/faq/llms.md b/docs/community/faq/llms.md
index 58aff0e0bab8942d673de74bd268a3d1c8dba834..4202f3380195ead96c6cbfc7b4dd2a0e0224340a 100644
--- a/docs/community/faq/llms.md
+++ b/docs/community/faq/llms.md
@@ -46,12 +46,12 @@ response = query_engine.query("Rest of your query... \nRespond in Italian")
 Alternatively:
 
 ```py
-from llama_index import LLMPredictor, ServiceContext
+from llama_index import ServiceContext
 from llama_index.llms import OpenAI
 
-llm_predictor = LLMPredictor(system_prompt="Always respond in Italian.")
+llm = OpenAI(system_prompt="Always respond in Italian.")
 
-service_context = ServiceContext.from_defaults(llm_predictor=llm_predictor)
+service_context = ServiceContext.from_defaults(llm=llm)
 
 query_engine = load_index_from_storage(
     storage_context, service_context=service_context
diff --git a/docs/community/integrations/deepeval.md b/docs/community/integrations/deepeval.md
index d927efd6c4b1c35b89a04313207f7510fbb6deaf..ed8bc8ecdd96d9af9c3686c9f97434a713e0aca9 100644
--- a/docs/community/integrations/deepeval.md
+++ b/docs/community/integrations/deepeval.md
@@ -58,7 +58,6 @@ from llama_index import (
     TreeIndex,
     VectorStoreIndex,
     SimpleDirectoryReader,
-    LLMPredictor,
     ServiceContext,
     Response,
 )
diff --git a/docs/examples/agent/openai_agent_query_plan.ipynb b/docs/examples/agent/openai_agent_query_plan.ipynb
index 383fa4f1bcab0b1efff7b0a3ee5ef19020af4781..e356c5a11192d424fd88fea754d53fd8326132c6 100644
--- a/docs/examples/agent/openai_agent_query_plan.ipynb
+++ b/docs/examples/agent/openai_agent_query_plan.ipynb
@@ -80,7 +80,6 @@
    "source": [
     "from llama_index import (\n",
     "    SimpleDirectoryReader,\n",
-    "    LLMPredictor,\n",
     "    ServiceContext,\n",
     "    GPTVectorStoreIndex,\n",
     ")\n",
diff --git a/docs/examples/callbacks/HoneyHiveLlamaIndexTracer.ipynb b/docs/examples/callbacks/HoneyHiveLlamaIndexTracer.ipynb
index bf81db607bed479fdc80e1ed92ca6c474afa30b0..2117dfef15e8938b65fbae0185eac044b986b0b2 100644
--- a/docs/examples/callbacks/HoneyHiveLlamaIndexTracer.ipynb
+++ b/docs/examples/callbacks/HoneyHiveLlamaIndexTracer.ipynb
@@ -118,16 +118,13 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from llama_index.callbacks import CallbackManager, CBEventType\n",
-    "from llama_index.callbacks import LlamaDebugHandler, WandbCallbackHandler\n",
+    "from llama_index.callbacks import CallbackManager\n",
+    "from llama_index.callbacks import LlamaDebugHandler\n",
     "from llama_index import (\n",
-    "    SummaryIndex,\n",
-    "    GPTTreeIndex,\n",
-    "    GPTVectorStoreIndex,\n",
+    "    VectorStoreIndex,\n",
     "    ServiceContext,\n",
     "    SimpleDirectoryReader,\n",
-    "    LLMPredictor,\n",
-    "    GPTSimpleKeywordTableIndex,\n",
+    "    SimpleKeywordTableIndex,\n",
     "    StorageContext,\n",
     ")\n",
     "from llama_index.indices.composability import ComposableGraph\n",
@@ -289,9 +286,7 @@
     }
    ],
    "source": [
-    "index = GPTVectorStoreIndex.from_documents(\n",
-    "    docs, service_context=service_context\n",
-    ")"
+    "index = VectorStoreIndex.from_documents(docs, service_context=service_context)"
    ]
   },
   {
@@ -421,7 +416,7 @@
    ],
    "source": [
     "# build NYC index\n",
-    "nyc_index = GPTVectorStoreIndex.from_documents(\n",
+    "nyc_index = VectorStoreIndex.from_documents(\n",
     "    nyc_documents,\n",
     "    service_context=service_context,\n",
     "    storage_context=storage_context,\n",
@@ -450,7 +445,7 @@
    ],
    "source": [
     "# build essay index\n",
-    "essay_index = GPTVectorStoreIndex.from_documents(\n",
+    "essay_index = VectorStoreIndex.from_documents(\n",
     "    essay_documents,\n",
     "    service_context=service_context,\n",
     "    storage_context=storage_context,\n",
@@ -529,7 +524,7 @@
     "from llama_index import StorageContext, load_graph_from_storage\n",
     "\n",
     "graph = ComposableGraph.from_indices(\n",
-    "    GPTSimpleKeywordTableIndex,\n",
+    "    SimpleKeywordTableIndex,\n",
     "    [nyc_index, essay_index],\n",
     "    index_summaries=[nyc_index_summary, essay_index_summary],\n",
     "    max_keywords_per_chunk=50,\n",
diff --git a/docs/examples/callbacks/WandbCallbackHandler.ipynb b/docs/examples/callbacks/WandbCallbackHandler.ipynb
index 664a545f35727597befd100f8da5dda20d007843..cfea37a36c18867d6c0c98ed993576617e104ddd 100644
--- a/docs/examples/callbacks/WandbCallbackHandler.ipynb
+++ b/docs/examples/callbacks/WandbCallbackHandler.ipynb
@@ -57,16 +57,13 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from llama_index.callbacks import CallbackManager, CBEventType\n",
+    "from llama_index.callbacks import CallbackManager\n",
     "from llama_index.callbacks import LlamaDebugHandler, WandbCallbackHandler\n",
     "from llama_index import (\n",
-    "    SummaryIndex,\n",
-    "    GPTTreeIndex,\n",
-    "    GPTVectorStoreIndex,\n",
+    "    VectorStoreIndex,\n",
     "    ServiceContext,\n",
     "    SimpleDirectoryReader,\n",
-    "    LLMPredictor,\n",
-    "    GPTSimpleKeywordTableIndex,\n",
+    "    SimpleKeywordTableIndex,\n",
     "    StorageContext,\n",
     ")\n",
     "from llama_index.indices.composability import ComposableGraph\n",
@@ -238,9 +235,7 @@
     }
    ],
    "source": [
-    "index = GPTVectorStoreIndex.from_documents(\n",
-    "    docs, service_context=service_context\n",
-    ")"
+    "index = VectorStoreIndex.from_documents(docs, service_context=service_context)"
    ]
   },
   {
@@ -457,7 +452,7 @@
    ],
    "source": [
     "# build NYC index\n",
-    "nyc_index = GPTVectorStoreIndex.from_documents(\n",
+    "nyc_index = VectorStoreIndex.from_documents(\n",
     "    nyc_documents,\n",
     "    service_context=service_context,\n",
     "    storage_context=storage_context,\n",
@@ -493,7 +488,7 @@
    ],
    "source": [
     "# build essay index\n",
-    "essay_index = GPTVectorStoreIndex.from_documents(\n",
+    "essay_index = VectorStoreIndex.from_documents(\n",
     "    essay_documents,\n",
     "    service_context=service_context,\n",
     "    storage_context=storage_context,\n",
@@ -572,7 +567,7 @@
     "from llama_index import StorageContext, load_graph_from_storage\n",
     "\n",
     "graph = ComposableGraph.from_indices(\n",
-    "    GPTSimpleKeywordTableIndex,\n",
+    "    SimpleKeywordTableIndex,\n",
     "    [nyc_index, essay_index],\n",
     "    index_summaries=[nyc_index_summary, essay_index_summary],\n",
     "    max_keywords_per_chunk=50,\n",
diff --git a/docs/examples/citation/pdf_page_reference.ipynb b/docs/examples/citation/pdf_page_reference.ipynb
index 2c25638451c40552fb5d6a40905d843c60c42b05..82d6fbb909cbf398686701e07908f5becaa0f1fc 100644
--- a/docs/examples/citation/pdf_page_reference.ipynb
+++ b/docs/examples/citation/pdf_page_reference.ipynb
@@ -57,7 +57,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from llama_index import LLMPredictor, ServiceContext\n",
+    "from llama_index import ServiceContext\n",
     "from llama_index.llms import OpenAI\n",
     "\n",
     "service_context = ServiceContext.from_defaults(\n",
diff --git a/docs/examples/composable_indices/city_analysis/City_Analysis-Decompose.ipynb b/docs/examples/composable_indices/city_analysis/City_Analysis-Decompose.ipynb
index d685ebd7ec30177587fe391acaf9b7d52be95719..0659a82858bd677f0d3d987922b0321bd754c002 100644
--- a/docs/examples/composable_indices/city_analysis/City_Analysis-Decompose.ipynb
+++ b/docs/examples/composable_indices/city_analysis/City_Analysis-Decompose.ipynb
@@ -309,7 +309,7 @@
     ")\n",
     "\n",
     "decompose_transform = DecomposeQueryTransform(\n",
-    "    service_context.llm_predictor, verbose=True\n",
+    "    service_context.llm, verbose=True\n",
     ")"
    ]
   },
diff --git a/docs/examples/composable_indices/city_analysis/City_Analysis-Unified-Query.ipynb b/docs/examples/composable_indices/city_analysis/City_Analysis-Unified-Query.ipynb
index 800cae7b7719b077f5bedf98eb4469658e70176f..c1a806320bface1941bce10aebdfc4a31a6f2602 100644
--- a/docs/examples/composable_indices/city_analysis/City_Analysis-Unified-Query.ipynb
+++ b/docs/examples/composable_indices/city_analysis/City_Analysis-Unified-Query.ipynb
@@ -371,11 +371,8 @@
     "from llama_index.indices.query.query_transform.base import (\n",
     "    DecomposeQueryTransform,\n",
     ")\n",
-    "from llama_index import LLMPredictor\n",
     "\n",
-    "decompose_transform = DecomposeQueryTransform(\n",
-    "    LLMPredictor(llm=chatgpt), verbose=True\n",
-    ")"
+    "decompose_transform = DecomposeQueryTransform(llm=chatgpt, verbose=True)"
    ]
   },
   {
diff --git a/docs/examples/composable_indices/city_analysis/City_Analysis.ipynb b/docs/examples/composable_indices/city_analysis/City_Analysis.ipynb
index 68255f85264710fb2bdbe4734a94994d65c41a5f..9f39f9a413b7e20bb74f7b05cd928f61e25436d3 100644
--- a/docs/examples/composable_indices/city_analysis/City_Analysis.ipynb
+++ b/docs/examples/composable_indices/city_analysis/City_Analysis.ipynb
@@ -206,7 +206,6 @@
     "    SimpleKeywordTableIndex,\n",
     "    SummaryIndex,\n",
     "    SimpleDirectoryReader,\n",
-    "    LLMPredictor,\n",
     "    ServiceContext,\n",
     ")\n",
     "from llama_index.llms import OpenAI\n",
diff --git a/docs/examples/composable_indices/city_analysis/PineconeDemo-CityAnalysis.ipynb b/docs/examples/composable_indices/city_analysis/PineconeDemo-CityAnalysis.ipynb
index bc4e37269da6170c7d8eaa74de5735edb3d4158f..4a6e00c072e71896f8b3ee3f8899478ed203d30a 100644
--- a/docs/examples/composable_indices/city_analysis/PineconeDemo-CityAnalysis.ipynb
+++ b/docs/examples/composable_indices/city_analysis/PineconeDemo-CityAnalysis.ipynb
@@ -67,7 +67,6 @@
     "    VectorStoreIndex,\n",
     "    SimpleKeywordTableIndex,\n",
     "    SimpleDirectoryReader,\n",
-    "    LLMPredictor,\n",
     "    ServiceContext,\n",
     ")\n",
     "from llama_index.vector_stores import PineconeVectorStore\n",
diff --git a/docs/examples/composable_indices/financial_data_analysis/DeepLakeDemo-FinancialData.ipynb b/docs/examples/composable_indices/financial_data_analysis/DeepLakeDemo-FinancialData.ipynb
index 1c3bf18483d62ecc3408774e791f333ddd904aca..a955f829bcdec5f640b2ffd7e4d081923930ce27 100644
--- a/docs/examples/composable_indices/financial_data_analysis/DeepLakeDemo-FinancialData.ipynb
+++ b/docs/examples/composable_indices/financial_data_analysis/DeepLakeDemo-FinancialData.ipynb
@@ -152,7 +152,6 @@
     "    VectorStoreIndex,\n",
     "    SimpleKeywordTableIndex,\n",
     "    SimpleDirectoryReader,\n",
-    "    LLMPredictor,\n",
     "    ServiceContext,\n",
     "    download_loader,\n",
     "    Document,\n",
@@ -818,7 +817,7 @@
     ")\n",
     "\n",
     "decompose_transform = DecomposeQueryTransform(\n",
-    "    service_context.llm_predictor, verbose=True\n",
+    "    service_context.llm, verbose=True\n",
     ")"
    ]
   },
@@ -879,7 +878,7 @@
     ")\n",
     "\n",
     "decompose_transform = DecomposeQueryTransform(\n",
-    "    service_context.llm_predictor, verbose=True\n",
+    "    service_context.llm, verbose=True\n",
     ")"
    ]
   },
diff --git a/docs/examples/customization/llms/SimpleIndexDemo-ChatGPT.ipynb b/docs/examples/customization/llms/SimpleIndexDemo-ChatGPT.ipynb
index da846a6389064b1cf8fa6b5d91cdb97867e08dc6..565b5cfefb6197ac286679f0398a6dd41b465d58 100644
--- a/docs/examples/customization/llms/SimpleIndexDemo-ChatGPT.ipynb
+++ b/docs/examples/customization/llms/SimpleIndexDemo-ChatGPT.ipynb
@@ -51,7 +51,6 @@
     "from llama_index import (\n",
     "    VectorStoreIndex,\n",
     "    SimpleDirectoryReader,\n",
-    "    LLMPredictor,\n",
     "    ServiceContext,\n",
     ")\n",
     "from llama_index.llms import OpenAI\n",
diff --git a/docs/examples/discover_llamaindex/document_management/discord_dumps/help_channel_dump_05_25_23.json b/docs/examples/discover_llamaindex/document_management/discord_dumps/help_channel_dump_05_25_23.json
index 54bb7a9d3add139e7824b1f7fc440ac2fc23d66b..70b35ad13949f927e37826ee48e089c9f1119960 100644
--- a/docs/examples/discover_llamaindex/document_management/discord_dumps/help_channel_dump_05_25_23.json
+++ b/docs/examples/discover_llamaindex/document_management/discord_dumps/help_channel_dump_05_25_23.json
@@ -79388,7 +79388,7 @@
       "timestampEdited": null,
       "callEndedTimestamp": null,
       "isPinned": false,
-      "content": "`\n        for cur_text_chunk in text_chunks:\n            if not self._streaming:\n                (\n                    response,\n                    formatted_prompt,\n                ) = self._service_context.llm_predictor.predict(\n                    refine_template,\n                    context_msg=cur_text_chunk,\n                )\n            else:\n                response, formatted_prompt = self._service_context.llm_predictor.stream(\n                    refine_template,\n                    context_msg=cur_text_chunk,\n                )\n            self._log_prompt_and_response(\n                formatted_prompt, response, log_prefix=\"Refined\"\n            )\n`\nA code snippet in refine_response_single seems to have no effect on the subsequent llm query after being split into cur_text_chunk. It appears that the response is entirely based on the result of the last text chunk, and previous chunks are essentially discarded. I feel that there may be some issues with this logic, perhaps it's just my understanding problem. I hope you can clarify this for me, thank you.",
+      "content": "`\n        for cur_text_chunk in text_chunks:\n            if not self._streaming:\n                (\n                    response,\n                    formatted_prompt,\n                ) = self._service_context.llm.predict(\n                    refine_template,\n                    context_msg=cur_text_chunk,\n                )\n            else:\n                response, formatted_prompt = self._service_context.llm.stream(\n                    refine_template,\n                    context_msg=cur_text_chunk,\n                )\n            self._log_prompt_and_response(\n                formatted_prompt, response, log_prefix=\"Refined\"\n            )\n`\nA code snippet in refine_response_single seems to have no effect on the subsequent llm query after being split into cur_text_chunk. It appears that the response is entirely based on the result of the last text chunk, and previous chunks are essentially discarded. I feel that there may be some issues with this logic, perhaps it's just my understanding problem. I hope you can clarify this for me, thank you.",
       "author": {
         "id": "937548610885791806",
         "name": "noequal",
@@ -105504,7 +105504,7 @@
       "timestampEdited": null,
       "callEndedTimestamp": null,
       "isPinned": false,
-      "content": "`index._service_context.llm_predictor.last_token_usage()`\n\n`index._service_context.embed_model.last_token_usage()`",
+      "content": "`index._service_context.llm.last_token_usage()`\n\n`index._service_context.embed_model.last_token_usage()`",
       "author": {
         "id": "334536717648265216",
         "name": "Logan M",
diff --git a/docs/examples/discover_llamaindex/document_management/discord_dumps/help_channel_dump_06_02_23.json b/docs/examples/discover_llamaindex/document_management/discord_dumps/help_channel_dump_06_02_23.json
index c29bb79f5dc40c080cf3357716dc1d2ded221eaa..b59cd12b6715ea5d32f494b708805af83ac2eff3 100644
--- a/docs/examples/discover_llamaindex/document_management/discord_dumps/help_channel_dump_06_02_23.json
+++ b/docs/examples/discover_llamaindex/document_management/discord_dumps/help_channel_dump_06_02_23.json
@@ -79388,7 +79388,7 @@
       "timestampEdited": null,
       "callEndedTimestamp": null,
       "isPinned": false,
-      "content": "`\n        for cur_text_chunk in text_chunks:\n            if not self._streaming:\n                (\n                    response,\n                    formatted_prompt,\n                ) = self._service_context.llm_predictor.predict(\n                    refine_template,\n                    context_msg=cur_text_chunk,\n                )\n            else:\n                response, formatted_prompt = self._service_context.llm_predictor.stream(\n                    refine_template,\n                    context_msg=cur_text_chunk,\n                )\n            self._log_prompt_and_response(\n                formatted_prompt, response, log_prefix=\"Refined\"\n            )\n`\nA code snippet in refine_response_single seems to have no effect on the subsequent llm query after being split into cur_text_chunk. It appears that the response is entirely based on the result of the last text chunk, and previous chunks are essentially discarded. I feel that there may be some issues with this logic, perhaps it's just my understanding problem. I hope you can clarify this for me, thank you.",
+      "content": "`\n        for cur_text_chunk in text_chunks:\n            if not self._streaming:\n                (\n                    response,\n                    formatted_prompt,\n                ) = self._service_context.llm.predict(\n                    refine_template,\n                    context_msg=cur_text_chunk,\n                )\n            else:\n                response, formatted_prompt = self._service_context.llm.stream(\n                    refine_template,\n                    context_msg=cur_text_chunk,\n                )\n            self._log_prompt_and_response(\n                formatted_prompt, response, log_prefix=\"Refined\"\n            )\n`\nA code snippet in refine_response_single seems to have no effect on the subsequent llm query after being split into cur_text_chunk. It appears that the response is entirely based on the result of the last text chunk, and previous chunks are essentially discarded. I feel that there may be some issues with this logic, perhaps it's just my understanding problem. I hope you can clarify this for me, thank you.",
       "author": {
         "id": "937548610885791806",
         "name": "noequal",
@@ -105504,7 +105504,7 @@
       "timestampEdited": null,
       "callEndedTimestamp": null,
       "isPinned": false,
-      "content": "`index._service_context.llm_predictor.last_token_usage()`\n\n`index._service_context.embed_model.last_token_usage()`",
+      "content": "`index._service_context.llm.last_token_usage()`\n\n`index._service_context.embed_model.last_token_usage()`",
       "author": {
         "id": "334536717648265216",
         "name": "Logan M",
diff --git a/docs/examples/docstore/DocstoreDemo.ipynb b/docs/examples/docstore/DocstoreDemo.ipynb
index 788c2a397048691cd1309d6f6969fa47d9ede2c1..df8eb228919f9f26498622c5f8d63b7fa04d048e 100644
--- a/docs/examples/docstore/DocstoreDemo.ipynb
+++ b/docs/examples/docstore/DocstoreDemo.ipynb
@@ -59,7 +59,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from llama_index import SimpleDirectoryReader, ServiceContext, LLMPredictor\n",
+    "from llama_index import SimpleDirectoryReader, ServiceContext\n",
     "from llama_index import VectorStoreIndex, SummaryIndex, SimpleKeywordTableIndex\n",
     "from llama_index.composability import ComposableGraph\n",
     "from llama_index.llms import OpenAI"
diff --git a/docs/examples/docstore/FirestoreDemo.ipynb b/docs/examples/docstore/FirestoreDemo.ipynb
index f2c9a9343aa87d0f90c0697043a93d6da177313f..dfbfaddbe4272ed88e764876efc7ca17c23266e5 100644
--- a/docs/examples/docstore/FirestoreDemo.ipynb
+++ b/docs/examples/docstore/FirestoreDemo.ipynb
@@ -56,7 +56,6 @@
     "from llama_index import (\n",
     "    SimpleDirectoryReader,\n",
     "    ServiceContext,\n",
-    "    LLMPredictor,\n",
     "    StorageContext,\n",
     ")\n",
     "from llama_index import VectorStoreIndex, SummaryIndex, SimpleKeywordTableIndex\n",
diff --git a/docs/examples/docstore/MongoDocstoreDemo.ipynb b/docs/examples/docstore/MongoDocstoreDemo.ipynb
index 2f45117a545e54a147cb2ad557291e6c40f4e682..e4f5cc855593e8417e801fc1710428692d6544dd 100644
--- a/docs/examples/docstore/MongoDocstoreDemo.ipynb
+++ b/docs/examples/docstore/MongoDocstoreDemo.ipynb
@@ -63,7 +63,6 @@
     "from llama_index import (\n",
     "    SimpleDirectoryReader,\n",
     "    ServiceContext,\n",
-    "    LLMPredictor,\n",
     "    StorageContext,\n",
     ")\n",
     "from llama_index import VectorStoreIndex, SummaryIndex, SimpleKeywordTableIndex\n",
diff --git a/docs/examples/docstore/RedisDocstoreIndexStoreDemo.ipynb b/docs/examples/docstore/RedisDocstoreIndexStoreDemo.ipynb
index 3ed734ba028dfae732cf3c02f3bc892fc76b2778..74b9a196a7f3978ad7995909bc018b5f099710b4 100644
--- a/docs/examples/docstore/RedisDocstoreIndexStoreDemo.ipynb
+++ b/docs/examples/docstore/RedisDocstoreIndexStoreDemo.ipynb
@@ -91,7 +91,6 @@
     "from llama_index import (\n",
     "    SimpleDirectoryReader,\n",
     "    ServiceContext,\n",
-    "    LLMPredictor,\n",
     "    StorageContext,\n",
     ")\n",
     "from llama_index import VectorStoreIndex, SummaryIndex, SimpleKeywordTableIndex\n",
diff --git a/docs/examples/evaluation/Deepeval.ipynb b/docs/examples/evaluation/Deepeval.ipynb
index fe3a1ba316ddf0d6d1c973f62befc398e0787d41..17a525f6aa9d90fdee39add7b6ecc68c9711ec0f 100644
--- a/docs/examples/evaluation/Deepeval.ipynb
+++ b/docs/examples/evaluation/Deepeval.ipynb
@@ -100,7 +100,6 @@
     "    TreeIndex,\n",
     "    VectorStoreIndex,\n",
     "    SimpleDirectoryReader,\n",
-    "    LLMPredictor,\n",
     "    ServiceContext,\n",
     "    Response,\n",
     ")\n",
diff --git a/docs/examples/evaluation/QuestionGeneration.ipynb b/docs/examples/evaluation/QuestionGeneration.ipynb
index b723dc20fe20d8c832860095c304ea0b29cc35b2..05a2371349df2eca57f4847654964dede0f04308 100644
--- a/docs/examples/evaluation/QuestionGeneration.ipynb
+++ b/docs/examples/evaluation/QuestionGeneration.ipynb
@@ -64,7 +64,6 @@
     "    SimpleDirectoryReader,\n",
     "    VectorStoreIndex,\n",
     "    ServiceContext,\n",
-    "    LLMPredictor,\n",
     "    Response,\n",
     ")\n",
     "from llama_index.llms import OpenAI"
diff --git a/docs/examples/evaluation/batch_eval.ipynb b/docs/examples/evaluation/batch_eval.ipynb
index 1ce5842cb9156662cba733a2ce200a8997bb6315..8e3123ebe31faf47eca100744848c363bdf2d74c 100644
--- a/docs/examples/evaluation/batch_eval.ipynb
+++ b/docs/examples/evaluation/batch_eval.ipynb
@@ -35,8 +35,8 @@
     "import os\n",
     "import openai\n",
     "\n",
-    "os.environ[\"OPENAI_API_KEY\"] = \"YOUR_API_KEY\"\n",
-    "openai.api_key = os.environ[\"OPENAI_API_KEY\"]"
+    "os.environ[\"OPENAI_API_KEY\"] = \"sk-...\"\n",
+    "# openai.api_key = os.environ[\"OPENAI_API_KEY\"]"
    ]
   },
   {
@@ -124,12 +124,126 @@
     "First, we can generate some questions and then run evaluation on them."
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "976e0a93",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Requirement already satisfied: spacy in /home/loganm/.cache/pypoetry/virtualenvs/llama-index-4a-wkI5X-py3.11/lib/python3.11/site-packages (3.7.2)\n",
+      "Requirement already satisfied: datasets in /home/loganm/.cache/pypoetry/virtualenvs/llama-index-4a-wkI5X-py3.11/lib/python3.11/site-packages (2.15.0)\n",
+      "Requirement already satisfied: span-marker in /home/loganm/.cache/pypoetry/virtualenvs/llama-index-4a-wkI5X-py3.11/lib/python3.11/site-packages (1.5.0)\n",
+      "Requirement already satisfied: scikit-learn in /home/loganm/.cache/pypoetry/virtualenvs/llama-index-4a-wkI5X-py3.11/lib/python3.11/site-packages (1.3.2)\n",
+      "Requirement already satisfied: spacy-legacy<3.1.0,>=3.0.11 in /home/loganm/.cache/pypoetry/virtualenvs/llama-index-4a-wkI5X-py3.11/lib/python3.11/site-packages (from spacy) (3.0.12)\n",
+      "Requirement already satisfied: spacy-loggers<2.0.0,>=1.0.0 in /home/loganm/.cache/pypoetry/virtualenvs/llama-index-4a-wkI5X-py3.11/lib/python3.11/site-packages (from spacy) (1.0.5)\n",
+      "Requirement already satisfied: murmurhash<1.1.0,>=0.28.0 in /home/loganm/.cache/pypoetry/virtualenvs/llama-index-4a-wkI5X-py3.11/lib/python3.11/site-packages (from spacy) (1.0.10)\n",
+      "Requirement already satisfied: cymem<2.1.0,>=2.0.2 in /home/loganm/.cache/pypoetry/virtualenvs/llama-index-4a-wkI5X-py3.11/lib/python3.11/site-packages (from spacy) (2.0.8)\n",
+      "Requirement already satisfied: preshed<3.1.0,>=3.0.2 in /home/loganm/.cache/pypoetry/virtualenvs/llama-index-4a-wkI5X-py3.11/lib/python3.11/site-packages (from spacy) (3.0.9)\n",
+      "Requirement already satisfied: thinc<8.3.0,>=8.1.8 in /home/loganm/.cache/pypoetry/virtualenvs/llama-index-4a-wkI5X-py3.11/lib/python3.11/site-packages (from spacy) (8.2.1)\n",
+      "Requirement already satisfied: wasabi<1.2.0,>=0.9.1 in /home/loganm/.cache/pypoetry/virtualenvs/llama-index-4a-wkI5X-py3.11/lib/python3.11/site-packages (from spacy) (1.1.2)\n",
+      "Requirement already satisfied: srsly<3.0.0,>=2.4.3 in /home/loganm/.cache/pypoetry/virtualenvs/llama-index-4a-wkI5X-py3.11/lib/python3.11/site-packages (from spacy) (2.4.8)\n",
+      "Requirement already satisfied: catalogue<2.1.0,>=2.0.6 in /home/loganm/.cache/pypoetry/virtualenvs/llama-index-4a-wkI5X-py3.11/lib/python3.11/site-packages (from spacy) (2.0.10)\n",
+      "Requirement already satisfied: weasel<0.4.0,>=0.1.0 in /home/loganm/.cache/pypoetry/virtualenvs/llama-index-4a-wkI5X-py3.11/lib/python3.11/site-packages (from spacy) (0.3.4)\n",
+      "Requirement already satisfied: typer<0.10.0,>=0.3.0 in /home/loganm/.cache/pypoetry/virtualenvs/llama-index-4a-wkI5X-py3.11/lib/python3.11/site-packages (from spacy) (0.9.0)\n",
+      "Requirement already satisfied: smart-open<7.0.0,>=5.2.1 in /home/loganm/.cache/pypoetry/virtualenvs/llama-index-4a-wkI5X-py3.11/lib/python3.11/site-packages (from spacy) (6.4.0)\n",
+      "Requirement already satisfied: tqdm<5.0.0,>=4.38.0 in /home/loganm/.cache/pypoetry/virtualenvs/llama-index-4a-wkI5X-py3.11/lib/python3.11/site-packages (from spacy) (4.66.1)\n",
+      "Requirement already satisfied: requests<3.0.0,>=2.13.0 in /home/loganm/.cache/pypoetry/virtualenvs/llama-index-4a-wkI5X-py3.11/lib/python3.11/site-packages (from spacy) (2.31.0)\n",
+      "Requirement already satisfied: pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4 in /home/loganm/.cache/pypoetry/virtualenvs/llama-index-4a-wkI5X-py3.11/lib/python3.11/site-packages (from spacy) (1.10.12)\n",
+      "Requirement already satisfied: jinja2 in /home/loganm/.cache/pypoetry/virtualenvs/llama-index-4a-wkI5X-py3.11/lib/python3.11/site-packages (from spacy) (3.1.2)\n",
+      "Requirement already satisfied: setuptools in /home/loganm/.cache/pypoetry/virtualenvs/llama-index-4a-wkI5X-py3.11/lib/python3.11/site-packages (from spacy) (69.0.2)\n",
+      "Requirement already satisfied: packaging>=20.0 in /home/loganm/.cache/pypoetry/virtualenvs/llama-index-4a-wkI5X-py3.11/lib/python3.11/site-packages (from spacy) (23.2)\n",
+      "Requirement already satisfied: langcodes<4.0.0,>=3.2.0 in /home/loganm/.cache/pypoetry/virtualenvs/llama-index-4a-wkI5X-py3.11/lib/python3.11/site-packages (from spacy) (3.3.0)\n",
+      "Requirement already satisfied: numpy>=1.19.0 in /home/loganm/.cache/pypoetry/virtualenvs/llama-index-4a-wkI5X-py3.11/lib/python3.11/site-packages (from spacy) (1.24.4)\n",
+      "Requirement already satisfied: pyarrow>=8.0.0 in /home/loganm/.cache/pypoetry/virtualenvs/llama-index-4a-wkI5X-py3.11/lib/python3.11/site-packages (from datasets) (14.0.1)\n",
+      "Requirement already satisfied: pyarrow-hotfix in /home/loganm/.cache/pypoetry/virtualenvs/llama-index-4a-wkI5X-py3.11/lib/python3.11/site-packages (from datasets) (0.6)\n",
+      "Requirement already satisfied: dill<0.3.8,>=0.3.0 in /home/loganm/.cache/pypoetry/virtualenvs/llama-index-4a-wkI5X-py3.11/lib/python3.11/site-packages (from datasets) (0.3.7)\n",
+      "Requirement already satisfied: pandas in /home/loganm/.cache/pypoetry/virtualenvs/llama-index-4a-wkI5X-py3.11/lib/python3.11/site-packages (from datasets) (2.0.3)\n",
+      "Requirement already satisfied: xxhash in /home/loganm/.cache/pypoetry/virtualenvs/llama-index-4a-wkI5X-py3.11/lib/python3.11/site-packages (from datasets) (3.4.1)\n",
+      "Requirement already satisfied: multiprocess in /home/loganm/.cache/pypoetry/virtualenvs/llama-index-4a-wkI5X-py3.11/lib/python3.11/site-packages (from datasets) (0.70.15)\n",
+      "Requirement already satisfied: fsspec[http]<=2023.10.0,>=2023.1.0 in /home/loganm/.cache/pypoetry/virtualenvs/llama-index-4a-wkI5X-py3.11/lib/python3.11/site-packages (from datasets) (2023.10.0)\n",
+      "Requirement already satisfied: aiohttp in /home/loganm/.cache/pypoetry/virtualenvs/llama-index-4a-wkI5X-py3.11/lib/python3.11/site-packages (from datasets) (3.9.1)\n",
+      "Requirement already satisfied: huggingface-hub>=0.18.0 in /home/loganm/.cache/pypoetry/virtualenvs/llama-index-4a-wkI5X-py3.11/lib/python3.11/site-packages (from datasets) (0.19.4)\n",
+      "Requirement already satisfied: pyyaml>=5.1 in /home/loganm/.cache/pypoetry/virtualenvs/llama-index-4a-wkI5X-py3.11/lib/python3.11/site-packages (from datasets) (6.0.1)\n",
+      "Requirement already satisfied: torch in /home/loganm/.cache/pypoetry/virtualenvs/llama-index-4a-wkI5X-py3.11/lib/python3.11/site-packages (from span-marker) (2.1.1)\n",
+      "Requirement already satisfied: accelerate in /home/loganm/.cache/pypoetry/virtualenvs/llama-index-4a-wkI5X-py3.11/lib/python3.11/site-packages (from span-marker) (0.25.0)\n",
+      "Requirement already satisfied: transformers>=4.19.0 in /home/loganm/.cache/pypoetry/virtualenvs/llama-index-4a-wkI5X-py3.11/lib/python3.11/site-packages (from span-marker) (4.35.2)\n",
+      "Requirement already satisfied: evaluate in /home/loganm/.cache/pypoetry/virtualenvs/llama-index-4a-wkI5X-py3.11/lib/python3.11/site-packages (from span-marker) (0.4.1)\n",
+      "Requirement already satisfied: seqeval in /home/loganm/.cache/pypoetry/virtualenvs/llama-index-4a-wkI5X-py3.11/lib/python3.11/site-packages (from span-marker) (1.2.2)\n",
+      "Requirement already satisfied: scipy>=1.5.0 in /home/loganm/.cache/pypoetry/virtualenvs/llama-index-4a-wkI5X-py3.11/lib/python3.11/site-packages (from scikit-learn) (1.11.4)\n",
+      "Requirement already satisfied: joblib>=1.1.1 in /home/loganm/.cache/pypoetry/virtualenvs/llama-index-4a-wkI5X-py3.11/lib/python3.11/site-packages (from scikit-learn) (1.3.2)\n",
+      "Requirement already satisfied: threadpoolctl>=2.0.0 in /home/loganm/.cache/pypoetry/virtualenvs/llama-index-4a-wkI5X-py3.11/lib/python3.11/site-packages (from scikit-learn) (3.2.0)\n",
+      "Requirement already satisfied: attrs>=17.3.0 in /home/loganm/.cache/pypoetry/virtualenvs/llama-index-4a-wkI5X-py3.11/lib/python3.11/site-packages (from aiohttp->datasets) (23.1.0)\n",
+      "Requirement already satisfied: multidict<7.0,>=4.5 in /home/loganm/.cache/pypoetry/virtualenvs/llama-index-4a-wkI5X-py3.11/lib/python3.11/site-packages (from aiohttp->datasets) (6.0.4)\n",
+      "Requirement already satisfied: yarl<2.0,>=1.0 in /home/loganm/.cache/pypoetry/virtualenvs/llama-index-4a-wkI5X-py3.11/lib/python3.11/site-packages (from aiohttp->datasets) (1.9.3)\n",
+      "Requirement already satisfied: frozenlist>=1.1.1 in /home/loganm/.cache/pypoetry/virtualenvs/llama-index-4a-wkI5X-py3.11/lib/python3.11/site-packages (from aiohttp->datasets) (1.4.0)\n",
+      "Requirement already satisfied: aiosignal>=1.1.2 in /home/loganm/.cache/pypoetry/virtualenvs/llama-index-4a-wkI5X-py3.11/lib/python3.11/site-packages (from aiohttp->datasets) (1.3.1)\n",
+      "Requirement already satisfied: filelock in /home/loganm/.cache/pypoetry/virtualenvs/llama-index-4a-wkI5X-py3.11/lib/python3.11/site-packages (from huggingface-hub>=0.18.0->datasets) (3.13.1)\n",
+      "Requirement already satisfied: typing-extensions>=3.7.4.3 in /home/loganm/.cache/pypoetry/virtualenvs/llama-index-4a-wkI5X-py3.11/lib/python3.11/site-packages (from huggingface-hub>=0.18.0->datasets) (4.8.0)\n",
+      "Requirement already satisfied: charset-normalizer<4,>=2 in /home/loganm/.cache/pypoetry/virtualenvs/llama-index-4a-wkI5X-py3.11/lib/python3.11/site-packages (from requests<3.0.0,>=2.13.0->spacy) (3.3.2)\n",
+      "Requirement already satisfied: idna<4,>=2.5 in /home/loganm/.cache/pypoetry/virtualenvs/llama-index-4a-wkI5X-py3.11/lib/python3.11/site-packages (from requests<3.0.0,>=2.13.0->spacy) (3.6)\n",
+      "Requirement already satisfied: urllib3<3,>=1.21.1 in /home/loganm/.cache/pypoetry/virtualenvs/llama-index-4a-wkI5X-py3.11/lib/python3.11/site-packages (from requests<3.0.0,>=2.13.0->spacy) (1.26.18)\n",
+      "Requirement already satisfied: certifi>=2017.4.17 in /home/loganm/.cache/pypoetry/virtualenvs/llama-index-4a-wkI5X-py3.11/lib/python3.11/site-packages (from requests<3.0.0,>=2.13.0->spacy) (2023.11.17)\n",
+      "Requirement already satisfied: blis<0.8.0,>=0.7.8 in /home/loganm/.cache/pypoetry/virtualenvs/llama-index-4a-wkI5X-py3.11/lib/python3.11/site-packages (from thinc<8.3.0,>=8.1.8->spacy) (0.7.11)\n",
+      "Requirement already satisfied: confection<1.0.0,>=0.0.1 in /home/loganm/.cache/pypoetry/virtualenvs/llama-index-4a-wkI5X-py3.11/lib/python3.11/site-packages (from thinc<8.3.0,>=8.1.8->spacy) (0.1.4)\n",
+      "Requirement already satisfied: regex!=2019.12.17 in /home/loganm/.cache/pypoetry/virtualenvs/llama-index-4a-wkI5X-py3.11/lib/python3.11/site-packages (from transformers>=4.19.0->span-marker) (2023.10.3)\n",
+      "Requirement already satisfied: tokenizers<0.19,>=0.14 in /home/loganm/.cache/pypoetry/virtualenvs/llama-index-4a-wkI5X-py3.11/lib/python3.11/site-packages (from transformers>=4.19.0->span-marker) (0.15.0)\n",
+      "Requirement already satisfied: safetensors>=0.3.1 in /home/loganm/.cache/pypoetry/virtualenvs/llama-index-4a-wkI5X-py3.11/lib/python3.11/site-packages (from transformers>=4.19.0->span-marker) (0.4.1)\n",
+      "Requirement already satisfied: click<9.0.0,>=7.1.1 in /home/loganm/.cache/pypoetry/virtualenvs/llama-index-4a-wkI5X-py3.11/lib/python3.11/site-packages (from typer<0.10.0,>=0.3.0->spacy) (8.1.7)\n",
+      "Requirement already satisfied: cloudpathlib<0.17.0,>=0.7.0 in /home/loganm/.cache/pypoetry/virtualenvs/llama-index-4a-wkI5X-py3.11/lib/python3.11/site-packages (from weasel<0.4.0,>=0.1.0->spacy) (0.16.0)\n",
+      "Requirement already satisfied: psutil in /home/loganm/.cache/pypoetry/virtualenvs/llama-index-4a-wkI5X-py3.11/lib/python3.11/site-packages (from accelerate->span-marker) (5.9.6)\n",
+      "Requirement already satisfied: sympy in /home/loganm/.cache/pypoetry/virtualenvs/llama-index-4a-wkI5X-py3.11/lib/python3.11/site-packages (from torch->span-marker) (1.12)\n",
+      "Requirement already satisfied: networkx in /home/loganm/.cache/pypoetry/virtualenvs/llama-index-4a-wkI5X-py3.11/lib/python3.11/site-packages (from torch->span-marker) (3.2.1)\n",
+      "Requirement already satisfied: nvidia-cuda-nvrtc-cu12==12.1.105 in /home/loganm/.cache/pypoetry/virtualenvs/llama-index-4a-wkI5X-py3.11/lib/python3.11/site-packages (from torch->span-marker) (12.1.105)\n",
+      "Requirement already satisfied: nvidia-cuda-runtime-cu12==12.1.105 in /home/loganm/.cache/pypoetry/virtualenvs/llama-index-4a-wkI5X-py3.11/lib/python3.11/site-packages (from torch->span-marker) (12.1.105)\n",
+      "Requirement already satisfied: nvidia-cuda-cupti-cu12==12.1.105 in /home/loganm/.cache/pypoetry/virtualenvs/llama-index-4a-wkI5X-py3.11/lib/python3.11/site-packages (from torch->span-marker) (12.1.105)\n",
+      "Requirement already satisfied: nvidia-cudnn-cu12==8.9.2.26 in /home/loganm/.cache/pypoetry/virtualenvs/llama-index-4a-wkI5X-py3.11/lib/python3.11/site-packages (from torch->span-marker) (8.9.2.26)\n",
+      "Requirement already satisfied: nvidia-cublas-cu12==12.1.3.1 in /home/loganm/.cache/pypoetry/virtualenvs/llama-index-4a-wkI5X-py3.11/lib/python3.11/site-packages (from torch->span-marker) (12.1.3.1)\n",
+      "Requirement already satisfied: nvidia-cufft-cu12==11.0.2.54 in /home/loganm/.cache/pypoetry/virtualenvs/llama-index-4a-wkI5X-py3.11/lib/python3.11/site-packages (from torch->span-marker) (11.0.2.54)\n",
+      "Requirement already satisfied: nvidia-curand-cu12==10.3.2.106 in /home/loganm/.cache/pypoetry/virtualenvs/llama-index-4a-wkI5X-py3.11/lib/python3.11/site-packages (from torch->span-marker) (10.3.2.106)\n",
+      "Requirement already satisfied: nvidia-cusolver-cu12==11.4.5.107 in /home/loganm/.cache/pypoetry/virtualenvs/llama-index-4a-wkI5X-py3.11/lib/python3.11/site-packages (from torch->span-marker) (11.4.5.107)\n",
+      "Requirement already satisfied: nvidia-cusparse-cu12==12.1.0.106 in /home/loganm/.cache/pypoetry/virtualenvs/llama-index-4a-wkI5X-py3.11/lib/python3.11/site-packages (from torch->span-marker) (12.1.0.106)\n",
+      "Requirement already satisfied: nvidia-nccl-cu12==2.18.1 in /home/loganm/.cache/pypoetry/virtualenvs/llama-index-4a-wkI5X-py3.11/lib/python3.11/site-packages (from torch->span-marker) (2.18.1)\n",
+      "Requirement already satisfied: nvidia-nvtx-cu12==12.1.105 in /home/loganm/.cache/pypoetry/virtualenvs/llama-index-4a-wkI5X-py3.11/lib/python3.11/site-packages (from torch->span-marker) (12.1.105)\n",
+      "Requirement already satisfied: triton==2.1.0 in /home/loganm/.cache/pypoetry/virtualenvs/llama-index-4a-wkI5X-py3.11/lib/python3.11/site-packages (from torch->span-marker) (2.1.0)\n",
+      "Requirement already satisfied: nvidia-nvjitlink-cu12 in /home/loganm/.cache/pypoetry/virtualenvs/llama-index-4a-wkI5X-py3.11/lib/python3.11/site-packages (from nvidia-cusolver-cu12==11.4.5.107->torch->span-marker) (12.3.101)\n",
+      "Requirement already satisfied: responses<0.19 in /home/loganm/.cache/pypoetry/virtualenvs/llama-index-4a-wkI5X-py3.11/lib/python3.11/site-packages (from evaluate->span-marker) (0.18.0)\n",
+      "Requirement already satisfied: MarkupSafe>=2.0 in /home/loganm/.cache/pypoetry/virtualenvs/llama-index-4a-wkI5X-py3.11/lib/python3.11/site-packages (from jinja2->spacy) (2.1.3)\n",
+      "Requirement already satisfied: python-dateutil>=2.8.2 in /home/loganm/.cache/pypoetry/virtualenvs/llama-index-4a-wkI5X-py3.11/lib/python3.11/site-packages (from pandas->datasets) (2.8.2)\n",
+      "Requirement already satisfied: pytz>=2020.1 in /home/loganm/.cache/pypoetry/virtualenvs/llama-index-4a-wkI5X-py3.11/lib/python3.11/site-packages (from pandas->datasets) (2023.3.post1)\n",
+      "Requirement already satisfied: tzdata>=2022.1 in /home/loganm/.cache/pypoetry/virtualenvs/llama-index-4a-wkI5X-py3.11/lib/python3.11/site-packages (from pandas->datasets) (2023.3)\n",
+      "Requirement already satisfied: six>=1.5 in /home/loganm/.cache/pypoetry/virtualenvs/llama-index-4a-wkI5X-py3.11/lib/python3.11/site-packages (from python-dateutil>=2.8.2->pandas->datasets) (1.16.0)\n",
+      "Requirement already satisfied: mpmath>=0.19 in /home/loganm/.cache/pypoetry/virtualenvs/llama-index-4a-wkI5X-py3.11/lib/python3.11/site-packages (from sympy->torch->span-marker) (1.3.0)\n",
+      "\n",
+      "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.2.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m23.3.1\u001b[0m\n",
+      "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n"
+     ]
+    }
+   ],
+   "source": [
+    "!pip install spacy datasets span-marker scikit-learn"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
    "id": "e31e10e6",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/loganm/llama_index_proper/llama_index/llama_index/evaluation/dataset_generation.py:187: DeprecationWarning: Call to deprecated class DatasetGenerator. (Deprecated in favor of `RagDatasetGenerator` which should be used instead.)\n",
+      "  return cls(\n",
+      "/home/loganm/llama_index_proper/llama_index/llama_index/evaluation/dataset_generation.py:282: DeprecationWarning: Call to deprecated class QueryResponseDataset. (Deprecated in favor of `LabelledRagDataset` which should be used instead.)\n",
+      "  return QueryResponseDataset(queries=queries, responses=responses_dict)\n"
+     ]
+    }
+   ],
    "source": [
     "from llama_index.evaluation import DatasetGenerator\n",
     "\n",
@@ -137,7 +251,7 @@
     "    documents, service_context=service_context\n",
     ")\n",
     "\n",
-    "questions = dataset_generator.generate_questions_from_nodes(num=25)"
+    "qas = dataset_generator.generate_dataset_from_nodes(num=3)"
    ]
   },
   {
@@ -165,7 +279,7 @@
     ")\n",
     "\n",
     "eval_results = await runner.aevaluate_queries(\n",
-    "    vector_index.as_query_engine(), queries=questions\n",
+    "    vector_index.as_query_engine(), queries=qas.questions\n",
     ")\n",
     "\n",
     "# If we had ground-truth answers, we could also include the correctness evaluator like below.\n",
@@ -174,17 +288,35 @@
     "#\n",
     "\n",
     "# runner = BatchEvalRunner(\n",
-    "#   {'faithfulness': faithfulness_gpt4, 'relevancy': relevancy_gpt4, 'correctness': correctness_gpt4},\n",
-    "#   workers=8,\n",
+    "#     {\"correctness\": correctness_gpt4},\n",
+    "#     workers=8,\n",
     "# )\n",
-    "#\n",
+    "\n",
     "# eval_results = await runner.aevaluate_queries(\n",
-    "#   vector_index.as_query_engine(),\n",
-    "#   queries=questions,\n",
-    "#   query_kwargs={'question': {'reference': 'ground-truth answer', ...}}\n",
+    "#     vector_index.as_query_engine(),\n",
+    "#     queries=qas.queries,\n",
+    "#     reference=[qr[1] for qr in qas.qr_pairs],\n",
     "# )"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0eff6823",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "3\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(len([qr for qr in qas.qr_pairs]))"
+   ]
+  },
   {
    "cell_type": "markdown",
    "id": "b256b98c",
@@ -203,22 +335,22 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "dict_keys(['faithfulness', 'relevancy'])\n",
-      "dict_keys(['query', 'contexts', 'response', 'passing', 'feedback', 'score'])\n",
-      "True\n",
-      "The population of New York City as of 2020 is 8,804,190.\n",
-      "[\"== Demographics ==\\n\\nNew York City is the most populous city in the United States, with 8,804,190 residents incorporating more immigration into the city than outmigration since the 2010 United States census. More than twice as many people live in New York City as compared to Los Angeles, the second-most populous U.S. city; and New York has more than three times the population of Chicago, the third-most populous U.S. city. New York City gained more residents between 2010 and 2020 (629,000) than any other U.S. city, and a greater amount than the total sum of the gains over the same decade of the next four largest U.S. cities, Los Angeles, Chicago, Houston, and Phoenix, Arizona combined. New York City's population is about 44% of New York State's population, and about 39% of the population of the New York metropolitan area. The majority of New York City residents in 2020 (5,141,538, or 58.4%) were living on Long Island, in Brooklyn, or in Queens. The New York City metropolitan statistical area, has the largest foreign-born population of any metropolitan region in the world. The New York region continues to be by far the leading metropolitan gateway for legal immigrants admitted into the United States, substantially exceeding the combined totals of Los Angeles and Miami.\\n\\n\\n=== Population density ===\\n\\nIn 2020, the city had an estimated population density of 29,302.37 inhabitants per square mile (11,313.71/km2), rendering it the nation's most densely populated of all larger municipalities (those with more than 100,000 residents), with several small cities (of fewer than 100,000) in adjacent Hudson County, New Jersey having greater density, as per the 2010 census. Geographically co-extensive with New York County, the borough of Manhattan's 2017 population density of 72,918 inhabitants per square mile (28,154/km2) makes it the highest of any county in the United States and higher than the density of any individual American city. The next three densest counties in the United States, placing second through fourth, are also New York boroughs: Brooklyn, the Bronx, and Queens respectively.\", \"New York, often called New York City or NYC, is the most populous city in the United States. With a 2020 population of 8,804,190 distributed over 300.46 square miles (778.2 km2), New York City is the most densely populated major city in the United States and more than twice as populous as Los Angeles, the nation's second-largest city. New York City is located at the southern tip of New York State. It constitutes the geographical and demographic center of both the Northeast megalopolis and the New York metropolitan area, the largest metropolitan area in the U.S. by both population and urban area. With over 20.1 million people in its metropolitan statistical area and 23.5 million in its combined statistical area as of 2020, New York is one of the world's most populous megacities, and over 58 million people live within 250 mi (400 km) of the city. New York City is a global cultural, financial, entertainment, and media center with a significant influence on commerce, health care and life sciences, research, technology, education, politics, tourism, dining, art, fashion, and sports. Home to the headquarters of the United Nations, New York is an important center for international diplomacy, and is sometimes described as the capital of the world.Situated on one of the world's largest natural harbors and extending into the Atlantic Ocean, New York City comprises five boroughs, each of which is coextensive with a respective county of the state of New York. The five boroughs, which were created in 1898 when local governments were consolidated into a single municipal entity, are: Brooklyn (in Kings County), Queens (in Queens County), Manhattan (in New York County), The Bronx (in Bronx County), and Staten Island (in Richmond County).As of 2021, the New York metropolitan area is the largest metropolitan economy in the world with a gross metropolitan product of over $2.4 trillion. If the New York metropolitan area were a sovereign state, it would have the eighth-largest economy in the world. New York City is an established safe haven for global investors. New York is home to the highest number of billionaires, individuals of ultra-high net worth (greater than US$30 million), and millionaires of any city in the world.\\nThe city and its metropolitan area constitute the premier gateway for legal immigration to the United States.\"]\n"
+      "dict_keys(['correctness'])\n",
+      "dict_keys(['query', 'contexts', 'response', 'passing', 'feedback', 'score', 'pairwise_source'])\n",
+      "False\n",
+      "The context information does not provide any information related to the query. Therefore, I cannot provide an answer based on the given context.\n",
+      "None\n"
      ]
     }
    ],
    "source": [
     "print(eval_results.keys())\n",
     "\n",
-    "print(eval_results[\"faithfulness\"][0].dict().keys())\n",
+    "print(eval_results[\"correctness\"][0].dict().keys())\n",
     "\n",
-    "print(eval_results[\"faithfulness\"][0].passing)\n",
-    "print(eval_results[\"faithfulness\"][0].response)\n",
-    "print(eval_results[\"faithfulness\"][0].contexts)"
+    "print(eval_results[\"correctness\"][0].passing)\n",
+    "print(eval_results[\"correctness\"][0].response)\n",
+    "print(eval_results[\"correctness\"][0].contexts)"
    ]
   },
   {
@@ -257,12 +389,12 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "faithfulness Score: 1.0\n"
+      "correctness Score: 0.0\n"
      ]
     }
    ],
    "source": [
-    "score = get_eval_results(\"faithfulness\", eval_results)"
+    "score = get_eval_results(\"correctness\", eval_results)"
    ]
   },
   {
@@ -286,9 +418,9 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "llama_index_v2",
+   "display_name": "llama-index-4a-wkI5X-py3.11",
    "language": "python",
-   "name": "llama_index_v2"
+   "name": "python3"
   },
   "language_info": {
    "codemirror_mode": {
diff --git a/docs/examples/evaluation/relevancy_eval.ipynb b/docs/examples/evaluation/relevancy_eval.ipynb
index e44df8e7847630a37a448d8b9df53229d42fbc17..652e9783a9b58d66d846f54b069f0d8c0790c6be 100644
--- a/docs/examples/evaluation/relevancy_eval.ipynb
+++ b/docs/examples/evaluation/relevancy_eval.ipynb
@@ -36,7 +36,6 @@
     "    TreeIndex,\n",
     "    VectorStoreIndex,\n",
     "    SimpleDirectoryReader,\n",
-    "    LLMPredictor,\n",
     "    ServiceContext,\n",
     "    Response,\n",
     ")\n",
diff --git a/docs/examples/finetuning/gradient/gradient_structured.ipynb b/docs/examples/finetuning/gradient/gradient_structured.ipynb
index fbb581771fe95aaee69604bddab6bdb66e760464..efee83cbfd2a963bd272e335e204c01f38f5e6fd 100644
--- a/docs/examples/finetuning/gradient/gradient_structured.ipynb
+++ b/docs/examples/finetuning/gradient/gradient_structured.ipynb
@@ -103,7 +103,7 @@
     "    is_chat_model=True,\n",
     ")\n",
     "# HACK: set chat model\n",
-    "# from llama_index.llms.base import LLMMetadata\n",
+    "# from llama_index.llms.types import LLMMetadata\n",
     "# gradient_llm.metadata = LLMMetadata(\n",
     "#     context_window=1024,\n",
     "#     num_output=gradient_llm.max_tokens or 20,\n",
diff --git a/docs/examples/finetuning/llm_judge/correctness/finetune_llm_judge_single_grading_correctness.ipynb b/docs/examples/finetuning/llm_judge/correctness/finetune_llm_judge_single_grading_correctness.ipynb
index bc383124b5efe1d60ecf4889a4e6af4cc7fd7b5d..26334e8bff0108b51040a72a56ee342275ae8291 100644
--- a/docs/examples/finetuning/llm_judge/correctness/finetune_llm_judge_single_grading_correctness.ipynb
+++ b/docs/examples/finetuning/llm_judge/correctness/finetune_llm_judge_single_grading_correctness.ipynb
@@ -257,14 +257,13 @@
     "    RetrieverQueryEngine,\n",
     ")\n",
     "from llama_index.llms import HuggingFaceInferenceAPI\n",
-    "from llama_index.llm_predictor import LLMPredictor\n",
     "\n",
     "llm = HuggingFaceInferenceAPI(\n",
     "    model_name=\"meta-llama/Llama-2-7b-chat-hf\",\n",
     "    context_window=2048,  # to use refine\n",
     "    token=HUGGING_FACE_TOKEN,\n",
     ")\n",
-    "context = ServiceContext.from_defaults(llm_predictor=LLMPredictor(llm=llm))\n",
+    "context = ServiceContext.from_defaults(llm=llm)\n",
     "query_engine = RetrieverQueryEngine.from_args(\n",
     "    retriever=the_retriever, service_context=context\n",
     ")"
diff --git a/docs/examples/finetuning/llm_judge/pairwise/finetune_llm_judge.ipynb b/docs/examples/finetuning/llm_judge/pairwise/finetune_llm_judge.ipynb
index c72f69131b74064869beb39739887e2aa478ad99..542413c8fd992c9b192d89b759ccb366322dcfbe 100644
--- a/docs/examples/finetuning/llm_judge/pairwise/finetune_llm_judge.ipynb
+++ b/docs/examples/finetuning/llm_judge/pairwise/finetune_llm_judge.ipynb
@@ -402,7 +402,6 @@
     "    RetrieverQueryEngine,\n",
     ")\n",
     "from llama_index.llms import HuggingFaceInferenceAPI\n",
-    "from llama_index.llm_predictor import LLMPredictor\n",
     "\n",
     "\n",
     "def create_query_engine(\n",
@@ -416,7 +415,7 @@
     "        context_window=2048,  # to use refine\n",
     "        token=HUGGING_FACE_TOKEN,\n",
     "    )\n",
-    "    context = ServiceContext.from_defaults(llm_predictor=LLMPredictor(llm=llm))\n",
+    "    context = ServiceContext.from_defaults(llm=llm)\n",
     "    return RetrieverQueryEngine.from_args(\n",
     "        retriever=retriever, service_context=context\n",
     "    )"
diff --git a/docs/examples/index_structs/knowledge_graph/KnowledgeGraphDemo.ipynb b/docs/examples/index_structs/knowledge_graph/KnowledgeGraphDemo.ipynb
index bb238dec752d906e9e8f22c8a960ac2f6e2cc515..b467fb951a5a5ad99351e4c5a63db6ca10535ba5 100644
--- a/docs/examples/index_structs/knowledge_graph/KnowledgeGraphDemo.ipynb
+++ b/docs/examples/index_structs/knowledge_graph/KnowledgeGraphDemo.ipynb
@@ -76,7 +76,6 @@
    "source": [
     "from llama_index import (\n",
     "    SimpleDirectoryReader,\n",
-    "    LLMPredictor,\n",
     "    ServiceContext,\n",
     "    KnowledgeGraphIndex,\n",
     ")\n",
diff --git a/docs/examples/index_structs/knowledge_graph/NebulaGraphKGIndexDemo.ipynb b/docs/examples/index_structs/knowledge_graph/NebulaGraphKGIndexDemo.ipynb
index 68fa6426c8fe23e0a287547cff472b0df6608213..d64cd170e1782baadb32e4360b4d99d63243210f 100644
--- a/docs/examples/index_structs/knowledge_graph/NebulaGraphKGIndexDemo.ipynb
+++ b/docs/examples/index_structs/knowledge_graph/NebulaGraphKGIndexDemo.ipynb
@@ -51,7 +51,6 @@
     "    VectorStoreIndex,\n",
     "    SimpleDirectoryReader,\n",
     "    KnowledgeGraphIndex,\n",
-    "    LLMPredictor,\n",
     "    ServiceContext,\n",
     ")\n",
     "\n",
@@ -83,7 +82,6 @@
     "    api_base=openai.api_base,\n",
     "    api_version=openai.api_version,\n",
     ")\n",
-    "llm_predictor = LLMPredictor(llm=llm)\n",
     "\n",
     "# You need to deploy your own embedding model as well as your own chat completion model\n",
     "embedding_model = OpenAIEmbedding(\n",
@@ -96,7 +94,7 @@
     ")\n",
     "\n",
     "service_context = ServiceContext.from_defaults(\n",
-    "    llm_predictor=llm_predictor,\n",
+    "    llm=llm,\n",
     "    embed_model=embedding_model,\n",
     ")"
    ]
@@ -128,7 +126,6 @@
    "source": [
     "from llama_index import (\n",
     "    KnowledgeGraphIndex,\n",
-    "    LLMPredictor,\n",
     "    ServiceContext,\n",
     "    SimpleDirectoryReader,\n",
     ")\n",
diff --git a/docs/examples/index_structs/knowledge_graph/Neo4jKGIndexDemo.ipynb b/docs/examples/index_structs/knowledge_graph/Neo4jKGIndexDemo.ipynb
index 82272d9a40158b3bd47ff04d00e5370d91d60722..ad4710ad315562953b9a24360062bffcbd22ee0b 100644
--- a/docs/examples/index_structs/knowledge_graph/Neo4jKGIndexDemo.ipynb
+++ b/docs/examples/index_structs/knowledge_graph/Neo4jKGIndexDemo.ipynb
@@ -50,7 +50,6 @@
     "    VectorStoreIndex,\n",
     "    SimpleDirectoryReader,\n",
     "    KnowledgeGraphIndex,\n",
-    "    LLMPredictor,\n",
     "    ServiceContext,\n",
     ")\n",
     "\n",
@@ -81,7 +80,6 @@
     "        \"api_version\": openai.api_version,\n",
     "    },\n",
     ")\n",
-    "llm_predictor = LLMPredictor(llm=llm)\n",
     "\n",
     "# You need to deploy your own embedding model as well as your own chat completion model\n",
     "embedding_llm = OpenAIEmbedding(\n",
@@ -94,7 +92,7 @@
     ")\n",
     "\n",
     "service_context = ServiceContext.from_defaults(\n",
-    "    llm_predictor=llm_predictor,\n",
+    "    llm=llm,\n",
     "    embed_model=embedding_llm,\n",
     ")"
    ]
@@ -124,7 +122,6 @@
    "source": [
     "from llama_index import (\n",
     "    KnowledgeGraphIndex,\n",
-    "    LLMPredictor,\n",
     "    ServiceContext,\n",
     "    SimpleDirectoryReader,\n",
     ")\n",
diff --git a/docs/examples/llm/Konko.ipynb b/docs/examples/llm/Konko.ipynb
index 5259a1552a1c6a4f24c877d83a5e3c189adc3c10..d1bb6b0c7e2d575d2932929f73924184eb2c1d56 100644
--- a/docs/examples/llm/Konko.ipynb
+++ b/docs/examples/llm/Konko.ipynb
@@ -44,7 +44,7 @@
    "outputs": [],
    "source": [
     "from llama_index.llms import Konko\n",
-    "from llama_index.llms.base import ChatMessage"
+    "from llama_index.llms import ChatMessage"
    ]
   },
   {
diff --git a/docs/examples/llm/anyscale.ipynb b/docs/examples/llm/anyscale.ipynb
index 7b12d0c98b09da68bcb2b6f1c53e1306c7cdde8f..8ee0f60818c2b1a1ae7a2b9df3c3ad7470f824f3 100644
--- a/docs/examples/llm/anyscale.ipynb
+++ b/docs/examples/llm/anyscale.ipynb
@@ -42,7 +42,7 @@
    "outputs": [],
    "source": [
     "from llama_index.llms import Anyscale\n",
-    "from llama_index.llms.base import ChatMessage"
+    "from llama_index.llms import ChatMessage"
    ]
   },
   {
diff --git a/docs/examples/llm/everlyai.ipynb b/docs/examples/llm/everlyai.ipynb
index b99f50eb437c074d440331dd1238d50c5d68f29b..3c5062d2da921cbf135c76447d27d47ba3a6c59d 100644
--- a/docs/examples/llm/everlyai.ipynb
+++ b/docs/examples/llm/everlyai.ipynb
@@ -44,7 +44,7 @@
    "outputs": [],
    "source": [
     "from llama_index.llms import EverlyAI\n",
-    "from llama_index.llms.base import ChatMessage"
+    "from llama_index.llms import ChatMessage"
    ]
   },
   {
diff --git a/docs/examples/llm/litellm.ipynb b/docs/examples/llm/litellm.ipynb
index d40af5d4fd974406b85e51e4bb2b74cf1a1a5295..2049334b60c4689dba5ed8dee79e270f5a076993 100755
--- a/docs/examples/llm/litellm.ipynb
+++ b/docs/examples/llm/litellm.ipynb
@@ -55,7 +55,6 @@
    "source": [
     "import os\n",
     "from llama_index.llms import LiteLLM, ChatMessage\n",
-    "from llama_index.llms.base import \n",
     "\n",
     "# set env variable\n",
     "os.environ[\"OPENAI_API_KEY\"] = \"your-api-key\"\n",
diff --git a/docs/examples/llm/llm_predictor.ipynb b/docs/examples/llm/llm_predictor.ipynb
index bc6fb4cdd6614859ee63b8d85e5378d88b15ba25..dc352872d604c9062bedcda9c0017848aef18057 100644
--- a/docs/examples/llm/llm_predictor.ipynb
+++ b/docs/examples/llm/llm_predictor.ipynb
@@ -50,7 +50,7 @@
    "outputs": [],
    "source": [
     "from langchain.chat_models import ChatAnyscale, ChatOpenAI\n",
-    "from llama_index import LLMPredictor\n",
+    "from llama_index.llms import LangChainLLM\n",
     "from llama_index.prompts import PromptTemplate"
    ]
   },
@@ -61,7 +61,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "llm_predictor = LLMPredictor(ChatOpenAI())"
+    "llm = LangChainLLM(ChatOpenAI())"
    ]
   },
   {
@@ -71,7 +71,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "stream = await llm_predictor.astream(PromptTemplate(\"Hi, write a short story\"))"
+    "stream = await llm.astream(PromptTemplate(\"Hi, write a short story\"))"
    ]
   },
   {
@@ -127,7 +127,7 @@
    "outputs": [],
    "source": [
     "## Test with ChatAnyscale\n",
-    "llm_predictor = LLMPredictor(ChatAnyscale())"
+    "llm = LangChainLLM(ChatAnyscale())"
    ]
   },
   {
@@ -145,7 +145,7 @@
     }
    ],
    "source": [
-    "stream = llm_predictor.stream(\n",
+    "stream = llm.stream(\n",
     "    PromptTemplate(\"Hi, Which NFL team have most Super Bowl wins\")\n",
     ")\n",
     "for token in stream:\n",
@@ -167,8 +167,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from llama_index.llms import OpenAI\n",
-    "from llama_index import LLMPredictor"
+    "from llama_index.llms import OpenAI"
    ]
   },
   {
@@ -178,7 +177,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "llm_predictor = LLMPredictor(OpenAI())"
+    "llm = OpenAI()"
    ]
   },
   {
@@ -188,7 +187,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "stream = await llm_predictor.astream(\"Hi, write a short story\")"
+    "stream = await llm.astream(\"Hi, write a short story\")"
    ]
   },
   {
diff --git a/docs/examples/llm/monsterapi.ipynb b/docs/examples/llm/monsterapi.ipynb
index cc295e7f86dd2e90b36f19b45144f645548c7357..a38aea8154a14a569c145e048eb0b3e22fc26189 100644
--- a/docs/examples/llm/monsterapi.ipynb
+++ b/docs/examples/llm/monsterapi.ipynb
@@ -160,7 +160,7 @@
     }
    ],
    "source": [
-    "from llama_index.llms.base import ChatMessage\n",
+    "from llama_index.llms import ChatMessage\n",
     "\n",
     "# Construct mock Chat history\n",
     "history_message = ChatMessage(\n",
diff --git a/docs/examples/llm/openllm.ipynb b/docs/examples/llm/openllm.ipynb
index 5076857c00c82926823878139e44a8ba090ef996..290a23c3fe1bf26f886c86fdf997b1a1f807becf 100644
--- a/docs/examples/llm/openllm.ipynb
+++ b/docs/examples/llm/openllm.ipynb
@@ -87,7 +87,7 @@
     "from typing import List, Optional\n",
     "\n",
     "from llama_index.llms import OpenLLM, OpenLLMAPI\n",
-    "from llama_index.llms.base import ChatMessage"
+    "from llama_index.llms import ChatMessage"
    ]
   },
   {
diff --git a/docs/examples/llm/perplexity.ipynb b/docs/examples/llm/perplexity.ipynb
index 868ec878bdd0ca5e92e30feda83257988862119d..5ca5ba5bb128c873be49e874fe77e1f58c92aadc 100644
--- a/docs/examples/llm/perplexity.ipynb
+++ b/docs/examples/llm/perplexity.ipynb
@@ -65,7 +65,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from llama_index.llms.base import ChatMessage\n",
+    "from llama_index.llms import ChatMessage\n",
     "\n",
     "messages_dict = [\n",
     "    {\"role\": \"system\", \"content\": \"Be precise and concise.\"},\n",
diff --git a/docs/examples/llm/rungpt.ipynb b/docs/examples/llm/rungpt.ipynb
index 922d7b153a6ed4f23bbd21bcaf718b41a7ed654c..4122438a095b2c5b585ed88ce12a37b5a2f81385 100644
--- a/docs/examples/llm/rungpt.ipynb
+++ b/docs/examples/llm/rungpt.ipynb
@@ -122,7 +122,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from llama_index.llms.base import ChatMessage, MessageRole\n",
+    "from llama_index.llms import ChatMessage, MessageRole\n",
     "from llama_index.llms.rungpt import RunGptLLM\n",
     "\n",
     "messages = [\n",
diff --git a/docs/examples/llm/vertex.ipynb b/docs/examples/llm/vertex.ipynb
index 6384f7ba0d5eb7aa513753f80030be087264b010..297550d6dcb2d497d2ed90853fb2f97ff5449d25 100644
--- a/docs/examples/llm/vertex.ipynb
+++ b/docs/examples/llm/vertex.ipynb
@@ -60,7 +60,7 @@
    ],
    "source": [
     "from llama_index.llms.vertex import Vertex\n",
-    "from llama_index.llms.base import ChatMessage, MessageRole\n",
+    "from llama_index.llms import ChatMessage, MessageRole\n",
     "\n",
     "llm = Vertex(model=\"text-bison\", temperature=0, additional_kwargs={})\n",
     "llm.complete(\"Hello this is a sample text\").text"
diff --git a/docs/examples/llm/vllm.ipynb b/docs/examples/llm/vllm.ipynb
index 4ece58e24c5ac5fe37d2be1c1553c759aed44a22..8cda81b3b3dda46bf4d316f9a798cfb8844f293f 100644
--- a/docs/examples/llm/vllm.ipynb
+++ b/docs/examples/llm/vllm.ipynb
@@ -491,7 +491,7 @@
    "outputs": [],
    "source": [
     "from llama_index.llms.vllm import VllmServer\n",
-    "from llama_index.llms.base import ChatMessage"
+    "from llama_index.llms import ChatMessage"
    ]
   },
   {
diff --git a/docs/examples/low_level/response_synthesis.ipynb b/docs/examples/low_level/response_synthesis.ipynb
index 99e068809ef0a403265d61ab027b5e807c95bb8d..feb1c7a6047ce30348d0f72278d37338e77b2d9d 100644
--- a/docs/examples/low_level/response_synthesis.ipynb
+++ b/docs/examples/low_level/response_synthesis.ipynb
@@ -926,7 +926,7 @@
    "outputs": [],
    "source": [
     "from llama_index.retrievers import BaseRetriever\n",
-    "from llama_index.llms.base import LLM\n",
+    "from llama_index.llms.llm import LLM\n",
     "from dataclasses import dataclass\n",
     "from typing import Optional, List\n",
     "\n",
diff --git a/docs/examples/node_postprocessor/LLMReranker-Gatsby.ipynb b/docs/examples/node_postprocessor/LLMReranker-Gatsby.ipynb
index 7e9e6c61304a9e6185768d99a6e84144df0910af..4089b1063cd04f752384ee2f726e770ee50e8632 100644
--- a/docs/examples/node_postprocessor/LLMReranker-Gatsby.ipynb
+++ b/docs/examples/node_postprocessor/LLMReranker-Gatsby.ipynb
@@ -41,7 +41,6 @@
     "    VectorStoreIndex,\n",
     "    SimpleDirectoryReader,\n",
     "    ServiceContext,\n",
-    "    LLMPredictor,\n",
     ")\n",
     "from llama_index.postprocessor import LLMRerank\n",
     "from llama_index.llms import OpenAI\n",
diff --git a/docs/examples/node_postprocessor/LLMReranker-Lyft-10k.ipynb b/docs/examples/node_postprocessor/LLMReranker-Lyft-10k.ipynb
index 00aec9e230a25eb0fec8cf5222a45bec03e4a3c4..c4e1d08bbd15d900d841b85f869a207b0ea4c7ed 100644
--- a/docs/examples/node_postprocessor/LLMReranker-Lyft-10k.ipynb
+++ b/docs/examples/node_postprocessor/LLMReranker-Lyft-10k.ipynb
@@ -50,7 +50,6 @@
     "    VectorStoreIndex,\n",
     "    SimpleDirectoryReader,\n",
     "    ServiceContext,\n",
-    "    LLMPredictor,\n",
     ")\n",
     "from llama_index.postprocessor import LLMRerank\n",
     "\n",
diff --git a/docs/examples/output_parsing/GuardrailsDemo.ipynb b/docs/examples/output_parsing/GuardrailsDemo.ipynb
index e0e0ed10b41fc8c18e7e3d00d58ca5ffbf540516..5d32d49ca10f32aee2445a46aef1b44317615724 100644
--- a/docs/examples/output_parsing/GuardrailsDemo.ipynb
+++ b/docs/examples/output_parsing/GuardrailsDemo.ipynb
@@ -26,6 +26,16 @@
     "If you're opening this Notebook on colab, you will probably need to install LlamaIndex ðŸ¦™."
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e716f66f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!pip install guardrails-ai"
+   ]
+  },
   {
    "attachments": {},
    "cell_type": "markdown",
@@ -40,7 +50,27 @@
    "execution_count": null,
    "id": "649bea0c",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Will not apply HSTS. The HSTS database must be a regular and non-world-writable file.\n",
+      "ERROR: could not open HSTS store at '/home/loganm/.wget-hsts'. HSTS will be disabled.\n",
+      "--2023-12-11 10:18:02--  https://raw.githubusercontent.com/run-llama/llama_index/main/docs/examples/data/paul_graham/paul_graham_essay.txt\n",
+      "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.109.133, 185.199.108.133, ...\n",
+      "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.\n",
+      "HTTP request sent, awaiting response... 200 OK\n",
+      "Length: 75042 (73K) [text/plain]\n",
+      "Saving to: â€˜data/paul_graham/paul_graham_essay.txtâ€™\n",
+      "\n",
+      "data/paul_graham/pa 100%[===================>]  73.28K  --.-KB/s    in 0.04s   \n",
+      "\n",
+      "2023-12-11 10:18:02 (1.70 MB/s) - â€˜data/paul_graham/paul_graham_essay.txtâ€™ saved [75042/75042]\n",
+      "\n"
+     ]
+    }
+   ],
    "source": [
     "!mkdir -p 'data/paul_graham/'\n",
     "!wget 'https://raw.githubusercontent.com/run-llama/llama_index/main/docs/examples/data/paul_graham/paul_graham_essay.txt' -O 'data/paul_graham/paul_graham_essay.txt'"
@@ -59,7 +89,35 @@
    "execution_count": null,
    "id": "690a6918-7c75-4f95-9ccc-d2c4a1fe00d7",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "ename": "AttributeError",
+     "evalue": "module 'openai' has no attribute 'error'",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mAttributeError\u001b[0m                            Traceback (most recent call last)",
+      "\u001b[1;32m/home/loganm/llama_index_proper/llama_index/docs/examples/output_parsing/GuardrailsDemo.ipynb Cell 8\u001b[0m line \u001b[0;36m7\n\u001b[1;32m      <a href='vscode-notebook-cell://wsl%2Bubuntu/home/loganm/llama_index_proper/llama_index/docs/examples/output_parsing/GuardrailsDemo.ipynb#W6sdnNjb2RlLXJlbW90ZQ%3D%3D?line=3'>4</a>\u001b[0m logging\u001b[39m.\u001b[39mbasicConfig(stream\u001b[39m=\u001b[39msys\u001b[39m.\u001b[39mstdout, level\u001b[39m=\u001b[39mlogging\u001b[39m.\u001b[39mINFO)\n\u001b[1;32m      <a href='vscode-notebook-cell://wsl%2Bubuntu/home/loganm/llama_index_proper/llama_index/docs/examples/output_parsing/GuardrailsDemo.ipynb#W6sdnNjb2RlLXJlbW90ZQ%3D%3D?line=4'>5</a>\u001b[0m logging\u001b[39m.\u001b[39mgetLogger()\u001b[39m.\u001b[39maddHandler(logging\u001b[39m.\u001b[39mStreamHandler(stream\u001b[39m=\u001b[39msys\u001b[39m.\u001b[39mstdout))\n\u001b[0;32m----> <a href='vscode-notebook-cell://wsl%2Bubuntu/home/loganm/llama_index_proper/llama_index/docs/examples/output_parsing/GuardrailsDemo.ipynb#W6sdnNjb2RlLXJlbW90ZQ%3D%3D?line=6'>7</a>\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39mllama_index\u001b[39;00m \u001b[39mimport\u001b[39;00m VectorStoreIndex, SimpleDirectoryReader\n\u001b[1;32m      <a href='vscode-notebook-cell://wsl%2Bubuntu/home/loganm/llama_index_proper/llama_index/docs/examples/output_parsing/GuardrailsDemo.ipynb#W6sdnNjb2RlLXJlbW90ZQ%3D%3D?line=7'>8</a>\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39mIPython\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mdisplay\u001b[39;00m \u001b[39mimport\u001b[39;00m Markdown, display\n\u001b[1;32m     <a href='vscode-notebook-cell://wsl%2Bubuntu/home/loganm/llama_index_proper/llama_index/docs/examples/output_parsing/GuardrailsDemo.ipynb#W6sdnNjb2RlLXJlbW90ZQ%3D%3D?line=9'>10</a>\u001b[0m \u001b[39mimport\u001b[39;00m \u001b[39mos\u001b[39;00m\n",
+      "File \u001b[0;32m~/llama_index_proper/llama_index/llama_index/__init__.py:21\u001b[0m\n\u001b[1;32m     17\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39mllama_index\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39membeddings\u001b[39;00m \u001b[39mimport\u001b[39;00m OpenAIEmbedding\n\u001b[1;32m     19\u001b[0m \u001b[39m# indices\u001b[39;00m\n\u001b[1;32m     20\u001b[0m \u001b[39m# loading\u001b[39;00m\n\u001b[0;32m---> 21\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39mllama_index\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mindices\u001b[39;00m \u001b[39mimport\u001b[39;00m (\n\u001b[1;32m     22\u001b[0m     ComposableGraph,\n\u001b[1;32m     23\u001b[0m     DocumentSummaryIndex,\n\u001b[1;32m     24\u001b[0m     GPTDocumentSummaryIndex,\n\u001b[1;32m     25\u001b[0m     GPTKeywordTableIndex,\n\u001b[1;32m     26\u001b[0m     GPTKnowledgeGraphIndex,\n\u001b[1;32m     27\u001b[0m     GPTListIndex,\n\u001b[1;32m     28\u001b[0m     GPTRAKEKeywordTableIndex,\n\u001b[1;32m     29\u001b[0m     GPTSimpleKeywordTableIndex,\n\u001b[1;32m     30\u001b[0m     GPTTreeIndex,\n\u001b[1;32m     31\u001b[0m     GPTVectorStoreIndex,\n\u001b[1;32m     32\u001b[0m     KeywordTableIndex,\n\u001b[1;32m     33\u001b[0m     KnowledgeGraphIndex,\n\u001b[1;32m     34\u001b[0m     ListIndex,\n\u001b[1;32m     35\u001b[0m     RAKEKeywordTableIndex,\n\u001b[1;32m     36\u001b[0m     SimpleKeywordTableIndex,\n\u001b[1;32m     37\u001b[0m     SummaryIndex,\n\u001b[1;32m     38\u001b[0m     TreeIndex,\n\u001b[1;32m     39\u001b[0m     VectorStoreIndex,\n\u001b[1;32m     40\u001b[0m     load_graph_from_storage,\n\u001b[1;32m     41\u001b[0m     load_index_from_storage,\n\u001b[1;32m     42\u001b[0m     load_indices_from_storage,\n\u001b[1;32m     43\u001b[0m )\n\u001b[1;32m     45\u001b[0m \u001b[39m# structured\u001b[39;00m\n\u001b[1;32m     46\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39mllama_index\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mindices\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mcommon\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mstruct_store\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mbase\u001b[39;00m \u001b[39mimport\u001b[39;00m SQLDocumentContextBuilder\n",
+      "File \u001b[0;32m~/llama_index_proper/llama_index/llama_index/indices/__init__.py:29\u001b[0m\n\u001b[1;32m     27\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39mllama_index\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mindices\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mlist\u001b[39;00m \u001b[39mimport\u001b[39;00m GPTListIndex, ListIndex, SummaryIndex\n\u001b[1;32m     28\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39mllama_index\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mindices\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mlist\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mbase\u001b[39;00m \u001b[39mimport\u001b[39;00m GPTListIndex, ListIndex, SummaryIndex\n\u001b[0;32m---> 29\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39mllama_index\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mindices\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mloading\u001b[39;00m \u001b[39mimport\u001b[39;00m (\n\u001b[1;32m     30\u001b[0m     load_graph_from_storage,\n\u001b[1;32m     31\u001b[0m     load_index_from_storage,\n\u001b[1;32m     32\u001b[0m     load_indices_from_storage,\n\u001b[1;32m     33\u001b[0m )\n\u001b[1;32m     34\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39mllama_index\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mindices\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mmanaged\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mvectara\u001b[39;00m \u001b[39mimport\u001b[39;00m VectaraIndex\n\u001b[1;32m     35\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39mllama_index\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mindices\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mmulti_modal\u001b[39;00m \u001b[39mimport\u001b[39;00m MultiModalVectorStoreIndex\n",
+      "File \u001b[0;32m~/llama_index_proper/llama_index/llama_index/indices/loading.py:6\u001b[0m\n\u001b[1;32m      4\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39mllama_index\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mindices\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mbase\u001b[39;00m \u001b[39mimport\u001b[39;00m BaseIndex\n\u001b[1;32m      5\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39mllama_index\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mindices\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mcomposability\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mgraph\u001b[39;00m \u001b[39mimport\u001b[39;00m ComposableGraph\n\u001b[0;32m----> 6\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39mllama_index\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mindices\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mregistry\u001b[39;00m \u001b[39mimport\u001b[39;00m INDEX_STRUCT_TYPE_TO_INDEX_CLASS\n\u001b[1;32m      7\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39mllama_index\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mstorage\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mstorage_context\u001b[39;00m \u001b[39mimport\u001b[39;00m StorageContext\n\u001b[1;32m      9\u001b[0m logger \u001b[39m=\u001b[39m logging\u001b[39m.\u001b[39mgetLogger(\u001b[39m__name__\u001b[39m)\n",
+      "File \u001b[0;32m~/llama_index_proper/llama_index/llama_index/indices/registry.py:12\u001b[0m\n\u001b[1;32m     10\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39mllama_index\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mindices\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mknowledge_graph\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mbase\u001b[39;00m \u001b[39mimport\u001b[39;00m KnowledgeGraphIndex\n\u001b[1;32m     11\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39mllama_index\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mindices\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mlist\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mbase\u001b[39;00m \u001b[39mimport\u001b[39;00m SummaryIndex\n\u001b[0;32m---> 12\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39mllama_index\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mindices\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mmulti_modal\u001b[39;00m \u001b[39mimport\u001b[39;00m MultiModalVectorStoreIndex\n\u001b[1;32m     13\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39mllama_index\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mindices\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mstruct_store\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mpandas\u001b[39;00m \u001b[39mimport\u001b[39;00m PandasIndex\n\u001b[1;32m     14\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39mllama_index\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mindices\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mstruct_store\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39msql\u001b[39;00m \u001b[39mimport\u001b[39;00m SQLStructStoreIndex\n",
+      "File \u001b[0;32m~/llama_index_proper/llama_index/llama_index/indices/multi_modal/__init__.py:3\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[39m\"\"\"Vector-store based data structures.\"\"\"\u001b[39;00m\n\u001b[0;32m----> 3\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39mllama_index\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mindices\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mmulti_modal\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mbase\u001b[39;00m \u001b[39mimport\u001b[39;00m MultiModalVectorStoreIndex\n\u001b[1;32m      4\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39mllama_index\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mindices\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mmulti_modal\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mretriever\u001b[39;00m \u001b[39mimport\u001b[39;00m MultiModalVectorIndexRetriever\n\u001b[1;32m      6\u001b[0m __all__ \u001b[39m=\u001b[39m [\n\u001b[1;32m      7\u001b[0m     \u001b[39m\"\u001b[39m\u001b[39mMultiModalVectorStoreIndex\u001b[39m\u001b[39m\"\u001b[39m,\n\u001b[1;32m      8\u001b[0m     \u001b[39m\"\u001b[39m\u001b[39mMultiModalVectorIndexRetriever\u001b[39m\u001b[39m\"\u001b[39m,\n\u001b[1;32m      9\u001b[0m ]\n",
+      "File \u001b[0;32m~/llama_index_proper/llama_index/llama_index/indices/multi_modal/base.py:19\u001b[0m\n\u001b[1;32m     12\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39mllama_index\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39membeddings\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mutils\u001b[39;00m \u001b[39mimport\u001b[39;00m EmbedType, resolve_embed_model\n\u001b[1;32m     13\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39mllama_index\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mindices\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mutils\u001b[39;00m \u001b[39mimport\u001b[39;00m (\n\u001b[1;32m     14\u001b[0m     async_embed_image_nodes,\n\u001b[1;32m     15\u001b[0m     async_embed_nodes,\n\u001b[1;32m     16\u001b[0m     embed_image_nodes,\n\u001b[1;32m     17\u001b[0m     embed_nodes,\n\u001b[1;32m     18\u001b[0m )\n\u001b[0;32m---> 19\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39mllama_index\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mindices\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mvector_store\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mbase\u001b[39;00m \u001b[39mimport\u001b[39;00m VectorStoreIndex\n\u001b[1;32m     20\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39mllama_index\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mschema\u001b[39;00m \u001b[39mimport\u001b[39;00m BaseNode, ImageNode\n\u001b[1;32m     21\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39mllama_index\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mservice_context\u001b[39;00m \u001b[39mimport\u001b[39;00m ServiceContext\n",
+      "File \u001b[0;32m~/llama_index_proper/llama_index/llama_index/indices/vector_store/__init__.py:4\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[39m\"\"\"Vector-store based data structures.\"\"\"\u001b[39;00m\n\u001b[1;32m      3\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39mllama_index\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mindices\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mvector_store\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mbase\u001b[39;00m \u001b[39mimport\u001b[39;00m GPTVectorStoreIndex, VectorStoreIndex\n\u001b[0;32m----> 4\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39mllama_index\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mindices\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mvector_store\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mretrievers\u001b[39;00m \u001b[39mimport\u001b[39;00m (\n\u001b[1;32m      5\u001b[0m     VectorIndexAutoRetriever,\n\u001b[1;32m      6\u001b[0m     VectorIndexRetriever,\n\u001b[1;32m      7\u001b[0m )\n\u001b[1;32m      9\u001b[0m __all__ \u001b[39m=\u001b[39m [\n\u001b[1;32m     10\u001b[0m     \u001b[39m\"\u001b[39m\u001b[39mVectorStoreIndex\u001b[39m\u001b[39m\"\u001b[39m,\n\u001b[1;32m     11\u001b[0m     \u001b[39m\"\u001b[39m\u001b[39mVectorIndexRetriever\u001b[39m\u001b[39m\"\u001b[39m,\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m     14\u001b[0m     \u001b[39m\"\u001b[39m\u001b[39mGPTVectorStoreIndex\u001b[39m\u001b[39m\"\u001b[39m,\n\u001b[1;32m     15\u001b[0m ]\n",
+      "File \u001b[0;32m~/llama_index_proper/llama_index/llama_index/indices/vector_store/retrievers/__init__.py:4\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39mllama_index\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mindices\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mvector_store\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mretrievers\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mretriever\u001b[39;00m \u001b[39mimport\u001b[39;00m (  \u001b[39m# noqa: I001\u001b[39;00m\n\u001b[1;32m      2\u001b[0m     VectorIndexRetriever,\n\u001b[1;32m      3\u001b[0m )\n\u001b[0;32m----> 4\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39mllama_index\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mindices\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mvector_store\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mretrievers\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mauto_retriever\u001b[39;00m \u001b[39mimport\u001b[39;00m (\n\u001b[1;32m      5\u001b[0m     VectorIndexAutoRetriever,\n\u001b[1;32m      6\u001b[0m )\n\u001b[1;32m      8\u001b[0m __all__ \u001b[39m=\u001b[39m [\n\u001b[1;32m      9\u001b[0m     \u001b[39m\"\u001b[39m\u001b[39mVectorIndexRetriever\u001b[39m\u001b[39m\"\u001b[39m,\n\u001b[1;32m     10\u001b[0m     \u001b[39m\"\u001b[39m\u001b[39mVectorIndexAutoRetriever\u001b[39m\u001b[39m\"\u001b[39m,\n\u001b[1;32m     11\u001b[0m ]\n",
+      "File \u001b[0;32m~/llama_index_proper/llama_index/llama_index/indices/vector_store/retrievers/auto_retriever/__init__.py:1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39mllama_index\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mindices\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mvector_store\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mretrievers\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mauto_retriever\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mauto_retriever\u001b[39;00m \u001b[39mimport\u001b[39;00m (\n\u001b[1;32m      2\u001b[0m     VectorIndexAutoRetriever,\n\u001b[1;32m      3\u001b[0m )\n\u001b[1;32m      5\u001b[0m __all__ \u001b[39m=\u001b[39m [\n\u001b[1;32m      6\u001b[0m     \u001b[39m\"\u001b[39m\u001b[39mVectorIndexAutoRetriever\u001b[39m\u001b[39m\"\u001b[39m,\n\u001b[1;32m      7\u001b[0m ]\n",
+      "File \u001b[0;32m~/llama_index_proper/llama_index/llama_index/indices/vector_store/retrievers/auto_retriever/auto_retriever.py:9\u001b[0m\n\u001b[1;32m      7\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39mllama_index\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mindices\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mvector_store\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mbase\u001b[39;00m \u001b[39mimport\u001b[39;00m VectorStoreIndex\n\u001b[1;32m      8\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39mllama_index\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mindices\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mvector_store\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mretrievers\u001b[39;00m \u001b[39mimport\u001b[39;00m VectorIndexRetriever\n\u001b[0;32m----> 9\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39mllama_index\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mindices\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mvector_store\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mretrievers\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mauto_retriever\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39moutput_parser\u001b[39;00m \u001b[39mimport\u001b[39;00m (\n\u001b[1;32m     10\u001b[0m     VectorStoreQueryOutputParser,\n\u001b[1;32m     11\u001b[0m )\n\u001b[1;32m     12\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39mllama_index\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mindices\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mvector_store\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mretrievers\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mauto_retriever\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mprompts\u001b[39;00m \u001b[39mimport\u001b[39;00m (\n\u001b[1;32m     13\u001b[0m     DEFAULT_VECTOR_STORE_QUERY_PROMPT_TMPL,\n\u001b[1;32m     14\u001b[0m     VectorStoreQueryPrompt,\n\u001b[1;32m     15\u001b[0m )\n\u001b[1;32m     16\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39mllama_index\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39moutput_parsers\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mbase\u001b[39;00m \u001b[39mimport\u001b[39;00m OutputParserException, StructuredOutput\n",
+      "File \u001b[0;32m~/llama_index_proper/llama_index/llama_index/indices/vector_store/retrievers/auto_retriever/output_parser.py:3\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39mtyping\u001b[39;00m \u001b[39mimport\u001b[39;00m Any\n\u001b[0;32m----> 3\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39mllama_index\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39moutput_parsers\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mbase\u001b[39;00m \u001b[39mimport\u001b[39;00m StructuredOutput\n\u001b[1;32m      4\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39mllama_index\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39moutput_parsers\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mutils\u001b[39;00m \u001b[39mimport\u001b[39;00m parse_json_markdown\n\u001b[1;32m      5\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39mllama_index\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mtypes\u001b[39;00m \u001b[39mimport\u001b[39;00m BaseOutputParser\n",
+      "File \u001b[0;32m~/llama_index_proper/llama_index/llama_index/output_parsers/__init__.py:3\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[39m\"\"\"Output parsers.\"\"\"\u001b[39;00m\n\u001b[0;32m----> 3\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39mllama_index\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39moutput_parsers\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mguardrails\u001b[39;00m \u001b[39mimport\u001b[39;00m GuardrailsOutputParser\n\u001b[1;32m      4\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39mllama_index\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39moutput_parsers\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mlangchain\u001b[39;00m \u001b[39mimport\u001b[39;00m LangchainOutputParser\n\u001b[1;32m      5\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39mllama_index\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39moutput_parsers\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mpydantic\u001b[39;00m \u001b[39mimport\u001b[39;00m PydanticOutputParser\n",
+      "File \u001b[0;32m~/llama_index_proper/llama_index/llama_index/output_parsers/guardrails.py:9\u001b[0m\n\u001b[1;32m      6\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39mdeprecated\u001b[39;00m \u001b[39mimport\u001b[39;00m deprecated\n\u001b[1;32m      8\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[0;32m----> 9\u001b[0m     \u001b[39mfrom\u001b[39;00m \u001b[39mguardrails\u001b[39;00m \u001b[39mimport\u001b[39;00m Guard\n\u001b[1;32m     10\u001b[0m \u001b[39mexcept\u001b[39;00m \u001b[39mImportError\u001b[39;00m:\n\u001b[1;32m     11\u001b[0m     Guard \u001b[39m=\u001b[39m \u001b[39mNone\u001b[39;00m\n",
+      "File \u001b[0;32m~/.cache/pypoetry/virtualenvs/llama-index-4a-wkI5X-py3.11/lib/python3.11/site-packages/guardrails/__init__.py:3\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[39m# Set up __init__.py so that users can do from guardrails import Response, Schema, etc.\u001b[39;00m\n\u001b[0;32m----> 3\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39mguardrails\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mguard\u001b[39;00m \u001b[39mimport\u001b[39;00m Guard\n\u001b[1;32m      4\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39mguardrails\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mllm_providers\u001b[39;00m \u001b[39mimport\u001b[39;00m PromptCallableBase\n\u001b[1;32m      5\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39mguardrails\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mlogging_utils\u001b[39;00m \u001b[39mimport\u001b[39;00m configure_logging\n",
+      "File \u001b[0;32m~/.cache/pypoetry/virtualenvs/llama-index-4a-wkI5X-py3.11/lib/python3.11/site-packages/guardrails/guard.py:10\u001b[0m\n\u001b[1;32m      7\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39meliot\u001b[39;00m \u001b[39mimport\u001b[39;00m add_destinations, start_action\n\u001b[1;32m      8\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39mpydantic\u001b[39;00m \u001b[39mimport\u001b[39;00m BaseModel\n\u001b[0;32m---> 10\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39mguardrails\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mllm_providers\u001b[39;00m \u001b[39mimport\u001b[39;00m get_async_llm_ask, get_llm_ask\n\u001b[1;32m     11\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39mguardrails\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mprompt\u001b[39;00m \u001b[39mimport\u001b[39;00m Instructions, Prompt\n\u001b[1;32m     12\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39mguardrails\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mrail\u001b[39;00m \u001b[39mimport\u001b[39;00m Rail\n",
+      "File \u001b[0;32m~/.cache/pypoetry/virtualenvs/llama-index-4a-wkI5X-py3.11/lib/python3.11/site-packages/guardrails/llm_providers.py:24\u001b[0m\n\u001b[1;32m     19\u001b[0m \u001b[39mexcept\u001b[39;00m \u001b[39mImportError\u001b[39;00m:\n\u001b[1;32m     20\u001b[0m     cohere \u001b[39m=\u001b[39m \u001b[39mNone\u001b[39;00m\n\u001b[1;32m     23\u001b[0m OPENAI_RETRYABLE_ERRORS \u001b[39m=\u001b[39m [\n\u001b[0;32m---> 24\u001b[0m     openai\u001b[39m.\u001b[39;49merror\u001b[39m.\u001b[39mAPIConnectionError,\n\u001b[1;32m     25\u001b[0m     openai\u001b[39m.\u001b[39merror\u001b[39m.\u001b[39mAPIError,\n\u001b[1;32m     26\u001b[0m     openai\u001b[39m.\u001b[39merror\u001b[39m.\u001b[39mTryAgain,\n\u001b[1;32m     27\u001b[0m     openai\u001b[39m.\u001b[39merror\u001b[39m.\u001b[39mTimeout,\n\u001b[1;32m     28\u001b[0m     openai\u001b[39m.\u001b[39merror\u001b[39m.\u001b[39mRateLimitError,\n\u001b[1;32m     29\u001b[0m     openai\u001b[39m.\u001b[39merror\u001b[39m.\u001b[39mServiceUnavailableError,\n\u001b[1;32m     30\u001b[0m ]\n\u001b[1;32m     31\u001b[0m RETRYABLE_ERRORS \u001b[39m=\u001b[39m \u001b[39mtuple\u001b[39m(OPENAI_RETRYABLE_ERRORS)\n\u001b[1;32m     34\u001b[0m \u001b[39mclass\u001b[39;00m \u001b[39mPromptCallableException\u001b[39;00m(\u001b[39mException\u001b[39;00m):\n",
+      "\u001b[0;31mAttributeError\u001b[0m: module 'openai' has no attribute 'error'"
+     ]
+    }
+   ],
    "source": [
     "import logging\n",
     "import sys\n",
@@ -70,9 +128,9 @@
     "from llama_index import VectorStoreIndex, SimpleDirectoryReader\n",
     "from IPython.display import Markdown, display\n",
     "\n",
-    "import openai\n",
+    "import os\n",
     "\n",
-    "openai.api_key = \"<YOUR_OPENAI_API_KEY>\""
+    "os.environ[\"OPENAI_API_KEY\"] = \"sk-...\""
    ]
   },
   {
@@ -122,18 +180,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from llama_index.output_parsers import GuardrailsOutputParser\n",
-    "from llama_index.llm_predictor import StructuredLLMPredictor"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "057139d2-09e8-4b8d-83a1-a2356a1475a8",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "llm_predictor = StructuredLLMPredictor()"
+    "from llama_index.output_parsers import GuardrailsOutputParser"
    ]
   },
   {
@@ -144,20 +191,6 @@
     "**Define custom QA and Refine Prompts**\n"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "2833d086-d240-4798-b3c5-a83ac4593b0e",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from llama_index.prompts import PromptTemplate\n",
-    "from llama_index.prompts.default_prompts import (\n",
-    "    DEFAULT_TEXT_QA_PROMPT_TMPL,\n",
-    "    DEFAULT_REFINE_PROMPT_TMPL,\n",
-    ")"
-   ]
-  },
   {
    "cell_type": "markdown",
    "id": "dba8513e",
@@ -216,28 +249,16 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "from llama_index.llms import OpenAI\n",
+    "\n",
     "# Create a guard object\n",
     "guard = gd.Guard.from_pydantic(output_class=BulletPoints, prompt=prompt)\n",
     "\n",
     "# Create output parse object\n",
-    "output_parser = GuardrailsOutputParser(guard, llm=llm_predictor.llm)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "a9b440d4-6fb4-46e6-973f-44207b432d3f",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# NOTE: we use the same output parser for both prompts, though you can choose to use different parsers\n",
-    "# NOTE: here we add formatting instructions to the prompts.\n",
-    "\n",
-    "fmt_qa_tmpl = output_parser.format(DEFAULT_TEXT_QA_PROMPT_TMPL)\n",
-    "fmt_refine_tmpl = output_parser.format(DEFAULT_REFINE_PROMPT_TMPL)\n",
+    "output_parser = GuardrailsOutputParser(guard, llm=OpenAI())\n",
     "\n",
-    "qa_prompt = PromptTemplate(fmt_qa_tmpl, output_parser=output_parser)\n",
-    "refine_prompt = PromptTemplate(fmt_refine_tmpl, output_parser=output_parser)"
+    "# attach to an llm object\n",
+    "llm = OpenAI(output_parser=output_parser)"
    ]
   },
   {
@@ -281,7 +302,12 @@
     }
    ],
    "source": [
+    "from llama_index.prompts.default_prompts import (\n",
+    "    DEFAULT_TEXT_QA_PROMPT_TMPL,\n",
+    ")\n",
+    "\n",
     "# take a look at the new QA template!\n",
+    "fmt_qa_tmpl = output_parser.format(DEFAULT_TEXT_QA_PROMPT_TMPL)\n",
     "print(fmt_qa_tmpl)"
    ]
   },
@@ -311,10 +337,12 @@
     }
    ],
    "source": [
+    "from llama_index import ServiceContext\n",
+    "\n",
+    "ctx = ServiceContext.from_defaults(llm=llm)\n",
+    "\n",
     "query_engine = index.as_query_engine(\n",
-    "    text_qa_template=qa_prompt,\n",
-    "    refine_template=refine_prompt,\n",
-    "    llm_predictor=llm_predictor,\n",
+    "    service_context=ctx,\n",
     ")\n",
     "response = query_engine.query(\n",
     "    \"What are the three items the author did growing up?\",\n",
diff --git a/docs/examples/output_parsing/LangchainOutputParserDemo.ipynb b/docs/examples/output_parsing/LangchainOutputParserDemo.ipynb
index 9c7ee102313e06c3fc1517cb632a3c327865e29e..c37509016478d63e0d03dbecdedb57c85c2ce007 100644
--- a/docs/examples/output_parsing/LangchainOutputParserDemo.ipynb
+++ b/docs/examples/output_parsing/LangchainOutputParserDemo.ipynb
@@ -32,7 +32,27 @@
    "execution_count": null,
    "id": "b9635dc3",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Will not apply HSTS. The HSTS database must be a regular and non-world-writable file.\n",
+      "ERROR: could not open HSTS store at '/home/loganm/.wget-hsts'. HSTS will be disabled.\n",
+      "--2023-12-11 10:24:04--  https://raw.githubusercontent.com/run-llama/llama_index/main/docs/examples/data/paul_graham/paul_graham_essay.txt\n",
+      "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.109.133, 185.199.108.133, ...\n",
+      "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.\n",
+      "HTTP request sent, awaiting response... 200 OK\n",
+      "Length: 75042 (73K) [text/plain]\n",
+      "Saving to: â€˜data/paul_graham/paul_graham_essay.txtâ€™\n",
+      "\n",
+      "data/paul_graham/pa 100%[===================>]  73.28K  --.-KB/s    in 0.04s   \n",
+      "\n",
+      "2023-12-11 10:24:04 (1.74 MB/s) - â€˜data/paul_graham/paul_graham_essay.txtâ€™ saved [75042/75042]\n",
+      "\n"
+     ]
+    }
+   ],
    "source": [
     "!mkdir -p 'data/paul_graham/'\n",
     "!wget 'https://raw.githubusercontent.com/run-llama/llama_index/main/docs/examples/data/paul_graham/paul_graham_essay.txt' -O 'data/paul_graham/paul_graham_essay.txt'"
@@ -60,7 +80,11 @@
     "logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))\n",
     "\n",
     "from llama_index import VectorStoreIndex, SimpleDirectoryReader\n",
-    "from IPython.display import Markdown, display"
+    "from IPython.display import Markdown, display\n",
+    "\n",
+    "import os\n",
+    "\n",
+    "os.environ[\"OPENAI_API_KEY\"] = \"sk-...\""
    ]
   },
   {
@@ -84,10 +108,12 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "INFO:llama_index.token_counter.token_counter:> [build_index_from_documents] Total LLM token usage: 0 tokens\n",
-      "> [build_index_from_documents] Total LLM token usage: 0 tokens\n",
-      "INFO:llama_index.token_counter.token_counter:> [build_index_from_documents] Total embedding token usage: 18579 tokens\n",
-      "> [build_index_from_documents] Total embedding token usage: 18579 tokens\n"
+      "INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings \"HTTP/1.1 200 OK\"\n",
+      "HTTP Request: POST https://api.openai.com/v1/embeddings \"HTTP/1.1 200 OK\"\n",
+      "INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings \"HTTP/1.1 200 OK\"\n",
+      "HTTP Request: POST https://api.openai.com/v1/embeddings \"HTTP/1.1 200 OK\"\n",
+      "INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings \"HTTP/1.1 200 OK\"\n",
+      "HTTP Request: POST https://api.openai.com/v1/embeddings \"HTTP/1.1 200 OK\"\n"
      ]
     }
    ],
@@ -111,20 +137,9 @@
    "outputs": [],
    "source": [
     "from llama_index.output_parsers import LangchainOutputParser\n",
-    "from llama_index.llm_predictor import StructuredLLMPredictor\n",
     "from langchain.output_parsers import StructuredOutputParser, ResponseSchema"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "057139d2-09e8-4b8d-83a1-a2356a1475a8",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "llm_predictor = StructuredLLMPredictor()"
-   ]
-  },
   {
    "cell_type": "markdown",
    "id": "bc25edf7-9343-4e82-a3f1-eec4281a9371",
@@ -133,20 +148,6 @@
     "**Define custom QA and Refine Prompts**"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "2833d086-d240-4798-b3c5-a83ac4593b0e",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from llama_index.prompts import PromptTemplate\n",
-    "from llama_index.prompts.default_prompts import (\n",
-    "    DEFAULT_TEXT_QA_PROMPT_TMPL,\n",
-    "    DEFAULT_REFINE_PROMPT_TMPL,\n",
-    ")"
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -181,23 +182,6 @@
     "output_parser = LangchainOutputParser(lc_output_parser)"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "a9b440d4-6fb4-46e6-973f-44207b432d3f",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# NOTE: we use the same output parser for both prompts, though you can choose to use different parsers\n",
-    "# NOTE: here we add formatting instructions to the prompts.\n",
-    "\n",
-    "fmt_qa_tmpl = output_parser.format(DEFAULT_TEXT_QA_PROMPT_TMPL)\n",
-    "fmt_refine_tmpl = output_parser.format(DEFAULT_REFINE_PROMPT_TMPL)\n",
-    "\n",
-    "qa_prompt = PromptTemplate(fmt_qa_tmpl, output_parser=output_parser)\n",
-    "refine_prompt = PromptTemplate(fmt_refine_tmpl, output_parser=output_parser)"
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -208,14 +192,15 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Context information is below. \n",
+      "Context information is below.\n",
       "---------------------\n",
       "{context_str}\n",
       "---------------------\n",
-      "Given the context information and not prior knowledge, answer the question: {query_str}\n",
+      "Given the context information and not prior knowledge, answer the query.\n",
+      "Query: {query_str}\n",
+      "Answer: \n",
       "\n",
-      "\n",
-      "The output should be a markdown code snippet formatted in the following schema:\n",
+      "The output should be a markdown code snippet formatted in the following schema, including the leading and trailing \"```json\" and \"```\":\n",
       "\n",
       "```json\n",
       "{{\n",
@@ -227,7 +212,12 @@
     }
    ],
    "source": [
+    "from llama_index.prompts.default_prompts import (\n",
+    "    DEFAULT_TEXT_QA_PROMPT_TMPL,\n",
+    ")\n",
+    "\n",
     "# take a look at the new QA template!\n",
+    "fmt_qa_tmpl = output_parser.format(DEFAULT_TEXT_QA_PROMPT_TMPL)\n",
     "print(fmt_qa_tmpl)"
    ]
   },
@@ -245,40 +235,32 @@
    "id": "fb9cdf43-0f31-4c36-869b-df9fa50aebdb",
    "metadata": {},
    "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "INFO:llama_index.token_counter.token_counter:> [query] Total LLM token usage: 609 tokens\n"
-     ]
-    },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "> [query] Total LLM token usage: 609 tokens\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "INFO:llama_index.token_counter.token_counter:> [query] Total embedding token usage: 11 tokens\n"
+      "INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings \"HTTP/1.1 200 OK\"\n",
+      "HTTP Request: POST https://api.openai.com/v1/embeddings \"HTTP/1.1 200 OK\"\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "> [query] Total embedding token usage: 11 tokens\n"
+      "INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n",
+      "HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n"
      ]
     }
    ],
    "source": [
+    "from llama_index import ServiceContext\n",
+    "from llama_index.llms import OpenAI\n",
+    "\n",
+    "llm = OpenAI(output_parser=output_parser)\n",
+    "ctx = ServiceContext.from_defaults(llm=llm)\n",
+    "\n",
     "query_engine = index.as_query_engine(\n",
-    "    text_qa_template=qa_prompt,\n",
-    "    refine_template=refine_prompt,\n",
-    "    llm_predictor=llm_predictor,\n",
+    "    service_context=ctx,\n",
     ")\n",
     "response = query_engine.query(\n",
     "    \"What are a few things the author did growing up?\",\n",
@@ -295,7 +277,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "{'Education': 'Before college, the author wrote short stories and experimented with programming on an IBM 1401.', 'Work': 'The author worked on writing and programming outside of school.'}\n"
+      "{'Education': 'The author did not plan to study programming in college, but initially planned to study philosophy.', 'Work': 'Growing up, the author worked on writing short stories and programming. They wrote simple games, a program to predict rocket heights, and a word processor.'}\n"
      ]
     }
    ],
@@ -306,9 +288,9 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "llama_index",
+   "display_name": "llama-index-4a-wkI5X-py3.11",
    "language": "python",
-   "name": "llama_index"
+   "name": "python3"
   },
   "language_info": {
    "codemirror_mode": {
diff --git a/docs/examples/output_parsing/evaporate_program.ipynb b/docs/examples/output_parsing/evaporate_program.ipynb
index be9dbafcd18076cf4f3df8115f02004330758792..a2e68be329d912a7144988ced5dc89843a32b30b 100644
--- a/docs/examples/output_parsing/evaporate_program.ipynb
+++ b/docs/examples/output_parsing/evaporate_program.ipynb
@@ -60,7 +60,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from llama_index import SimpleDirectoryReader, ServiceContext, LLMPredictor\n",
+    "from llama_index import SimpleDirectoryReader, ServiceContext\n",
     "from llama_index.program.predefined import (\n",
     "    DFEvaporateProgram,\n",
     "    EvaporateExtractor,\n",
diff --git a/docs/examples/output_parsing/guidance_sub_question.ipynb b/docs/examples/output_parsing/guidance_sub_question.ipynb
index 39e63135d38b1973309c4f19662c96d4771401ed..9220523f93c071253e6ae34865b06bae4e2f7acf 100644
--- a/docs/examples/output_parsing/guidance_sub_question.ipynb
+++ b/docs/examples/output_parsing/guidance_sub_question.ipynb
@@ -204,7 +204,6 @@
    "source": [
     "from llama_index import (\n",
     "    SimpleDirectoryReader,\n",
-    "    LLMPredictor,\n",
     "    ServiceContext,\n",
     "    VectorStoreIndex,\n",
     ")\n",
diff --git a/docs/examples/query_engine/JointQASummary.ipynb b/docs/examples/query_engine/JointQASummary.ipynb
index a4e1fd48b85ca143009a8012479bc5ad4feef986..68952479f277f081080a1a61e37265d93745f95b 100644
--- a/docs/examples/query_engine/JointQASummary.ipynb
+++ b/docs/examples/query_engine/JointQASummary.ipynb
@@ -71,7 +71,7 @@
     "from llama_index.composability.joint_qa_summary import (\n",
     "    QASummaryQueryEngineBuilder,\n",
     ")\n",
-    "from llama_index import SimpleDirectoryReader, ServiceContext, LLMPredictor\n",
+    "from llama_index import SimpleDirectoryReader, ServiceContext\n",
     "from llama_index.response.notebook_utils import display_response\n",
     "from llama_index.llms import OpenAI"
    ]
diff --git a/docs/examples/query_engine/SQLAutoVectorQueryEngine.ipynb b/docs/examples/query_engine/SQLAutoVectorQueryEngine.ipynb
index 33c692a01dcc80a9b3bca895d5f7544f91b68be1..0c78a1528904b6d2f079bf84982d66b8d27fefd9 100644
--- a/docs/examples/query_engine/SQLAutoVectorQueryEngine.ipynb
+++ b/docs/examples/query_engine/SQLAutoVectorQueryEngine.ipynb
@@ -170,7 +170,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from llama_index import ServiceContext, LLMPredictor\n",
+    "from llama_index import ServiceContext\n",
     "from llama_index.storage import StorageContext\n",
     "from llama_index.vector_stores import PineconeVectorStore\n",
     "from llama_index.node_parser import TokenTextSplitter\n",
diff --git a/docs/examples/query_engine/SQLJoinQueryEngine.ipynb b/docs/examples/query_engine/SQLJoinQueryEngine.ipynb
index 07d19dc598f879496022143e546d6eeb2bfd52bc..dff62f347f47425c69d132ab1953f4d5b2a02d86 100644
--- a/docs/examples/query_engine/SQLJoinQueryEngine.ipynb
+++ b/docs/examples/query_engine/SQLJoinQueryEngine.ipynb
@@ -137,7 +137,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from llama_index import ServiceContext, LLMPredictor\n",
+    "from llama_index import ServiceContext\n",
     "from llama_index.storage import StorageContext\n",
     "from llama_index.vector_stores import PineconeVectorStore\n",
     "from llama_index.node_parser import TokenTextSplitter\n",
diff --git a/docs/examples/query_engine/citation_query_engine.ipynb b/docs/examples/query_engine/citation_query_engine.ipynb
index 034359448db9a55d1a9e35d4e6f03b19e5089b63..57b0b196c9a9d698409f922ce6ef40039895306a 100644
--- a/docs/examples/query_engine/citation_query_engine.ipynb
+++ b/docs/examples/query_engine/citation_query_engine.ipynb
@@ -73,7 +73,6 @@
     "    SimpleDirectoryReader,\n",
     "    StorageContext,\n",
     "    load_index_from_storage,\n",
-    "    LLMPredictor,\n",
     "    ServiceContext,\n",
     ")"
    ]
diff --git a/docs/examples/query_engine/flare_query_engine.ipynb b/docs/examples/query_engine/flare_query_engine.ipynb
index 983273bf4331f5f377efc04a8aa75713cdb6da8c..85f4cc106cfa2cc0f4a8f0a73b72938d9ff4b563 100644
--- a/docs/examples/query_engine/flare_query_engine.ipynb
+++ b/docs/examples/query_engine/flare_query_engine.ipynb
@@ -62,7 +62,6 @@
     "    VectorStoreIndex,\n",
     "    SimpleDirectoryReader,\n",
     "    StorageContext,\n",
-    "    LLMPredictor,\n",
     "    ServiceContext,\n",
     ")"
    ]
diff --git a/docs/examples/query_engine/knowledge_graph_query_engine.ipynb b/docs/examples/query_engine/knowledge_graph_query_engine.ipynb
index e25557e9bb520717db73f83be5342a1f402277c2..912f121ac07232c6c59d4d825ab733b2e3ae6649 100644
--- a/docs/examples/query_engine/knowledge_graph_query_engine.ipynb
+++ b/docs/examples/query_engine/knowledge_graph_query_engine.ipynb
@@ -80,7 +80,6 @@
     "\n",
     "from llama_index import (\n",
     "    KnowledgeGraphIndex,\n",
-    "    LLMPredictor,\n",
     "    ServiceContext,\n",
     "    SimpleDirectoryReader,\n",
     ")\n",
@@ -113,7 +112,6 @@
     "    VectorStoreIndex,\n",
     "    SimpleDirectoryReader,\n",
     "    KnowledgeGraphIndex,\n",
-    "    LLMPredictor,\n",
     "    ServiceContext,\n",
     ")\n",
     "\n",
diff --git a/docs/examples/query_engine/knowledge_graph_rag_query_engine.ipynb b/docs/examples/query_engine/knowledge_graph_rag_query_engine.ipynb
index afca05e20c4899312f1afb107b5d976f84c55bb5..8e710bd14a09cbd3b0e39658a85c4125b90ae5e7 100644
--- a/docs/examples/query_engine/knowledge_graph_rag_query_engine.ipynb
+++ b/docs/examples/query_engine/knowledge_graph_rag_query_engine.ipynb
@@ -88,7 +88,6 @@
     "\n",
     "from llama_index import (\n",
     "    KnowledgeGraphIndex,\n",
-    "    LLMPredictor,\n",
     "    ServiceContext,\n",
     "    SimpleDirectoryReader,\n",
     ")\n",
@@ -121,7 +120,6 @@
     "    VectorStoreIndex,\n",
     "    SimpleDirectoryReader,\n",
     "    KnowledgeGraphIndex,\n",
-    "    LLMPredictor,\n",
     "    ServiceContext,\n",
     ")\n",
     "\n",
diff --git a/docs/examples/query_engine/pdf_tables/recursive_retriever.ipynb b/docs/examples/query_engine/pdf_tables/recursive_retriever.ipynb
index 511a283197b49c2482570280da1990dcccf87c46..77039150dd996ea71f5953740922c06e4428ad0c 100644
--- a/docs/examples/query_engine/pdf_tables/recursive_retriever.ipynb
+++ b/docs/examples/query_engine/pdf_tables/recursive_retriever.ipynb
@@ -34,7 +34,7 @@
     "from llama_index import Document, SummaryIndex\n",
     "\n",
     "# https://en.wikipedia.org/wiki/The_World%27s_Billionaires\n",
-    "from llama_index import VectorStoreIndex, ServiceContext, LLMPredictor\n",
+    "from llama_index import VectorStoreIndex, ServiceContext\n",
     "from llama_index.query_engine import PandasQueryEngine, RetrieverQueryEngine\n",
     "from llama_index.retrievers import RecursiveRetriever\n",
     "from llama_index.schema import IndexNode\n",
diff --git a/docs/examples/query_transformations/SimpleIndexDemo-multistep.ipynb b/docs/examples/query_transformations/SimpleIndexDemo-multistep.ipynb
index 15404199350c3d8f28db71f95604ddc7e2c1a8b8..18b69a28a7bc605e6696e42e7fa125facc020574 100644
--- a/docs/examples/query_transformations/SimpleIndexDemo-multistep.ipynb
+++ b/docs/examples/query_transformations/SimpleIndexDemo-multistep.ipynb
@@ -82,7 +82,6 @@
     "from llama_index import (\n",
     "    VectorStoreIndex,\n",
     "    SimpleDirectoryReader,\n",
-    "    LLMPredictor,\n",
     "    ServiceContext,\n",
     ")\n",
     "from llama_index.llms import OpenAI\n",
@@ -96,11 +95,11 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# LLM Predictor (gpt-3)\n",
+    "# LLM (gpt-3)\n",
     "gpt3 = OpenAI(temperature=0, model=\"text-davinci-003\")\n",
     "service_context_gpt3 = ServiceContext.from_defaults(llm=gpt3)\n",
     "\n",
-    "# LLMPredictor (gpt-4)\n",
+    "# LLM (gpt-4)\n",
     "gpt4 = OpenAI(temperature=0, model=\"gpt-4\")\n",
     "service_context_gpt4 = ServiceContext.from_defaults(llm=gpt4)"
    ]
@@ -145,16 +144,13 @@
     "from llama_index.indices.query.query_transform.base import (\n",
     "    StepDecomposeQueryTransform,\n",
     ")\n",
-    "from llama_index import LLMPredictor\n",
     "\n",
     "# gpt-4\n",
-    "step_decompose_transform = StepDecomposeQueryTransform(\n",
-    "    LLMPredictor(llm=gpt4), verbose=True\n",
-    ")\n",
+    "step_decompose_transform = StepDecomposeQueryTransform(llm=gpt4, verbose=True)\n",
     "\n",
     "# gpt-3\n",
     "step_decompose_transform_gpt3 = StepDecomposeQueryTransform(\n",
-    "    LLMPredictor(llm=gpt3), verbose=True\n",
+    "    llm=gpt3, verbose=True\n",
     ")"
    ]
   },
diff --git a/docs/examples/usecases/City_Analysis-Decompose-KeywordTable.ipynb b/docs/examples/usecases/City_Analysis-Decompose-KeywordTable.ipynb
index 463cc42a8e1143def210e7f2138fc531bc7f0e56..53d547b3b371a5c44ea24dffed6c848673c99578 100644
--- a/docs/examples/usecases/City_Analysis-Decompose-KeywordTable.ipynb
+++ b/docs/examples/usecases/City_Analysis-Decompose-KeywordTable.ipynb
@@ -80,7 +80,6 @@
     "    SimpleKeywordTableIndex,\n",
     "    SummaryIndex,\n",
     "    SimpleDirectoryReader,\n",
-    "    LLMPredictor,\n",
     "    ServiceContext,\n",
     ")\n",
     "import requests"
diff --git a/docs/examples/vector_stores/SimpleIndexDemoMMR.ipynb b/docs/examples/vector_stores/SimpleIndexDemoMMR.ipynb
index 96efbb40d24c96728ca6f36c7ce2ab792136b5fc..58e2f570e6f1cf3d74add588d1fa501483466271 100644
--- a/docs/examples/vector_stores/SimpleIndexDemoMMR.ipynb
+++ b/docs/examples/vector_stores/SimpleIndexDemoMMR.ipynb
@@ -192,7 +192,6 @@
     "    VectorStoreIndex,\n",
     "    SimpleDirectoryReader,\n",
     "    ServiceContext,\n",
-    "    LLMPredictor,\n",
     ")\n",
     "from llama_index.response.notebook_utils import display_source_node\n",
     "from llama_index.llms import OpenAI\n",
diff --git a/docs/module_guides/models/llms/usage_custom.md b/docs/module_guides/models/llms/usage_custom.md
index 75acbd1f9c4a69c1b496b5f9fe1b6fbb8f2f39a0..103ebe1f986046f9077adc3649a88472db85bb87 100644
--- a/docs/module_guides/models/llms/usage_custom.md
+++ b/docs/module_guides/models/llms/usage_custom.md
@@ -24,7 +24,6 @@ you may also plug in any LLM shown on Langchain's
 from llama_index import (
     KeywordTableIndex,
     SimpleDirectoryReader,
-    LLMPredictor,
     ServiceContext,
 )
 from llama_index.llms import OpenAI
diff --git a/docs/module_guides/querying/output_parser.md b/docs/module_guides/querying/output_parser.md
index 60fc812f637f77c3d7d8e5d1389373049b0fd2ff..72256747389ef10a00ca49df1049b5aba702272b 100644
--- a/docs/module_guides/querying/output_parser.md
+++ b/docs/module_guides/querying/output_parser.md
@@ -13,22 +13,12 @@ Guardrails is an open-source Python package for specification/validation/correct
 ```python
 from llama_index import VectorStoreIndex, SimpleDirectoryReader
 from llama_index.output_parsers import GuardrailsOutputParser
-from llama_index.llm_predictor import StructuredLLMPredictor
-from llama_index.prompts import PromptTemplate
-from llama_index.prompts.default_prompts import (
-    DEFAULT_TEXT_QA_PROMPT_TMPL,
-    DEFAULT_REFINE_PROMPT_TMPL,
-)
+from llama_index.llms import OpenAI
 
 
 # load documents, build index
 documents = SimpleDirectoryReader("../paul_graham_essay/data").load_data()
 index = VectorStoreIndex(documents, chunk_size=512)
-llm_predictor = StructuredLLMPredictor()
-
-
-# specify StructuredLLMPredictor
-# this is a special LLMPredictor that allows for structured outputs
 
 # define query / output spec
 rail_spec = """
@@ -59,22 +49,18 @@ Query string here.
 
 # define output parser
 output_parser = GuardrailsOutputParser.from_rail_string(
-    rail_spec, llm=llm_predictor.llm
+    rail_spec, llm=OpenAI()
 )
 
-# format each prompt with output parser instructions
-fmt_qa_tmpl = output_parser.format(DEFAULT_TEXT_QA_PROMPT_TMPL)
-fmt_refine_tmpl = output_parser.format(DEFAULT_REFINE_PROMPT_TMPL)
-
-qa_prompt = PromptTemplate(fmt_qa_tmpl, output_parser=output_parser)
-refine_prompt = PromptTemplate(fmt_refine_tmpl, output_parser=output_parser)
+# Attach output parser to LLM
+llm = OpenAI(output_parser=output_parser)
 
 # obtain a structured response
-query_engine = index.as_query_engine(
-    service_context=ServiceContext.from_defaults(llm_predictor=llm_predictor),
-    text_qa_template=qa_prompt,
-    refine_template=refine_prompt,
-)
+from llama_index import ServiceContext
+
+ctx = ServiceContext.from_defaults(llm=llm)
+
+query_engine = index.as_query_engine(service_context=ctx)
 response = query_engine.query(
     "What are the three items the author did growing up?",
 )
@@ -94,19 +80,13 @@ Langchain also offers output parsing modules that you can use within LlamaIndex.
 ```python
 from llama_index import VectorStoreIndex, SimpleDirectoryReader
 from llama_index.output_parsers import LangchainOutputParser
-from llama_index.llm_predictor import StructuredLLMPredictor
-from llama_index.prompts import PromptTemplate
-from llama_index.prompts.default_prompts import (
-    DEFAULT_TEXT_QA_PROMPT_TMPL,
-    DEFAULT_REFINE_PROMPT_TMPL,
-)
+from llama_index.llms import OpenAI
 from langchain.output_parsers import StructuredOutputParser, ResponseSchema
 
 
 # load documents, build index
 documents = SimpleDirectoryReader("../paul_graham_essay/data").load_data()
 index = VectorStoreIndex.from_documents(documents)
-llm_predictor = StructuredLLMPredictor()
 
 # define output schema
 response_schemas = [
@@ -126,18 +106,15 @@ lc_output_parser = StructuredOutputParser.from_response_schemas(
 )
 output_parser = LangchainOutputParser(lc_output_parser)
 
-# format each prompt with output parser instructions
-fmt_qa_tmpl = output_parser.format(DEFAULT_TEXT_QA_PROMPT_TMPL)
-fmt_refine_tmpl = output_parser.format(DEFAULT_REFINE_PROMPT_TMPL)
-qa_prompt = PromptTemplate(fmt_qa_tmpl, output_parser=output_parser)
-refine_prompt = PromptTemplate(fmt_refine_tmpl, output_parser=output_parser)
-
-# query index
-query_engine = index.as_query_engine(
-    service_context=ServiceContext.from_defaults(llm_predictor=llm_predictor),
-    text_qa_template=qa_prompt,
-    refine_template=refine_prompt,
-)
+# Attach output parser to LLM
+llm = OpenAI(output_parser=output_parser)
+
+# obtain a structured response
+from llama_index import ServiceContext
+
+ctx = ServiceContext.from_defaults(llm=llm)
+
+query_engine = index.as_query_engine(service_context=ctx)
 response = query_engine.query(
     "What are a few things the author did growing up?",
 )
diff --git a/docs/module_guides/supporting_modules/service_context.md b/docs/module_guides/supporting_modules/service_context.md
index 1f36eca0773e7d92b7787a502f75e1d1427c21bd..eff6ea8bf4efddc4b30625a9506c2e54a091868f 100644
--- a/docs/module_guides/supporting_modules/service_context.md
+++ b/docs/module_guides/supporting_modules/service_context.md
@@ -69,7 +69,6 @@ Here's a complete example that sets up all objects using their default settings:
 ```python
 from llama_index import (
     ServiceContext,
-    LLMPredictor,
     OpenAIEmbedding,
     PromptHelper,
 )
diff --git a/docs/optimizing/advanced_retrieval/query_transformations.md b/docs/optimizing/advanced_retrieval/query_transformations.md
index c559fe2fad55e1c67af3436c1e1b97e11c1ffc9f..d10af98c3859dfa19057ea05cc288cae96a33de3 100644
--- a/docs/optimizing/advanced_retrieval/query_transformations.md
+++ b/docs/optimizing/advanced_retrieval/query_transformations.md
@@ -66,14 +66,12 @@ Here's a corresponding example code snippet over a composed graph.
 
 ```python
 # Setting: a summary index composed over multiple vector indices
-# llm_predictor_chatgpt corresponds to the ChatGPT LLM interface
+# llm_chatgpt corresponds to the ChatGPT LLM interface
 from llama_index.indices.query.query_transform.base import (
     DecomposeQueryTransform,
 )
 
-decompose_transform = DecomposeQueryTransform(
-    llm_predictor_chatgpt, verbose=True
-)
+decompose_transform = DecomposeQueryTransform(llm_chatgpt, verbose=True)
 
 # initialize indexes and graph
 ...
@@ -117,9 +115,7 @@ from llama_index.indices.query.query_transform.base import (
 )
 
 # gpt-4
-step_decompose_transform = StepDecomposeQueryTransform(
-    llm_predictor, verbose=True
-)
+step_decompose_transform = StepDecomposeQueryTransform(llm, verbose=True)
 
 query_engine = index.as_query_engine()
 query_engine = MultiStepQueryEngine(
diff --git a/docs/understanding/evaluating/cost_analysis/root.md b/docs/understanding/evaluating/cost_analysis/root.md
index 119e91435ee01e18763b6fbb5a9681680c08516a..fe0cac0ef043cde1db0795a3678417a899bec51d 100644
--- a/docs/understanding/evaluating/cost_analysis/root.md
+++ b/docs/understanding/evaluating/cost_analysis/root.md
@@ -81,7 +81,7 @@ You may also predict the token usage of embedding calls with `MockEmbedding`.
 from llama_index import ServiceContext, set_global_service_context
 from llama_index import MockEmbedding
 
-# specify a MockLLMPredictor
+# specify a MockEmbedding
 embed_model = MockEmbedding(embed_dim=1536)
 
 service_context = ServiceContext.from_defaults(embed_model=embed_model)
diff --git a/docs/understanding/putting_it_all_together/apps/fullstack_with_delphic.md b/docs/understanding/putting_it_all_together/apps/fullstack_with_delphic.md
index 1ddea915affb43da414829cd07e35f9e5bfed5a6..f4ff376c40c06003f402fa32e1fef712b4c6c515 100644
--- a/docs/understanding/putting_it_all_together/apps/fullstack_with_delphic.md
+++ b/docs/understanding/putting_it_all_together/apps/fullstack_with_delphic.md
@@ -329,7 +329,7 @@ async def receive(self, text_data):
 To load the collection model, the `load_collection_model` function is used, which can be found
 in [`delphic/utils/collections.py`](https://github.com/JSv4/Delphic/blob/main/delphic/utils/collections.py). This
 function retrieves the collection object with the given collection ID, checks if a JSON file for the collection model
-exists, and if not, creates one. Then, it sets up the `LLMPredictor` and `ServiceContext` before loading
+exists, and if not, creates one. Then, it sets up the `LLM` and `ServiceContext` before loading
 the `VectorStoreIndex` using the cache file.
 
 ```python
diff --git a/docs/understanding/putting_it_all_together/q_and_a.md b/docs/understanding/putting_it_all_together/q_and_a.md
index ac6bee7ffcaa872e493c4acc2f48e3239030a00b..942c968459154940bf51d5934b9a7239fba2a3fd 100644
--- a/docs/understanding/putting_it_all_together/q_and_a.md
+++ b/docs/understanding/putting_it_all_together/q_and_a.md
@@ -140,7 +140,7 @@ from llama_index.indices.query.query_transform.base import (
 )
 
 decompose_transform = DecomposeQueryTransform(
-    service_context.llm_predictor, verbose=True
+    service_context.llm, verbose=True
 )
 ```
 
diff --git a/docs/understanding/putting_it_all_together/q_and_a/terms_definitions_tutorial.md b/docs/understanding/putting_it_all_together/q_and_a/terms_definitions_tutorial.md
index 36f4789d3fd7947620e54615dc17d030ec23962e..d4ceb765264413194c64a63f21cf4c79af7b3b5d 100644
--- a/docs/understanding/putting_it_all_together/q_and_a/terms_definitions_tutorial.md
+++ b/docs/understanding/putting_it_all_together/q_and_a/terms_definitions_tutorial.md
@@ -87,7 +87,6 @@ We can add the following functions to both initialize our LLM, as well as use it
 from llama_index import (
     Document,
     SummaryIndex,
-    LLMPredictor,
     ServiceContext,
     load_index_from_storage,
 )
@@ -358,7 +357,7 @@ from llama_index.prompts import (
     ChatPromptTemplate,
 )
 from llama_index.prompts.utils import is_chat_model
-from llama_index.llms.base import ChatMessage, MessageRole
+from llama_index.llms import ChatMessage, MessageRole
 
 # Text QA templates
 DEFAULT_TEXT_QA_PROMPT_TMPL = (
diff --git a/docs/understanding/putting_it_all_together/q_and_a/unified_query.md b/docs/understanding/putting_it_all_together/q_and_a/unified_query.md
index 8de57a374640ad66a0d2997bc15b89f18c2a469b..17f82eed993edf1a7633b8e2278434f6e7ec0fc4 100644
--- a/docs/understanding/putting_it_all_together/q_and_a/unified_query.md
+++ b/docs/understanding/putting_it_all_together/q_and_a/unified_query.md
@@ -151,14 +151,11 @@ An example is shown below.
 
 ```python
 # define decompose_transform
-from llama_index import LLMPredictor
 from llama_index.indices.query.query_transform.base import (
     DecomposeQueryTransform,
 )
 
-decompose_transform = DecomposeQueryTransform(
-    LLMPredictor(llm=llm_gpt4), verbose=True
-)
+decompose_transform = DecomposeQueryTransform(llm=llm_gpt4, verbose=True)
 
 # define custom query engines
 from llama_index.query_engine.transform_query_engine import (
diff --git a/examples/async/AsyncComposableIndicesSEC.ipynb b/examples/async/AsyncComposableIndicesSEC.ipynb
index acbae9f749c93d75361d8266f262ffd28e77c1bb..b8275b6e8a1ce0f1e48e9240c89c9aea2a3ddc27 100644
--- a/examples/async/AsyncComposableIndicesSEC.ipynb
+++ b/examples/async/AsyncComposableIndicesSEC.ipynb
@@ -143,7 +143,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from llama_index import SummaryIndex, LLMPredictor\n",
+    "from llama_index import SummaryIndex\n",
     "from llama_index.llms import OpenAI\n",
     "from llama_index.composability import ComposableGraph"
    ]
diff --git a/examples/async/AsyncLLMPredictorDemo.ipynb b/examples/async/AsyncLLMPredictorDemo.ipynb
index 3d66da8f4969c9dbf29568bbf69c90ea08a858ef..09a8f458e16225c49083ad228c7185c29d1f72ef 100644
--- a/examples/async/AsyncLLMPredictorDemo.ipynb
+++ b/examples/async/AsyncLLMPredictorDemo.ipynb
@@ -5,7 +5,7 @@
    "id": "43cea4f8",
    "metadata": {},
    "source": [
-    "# Async LLMPredictor Demo"
+    "# Async LLM Demo"
    ]
   },
   {
@@ -15,7 +15,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from llama_index.langchain_helpers.chain_wrapper import LLMPredictor\n",
+    "from llama_index.llms import OpenAI\n",
     "from llama_index.prompts.default_prompts import DEFAULT_SUMMARY_PROMPT\n",
     "import asyncio\n",
     "import time"
@@ -44,7 +44,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "llm = LLMPredictor()"
+    "llm = OpenAI()"
    ]
   },
   {
diff --git a/examples/experimental/Evaporate.ipynb b/examples/experimental/Evaporate.ipynb
index b09d543fac7df4a382a39c1d3428fc3005f28470..ea1f892c34a2c4e0571dc606db3a01918b642b09 100644
--- a/examples/experimental/Evaporate.ipynb
+++ b/examples/experimental/Evaporate.ipynb
@@ -16,7 +16,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from llama_index import SimpleDirectoryReader, ServiceContext, LLMPredictor\n",
+    "from llama_index import SimpleDirectoryReader, ServiceContext\n",
     "from llama_index.experimental.evaporate import EvaporateExtractor\n",
     "from llama_index.llms import OpenAI\n",
     "import requests"
diff --git a/examples/paul_graham_essay/GPT4Comparison.ipynb b/examples/paul_graham_essay/GPT4Comparison.ipynb
index 710996ed164ad4508014e4648a24b5b941d58366..fae5c47be3b61749ebca41b390185ceb8facc032 100644
--- a/examples/paul_graham_essay/GPT4Comparison.ipynb
+++ b/examples/paul_graham_essay/GPT4Comparison.ipynb
@@ -10,7 +10,6 @@
     "from llama_index import (\n",
     "    SummaryIndex,\n",
     "    SimpleDirectoryReader,\n",
-    "    LLMPredictor,\n",
     "    ServiceContext,\n",
     ")\n",
     "from llama_index.response.notebook_utils import display_response\n",
diff --git a/examples/test_wiki/TestNYC-Benchmark-GPT4.ipynb b/examples/test_wiki/TestNYC-Benchmark-GPT4.ipynb
index 40ec9c491cfa6ce36436544a12fd5d0544b1f763..ac3d7ab364c46ff39b807ecba5aa42e409aff9ce 100644
--- a/examples/test_wiki/TestNYC-Benchmark-GPT4.ipynb
+++ b/examples/test_wiki/TestNYC-Benchmark-GPT4.ipynb
@@ -52,14 +52,13 @@
     "from llama_index import (\n",
     "    TreeIndex,\n",
     "    SimpleDirectoryReader,\n",
-    "    LLMPredictor,\n",
     "    VectorStoreIndex,\n",
     "    SummaryIndex,\n",
     "    PromptTemplate,\n",
     "    ServiceContext,\n",
     ")\n",
     "from llama_index.indices.base import BaseIndex\n",
-    "from llama_index.llms.base import LLM\n",
+    "from llama_index.llms.llm import LLM\n",
     "from llama_index.llms import OpenAI\n",
     "from llama_index.response.schema import Response\n",
     "import pandas as pd\n",
@@ -346,14 +345,12 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "def analyze_outcome_llm_single(\n",
-    "    outcome: TestOutcome, llm_predictor: LLMPredictor\n",
-    ") -> Tuple[bool, bool]:\n",
+    "def analyze_outcome_llm_single(outcome: TestOutcome, llm: LLM) -> Tuple[bool, bool]:\n",
     "    try:\n",
     "        source_text = outcome.response.source_nodes[0].text\n",
     "    except:\n",
     "        source_text = \"Failed to retrieve any context\"\n",
-    "    result_str, _ = llm_predictor.predict(\n",
+    "    result_str, _ = llm.predict(\n",
     "        DEFAULT_EVAL_PROMPT,\n",
     "        query_str=outcome.test.query,\n",
     "        context_str=source_text,\n",
@@ -363,13 +360,11 @@
     "    return is_answer_correct, is_context_relevant, result_str\n",
     "\n",
     "\n",
-    "def analyze_outcome_llm(\n",
-    "    outcomes: List[TestOutcome], llm_predictor: LLMPredictor\n",
-    ") -> None:\n",
+    "def analyze_outcome_llm(outcomes: List[TestOutcome], llm: LLM) -> None:\n",
     "    rows = []\n",
     "    for outcome in outcomes:\n",
     "        is_correct_response, is_correct_source, result_str = analyze_outcome_llm_single(\n",
-    "            outcome, llm_predictor\n",
+    "            outcome, llm\n",
     "        )\n",
     "        row = [outcome.test.query, is_correct_response, is_correct_source, result_str]\n",
     "        rows.append(row)\n",
@@ -434,7 +429,7 @@
    "id": "5b2e7fdd",
    "metadata": {},
    "source": [
-    "# Create LLMPredictors"
+    "# Create LLMs"
    ]
   },
   {
diff --git a/examples/test_wiki/TestNYC-Tree-GPT4.ipynb b/examples/test_wiki/TestNYC-Tree-GPT4.ipynb
index 7f8bc21a1fc6a605cc4daa13eadbb4e5042913f8..a940438438cac09e66c8d9cead948dbc8db6b457 100644
--- a/examples/test_wiki/TestNYC-Tree-GPT4.ipynb
+++ b/examples/test_wiki/TestNYC-Tree-GPT4.ipynb
@@ -70,7 +70,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from llama_index import TreeIndex, SimpleDirectoryReader, LLMPredictor, ServiceContext\n",
+    "from llama_index import TreeIndex, SimpleDirectoryReader, ServiceContext\n",
     "from llama_index.logger import LlamaLogger\n",
     "from llama_index.llms import OpenAI"
    ]
diff --git a/experimental/classifier/utils.py b/experimental/classifier/utils.py
index 43b989de80aa00f265a7261a9ed334719f1f76fb..877dfd7adb34c37871cfc33857d190c29e5ca9cb 100644
--- a/experimental/classifier/utils.py
+++ b/experimental/classifier/utils.py
@@ -8,7 +8,7 @@ import pandas as pd
 from sklearn.model_selection import train_test_split
 
 from llama_index.indices.utils import extract_numbers_given_response
-from llama_index.llm_predictor import LLMPredictor
+from llama_index.llms import OpenAI
 from llama_index.prompts import BasePromptTemplate, PromptTemplate
 
 
@@ -77,13 +77,11 @@ def get_eval_preds(
     train_prompt: BasePromptTemplate, train_str: str, eval_df: pd.DataFrame, n: int = 20
 ) -> List:
     """Get eval preds."""
-    llm_predictor = LLMPredictor()
+    llm = OpenAI()
     eval_preds = []
     for i in range(n):
         eval_str = get_sorted_dict_str(eval_df.iloc[i].to_dict())
-        response = llm_predictor.predict(
-            train_prompt, train_str=train_str, eval_str=eval_str
-        )
+        response = llm.predict(train_prompt, train_str=train_str, eval_str=eval_str)
         pred = extract_float_given_response(response)
         print(f"Getting preds: {i}/{n}: {pred}")
         if pred is None:
diff --git a/experimental/cli/configuration.py b/experimental/cli/configuration.py
index a78cde46bf9b788bf18359df17a570addf6c5060..33a72e240d2dbe125a463286f239e99cf16b01c2 100644
--- a/experimental/cli/configuration.py
+++ b/experimental/cli/configuration.py
@@ -13,7 +13,7 @@ from llama_index.indices import SimpleKeywordTableIndex
 from llama_index.indices.base import BaseIndex
 from llama_index.indices.loading import load_index_from_storage
 from llama_index.llm_predictor import StructuredLLMPredictor
-from llama_index.llms.base import LLM
+from llama_index.llms.llm import LLM
 from llama_index.llms.openai import OpenAI
 from llama_index.storage.storage_context import StorageContext
 
diff --git a/llama_index/agent/context_retriever_agent.py b/llama_index/agent/context_retriever_agent.py
index f2a463b14b1a9053f025e60e292d369610af1100..c54253ac32aead37f304f7c8bf164da59d711fe5 100644
--- a/llama_index/agent/context_retriever_agent.py
+++ b/llama_index/agent/context_retriever_agent.py
@@ -12,9 +12,10 @@ from llama_index.chat_engine.types import (
     AgentChatResponse,
 )
 from llama_index.core import BaseRetriever
-from llama_index.llms.base import LLM, ChatMessage
+from llama_index.llms.llm import LLM
 from llama_index.llms.openai import OpenAI
 from llama_index.llms.openai_utils import is_function_calling_model
+from llama_index.llms.types import ChatMessage
 from llama_index.memory import BaseMemory, ChatMemoryBuffer
 from llama_index.prompts import PromptTemplate
 from llama_index.schema import NodeWithScore
diff --git a/llama_index/agent/openai_agent.py b/llama_index/agent/openai_agent.py
index 2ea23ad605c37b93dde4b77067adb8518b5a4fd9..9f7dd54fb306e168112a629b031e255a58c8278f 100644
--- a/llama_index/agent/openai_agent.py
+++ b/llama_index/agent/openai_agent.py
@@ -18,9 +18,10 @@ from llama_index.chat_engine.types import (
     ChatResponseMode,
     StreamingAgentChatResponse,
 )
-from llama_index.llms.base import LLM, ChatMessage, ChatResponse, MessageRole
+from llama_index.llms.llm import LLM
 from llama_index.llms.openai import OpenAI
 from llama_index.llms.openai_utils import OpenAIToolCall
+from llama_index.llms.types import ChatMessage, ChatResponse, MessageRole
 from llama_index.memory import BaseMemory, ChatMemoryBuffer
 from llama_index.objects.base import ObjectRetriever
 from llama_index.tools import BaseTool, ToolOutput, adapt_to_async_tool
diff --git a/llama_index/agent/openai_assistant_agent.py b/llama_index/agent/openai_assistant_agent.py
index d38f4cec54a761d1eb6e89b7061b9f072f15d10f..81015b1b885e3e0f5e6311118725f7fa9d79b2bc 100644
--- a/llama_index/agent/openai_assistant_agent.py
+++ b/llama_index/agent/openai_assistant_agent.py
@@ -18,7 +18,7 @@ from llama_index.chat_engine.types import (
     ChatResponseMode,
     StreamingAgentChatResponse,
 )
-from llama_index.llms.base import ChatMessage, MessageRole
+from llama_index.llms.types import ChatMessage, MessageRole
 from llama_index.tools import BaseTool, ToolOutput
 
 logger = logging.getLogger(__name__)
diff --git a/llama_index/agent/react/base.py b/llama_index/agent/react/base.py
index 18759c6c6a1275c0f053e5cb21b82b62869405ae..84c484d37e39c915443ca21c99e2e599877311e9 100644
--- a/llama_index/agent/react/base.py
+++ b/llama_index/agent/react/base.py
@@ -30,8 +30,9 @@ from llama_index.callbacks import (
     trace_method,
 )
 from llama_index.chat_engine.types import AgentChatResponse, StreamingAgentChatResponse
-from llama_index.llms.base import LLM, ChatMessage, ChatResponse, MessageRole
+from llama_index.llms.llm import LLM
 from llama_index.llms.openai import OpenAI
+from llama_index.llms.types import ChatMessage, ChatResponse, MessageRole
 from llama_index.memory.chat_memory_buffer import ChatMemoryBuffer
 from llama_index.memory.types import BaseMemory
 from llama_index.objects.base import ObjectRetriever
diff --git a/llama_index/agent/react/formatter.py b/llama_index/agent/react/formatter.py
index e1733846b3229f5509da9e100e0dee159b1c5a1d..ab39d29fe5f40b53280dedcfd6f621443cce4af8 100644
--- a/llama_index/agent/react/formatter.py
+++ b/llama_index/agent/react/formatter.py
@@ -6,7 +6,7 @@ from typing import List, Optional, Sequence
 from llama_index.agent.react.prompts import REACT_CHAT_SYSTEM_HEADER
 from llama_index.agent.react.types import BaseReasoningStep, ObservationReasoningStep
 from llama_index.bridge.pydantic import BaseModel
-from llama_index.llms.base import ChatMessage, MessageRole
+from llama_index.llms.types import ChatMessage, MessageRole
 from llama_index.tools import BaseTool
 
 
diff --git a/llama_index/agent/types.py b/llama_index/agent/types.py
index 6595cf2bfa46b56ca20f7cef55b48e6ddb41853a..422516796c67965ceeb8ebaa387684b7a56877d6 100644
--- a/llama_index/agent/types.py
+++ b/llama_index/agent/types.py
@@ -4,7 +4,7 @@ from typing import List, Optional
 from llama_index.callbacks import trace_method
 from llama_index.chat_engine.types import BaseChatEngine, StreamingAgentChatResponse
 from llama_index.core import BaseQueryEngine
-from llama_index.llms.base import ChatMessage
+from llama_index.llms.types import ChatMessage
 from llama_index.prompts.mixin import PromptDictType, PromptMixinType
 from llama_index.response.schema import RESPONSE_TYPE, Response
 from llama_index.schema import QueryBundle
diff --git a/llama_index/callbacks/finetuning_handler.py b/llama_index/callbacks/finetuning_handler.py
index 811c7be8189a20b4c2ff3de5175ffc9c72b4aa21..577e1fe104b54b4677b33675d76e4950d3999cb0 100644
--- a/llama_index/callbacks/finetuning_handler.py
+++ b/llama_index/callbacks/finetuning_handler.py
@@ -35,7 +35,7 @@ class BaseFinetuningHandler(BaseCallbackHandler):
         **kwargs: Any,
     ) -> str:
         """Run when an event starts and return id of event."""
-        from llama_index.llms.base import ChatMessage, MessageRole
+        from llama_index.llms.types import ChatMessage, MessageRole
 
         if event_type == CBEventType.LLM:
             cur_messages = []
@@ -68,7 +68,7 @@ class BaseFinetuningHandler(BaseCallbackHandler):
         **kwargs: Any,
     ) -> None:
         """Run when an event ends."""
-        from llama_index.llms.base import ChatMessage, MessageRole
+        from llama_index.llms.types import ChatMessage, MessageRole
 
         if (
             event_type == CBEventType.LLM
diff --git a/llama_index/chat_engine/condense_plus_context.py b/llama_index/chat_engine/condense_plus_context.py
index 0399af7d2b050922358a77f0bd7d832ac125fa00..bc3b14ab4ec7975c7ee42213326d823eef2de227 100644
--- a/llama_index/chat_engine/condense_plus_context.py
+++ b/llama_index/chat_engine/condense_plus_context.py
@@ -13,9 +13,9 @@ from llama_index.chat_engine.types import (
 from llama_index.indices.base_retriever import BaseRetriever
 from llama_index.indices.query.schema import QueryBundle
 from llama_index.indices.service_context import ServiceContext
-from llama_index.llm_predictor.base import LLMPredictor
-from llama_index.llms.base import LLM, ChatMessage, MessageRole
 from llama_index.llms.generic_utils import messages_to_history_str
+from llama_index.llms.llm import LLM
+from llama_index.llms.types import ChatMessage, MessageRole
 from llama_index.memory import BaseMemory, ChatMemoryBuffer
 from llama_index.postprocessor.types import BaseNodePostprocessor
 from llama_index.prompts.base import PromptTemplate
@@ -60,7 +60,6 @@ class CondensePlusContextChatEngine(BaseChatEngine):
         self,
         retriever: BaseRetriever,
         llm: LLM,
-        llm_predictor: LLMPredictor,
         memory: BaseMemory,
         context_prompt: Optional[str] = None,
         condense_prompt: Optional[str] = None,
@@ -72,7 +71,6 @@ class CondensePlusContextChatEngine(BaseChatEngine):
     ):
         self._retriever = retriever
         self._llm = llm
-        self._llm_predictor = llm_predictor
         self._memory = memory
         self._context_prompt_template = (
             context_prompt or DEFAULT_CONTEXT_PROMPT_TEMPLATE
@@ -106,10 +104,7 @@ class CondensePlusContextChatEngine(BaseChatEngine):
     ) -> "CondensePlusContextChatEngine":
         """Initialize a CondensePlusContextChatEngine from default parameters."""
         service_context = service_context or ServiceContext.from_defaults()
-        if not isinstance(service_context.llm_predictor, LLMPredictor):
-            raise ValueError("llm_predictor must be a LLMPredictor instance")
-        llm_predictor = service_context.llm_predictor
-        llm = llm_predictor.llm
+        llm = service_context.llm
         chat_history = chat_history or []
         memory = memory or ChatMemoryBuffer.from_defaults(
             chat_history=chat_history, token_limit=llm.metadata.context_window - 256
@@ -118,7 +113,6 @@ class CondensePlusContextChatEngine(BaseChatEngine):
         return cls(
             retriever=retriever,
             llm=llm,
-            llm_predictor=llm_predictor,
             memory=memory,
             context_prompt=context_prompt,
             condense_prompt=condense_prompt,
@@ -139,7 +133,7 @@ class CondensePlusContextChatEngine(BaseChatEngine):
         chat_history_str = messages_to_history_str(chat_history)
         logger.debug(chat_history_str)
 
-        return self._llm_predictor.predict(
+        return self._llm.predict(
             self._condense_prompt_template,
             question=latest_message,
             chat_history=chat_history_str,
@@ -155,7 +149,7 @@ class CondensePlusContextChatEngine(BaseChatEngine):
         chat_history_str = messages_to_history_str(chat_history)
         logger.debug(chat_history_str)
 
-        return await self._llm_predictor.apredict(
+        return await self._llm.apredict(
             self._condense_prompt_template,
             question=latest_message,
             chat_history=chat_history_str,
diff --git a/llama_index/chat_engine/condense_question.py b/llama_index/chat_engine/condense_question.py
index e8c9500701acfadf217034193471f4442e5f2483..ef8f2f19a13a7f164cf9b45b3f4dd2c08f23bcac 100644
--- a/llama_index/chat_engine/condense_question.py
+++ b/llama_index/chat_engine/condense_question.py
@@ -10,9 +10,9 @@ from llama_index.chat_engine.types import (
 )
 from llama_index.chat_engine.utils import response_gen_from_query_engine
 from llama_index.core import BaseQueryEngine
-from llama_index.llm_predictor.base import LLMPredictor
-from llama_index.llms.base import ChatMessage, MessageRole
+from llama_index.llm_predictor.base import LLMPredictorType
 from llama_index.llms.generic_utils import messages_to_history_str
+from llama_index.llms.types import ChatMessage, MessageRole
 from llama_index.memory import BaseMemory, ChatMemoryBuffer
 from llama_index.prompts.base import BasePromptTemplate, PromptTemplate
 from llama_index.response.schema import RESPONSE_TYPE, StreamingResponse
@@ -51,14 +51,14 @@ class CondenseQuestionChatEngine(BaseChatEngine):
         query_engine: BaseQueryEngine,
         condense_question_prompt: BasePromptTemplate,
         memory: BaseMemory,
-        service_context: ServiceContext,
+        llm: LLMPredictorType,
         verbose: bool = False,
         callback_manager: Optional[CallbackManager] = None,
     ) -> None:
         self._query_engine = query_engine
         self._condense_question_prompt = condense_question_prompt
         self._memory = memory
-        self._service_context = service_context
+        self._llm = llm
         self._verbose = verbose
         self.callback_manager = callback_manager or CallbackManager([])
 
@@ -80,9 +80,7 @@ class CondenseQuestionChatEngine(BaseChatEngine):
         condense_question_prompt = condense_question_prompt or DEFAULT_PROMPT
 
         service_context = service_context or ServiceContext.from_defaults()
-        if not isinstance(service_context.llm_predictor, LLMPredictor):
-            raise ValueError("llm_predictor must be a LLMPredictor instance")
-        llm = service_context.llm_predictor.llm
+        llm = service_context.llm
 
         chat_history = chat_history or []
         memory = memory or memory_cls.from_defaults(chat_history=chat_history, llm=llm)
@@ -100,7 +98,7 @@ class CondenseQuestionChatEngine(BaseChatEngine):
             query_engine,
             condense_question_prompt,
             memory,
-            service_context,
+            llm,
             verbose=verbose,
             callback_manager=service_context.callback_manager,
         )
@@ -114,7 +112,7 @@ class CondenseQuestionChatEngine(BaseChatEngine):
         chat_history_str = messages_to_history_str(chat_history)
         logger.debug(chat_history_str)
 
-        return self._service_context.llm_predictor.predict(
+        return self._llm.predict(
             self._condense_question_prompt,
             question=last_message,
             chat_history=chat_history_str,
@@ -129,7 +127,7 @@ class CondenseQuestionChatEngine(BaseChatEngine):
         chat_history_str = messages_to_history_str(chat_history)
         logger.debug(chat_history_str)
 
-        return await self._service_context.llm_predictor.apredict(
+        return await self._llm.apredict(
             self._condense_question_prompt,
             question=last_message,
             chat_history=chat_history_str,
diff --git a/llama_index/chat_engine/context.py b/llama_index/chat_engine/context.py
index 9758df918800506db139fba7a4c22448c0c8a90a..04b76f136372f1894261a79c7c5757d1827a3238 100644
--- a/llama_index/chat_engine/context.py
+++ b/llama_index/chat_engine/context.py
@@ -10,8 +10,8 @@ from llama_index.chat_engine.types import (
     ToolOutput,
 )
 from llama_index.core import BaseRetriever
-from llama_index.llm_predictor.base import LLMPredictor
-from llama_index.llms.base import LLM, ChatMessage, MessageRole
+from llama_index.llms.llm import LLM
+from llama_index.llms.types import ChatMessage, MessageRole
 from llama_index.memory import BaseMemory, ChatMemoryBuffer
 from llama_index.postprocessor.types import BaseNodePostprocessor
 from llama_index.schema import MetadataMode, NodeWithScore, QueryBundle
@@ -68,9 +68,7 @@ class ContextChatEngine(BaseChatEngine):
     ) -> "ContextChatEngine":
         """Initialize a ContextChatEngine from default parameters."""
         service_context = service_context or ServiceContext.from_defaults()
-        if not isinstance(service_context.llm_predictor, LLMPredictor):
-            raise ValueError("llm_predictor must be a LLMPredictor instance")
-        llm = service_context.llm_predictor.llm
+        llm = service_context.llm
 
         chat_history = chat_history or []
         memory = memory or ChatMemoryBuffer.from_defaults(
diff --git a/llama_index/chat_engine/simple.py b/llama_index/chat_engine/simple.py
index 3109de021902c8509fddae33c8e7ee5c774bee75..4e95aeb5dafd1e438f85a97dc77a9bcd0fabaf8e 100644
--- a/llama_index/chat_engine/simple.py
+++ b/llama_index/chat_engine/simple.py
@@ -8,8 +8,8 @@ from llama_index.chat_engine.types import (
     BaseChatEngine,
     StreamingAgentChatResponse,
 )
-from llama_index.llm_predictor.base import LLMPredictor
-from llama_index.llms.base import LLM, ChatMessage
+from llama_index.llms.llm import LLM
+from llama_index.llms.types import ChatMessage
 from llama_index.memory import BaseMemory, ChatMemoryBuffer
 from llama_index.service_context import ServiceContext
 
@@ -46,9 +46,7 @@ class SimpleChatEngine(BaseChatEngine):
     ) -> "SimpleChatEngine":
         """Initialize a SimpleChatEngine from default parameters."""
         service_context = service_context or ServiceContext.from_defaults()
-        if not isinstance(service_context.llm_predictor, LLMPredictor):
-            raise ValueError("llm_predictor must be a LLMPredictor instance")
-        llm = service_context.llm_predictor.llm
+        llm = service_context.llm
 
         chat_history = chat_history or []
         memory = memory or memory_cls.from_defaults(chat_history=chat_history, llm=llm)
diff --git a/llama_index/chat_engine/types.py b/llama_index/chat_engine/types.py
index 19c21e193750baa19b19ab6a18a1dd65e242b250..64ccad79983598a4c2b9816f5979b9b3d1c295fc 100644
--- a/llama_index/chat_engine/types.py
+++ b/llama_index/chat_engine/types.py
@@ -7,7 +7,7 @@ from enum import Enum
 from threading import Event
 from typing import AsyncGenerator, Generator, List, Optional, Union
 
-from llama_index.llms.base import ChatMessage, ChatResponseAsyncGen, ChatResponseGen
+from llama_index.llms.types import ChatMessage, ChatResponseAsyncGen, ChatResponseGen
 from llama_index.memory import BaseMemory
 from llama_index.response.schema import Response, StreamingResponse
 from llama_index.schema import NodeWithScore
diff --git a/llama_index/chat_engine/utils.py b/llama_index/chat_engine/utils.py
index 44e25bac3490ecde7a3c3a28bbaf93dafa8218a1..b33e8ff6be61342482b30aae5bccc6dda0235bbc 100644
--- a/llama_index/chat_engine/utils.py
+++ b/llama_index/chat_engine/utils.py
@@ -1,4 +1,4 @@
-from llama_index.llms.base import (
+from llama_index.llms.types import (
     ChatMessage,
     ChatResponse,
     ChatResponseGen,
diff --git a/llama_index/evaluation/correctness.py b/llama_index/evaluation/correctness.py
index 5079f682d5eb5962272160c98342c14936ec37b1..7afe9dd662d3fada1fdd1992e322c175259c4565 100644
--- a/llama_index/evaluation/correctness.py
+++ b/llama_index/evaluation/correctness.py
@@ -126,7 +126,7 @@ class CorrectnessEvaluator(BaseEvaluator):
             print(query, response, reference, flush=True)
             raise ValueError("query, response, and reference must be provided")
 
-        eval_response = await self._service_context.llm_predictor.apredict(
+        eval_response = await self._service_context.llm.apredict(
             prompt=self._eval_template,
             query=query,
             generated_answer=response,
diff --git a/llama_index/evaluation/guideline.py b/llama_index/evaluation/guideline.py
index 67946011c180198a17b715b9a307508b0e38265e..71d8105c0cb067afb6ac322ac63095b1c795be4c 100644
--- a/llama_index/evaluation/guideline.py
+++ b/llama_index/evaluation/guideline.py
@@ -102,7 +102,7 @@ class GuidelineEvaluator(BaseEvaluator):
 
         await asyncio.sleep(sleep_time_in_seconds)
 
-        eval_response = await self._service_context.llm_predictor.apredict(
+        eval_response = await self._service_context.llm.apredict(
             self._eval_template,
             query=query,
             response=response,
diff --git a/llama_index/evaluation/pairwise.py b/llama_index/evaluation/pairwise.py
index 885b33579596ed6503e67356c16d05b27422ce4c..a271db4057282c208e8154f36431117733df7608 100644
--- a/llama_index/evaluation/pairwise.py
+++ b/llama_index/evaluation/pairwise.py
@@ -117,7 +117,7 @@ class PairwiseComparisonEvaluator(BaseEvaluator):
         reference: Optional[str],
     ) -> EvaluationResult:
         """Get evaluation result."""
-        eval_response = await self._service_context.llm_predictor.apredict(
+        eval_response = await self._service_context.llm.apredict(
             prompt=self._eval_template,
             query=query,
             answer_1=response,
diff --git a/llama_index/extractors/interface.py b/llama_index/extractors/interface.py
index 44c11c05d7518c13b6e88582d16e954ff2524063..375e4bf27d45e4c5c748b26b63e0a3c5bccac185 100644
--- a/llama_index/extractors/interface.py
+++ b/llama_index/extractors/interface.py
@@ -54,7 +54,14 @@ class BaseExtractor(TransformComponent):
             from llama_index.llm_predictor.loading import load_predictor
 
             llm_predictor = load_predictor(llm_predictor)
-        data["llm_predictor"] = llm_predictor
+            data["llm_predictor"] = llm_predictor
+
+        llm = data.get("llm", None)
+        if llm:
+            from llama_index.llms.loading import load_llm
+
+            llm = load_llm(llm)
+            data["llm"] = llm
 
         return cls(**data)
 
diff --git a/llama_index/extractors/metadata_extractors.py b/llama_index/extractors/metadata_extractors.py
index 1184f9771bfe09668b7cd439e7756e4deffbf3d5..a259250bf8918bb18d0bbc6d9105e8756f6bf2e0 100644
--- a/llama_index/extractors/metadata_extractors.py
+++ b/llama_index/extractors/metadata_extractors.py
@@ -25,8 +25,9 @@ from typing import Any, Callable, Dict, Iterable, List, Optional, Sequence, cast
 from llama_index.async_utils import DEFAULT_NUM_WORKERS, run_jobs
 from llama_index.bridge.pydantic import Field, PrivateAttr
 from llama_index.extractors.interface import BaseExtractor
-from llama_index.llm_predictor.base import LLMPredictor
-from llama_index.llms.base import LLM
+from llama_index.llm_predictor.base import LLMPredictorType
+from llama_index.llms.llm import LLM
+from llama_index.llms.utils import resolve_llm
 from llama_index.prompts import PromptTemplate
 from llama_index.schema import BaseNode, TextNode
 from llama_index.types import BasePydanticProgram
@@ -47,7 +48,7 @@ class TitleExtractor(BaseExtractor):
     metadata field.
 
     Args:
-        llm_predictor (Optional[LLMPredictor]): LLM predictor
+        llm (Optional[LLM]): LLM
         nodes (int): number of nodes from front to use for title extraction
         node_template (str): template for node-level title clues extraction
         combine_template (str): template for combining node-level clues into
@@ -55,9 +56,7 @@ class TitleExtractor(BaseExtractor):
     """
 
     is_text_node_only: bool = False  # can work for mixture of text and non-text nodes
-    llm_predictor: LLMPredictor = Field(
-        description="The LLMPredictor to use for generation."
-    )
+    llm: LLMPredictorType = Field(description="The LLM to use for generation.")
     nodes: int = Field(
         default=5,
         description="The number of nodes to extract titles from.",
@@ -76,7 +75,7 @@ class TitleExtractor(BaseExtractor):
         self,
         llm: Optional[LLM] = None,
         # TODO: llm_predictor arg is deprecated
-        llm_predictor: Optional[LLMPredictor] = None,
+        llm_predictor: Optional[LLMPredictorType] = None,
         nodes: int = 5,
         node_template: str = DEFAULT_TITLE_NODE_TEMPLATE,
         combine_template: str = DEFAULT_TITLE_COMBINE_TEMPLATE,
@@ -87,13 +86,8 @@ class TitleExtractor(BaseExtractor):
         if nodes < 1:
             raise ValueError("num_nodes must be >= 1")
 
-        if llm is not None:
-            llm_predictor = LLMPredictor(llm=llm)
-        elif llm_predictor is None and llm is None:
-            llm_predictor = LLMPredictor()
-
         super().__init__(
-            llm_predictor=llm_predictor,
+            llm=llm or llm_predictor or resolve_llm("default"),
             nodes=nodes,
             node_template=node_template,
             combine_template=combine_template,
@@ -120,7 +114,7 @@ class TitleExtractor(BaseExtractor):
             return []
 
         title_jobs = [
-            self.llm_predictor.apredict(
+            self.llm.apredict(
                 PromptTemplate(template=self.node_template),
                 context_str=cast(TextNode, node).text,
             )
@@ -135,7 +129,7 @@ class TitleExtractor(BaseExtractor):
                 lambda x, y: x + "," + y, title_candidates[1:], title_candidates[0]
             )
 
-            title = await self.llm_predictor.apredict(
+            title = await self.llm.apredict(
                 PromptTemplate(template=self.combine_template),
                 context_str=titles,
             )
@@ -152,13 +146,11 @@ class KeywordExtractor(BaseExtractor):
     `excerpt_keywords` metadata field.
 
     Args:
-        llm_predictor (Optional[LLMPredictor]): LLM predictor
+        llm (Optional[LLM]): LLM
         keywords (int): number of keywords to extract
     """
 
-    llm_predictor: LLMPredictor = Field(
-        description="The LLMPredictor to use for generation."
-    )
+    llm: LLMPredictorType = Field(description="The LLM to use for generation.")
     keywords: int = Field(
         default=5, description="The number of keywords to extract.", gt=0
     )
@@ -167,7 +159,7 @@ class KeywordExtractor(BaseExtractor):
         self,
         llm: Optional[LLM] = None,
         # TODO: llm_predictor arg is deprecated
-        llm_predictor: Optional[LLMPredictor] = None,
+        llm_predictor: Optional[LLMPredictorType] = None,
         keywords: int = 5,
         num_workers: int = DEFAULT_NUM_WORKERS,
         **kwargs: Any,
@@ -176,13 +168,8 @@ class KeywordExtractor(BaseExtractor):
         if keywords < 1:
             raise ValueError("num_keywords must be >= 1")
 
-        if llm is not None:
-            llm_predictor = LLMPredictor(llm=llm)
-        elif llm_predictor is None and llm is None:
-            llm_predictor = LLMPredictor()
-
         super().__init__(
-            llm_predictor=llm_predictor,
+            llm=llm or llm_predictor or resolve_llm("default"),
             keywords=keywords,
             num_workers=num_workers,
             **kwargs,
@@ -198,7 +185,7 @@ class KeywordExtractor(BaseExtractor):
             return {}
 
         # TODO: figure out a good way to allow users to customize keyword template
-        keywords = await self.llm_predictor.apredict(
+        keywords = await self.llm.apredict(
             PromptTemplate(
                 template=f"""\
 {{context_str}}. Give {self.keywords} unique keywords for this \
@@ -242,15 +229,13 @@ class QuestionsAnsweredExtractor(BaseExtractor):
     Extracts `questions_this_excerpt_can_answer` metadata field.
 
     Args:
-        llm_predictor (Optional[LLMPredictor]): LLM predictor
+        llm (Optional[LLM]): LLM
         questions (int): number of questions to extract
         prompt_template (str): template for question extraction,
         embedding_only (bool): whether to use embedding only
     """
 
-    llm_predictor: LLMPredictor = Field(
-        description="The LLMPredictor to use for generation."
-    )
+    llm: LLMPredictorType = Field(description="The LLM to use for generation.")
     questions: int = Field(
         default=5,
         description="The number of questions to generate.",
@@ -268,7 +253,7 @@ class QuestionsAnsweredExtractor(BaseExtractor):
         self,
         llm: Optional[LLM] = None,
         # TODO: llm_predictor arg is deprecated
-        llm_predictor: Optional[LLMPredictor] = None,
+        llm_predictor: Optional[LLMPredictorType] = None,
         questions: int = 5,
         prompt_template: str = DEFAULT_QUESTION_GEN_TMPL,
         embedding_only: bool = True,
@@ -279,13 +264,8 @@ class QuestionsAnsweredExtractor(BaseExtractor):
         if questions < 1:
             raise ValueError("questions must be >= 1")
 
-        if llm is not None:
-            llm_predictor = LLMPredictor(llm=llm)
-        elif llm_predictor is None and llm is None:
-            llm_predictor = LLMPredictor()
-
         super().__init__(
-            llm_predictor=llm_predictor,
+            llm=llm or llm_predictor or resolve_llm("default"),
             questions=questions,
             prompt_template=prompt_template,
             embedding_only=embedding_only,
@@ -304,7 +284,7 @@ class QuestionsAnsweredExtractor(BaseExtractor):
 
         context_str = node.get_content(metadata_mode=self.metadata_mode)
         prompt = PromptTemplate(template=self.prompt_template)
-        questions = await self.llm_predictor.apredict(
+        questions = await self.llm.apredict(
             prompt, num_questions=self.questions, context_str=context_str
         )
 
@@ -338,14 +318,12 @@ class SummaryExtractor(BaseExtractor):
     metadata fields.
 
     Args:
-        llm_predictor (Optional[LLMPredictor]): LLM predictor
+        llm (Optional[LLM]): LLM
         summaries (List[str]): list of summaries to extract: 'self', 'prev', 'next'
         prompt_template (str): template for summary extraction
     """
 
-    llm_predictor: LLMPredictor = Field(
-        description="The LLMPredictor to use for generation."
-    )
+    llm: LLMPredictorType = Field(description="The LLM to use for generation.")
     summaries: List[str] = Field(
         description="List of summaries to extract: 'self', 'prev', 'next'"
     )
@@ -362,17 +340,12 @@ class SummaryExtractor(BaseExtractor):
         self,
         llm: Optional[LLM] = None,
         # TODO: llm_predictor arg is deprecated
-        llm_predictor: Optional[LLMPredictor] = None,
+        llm_predictor: Optional[LLMPredictorType] = None,
         summaries: List[str] = ["self"],
         prompt_template: str = DEFAULT_SUMMARY_EXTRACT_TEMPLATE,
         num_workers: int = DEFAULT_NUM_WORKERS,
         **kwargs: Any,
     ):
-        if llm is not None:
-            llm_predictor = LLMPredictor(llm=llm)
-        elif llm_predictor is None and llm is None:
-            llm_predictor = LLMPredictor()
-
         # validation
         if not all(s in ["self", "prev", "next"] for s in summaries):
             raise ValueError("summaries must be one of ['self', 'prev', 'next']")
@@ -381,7 +354,7 @@ class SummaryExtractor(BaseExtractor):
         self._next_summary = "next" in summaries
 
         super().__init__(
-            llm_predictor=llm_predictor,
+            llm=llm or llm_predictor or resolve_llm("default"),
             summaries=summaries,
             prompt_template=prompt_template,
             num_workers=num_workers,
@@ -398,7 +371,7 @@ class SummaryExtractor(BaseExtractor):
             return ""
 
         context_str = node.get_content(metadata_mode=self.metadata_mode)
-        summary = await self.llm_predictor.apredict(
+        summary = await self.llm.apredict(
             PromptTemplate(template=self.prompt_template), context_str=context_str
         )
 
diff --git a/llama_index/finetuning/cross_encoders/dataset_gen.py b/llama_index/finetuning/cross_encoders/dataset_gen.py
index 3abb04f38306afef1b9cef7c6a08c9c685fbdeee..a594c221ab8ca540fb906e780b9a4c0236d58c03 100644
--- a/llama_index/finetuning/cross_encoders/dataset_gen.py
+++ b/llama_index/finetuning/cross_encoders/dataset_gen.py
@@ -8,7 +8,7 @@ from tqdm.auto import tqdm
 
 from llama_index import VectorStoreIndex
 from llama_index.llms import ChatMessage, OpenAI
-from llama_index.llms.base import LLM
+from llama_index.llms.llm import LLM
 from llama_index.node_parser import TokenTextSplitter
 from llama_index.schema import Document, MetadataMode
 
diff --git a/llama_index/finetuning/openai/base.py b/llama_index/finetuning/openai/base.py
index 8d16936041afe80a09acebbe29e299cac1585f7b..9ea0a3e7856952309e7329e473061bdf15fa3041 100644
--- a/llama_index/finetuning/openai/base.py
+++ b/llama_index/finetuning/openai/base.py
@@ -13,7 +13,7 @@ from llama_index.callbacks import OpenAIFineTuningHandler
 from llama_index.finetuning.openai.validate_json import validate_json
 from llama_index.finetuning.types import BaseLLMFinetuneEngine
 from llama_index.llms import OpenAI
-from llama_index.llms.base import LLM
+from llama_index.llms.llm import LLM
 
 logger = logging.getLogger(__name__)
 
diff --git a/llama_index/finetuning/types.py b/llama_index/finetuning/types.py
index 730027da51a90b2bd47437379836dd05880bbe06..bc2106cd64b7572e373ce5bd78742bede83f6a17 100644
--- a/llama_index/finetuning/types.py
+++ b/llama_index/finetuning/types.py
@@ -4,7 +4,7 @@ from abc import ABC, abstractmethod
 from typing import Any
 
 from llama_index.embeddings.base import BaseEmbedding
-from llama_index.llms.base import LLM
+from llama_index.llms.llm import LLM
 from llama_index.postprocessor import CohereRerank, SentenceTransformerRerank
 
 
diff --git a/llama_index/indices/base.py b/llama_index/indices/base.py
index c13f95471b62cfb216259a9a2be34d23b23ec21c..be79007aca4f86ce4783b743c1b5e15871689909 100644
--- a/llama_index/indices/base.py
+++ b/llama_index/indices/base.py
@@ -27,7 +27,7 @@ class BaseIndex(Generic[IS], ABC):
         nodes (List[Node]): List of nodes to index
         show_progress (bool): Whether to show tqdm progress bars. Defaults to False.
         service_context (ServiceContext): Service context container (contains
-            components like LLMPredictor, PromptHelper, etc.).
+            components like LLM, Embeddings, etc.).
 
     """
 
diff --git a/llama_index/indices/common/struct_store/base.py b/llama_index/indices/common/struct_store/base.py
index 4437ec1009691286b9c7a0920930b5beba3fdcb8..b594d7dbf24a4156e406c679196101a3da03e8e5 100644
--- a/llama_index/indices/common/struct_store/base.py
+++ b/llama_index/indices/common/struct_store/base.py
@@ -6,7 +6,7 @@ from typing import Any, Callable, Dict, List, Optional, Sequence, cast
 
 from llama_index.callbacks.schema import CBEventType, EventPayload
 from llama_index.data_structs.table import StructDatapoint
-from llama_index.llm_predictor.base import BaseLLMPredictor
+from llama_index.llm_predictor.base import LLMPredictorType
 from llama_index.node_parser.interface import TextSplitter
 from llama_index.prompts import BasePromptTemplate
 from llama_index.prompts.default_prompt_selectors import (
@@ -135,12 +135,12 @@ class BaseStructDatapointExtractor:
 
     def __init__(
         self,
-        llm_predictor: BaseLLMPredictor,
+        llm: LLMPredictorType,
         schema_extract_prompt: BasePromptTemplate,
         output_parser: OUTPUT_PARSER_TYPE,
     ) -> None:
         """Initialize params."""
-        self._llm_predictor = llm_predictor
+        self._llm = llm
         self._schema_extract_prompt = schema_extract_prompt
         self._output_parser = output_parser
 
@@ -195,7 +195,7 @@ class BaseStructDatapointExtractor:
             logger.info(f"> Adding chunk {i}: {fmt_text_chunk}")
             # if embedding specified in document, pass it to the Node
             schema_text = self._get_schema_text()
-            response_str = self._llm_predictor.predict(
+            response_str = self._llm.predict(
                 self._schema_extract_prompt,
                 text=text_chunk,
                 schema=schema_text,
diff --git a/llama_index/indices/common/struct_store/sql.py b/llama_index/indices/common/struct_store/sql.py
index 939bc12a43e8a4de4116f5ef81d0a1e7dc8eb403..781a337757aabbc4b72e198d6d661206d0e88c4d 100644
--- a/llama_index/indices/common/struct_store/sql.py
+++ b/llama_index/indices/common/struct_store/sql.py
@@ -9,7 +9,7 @@ from llama_index.indices.common.struct_store.base import (
     OUTPUT_PARSER_TYPE,
     BaseStructDatapointExtractor,
 )
-from llama_index.llm_predictor.base import BaseLLMPredictor
+from llama_index.llm_predictor.base import LLMPredictorType
 from llama_index.prompts import BasePromptTemplate
 from llama_index.utilities.sql_wrapper import SQLDatabase
 
@@ -19,7 +19,7 @@ class SQLStructDatapointExtractor(BaseStructDatapointExtractor):
 
     def __init__(
         self,
-        llm_predictor: BaseLLMPredictor,
+        llm: LLMPredictorType,
         schema_extract_prompt: BasePromptTemplate,
         output_parser: OUTPUT_PARSER_TYPE,
         sql_database: SQLDatabase,
@@ -28,7 +28,7 @@ class SQLStructDatapointExtractor(BaseStructDatapointExtractor):
         ref_doc_id_column: Optional[str] = None,
     ) -> None:
         """Initialize params."""
-        super().__init__(llm_predictor, schema_extract_prompt, output_parser)
+        super().__init__(llm, schema_extract_prompt, output_parser)
         self._sql_database = sql_database
         # currently the user must specify a table info
         if table_name is None and table is None:
diff --git a/llama_index/indices/common_tree/base.py b/llama_index/indices/common_tree/base.py
index f43986b1c0e21ec0f14c6d01ae8ead9ea658f83f..b0a7acb663ce6c8f44333742d85b6eae0c95fb16 100644
--- a/llama_index/indices/common_tree/base.py
+++ b/llama_index/indices/common_tree/base.py
@@ -149,7 +149,7 @@ class GPTTreeIndexBuilder:
         ) as event:
             if self._use_async:
                 tasks = [
-                    self._service_context.llm_predictor.apredict(
+                    self._service_context.llm.apredict(
                         self.summary_prompt, context_str=text_chunk
                     )
                     for text_chunk in text_chunks
@@ -167,7 +167,7 @@ class GPTTreeIndexBuilder:
                     desc="Generating summaries",
                 )
                 summaries = [
-                    self._service_context.llm_predictor.predict(
+                    self._service_context.llm.predict(
                         self.summary_prompt, context_str=text_chunk
                     )
                     for text_chunk in text_chunks_progress
@@ -217,7 +217,7 @@ class GPTTreeIndexBuilder:
                 desc="Generating summaries",
             )
             tasks = [
-                self._service_context.llm_predictor.apredict(
+                self._service_context.llm.apredict(
                     self.summary_prompt, context_str=text_chunk
                 )
                 for text_chunk in text_chunks_progress
diff --git a/llama_index/indices/document_summary/retrievers.py b/llama_index/indices/document_summary/retrievers.py
index 5c1752216599a24dbe8834c910f468c3c67c9e23..fda5c8a845f730d88172d0dd4f0c6dd9e0c889a0 100644
--- a/llama_index/indices/document_summary/retrievers.py
+++ b/llama_index/indices/document_summary/retrievers.py
@@ -80,7 +80,7 @@ class DocumentSummaryIndexLLMRetriever(BaseRetriever):
             query_str = query_bundle.query_str
             fmt_batch_str = self._format_node_batch_fn(summary_nodes)
             # call each batch independently
-            raw_response = self._service_context.llm_predictor.predict(
+            raw_response = self._service_context.llm.predict(
                 self._choice_select_prompt,
                 context_str=fmt_batch_str,
                 query_str=query_str,
diff --git a/llama_index/indices/keyword_table/base.py b/llama_index/indices/keyword_table/base.py
index fb6b17156bf1b144f804a5ce347ff7fdc99a9517..02a031d674a8d5b6537532887da6ca258db4714d 100644
--- a/llama_index/indices/keyword_table/base.py
+++ b/llama_index/indices/keyword_table/base.py
@@ -219,7 +219,7 @@ class KeywordTableIndex(BaseKeywordTableIndex):
 
     def _extract_keywords(self, text: str) -> Set[str]:
         """Extract keywords from text."""
-        response = self._service_context.llm_predictor.predict(
+        response = self._service_context.llm.predict(
             self.keyword_extract_template,
             text=text,
         )
@@ -227,7 +227,7 @@ class KeywordTableIndex(BaseKeywordTableIndex):
 
     async def _async_extract_keywords(self, text: str) -> Set[str]:
         """Extract keywords from text."""
-        response = await self._service_context.llm_predictor.apredict(
+        response = await self._service_context.llm.apredict(
             self.keyword_extract_template,
             text=text,
         )
diff --git a/llama_index/indices/keyword_table/retrievers.py b/llama_index/indices/keyword_table/retrievers.py
index af216c3550312b7ea7f38a965290f81d1b2f8eb0..0d687b2feaf6c792f81c76862857ee03e57c6f40 100644
--- a/llama_index/indices/keyword_table/retrievers.py
+++ b/llama_index/indices/keyword_table/retrievers.py
@@ -118,7 +118,7 @@ class KeywordTableGPTRetriever(BaseKeywordTableRetriever):
 
     def _get_keywords(self, query_str: str) -> List[str]:
         """Extract keywords."""
-        response = self._service_context.llm_predictor.predict(
+        response = self._service_context.llm.predict(
             self.query_keyword_extract_template,
             max_keywords=self.max_keywords_per_query,
             question=query_str,
diff --git a/llama_index/indices/knowledge_graph/base.py b/llama_index/indices/knowledge_graph/base.py
index faccad78339362af91f23c99546aae4be2e992f6..00cc76d836d32d0a00d4ad72450237fa46a5499c 100644
--- a/llama_index/indices/knowledge_graph/base.py
+++ b/llama_index/indices/knowledge_graph/base.py
@@ -119,7 +119,7 @@ class KnowledgeGraphIndex(BaseIndex[KG]):
 
     def _llm_extract_triplets(self, text: str) -> List[Tuple[str, str, str]]:
         """Extract keywords from text."""
-        response = self._service_context.llm_predictor.predict(
+        response = self._service_context.llm.predict(
             self.kg_triple_extract_template,
             text=text,
         )
diff --git a/llama_index/indices/knowledge_graph/retrievers.py b/llama_index/indices/knowledge_graph/retrievers.py
index 72fe5efe5683620883a3af8436233162c460646f..9ac9cc064e8874c4f22b0c3bbb624b7e5633ce22 100644
--- a/llama_index/indices/knowledge_graph/retrievers.py
+++ b/llama_index/indices/knowledge_graph/retrievers.py
@@ -124,7 +124,7 @@ class KGTableRetriever(BaseRetriever):
 
     def _get_keywords(self, query_str: str) -> List[str]:
         """Extract keywords."""
-        response = self._service_context.llm_predictor.predict(
+        response = self._service_context.llm.predict(
             self.query_keyword_extract_template,
             max_keywords=self.max_keywords_per_query,
             question=query_str,
@@ -524,7 +524,7 @@ class KnowledgeGraphRAGRetriever(BaseRetriever):
         if handle_fn is not None:
             enitities_fn = handle_fn(query_str)
         if handle_llm_prompt_template is not None:
-            response = self._service_context.llm_predictor.predict(
+            response = self._service_context.llm.predict(
                 handle_llm_prompt_template,
                 max_keywords=max_items,
                 question=query_str,
@@ -574,7 +574,7 @@ class KnowledgeGraphRAGRetriever(BaseRetriever):
         if handle_fn is not None:
             enitities_fn = handle_fn(query_str)
         if handle_llm_prompt_template is not None:
-            response = await self._service_context.llm_predictor.apredict(
+            response = await self._service_context.llm.apredict(
                 handle_llm_prompt_template,
                 max_keywords=max_items,
                 question=query_str,
diff --git a/llama_index/indices/list/retrievers.py b/llama_index/indices/list/retrievers.py
index c3f6dd6e963129936f426d7556c7759586f5de73..4f92ee0763fd4e895b1705356efc30eb183a5d18 100644
--- a/llama_index/indices/list/retrievers.py
+++ b/llama_index/indices/list/retrievers.py
@@ -178,7 +178,7 @@ class SummaryIndexLLMRetriever(BaseRetriever):
             query_str = query_bundle.query_str
             fmt_batch_str = self._format_node_batch_fn(nodes_batch)
             # call each batch independently
-            raw_response = self._service_context.llm_predictor.predict(
+            raw_response = self._service_context.llm.predict(
                 self._choice_select_prompt,
                 context_str=fmt_batch_str,
                 query_str=query_str,
diff --git a/llama_index/indices/prompt_helper.py b/llama_index/indices/prompt_helper.py
index aedab1ba52da0853db42d8f4a26ac9210913ffa0..f2c5ca9efbfbfe2443b11fa38fc8c7a8469966c9 100644
--- a/llama_index/indices/prompt_helper.py
+++ b/llama_index/indices/prompt_helper.py
@@ -16,7 +16,8 @@ from typing import Callable, List, Optional, Sequence
 from llama_index.bridge.pydantic import Field, PrivateAttr
 from llama_index.constants import DEFAULT_CONTEXT_WINDOW, DEFAULT_NUM_OUTPUTS
 from llama_index.llm_predictor.base import LLMMetadata
-from llama_index.llms.base import LLM, ChatMessage
+from llama_index.llms.llm import LLM
+from llama_index.llms.types import ChatMessage
 from llama_index.node_parser.text.token import TokenTextSplitter
 from llama_index.node_parser.text.utils import truncate_text
 from llama_index.prompts import (
diff --git a/llama_index/indices/query/query_transform/base.py b/llama_index/indices/query/query_transform/base.py
index 6313cde6ee2d4f997f0e43aa027a39957f2b2733..bb4ee668feae253fabd94603fc3e51c7c1d310a1 100644
--- a/llama_index/indices/query/query_transform/base.py
+++ b/llama_index/indices/query/query_transform/base.py
@@ -12,8 +12,8 @@ from llama_index.indices.query.query_transform.prompts import (
     ImageOutputQueryTransformPrompt,
     StepDecomposeQueryTransformPrompt,
 )
-from llama_index.llm_predictor import LLMPredictor
-from llama_index.llm_predictor.base import BaseLLMPredictor
+from llama_index.llm_predictor.base import LLMPredictorType
+from llama_index.llms.utils import resolve_llm
 from llama_index.prompts import BasePromptTemplate
 from llama_index.prompts.default_prompts import DEFAULT_HYDE_PROMPT
 from llama_index.prompts.mixin import PromptDictType, PromptMixin, PromptMixinType
@@ -98,14 +98,14 @@ class HyDEQueryTransform(BaseQueryTransform):
 
     def __init__(
         self,
-        llm_predictor: Optional[BaseLLMPredictor] = None,
+        llm: Optional[LLMPredictorType] = None,
         hyde_prompt: Optional[BasePromptTemplate] = None,
         include_original: bool = True,
     ) -> None:
         """Initialize HyDEQueryTransform.
 
         Args:
-            llm_predictor (Optional[LLMPredictor]): LLM for generating
+            llm_predictor (Optional[LLM]): LLM for generating
                 hypothetical documents
             hyde_prompt (Optional[BasePromptTemplate]): Custom prompt for HyDE
             include_original (bool): Whether to include original query
@@ -113,7 +113,7 @@ class HyDEQueryTransform(BaseQueryTransform):
         """
         super().__init__()
 
-        self._llm_predictor = llm_predictor or LLMPredictor()
+        self._llm = llm or resolve_llm("default")
         self._hyde_prompt = hyde_prompt or DEFAULT_HYDE_PROMPT
         self._include_original = include_original
 
@@ -130,9 +130,7 @@ class HyDEQueryTransform(BaseQueryTransform):
         """Run query transform."""
         # TODO: support generating multiple hypothetical docs
         query_str = query_bundle.query_str
-        hypothetical_doc = self._llm_predictor.predict(
-            self._hyde_prompt, context_str=query_str
-        )
+        hypothetical_doc = self._llm.predict(self._hyde_prompt, context_str=query_str)
         embedding_strs = [hypothetical_doc]
         if self._include_original:
             embedding_strs.extend(query_bundle.embedding_strs)
@@ -149,20 +147,20 @@ class DecomposeQueryTransform(BaseQueryTransform):
     Performs a single step transformation.
 
     Args:
-        llm_predictor (Optional[LLMPredictor]): LLM for generating
+        llm_predictor (Optional[LLM]): LLM for generating
             hypothetical documents
 
     """
 
     def __init__(
         self,
-        llm_predictor: Optional[BaseLLMPredictor] = None,
+        llm: Optional[LLMPredictorType] = None,
         decompose_query_prompt: Optional[DecomposeQueryTransformPrompt] = None,
         verbose: bool = False,
     ) -> None:
         """Init params."""
         super().__init__()
-        self._llm_predictor = llm_predictor or LLMPredictor()
+        self._llm = llm or resolve_llm("default")
         self._decompose_query_prompt = (
             decompose_query_prompt or DEFAULT_DECOMPOSE_QUERY_TRANSFORM_PROMPT
         )
@@ -185,7 +183,7 @@ class DecomposeQueryTransform(BaseQueryTransform):
         # given the text from the index, we can use the query bundle to generate
         # a new query bundle
         query_str = query_bundle.query_str
-        new_query_str = self._llm_predictor.predict(
+        new_query_str = self._llm.predict(
             self._decompose_query_prompt,
             query_str=query_str,
             context_str=index_summary,
@@ -251,20 +249,20 @@ class StepDecomposeQueryTransform(BaseQueryTransform):
     NOTE: doesn't work yet.
 
     Args:
-        llm_predictor (Optional[LLMPredictor]): LLM for generating
+        llm_predictor (Optional[LLM]): LLM for generating
             hypothetical documents
 
     """
 
     def __init__(
         self,
-        llm_predictor: Optional[BaseLLMPredictor] = None,
+        llm: Optional[LLMPredictorType] = None,
         step_decompose_query_prompt: Optional[StepDecomposeQueryTransformPrompt] = None,
         verbose: bool = False,
     ) -> None:
         """Init params."""
         super().__init__()
-        self._llm_predictor = llm_predictor or LLMPredictor()
+        self._llm = llm or resolve_llm("default")
         self._step_decompose_query_prompt = (
             step_decompose_query_prompt or DEFAULT_STEP_DECOMPOSE_QUERY_TRANSFORM_PROMPT
         )
@@ -291,7 +289,7 @@ class StepDecomposeQueryTransform(BaseQueryTransform):
         # given the text from the index, we can use the query bundle to generate
         # a new query bundle
         query_str = query_bundle.query_str
-        new_query_str = self._llm_predictor.predict(
+        new_query_str = self._llm.predict(
             self._step_decompose_query_prompt,
             prev_reasoning=fmt_prev_reasoning,
             query_str=query_str,
diff --git a/llama_index/indices/query/query_transform/feedback_transform.py b/llama_index/indices/query/query_transform/feedback_transform.py
index 0e8342b054550f17162612aa8a1194f15eebe9ce..950f284514027f9a3f85e267829d64ed50ad9be2 100644
--- a/llama_index/indices/query/query_transform/feedback_transform.py
+++ b/llama_index/indices/query/query_transform/feedback_transform.py
@@ -3,8 +3,8 @@ from typing import Dict, Optional
 
 from llama_index.evaluation.base import Evaluation
 from llama_index.indices.query.query_transform.base import BaseQueryTransform
-from llama_index.llm_predictor import LLMPredictor
-from llama_index.llm_predictor.base import BaseLLMPredictor
+from llama_index.llm_predictor.base import LLMPredictorType
+from llama_index.llms.utils import resolve_llm
 from llama_index.prompts.base import BasePromptTemplate, PromptTemplate
 from llama_index.prompts.mixin import PromptDictType
 from llama_index.schema import QueryBundle
@@ -30,7 +30,7 @@ class FeedbackQueryTransformation(BaseQueryTransform):
 
     Args:
         eval(Evaluation): An evaluation object.
-        llm_predictor(BaseLLMPredictor): An LLM predictor.
+        llm(LLM): An LLM.
         resynthesize_query(bool): Whether to resynthesize the query.
         resynthesis_prompt(BasePromptTemplate): A prompt for resynthesizing the query.
 
@@ -38,12 +38,12 @@ class FeedbackQueryTransformation(BaseQueryTransform):
 
     def __init__(
         self,
-        llm_predictor: Optional[BaseLLMPredictor] = None,
+        llm: Optional[LLMPredictorType] = None,
         resynthesize_query: bool = False,
         resynthesis_prompt: Optional[BasePromptTemplate] = None,
     ) -> None:
         super().__init__()
-        self.llm_predictor = llm_predictor or LLMPredictor()
+        self.llm = llm or resolve_llm("default")
         self.should_resynthesize_query = resynthesize_query
         self.resynthesis_prompt = resynthesis_prompt or DEFAULT_RESYNTHESIS_PROMPT
 
@@ -106,7 +106,7 @@ class FeedbackQueryTransformation(BaseQueryTransform):
         if feedback is None:
             return query_str
         else:
-            new_query_str = self.llm_predictor.predict(
+            new_query_str = self.llm.predict(
                 self.resynthesis_prompt,
                 query_str=query_str,
                 response=response,
diff --git a/llama_index/indices/struct_store/json_query.py b/llama_index/indices/struct_store/json_query.py
index 943d6b68ff847751ef2055ec37001df1a816a34c..353aff77edbcc1f730bbb7bb7e500b8ebbf3d424 100644
--- a/llama_index/indices/struct_store/json_query.py
+++ b/llama_index/indices/struct_store/json_query.py
@@ -136,7 +136,7 @@ class JSONQueryEngine(BaseQueryEngine):
         """Answer a query."""
         schema = self._get_schema_context()
 
-        json_path_response_str = self._service_context.llm_predictor.predict(
+        json_path_response_str = self._service_context.llm.predict(
             self._json_path_prompt,
             schema=schema,
             query_str=query_bundle.query_str,
@@ -157,7 +157,7 @@ class JSONQueryEngine(BaseQueryEngine):
             print_text(f"> JSONPath Output: {json_path_output}\n")
 
         if self._synthesize_response:
-            response_str = self._service_context.llm_predictor.predict(
+            response_str = self._service_context.llm.predict(
                 self._response_synthesis_prompt,
                 query_str=query_bundle.query_str,
                 json_schema=self._json_schema,
@@ -176,7 +176,7 @@ class JSONQueryEngine(BaseQueryEngine):
     async def _aquery(self, query_bundle: QueryBundle) -> Response:
         schema = self._get_schema_context()
 
-        json_path_response_str = await self._service_context.llm_predictor.apredict(
+        json_path_response_str = await self._service_context.llm.apredict(
             self._json_path_prompt,
             schema=schema,
             query_str=query_bundle.query_str,
@@ -197,7 +197,7 @@ class JSONQueryEngine(BaseQueryEngine):
             print_text(f"> JSONPath Output: {json_path_output}\n")
 
         if self._synthesize_response:
-            response_str = await self._service_context.llm_predictor.apredict(
+            response_str = await self._service_context.llm.apredict(
                 self._response_synthesis_prompt,
                 query_str=query_bundle.query_str,
                 json_schema=self._json_schema,
diff --git a/llama_index/indices/struct_store/sql.py b/llama_index/indices/struct_store/sql.py
index 32ca4425a0ddb19f2f6e30c58b52aa7ac883102b..f59127669c94573dd69801521308f13c9a98b34b 100644
--- a/llama_index/indices/struct_store/sql.py
+++ b/llama_index/indices/struct_store/sql.py
@@ -107,7 +107,7 @@ class SQLStructStoreIndex(BaseStructStoreIndex[SQLStructTable]):
             return index_struct
         else:
             data_extractor = SQLStructDatapointExtractor(
-                self._service_context.llm_predictor,
+                self._service_context.llm,
                 self.schema_extract_prompt,
                 self.output_parser,
                 self.sql_database,
@@ -127,7 +127,7 @@ class SQLStructStoreIndex(BaseStructStoreIndex[SQLStructTable]):
     def _insert(self, nodes: Sequence[BaseNode], **insert_kwargs: Any) -> None:
         """Insert a document."""
         data_extractor = SQLStructDatapointExtractor(
-            self._service_context.llm_predictor,
+            self._service_context.llm,
             self.schema_extract_prompt,
             self.output_parser,
             self.sql_database,
diff --git a/llama_index/indices/struct_store/sql_query.py b/llama_index/indices/struct_store/sql_query.py
index a543d18d4946c1f33ddb1786c08b6e3cc637ab42..2b57021c1599e937804785c959b2d331f46cb5bf 100644
--- a/llama_index/indices/struct_store/sql_query.py
+++ b/llama_index/indices/struct_store/sql_query.py
@@ -192,7 +192,7 @@ class NLStructStoreQueryEngine(BaseQueryEngine):
         table_desc_str = self._get_table_context(query_bundle)
         logger.info(f"> Table desc str: {table_desc_str}")
 
-        response_str = self._service_context.llm_predictor.predict(
+        response_str = self._service_context.llm.predict(
             self._text_to_sql_prompt,
             query_str=query_bundle.query_str,
             schema=table_desc_str,
@@ -207,7 +207,7 @@ class NLStructStoreQueryEngine(BaseQueryEngine):
         metadata["sql_query"] = sql_query_str
 
         if self._synthesize_response:
-            response_str = self._service_context.llm_predictor.predict(
+            response_str = self._service_context.llm.predict(
                 self._response_synthesis_prompt,
                 query_str=query_bundle.query_str,
                 sql_query=sql_query_str,
@@ -223,7 +223,7 @@ class NLStructStoreQueryEngine(BaseQueryEngine):
         table_desc_str = self._get_table_context(query_bundle)
         logger.info(f"> Table desc str: {table_desc_str}")
 
-        response_str = await self._service_context.llm_predictor.apredict(
+        response_str = await self._service_context.llm.apredict(
             self._text_to_sql_prompt,
             query_str=query_bundle.query_str,
             schema=table_desc_str,
diff --git a/llama_index/indices/struct_store/sql_retriever.py b/llama_index/indices/struct_store/sql_retriever.py
index 6ae50a0f5b736ad88eaf16357aac9b5d2e8625b7..68ebfc66ef5e75e1bcc413cc6f8eb474b4907802 100644
--- a/llama_index/indices/struct_store/sql_retriever.py
+++ b/llama_index/indices/struct_store/sql_retriever.py
@@ -265,7 +265,7 @@ class NLSQLRetriever(BaseRetriever, PromptMixin):
         table_desc_str = self._get_table_context(query_bundle)
         logger.info(f"> Table desc str: {table_desc_str}")
 
-        response_str = self._service_context.llm_predictor.predict(
+        response_str = self._service_context.llm.predict(
             self._text_to_sql_prompt,
             query_str=query_bundle.query_str,
             schema=table_desc_str,
@@ -303,7 +303,7 @@ class NLSQLRetriever(BaseRetriever, PromptMixin):
         table_desc_str = self._get_table_context(query_bundle)
         logger.info(f"> Table desc str: {table_desc_str}")
 
-        response_str = await self._service_context.llm_predictor.apredict(
+        response_str = await self._service_context.llm.apredict(
             self._text_to_sql_prompt,
             query_str=query_bundle.query_str,
             schema=table_desc_str,
diff --git a/llama_index/indices/tree/inserter.py b/llama_index/indices/tree/inserter.py
index 1e8eb526e246cfdc398f14b7d4874d04f761104a..9b99b0a99da977894c8a80aeca500e47a80c16a8 100644
--- a/llama_index/indices/tree/inserter.py
+++ b/llama_index/indices/tree/inserter.py
@@ -75,7 +75,7 @@ class TreeIndexInserter:
             )
             text_chunk1 = "\n".join(truncated_chunks)
 
-            summary1 = self._service_context.llm_predictor.predict(
+            summary1 = self._service_context.llm.predict(
                 self.summary_prompt, context_str=text_chunk1
             )
             node1 = TextNode(text=summary1)
@@ -88,7 +88,7 @@ class TreeIndexInserter:
                 ],
             )
             text_chunk2 = "\n".join(truncated_chunks)
-            summary2 = self._service_context.llm_predictor.predict(
+            summary2 = self._service_context.llm.predict(
                 self.summary_prompt, context_str=text_chunk2
             )
             node2 = TextNode(text=summary2)
@@ -134,7 +134,7 @@ class TreeIndexInserter:
             numbered_text = get_numbered_text_from_nodes(
                 cur_graph_node_list, text_splitter=text_splitter
             )
-            response = self._service_context.llm_predictor.predict(
+            response = self._service_context.llm.predict(
                 self.insert_prompt,
                 new_chunk_text=node.get_content(metadata_mode=MetadataMode.LLM),
                 num_chunks=len(cur_graph_node_list),
@@ -166,7 +166,7 @@ class TreeIndexInserter:
                 ],
             )
             text_chunk = "\n".join(truncated_chunks)
-            new_summary = self._service_context.llm_predictor.predict(
+            new_summary = self._service_context.llm.predict(
                 self.summary_prompt, context_str=text_chunk
             )
 
diff --git a/llama_index/indices/tree/select_leaf_retriever.py b/llama_index/indices/tree/select_leaf_retriever.py
index 8606f614991be2ac2839da03b6d8b93602e7a6aa..a61a3e5ae99a158dcc345f914ac5c1025cc2b21e 100644
--- a/llama_index/indices/tree/select_leaf_retriever.py
+++ b/llama_index/indices/tree/select_leaf_retriever.py
@@ -129,7 +129,7 @@ class TreeSelectLeafRetriever(BaseRetriever):
             return cur_response
         else:
             context_msg = selected_node.get_content(metadata_mode=MetadataMode.LLM)
-            cur_response = self._service_context.llm_predictor.predict(
+            cur_response = self._service_context.llm.predict(
                 self._refine_template,
                 query_str=query_str,
                 existing_answer=prev_response,
@@ -172,7 +172,7 @@ class TreeSelectLeafRetriever(BaseRetriever):
                 cur_node_list, text_splitter=text_splitter
             )
 
-            response = self._service_context.llm_predictor.predict(
+            response = self._service_context.llm.predict(
                 query_template,
                 context_list=numbered_node_text,
             )
@@ -193,7 +193,7 @@ class TreeSelectLeafRetriever(BaseRetriever):
                 cur_node_list, text_splitter=text_splitter
             )
 
-            response = self._service_context.llm_predictor.predict(
+            response = self._service_context.llm.predict(
                 query_template_multiple,
                 context_list=numbered_node_text,
             )
@@ -290,7 +290,7 @@ class TreeSelectLeafRetriever(BaseRetriever):
                 cur_node_list, text_splitter=text_splitter
             )
 
-            response = self._service_context.llm_predictor.predict(
+            response = self._service_context.llm.predict(
                 query_template,
                 context_list=numbered_node_text,
             )
@@ -311,7 +311,7 @@ class TreeSelectLeafRetriever(BaseRetriever):
                 cur_node_list, text_splitter=text_splitter
             )
 
-            response = self._service_context.llm_predictor.predict(
+            response = self._service_context.llm.predict(
                 query_template_multiple,
                 context_list=numbered_node_text,
             )
diff --git a/llama_index/indices/vector_store/retrievers/auto_retriever/auto_retriever.py b/llama_index/indices/vector_store/retrievers/auto_retriever/auto_retriever.py
index 4e25b5ca866271b4f596e5dd63362b7f0193c47c..e8e8e33789f0ad00a75cd5ebe62f7927f13f1a4c 100644
--- a/llama_index/indices/vector_store/retrievers/auto_retriever/auto_retriever.py
+++ b/llama_index/indices/vector_store/retrievers/auto_retriever/auto_retriever.py
@@ -40,7 +40,7 @@ class VectorIndexAutoRetriever(BaseRetriever):
             parameters.
         prompt_template_str: custom prompt template string for LLM.
             Uses default template string if None.
-        service_context: service context containing reference to LLMPredictor.
+        service_context: service context containing reference to an LLM.
             Uses service context from index be default if None.
         similarity_top_k (int): number of top k results to return.
         max_top_k (int):
@@ -89,7 +89,7 @@ class VectorIndexAutoRetriever(BaseRetriever):
         schema_str = VectorStoreQuerySpec.schema_json(indent=4)
 
         # call LLM
-        output = self._service_context.llm_predictor.predict(
+        output = self._service_context.llm.predict(
             self._prompt,
             schema_str=schema_str,
             info_str=info_str,
diff --git a/llama_index/llm_predictor/base.py b/llama_index/llm_predictor/base.py
index 79444c07d79d9935430af1bb09eb039362bf4801..d807d221f6df74806b2366ee991adb344c61f12c 100644
--- a/llama_index/llm_predictor/base.py
+++ b/llama_index/llm_predictor/base.py
@@ -3,20 +3,25 @@
 import logging
 from abc import ABC, abstractmethod
 from collections import ChainMap
-from typing import Any, Dict, List, Optional
+from typing import Any, Dict, List, Optional, Union
 
 from typing_extensions import Self
 
 from llama_index.bridge.pydantic import BaseModel, PrivateAttr
 from llama_index.callbacks.base import CallbackManager
 from llama_index.callbacks.schema import CBEventType, EventPayload
-from llama_index.llm_predictor.utils import (
+from llama_index.llms.llm import (
+    LLM,
     astream_chat_response_to_tokens,
     astream_completion_response_to_tokens,
     stream_chat_response_to_tokens,
     stream_completion_response_to_tokens,
 )
-from llama_index.llms.base import LLM, ChatMessage, LLMMetadata, MessageRole
+from llama_index.llms.types import (
+    ChatMessage,
+    LLMMetadata,
+    MessageRole,
+)
 from llama_index.llms.utils import LLMType, resolve_llm
 from llama_index.prompts.base import BasePromptTemplate, PromptTemplate
 from llama_index.schema import BaseComponent
@@ -326,3 +331,6 @@ class LLMPredictor(BaseLLMPredictor):
                 *messages,
             ]
         return messages
+
+
+LLMPredictorType = Union[LLMPredictor, LLM]
diff --git a/llama_index/llm_predictor/mock.py b/llama_index/llm_predictor/mock.py
index 7ddaf99f4d0f52351ac0cd04440c7613c6b767ed..d3a971f18ec54175e8a6ff272d8134fe12bb7347 100644
--- a/llama_index/llm_predictor/mock.py
+++ b/llama_index/llm_predictor/mock.py
@@ -1,11 +1,14 @@
 """Mock LLM Predictor."""
 from typing import Any, Dict
 
+from deprecated import deprecated
+
 from llama_index.bridge.pydantic import Field, PrivateAttr
 from llama_index.callbacks.base import CallbackManager
 from llama_index.constants import DEFAULT_NUM_OUTPUTS
 from llama_index.llm_predictor.base import BaseLLMPredictor
-from llama_index.llms.base import LLM, LLMMetadata
+from llama_index.llms.llm import LLM
+from llama_index.llms.types import LLMMetadata
 from llama_index.prompts.base import BasePromptTemplate
 from llama_index.prompts.prompt_type import PromptType
 from llama_index.token_counter.utils import (
@@ -82,6 +85,7 @@ def _mock_knowledge_graph_triplet_extract(prompt_args: Dict, max_triplets: int)
     )
 
 
+@deprecated("MockLLMPredictor is deprecated. Use MockLLM instead.")
 class MockLLMPredictor(BaseLLMPredictor):
     """Mock LLM Predictor."""
 
diff --git a/llama_index/llm_predictor/structured.py b/llama_index/llm_predictor/structured.py
index d9f86ce20202dd750e276c92a222931b82383401..a3e07303a683710fb1d46cf8a1d1f9b9e5a5ebf7 100644
--- a/llama_index/llm_predictor/structured.py
+++ b/llama_index/llm_predictor/structured.py
@@ -4,6 +4,8 @@
 import logging
 from typing import Any, Optional
 
+from deprecated import deprecated
+
 from llama_index.llm_predictor.base import LLMPredictor
 from llama_index.prompts.base import BasePromptTemplate
 from llama_index.types import TokenGen
@@ -11,6 +13,7 @@ from llama_index.types import TokenGen
 logger = logging.getLogger(__name__)
 
 
+@deprecated("StructuredLLMPredictor is deprecated. Use llm.structured_predict().")
 class StructuredLLMPredictor(LLMPredictor):
     """Structured LLM predictor class.
 
diff --git a/llama_index/llm_predictor/utils.py b/llama_index/llm_predictor/utils.py
deleted file mode 100644
index 6c35125fbba113e5ca95fd15de1895de7e6781a7..0000000000000000000000000000000000000000
--- a/llama_index/llm_predictor/utils.py
+++ /dev/null
@@ -1,55 +0,0 @@
-from llama_index.llms.base import (
-    ChatResponseAsyncGen,
-    ChatResponseGen,
-    CompletionResponseAsyncGen,
-    CompletionResponseGen,
-)
-from llama_index.types import TokenAsyncGen, TokenGen
-
-
-def stream_completion_response_to_tokens(
-    completion_response_gen: CompletionResponseGen,
-) -> TokenGen:
-    """Convert a stream completion response to a stream of tokens."""
-
-    def gen() -> TokenGen:
-        for response in completion_response_gen:
-            yield response.delta or ""
-
-    return gen()
-
-
-def stream_chat_response_to_tokens(
-    chat_response_gen: ChatResponseGen,
-) -> TokenGen:
-    """Convert a stream completion response to a stream of tokens."""
-
-    def gen() -> TokenGen:
-        for response in chat_response_gen:
-            yield response.delta or ""
-
-    return gen()
-
-
-async def astream_completion_response_to_tokens(
-    completion_response_gen: CompletionResponseAsyncGen,
-) -> TokenAsyncGen:
-    """Convert a stream completion response to a stream of tokens."""
-
-    async def gen() -> TokenAsyncGen:
-        async for response in completion_response_gen:
-            yield response.delta or ""
-
-    return gen()
-
-
-async def astream_chat_response_to_tokens(
-    chat_response_gen: ChatResponseAsyncGen,
-) -> TokenAsyncGen:
-    """Convert a stream completion response to a stream of tokens."""
-
-    async def gen() -> TokenAsyncGen:
-        async for response in chat_response_gen:
-            yield response.delta or ""
-
-    return gen()
diff --git a/llama_index/llm_predictor/vellum/predictor.py b/llama_index/llm_predictor/vellum/predictor.py
index f3ebf2063153bdbb4c4b1bd69c7a905dd2e08f86..7c3287c49b25b548cf639084a6e30a058294dc34 100644
--- a/llama_index/llm_predictor/vellum/predictor.py
+++ b/llama_index/llm_predictor/vellum/predictor.py
@@ -2,6 +2,8 @@ from __future__ import annotations
 
 from typing import Any, Tuple, cast
 
+from deprecated import deprecated
+
 from llama_index.bridge.pydantic import PrivateAttr
 from llama_index.callbacks import CallbackManager
 from llama_index.callbacks.schema import CBEventType, EventPayload
@@ -16,6 +18,7 @@ from llama_index.prompts import BasePromptTemplate
 from llama_index.types import TokenAsyncGen, TokenGen
 
 
+@deprecated("VellumPredictor is deprecated and will be removed in a future release.")
 class VellumPredictor(BaseLLMPredictor):
     _callback_manager: CallbackManager = PrivateAttr(default_factory=CallbackManager)
 
diff --git a/llama_index/llms/__init__.py b/llama_index/llms/__init__.py
index 870e1cf2ff5ed8c6b5b517e269ac15a807598769..901c0d24b12db4149585a231f178e0be14f76dc1 100644
--- a/llama_index/llms/__init__.py
+++ b/llama_index/llms/__init__.py
@@ -2,18 +2,6 @@ from llama_index.llms.ai21 import AI21
 from llama_index.llms.anthropic import Anthropic
 from llama_index.llms.anyscale import Anyscale
 from llama_index.llms.azure_openai import AzureOpenAI
-from llama_index.llms.base import (
-    LLM,
-    ChatMessage,
-    ChatResponse,
-    ChatResponseAsyncGen,
-    ChatResponseGen,
-    CompletionResponse,
-    CompletionResponseAsyncGen,
-    CompletionResponseGen,
-    LLMMetadata,
-    MessageRole,
-)
 from llama_index.llms.bedrock import Bedrock
 from llama_index.llms.clarifai import Clarifai
 from llama_index.llms.cohere import Cohere
@@ -25,6 +13,7 @@ from llama_index.llms.konko import Konko
 from llama_index.llms.langchain import LangChainLLM
 from llama_index.llms.litellm import LiteLLM
 from llama_index.llms.llama_cpp import LlamaCPP
+from llama_index.llms.llm import LLM
 from llama_index.llms.localai import LOCALAI_DEFAULTS, LocalAI
 from llama_index.llms.mock import MockLLM
 from llama_index.llms.monsterapi import MonsterLLM
@@ -37,6 +26,17 @@ from llama_index.llms.perplexity import Perplexity
 from llama_index.llms.portkey import Portkey
 from llama_index.llms.predibase import PredibaseLLM
 from llama_index.llms.replicate import Replicate
+from llama_index.llms.types import (
+    ChatMessage,
+    ChatResponse,
+    ChatResponseAsyncGen,
+    ChatResponseGen,
+    CompletionResponse,
+    CompletionResponseAsyncGen,
+    CompletionResponseGen,
+    LLMMetadata,
+    MessageRole,
+)
 from llama_index.llms.vertex import Vertex
 from llama_index.llms.vllm import Vllm, VllmServer
 from llama_index.llms.watsonx import WatsonX
diff --git a/llama_index/llms/ai21.py b/llama_index/llms/ai21.py
index 305aadbcaf11c2642a48947180ea203feb65c7c6..0ed8216b691737a7a1c619621c8dd459391901f7 100644
--- a/llama_index/llms/ai21.py
+++ b/llama_index/llms/ai21.py
@@ -1,23 +1,23 @@
-from typing import Any, Dict, Optional, Sequence
+from typing import Any, Callable, Dict, Optional, Sequence
 
 from llama_index.bridge.pydantic import Field, PrivateAttr
 from llama_index.callbacks import CallbackManager
 from llama_index.llms.ai21_utils import ai21_model_to_context_size
-from llama_index.llms.base import (
+from llama_index.llms.base import llm_chat_callback, llm_completion_callback
+from llama_index.llms.custom import CustomLLM
+from llama_index.llms.generic_utils import (
+    completion_to_chat_decorator,
+    get_from_param_or_env,
+)
+from llama_index.llms.types import (
     ChatMessage,
     ChatResponse,
     ChatResponseGen,
     CompletionResponse,
     CompletionResponseGen,
     LLMMetadata,
-    llm_chat_callback,
-    llm_completion_callback,
-)
-from llama_index.llms.custom import CustomLLM
-from llama_index.llms.generic_utils import (
-    completion_to_chat_decorator,
-    get_from_param_or_env,
 )
+from llama_index.types import BaseOutputParser, PydanticProgramMode
 
 
 class AI21(CustomLLM):
@@ -41,6 +41,11 @@ class AI21(CustomLLM):
         temperature: Optional[float] = 0.1,
         additional_kwargs: Optional[Dict[str, Any]] = None,
         callback_manager: Optional[CallbackManager] = None,
+        system_prompt: Optional[str] = None,
+        messages_to_prompt: Optional[Callable[[Sequence[ChatMessage]], str]] = None,
+        completion_to_prompt: Optional[Callable[[str], str]] = None,
+        pydantic_program_mode: PydanticProgramMode = PydanticProgramMode.DEFAULT,
+        output_parser: Optional[BaseOutputParser] = None,
     ) -> None:
         """Initialize params."""
         try:
@@ -63,6 +68,11 @@ class AI21(CustomLLM):
             temperature=temperature,
             additional_kwargs=additional_kwargs,
             callback_manager=callback_manager,
+            system_prompt=system_prompt,
+            messages_to_prompt=messages_to_prompt,
+            completion_to_prompt=completion_to_prompt,
+            pydantic_program_mode=pydantic_program_mode,
+            output_parser=output_parser,
         )
 
     @classmethod
diff --git a/llama_index/llms/anthropic.py b/llama_index/llms/anthropic.py
index 0cf10ad7c83150ad7accc73bacd122e8d1bcba9a..86ceff3a58b42413fcc9a7ed6fc00fa732dbd3a5 100644
--- a/llama_index/llms/anthropic.py
+++ b/llama_index/llms/anthropic.py
@@ -1,4 +1,4 @@
-from typing import Any, Dict, Optional, Sequence
+from typing import Any, Callable, Dict, Optional, Sequence
 
 from llama_index.bridge.pydantic import Field, PrivateAttr
 from llama_index.callbacks import CallbackManager
@@ -8,7 +8,17 @@ from llama_index.llms.anthropic_utils import (
     messages_to_anthropic_prompt,
 )
 from llama_index.llms.base import (
-    LLM,
+    llm_chat_callback,
+    llm_completion_callback,
+)
+from llama_index.llms.generic_utils import (
+    achat_to_completion_decorator,
+    astream_chat_to_completion_decorator,
+    chat_to_completion_decorator,
+    stream_chat_to_completion_decorator,
+)
+from llama_index.llms.llm import LLM
+from llama_index.llms.types import (
     ChatMessage,
     ChatResponse,
     ChatResponseAsyncGen,
@@ -18,15 +28,8 @@ from llama_index.llms.base import (
     CompletionResponseGen,
     LLMMetadata,
     MessageRole,
-    llm_chat_callback,
-    llm_completion_callback,
-)
-from llama_index.llms.generic_utils import (
-    achat_to_completion_decorator,
-    astream_chat_to_completion_decorator,
-    chat_to_completion_decorator,
-    stream_chat_to_completion_decorator,
 )
+from llama_index.types import BaseOutputParser, PydanticProgramMode
 
 DEFAULT_ANTHROPIC_MODEL = "claude-2"
 DEFAULT_ANTHROPIC_MAX_TOKENS = 512
@@ -73,6 +76,11 @@ class Anthropic(LLM):
         api_key: Optional[str] = None,
         additional_kwargs: Optional[Dict[str, Any]] = None,
         callback_manager: Optional[CallbackManager] = None,
+        system_prompt: Optional[str] = None,
+        messages_to_prompt: Optional[Callable[[Sequence[ChatMessage]], str]] = None,
+        completion_to_prompt: Optional[Callable[[str], str]] = None,
+        pydantic_program_mode: PydanticProgramMode = PydanticProgramMode.DEFAULT,
+        output_parser: Optional[BaseOutputParser] = None,
     ) -> None:
         try:
             import anthropic
@@ -101,6 +109,11 @@ class Anthropic(LLM):
             max_retries=max_retries,
             model=model,
             callback_manager=callback_manager,
+            system_prompt=system_prompt,
+            messages_to_prompt=messages_to_prompt,
+            completion_to_prompt=completion_to_prompt,
+            pydantic_program_mode=pydantic_program_mode,
+            output_parser=output_parser,
         )
 
     @classmethod
diff --git a/llama_index/llms/anthropic_utils.py b/llama_index/llms/anthropic_utils.py
index 7ee85c684d5baf16bd3874aaeef4670fe9041472..f0904bd73aace13ec7bc758aefb0463062183fb6 100644
--- a/llama_index/llms/anthropic_utils.py
+++ b/llama_index/llms/anthropic_utils.py
@@ -1,6 +1,6 @@
 from typing import Dict, Sequence
 
-from llama_index.llms.base import ChatMessage, MessageRole
+from llama_index.llms.types import ChatMessage, MessageRole
 
 HUMAN_PREFIX = "\n\nHuman:"
 ASSISTANT_PREFIX = "\n\nAssistant:"
diff --git a/llama_index/llms/anyscale.py b/llama_index/llms/anyscale.py
index 714aa86858bff4a4b62ef1d4ce0816a9b8c6be9f..d9404326d11dd3fc58374bbc50544908c2152ad3 100644
--- a/llama_index/llms/anyscale.py
+++ b/llama_index/llms/anyscale.py
@@ -1,15 +1,14 @@
-from typing import Any, Dict, Optional
+from typing import Any, Callable, Dict, Optional, Sequence
 
 from llama_index.callbacks import CallbackManager
 from llama_index.constants import DEFAULT_NUM_OUTPUTS, DEFAULT_TEMPERATURE
 from llama_index.llms.anyscale_utils import (
     anyscale_modelname_to_contextsize,
 )
-from llama_index.llms.base import (
-    LLMMetadata,
-)
 from llama_index.llms.generic_utils import get_from_param_or_env
 from llama_index.llms.openai import OpenAI
+from llama_index.llms.types import ChatMessage, LLMMetadata
+from llama_index.types import BaseOutputParser, PydanticProgramMode
 
 DEFAULT_API_BASE = "https://api.endpoints.anyscale.com/v1"
 DEFAULT_MODEL = "meta-llama/Llama-2-70b-chat-hf"
@@ -26,6 +25,11 @@ class Anyscale(OpenAI):
         api_base: Optional[str] = DEFAULT_API_BASE,
         api_key: Optional[str] = None,
         callback_manager: Optional[CallbackManager] = None,
+        system_prompt: Optional[str] = None,
+        messages_to_prompt: Optional[Callable[[Sequence[ChatMessage]], str]] = None,
+        completion_to_prompt: Optional[Callable[[str], str]] = None,
+        pydantic_program_mode: PydanticProgramMode = PydanticProgramMode.DEFAULT,
+        output_parser: Optional[BaseOutputParser] = None,
     ) -> None:
         additional_kwargs = additional_kwargs or {}
         callback_manager = callback_manager or CallbackManager([])
@@ -42,6 +46,11 @@ class Anyscale(OpenAI):
             additional_kwargs=additional_kwargs,
             max_retries=max_retries,
             callback_manager=callback_manager,
+            system_prompt=system_prompt,
+            messages_to_prompt=messages_to_prompt,
+            completion_to_prompt=completion_to_prompt,
+            pydantic_program_mode=pydantic_program_mode,
+            output_parser=output_parser,
         )
 
     @classmethod
diff --git a/llama_index/llms/anyscale_utils.py b/llama_index/llms/anyscale_utils.py
index 6c7953c8c220a020075da40480cda92423a3c9c4..334f2c2591cea178700d12b61b722c354a28aa0d 100644
--- a/llama_index/llms/anyscale_utils.py
+++ b/llama_index/llms/anyscale_utils.py
@@ -1,6 +1,6 @@
 from typing import Any, Dict, List, Sequence
 
-from llama_index.llms.base import ChatMessage, MessageRole
+from llama_index.llms.types import ChatMessage, MessageRole
 
 LLAMA_MODELS = {
     "meta-llama/Llama-2-7b-chat-hf": 4096,
diff --git a/llama_index/llms/azure_openai.py b/llama_index/llms/azure_openai.py
index 5b7a2c2920dce07a35c011fa38f5a076954fe59d..0ffa3d32f6a62a1c5038563f10382d0e9b7866f3 100644
--- a/llama_index/llms/azure_openai.py
+++ b/llama_index/llms/azure_openai.py
@@ -1,4 +1,4 @@
-from typing import Any, Dict, Optional
+from typing import Any, Callable, Dict, Optional, Sequence
 
 import httpx
 from openai import AsyncAzureOpenAI
@@ -12,6 +12,8 @@ from llama_index.llms.openai_utils import (
     refresh_openai_azuread_token,
     resolve_from_aliases,
 )
+from llama_index.llms.types import ChatMessage
+from llama_index.types import BaseOutputParser, PydanticProgramMode
 
 
 class AzureOpenAI(OpenAI):
@@ -77,6 +79,12 @@ class AzureOpenAI(OpenAI):
         deployment: Optional[str] = None,
         # custom httpx client
         http_client: Optional[httpx.Client] = None,
+        # base class
+        system_prompt: Optional[str] = None,
+        messages_to_prompt: Optional[Callable[[Sequence[ChatMessage]], str]] = None,
+        completion_to_prompt: Optional[Callable[[str], str]] = None,
+        pydantic_program_mode: PydanticProgramMode = PydanticProgramMode.DEFAULT,
+        output_parser: Optional[BaseOutputParser] = None,
         **kwargs: Any,
     ) -> None:
         engine = resolve_from_aliases(
@@ -109,6 +117,11 @@ class AzureOpenAI(OpenAI):
             use_azure_ad=use_azure_ad,
             api_version=api_version,
             callback_manager=callback_manager,
+            system_prompt=system_prompt,
+            messages_to_prompt=messages_to_prompt,
+            completion_to_prompt=completion_to_prompt,
+            pydantic_program_mode=pydantic_program_mode,
+            output_parser=output_parser,
             **kwargs,
         )
 
diff --git a/llama_index/llms/base.py b/llama_index/llms/base.py
index 8aee9c90f95339382ba77d7f142fe9486beb085d..734143046f16ca3d73b2ec6aab845773d72e6c7d 100644
--- a/llama_index/llms/base.py
+++ b/llama_index/llms/base.py
@@ -1,120 +1,30 @@
 import asyncio
 from abc import abstractmethod
 from contextlib import contextmanager
-from enum import Enum
-from typing import Any, AsyncGenerator, Callable, Generator, Optional, Sequence, cast
-
-from llama_index.bridge.pydantic import BaseModel, Field, validator
+from typing import (
+    Any,
+    AsyncGenerator,
+    Callable,
+    Generator,
+    Sequence,
+    cast,
+)
+
+from llama_index.bridge.pydantic import Field, validator
 from llama_index.callbacks import CallbackManager, CBEventType, EventPayload
-from llama_index.constants import DEFAULT_CONTEXT_WINDOW, DEFAULT_NUM_OUTPUTS
+from llama_index.llms.types import (
+    ChatMessage,
+    ChatResponse,
+    ChatResponseAsyncGen,
+    ChatResponseGen,
+    CompletionResponse,
+    CompletionResponseAsyncGen,
+    CompletionResponseGen,
+    LLMMetadata,
+)
 from llama_index.schema import BaseComponent
 
 
-class MessageRole(str, Enum):
-    """Message role."""
-
-    SYSTEM = "system"
-    USER = "user"
-    ASSISTANT = "assistant"
-    FUNCTION = "function"
-    TOOL = "tool"
-
-
-# ===== Generic Model Input - Chat =====
-class ChatMessage(BaseModel):
-    """Chat message."""
-
-    role: MessageRole = MessageRole.USER
-    content: Optional[Any] = ""
-    additional_kwargs: dict = Field(default_factory=dict)
-
-    def __str__(self) -> str:
-        return f"{self.role.value}: {self.content}"
-
-
-# ===== Generic Model Output - Chat =====
-class ChatResponse(BaseModel):
-    """Chat response."""
-
-    message: ChatMessage
-    raw: Optional[dict] = None
-    delta: Optional[str] = None
-    additional_kwargs: dict = Field(default_factory=dict)
-
-    def __str__(self) -> str:
-        return str(self.message)
-
-
-ChatResponseGen = Generator[ChatResponse, None, None]
-ChatResponseAsyncGen = AsyncGenerator[ChatResponse, None]
-
-
-# ===== Generic Model Output - Completion =====
-class CompletionResponse(BaseModel):
-    """
-    Completion response.
-
-    Fields:
-        text: Text content of the response if not streaming, or if streaming,
-            the current extent of streamed text.
-        additional_kwargs: Additional information on the response(i.e. token
-            counts, function calling information).
-        raw: Optional raw JSON that was parsed to populate text, if relevant.
-        delta: New text that just streamed in (only relevant when streaming).
-    """
-
-    text: str
-    additional_kwargs: dict = Field(default_factory=dict)
-    raw: Optional[dict] = None
-    delta: Optional[str] = None
-
-    def __str__(self) -> str:
-        return self.text
-
-
-CompletionResponseGen = Generator[CompletionResponse, None, None]
-CompletionResponseAsyncGen = AsyncGenerator[CompletionResponse, None]
-
-
-class LLMMetadata(BaseModel):
-    context_window: int = Field(
-        default=DEFAULT_CONTEXT_WINDOW,
-        description=(
-            "Total number of tokens the model can be input and output for one response."
-        ),
-    )
-    num_output: int = Field(
-        default=DEFAULT_NUM_OUTPUTS,
-        description="Number of tokens the model can output when generating a response.",
-    )
-    is_chat_model: bool = Field(
-        default=False,
-        description=(
-            "Set True if the model exposes a chat interface (i.e. can be passed a"
-            " sequence of messages, rather than text), like OpenAI's"
-            " /v1/chat/completions endpoint."
-        ),
-    )
-    is_function_calling_model: bool = Field(
-        default=False,
-        # SEE: https://openai.com/blog/function-calling-and-other-api-updates
-        description=(
-            "Set True if the model supports function calling messages, similar to"
-            " OpenAI's function calling API. For example, converting 'Email Anya to"
-            " see if she wants to get coffee next Friday' to a function call like"
-            " `send_email(to: string, body: string)`."
-        ),
-    )
-    model_name: str = Field(
-        default="unknown",
-        description=(
-            "The model's name used for logging, testing, and sanity checking. For some"
-            " models this can be automatically discerned. For other models, like"
-            " locally loaded models, this must be manually specified."
-        ),
-    )
-
-
 def llm_chat_callback() -> Callable:
     def wrap(f: Callable) -> Callable:
         @contextmanager
@@ -366,7 +276,7 @@ def llm_completion_callback() -> Callable:
     return wrap
 
 
-class LLM(BaseComponent):
+class BaseLLM(BaseComponent):
     """LLM interface."""
 
     callback_manager: CallbackManager = Field(
diff --git a/llama_index/llms/bedrock.py b/llama_index/llms/bedrock.py
index e92961f33dee60abcaf4876a640c7ec0f4237704..327fc7d485e0328b1f9efd0ccbc3b93c26fcf51e 100644
--- a/llama_index/llms/bedrock.py
+++ b/llama_index/llms/bedrock.py
@@ -1,18 +1,9 @@
 import json
-from typing import Any, Dict, Optional, Sequence
+from typing import Any, Callable, Dict, Optional, Sequence
 
 from llama_index.bridge.pydantic import Field, PrivateAttr
 from llama_index.callbacks import CallbackManager
 from llama_index.llms.base import (
-    LLM,
-    ChatMessage,
-    ChatResponse,
-    ChatResponseAsyncGen,
-    ChatResponseGen,
-    CompletionResponse,
-    CompletionResponseAsyncGen,
-    CompletionResponseGen,
-    LLMMetadata,
     llm_chat_callback,
     llm_completion_callback,
 )
@@ -26,6 +17,18 @@ from llama_index.llms.bedrock_utils import (
     get_text_from_response,
     stream_completion_to_chat_decorator,
 )
+from llama_index.llms.llm import LLM
+from llama_index.llms.types import (
+    ChatMessage,
+    ChatResponse,
+    ChatResponseAsyncGen,
+    ChatResponseGen,
+    CompletionResponse,
+    CompletionResponseAsyncGen,
+    CompletionResponseGen,
+    LLMMetadata,
+)
+from llama_index.types import BaseOutputParser, PydanticProgramMode
 
 
 class Bedrock(LLM):
@@ -70,6 +73,11 @@ class Bedrock(LLM):
         max_retries: Optional[int] = 10,
         additional_kwargs: Optional[Dict[str, Any]] = None,
         callback_manager: Optional[CallbackManager] = None,
+        system_prompt: Optional[str] = None,
+        messages_to_prompt: Optional[Callable[[Sequence[ChatMessage]], str]] = None,
+        completion_to_prompt: Optional[Callable[[str], str]] = None,
+        pydantic_program_mode: PydanticProgramMode = PydanticProgramMode.DEFAULT,
+        output_parser: Optional[BaseOutputParser] = None,
     ) -> None:
         if context_size is None and model not in BEDROCK_FOUNDATION_LLMS:
             raise ValueError(
@@ -124,6 +132,11 @@ class Bedrock(LLM):
             max_retries=max_retries,
             additional_kwargs=additional_kwargs,
             callback_manager=callback_manager,
+            system_prompt=system_prompt,
+            messages_to_prompt=messages_to_prompt,
+            completion_to_prompt=completion_to_prompt,
+            pydantic_program_mode=pydantic_program_mode,
+            output_parser=output_parser,
         )
 
     @classmethod
@@ -136,7 +149,7 @@ class Bedrock(LLM):
         return LLMMetadata(
             context_window=self.context_size,
             num_output=self.max_tokens,
-            is_chat_model=True,
+            is_chat_model=False,
             model_name=self.model,
         )
 
diff --git a/llama_index/llms/bedrock_utils.py b/llama_index/llms/bedrock_utils.py
index c6389c37e772aea6216a3b57599e9afcbe9ab25e..9bc756049cdb90de600f363be5c44e815ddea9c7 100644
--- a/llama_index/llms/bedrock_utils.py
+++ b/llama_index/llms/bedrock_utils.py
@@ -9,7 +9,11 @@ from tenacity import (
     wait_exponential,
 )
 
-from llama_index.llms.base import (
+from llama_index.llms.generic_utils import (
+    completion_response_to_chat_response,
+    stream_completion_response_to_chat_response,
+)
+from llama_index.llms.types import (
     ChatMessage,
     ChatResponse,
     ChatResponseGen,
@@ -17,10 +21,6 @@ from llama_index.llms.base import (
     CompletionResponseGen,
     MessageRole,
 )
-from llama_index.llms.generic_utils import (
-    completion_response_to_chat_response,
-    stream_completion_response_to_chat_response,
-)
 
 HUMAN_PREFIX = "\n\nHuman:"
 ASSISTANT_PREFIX = "\n\nAssistant:"
diff --git a/llama_index/llms/clarifai.py b/llama_index/llms/clarifai.py
index aa4c23a5f84bc2382698058801b499431af72643..28e5d3dad4a7a134ac19a54e07a678387b48626a 100644
--- a/llama_index/llms/clarifai.py
+++ b/llama_index/llms/clarifai.py
@@ -1,9 +1,13 @@
-from typing import Any, Dict, Optional, Sequence
+from typing import Any, Callable, Dict, Optional, Sequence
 
 from llama_index.bridge.pydantic import Field, PrivateAttr
 from llama_index.callbacks import CallbackManager
 from llama_index.llms.base import (
-    LLM,
+    llm_chat_callback,
+    llm_completion_callback,
+)
+from llama_index.llms.llm import LLM
+from llama_index.llms.types import (
     ChatMessage,
     ChatResponse,
     ChatResponseAsyncGen,
@@ -12,9 +16,8 @@ from llama_index.llms.base import (
     CompletionResponseAsyncGen,
     CompletionResponseGen,
     LLMMetadata,
-    llm_chat_callback,
-    llm_completion_callback,
 )
+from llama_index.types import BaseOutputParser, PydanticProgramMode
 
 EXAMPLE_URL = "https://clarifai.com/anthropic/completion/models/claude-v2"
 
@@ -41,6 +44,11 @@ class Clarifai(LLM):
         max_tokens: int = 512,
         additional_kwargs: Optional[Dict[str, Any]] = None,
         callback_manager: Optional[CallbackManager] = None,
+        system_prompt: Optional[str] = None,
+        messages_to_prompt: Optional[Callable[[Sequence[ChatMessage]], str]] = None,
+        completion_to_prompt: Optional[Callable[[str], str]] = None,
+        pydantic_program_mode: PydanticProgramMode = PydanticProgramMode.DEFAULT,
+        output_parser: Optional[BaseOutputParser] = None,
     ):
         try:
             from clarifai.client.model import Model
@@ -81,6 +89,11 @@ class Clarifai(LLM):
             additional_kwargs=additional_kwargs,
             callback_manager=callback_manager,
             model_name=model_name,
+            system_prompt=system_prompt,
+            messages_to_prompt=messages_to_prompt,
+            completion_to_prompt=completion_to_prompt,
+            pydantic_program_mode=pydantic_program_mode,
+            output_parser=output_parser,
         )
 
     @classmethod
diff --git a/llama_index/llms/cohere.py b/llama_index/llms/cohere.py
index e5de8acd40327053ce6867b3e51bc1807953df23..2383a2eae0ed85a370058e8c72f6739c9484b02b 100644
--- a/llama_index/llms/cohere.py
+++ b/llama_index/llms/cohere.py
@@ -1,19 +1,9 @@
 import warnings
-from typing import Any, Dict, Optional, Sequence
+from typing import Any, Callable, Dict, Optional, Sequence
 
 from llama_index.bridge.pydantic import Field, PrivateAttr
 from llama_index.callbacks import CallbackManager
 from llama_index.llms.base import (
-    LLM,
-    ChatMessage,
-    ChatResponse,
-    ChatResponseAsyncGen,
-    ChatResponseGen,
-    CompletionResponse,
-    CompletionResponseAsyncGen,
-    CompletionResponseGen,
-    LLMMetadata,
-    MessageRole,
     llm_chat_callback,
     llm_completion_callback,
 )
@@ -24,6 +14,19 @@ from llama_index.llms.cohere_utils import (
     completion_with_retry,
     messages_to_cohere_history,
 )
+from llama_index.llms.llm import LLM
+from llama_index.llms.types import (
+    ChatMessage,
+    ChatResponse,
+    ChatResponseAsyncGen,
+    ChatResponseGen,
+    CompletionResponse,
+    CompletionResponseAsyncGen,
+    CompletionResponseGen,
+    LLMMetadata,
+    MessageRole,
+)
+from llama_index.types import BaseOutputParser, PydanticProgramMode
 
 
 class Cohere(LLM):
@@ -50,6 +53,11 @@ class Cohere(LLM):
         api_key: Optional[str] = None,
         additional_kwargs: Optional[Dict[str, Any]] = None,
         callback_manager: Optional[CallbackManager] = None,
+        system_prompt: Optional[str] = None,
+        messages_to_prompt: Optional[Callable[[Sequence[ChatMessage]], str]] = None,
+        completion_to_prompt: Optional[Callable[[str], str]] = None,
+        pydantic_program_mode: PydanticProgramMode = PydanticProgramMode.DEFAULT,
+        output_parser: Optional[BaseOutputParser] = None,
     ) -> None:
         try:
             import cohere
@@ -72,6 +80,11 @@ class Cohere(LLM):
             model=model,
             callback_manager=callback_manager,
             max_tokens=max_tokens,
+            system_prompt=system_prompt,
+            messages_to_prompt=messages_to_prompt,
+            completion_to_prompt=completion_to_prompt,
+            pydantic_program_mode=pydantic_program_mode,
+            output_parser=output_parser,
         )
 
     @classmethod
diff --git a/llama_index/llms/cohere_utils.py b/llama_index/llms/cohere_utils.py
index 183564be2098e8e457807422f76f003360dc546b..292102f51016c9c92e48b3d437270d7f7c4f2cdb 100644
--- a/llama_index/llms/cohere_utils.py
+++ b/llama_index/llms/cohere_utils.py
@@ -9,7 +9,7 @@ from tenacity import (
     wait_exponential,
 )
 
-from llama_index.llms.base import ChatMessage
+from llama_index.llms.types import ChatMessage
 
 COMMAND_MODELS = {
     "command": 4096,
diff --git a/llama_index/llms/custom.py b/llama_index/llms/custom.py
index 90a70dbee6e8d25a98b04385706d9731c39b9ddd..48eee2aee4368fbc1e8447660ed247f4fb4ea477 100644
--- a/llama_index/llms/custom.py
+++ b/llama_index/llms/custom.py
@@ -1,13 +1,6 @@
 from typing import Any, Sequence
 
 from llama_index.llms.base import (
-    LLM,
-    ChatMessage,
-    ChatResponse,
-    ChatResponseAsyncGen,
-    ChatResponseGen,
-    CompletionResponse,
-    CompletionResponseAsyncGen,
     llm_chat_callback,
     llm_completion_callback,
 )
@@ -15,13 +8,22 @@ from llama_index.llms.generic_utils import (
     completion_to_chat_decorator,
     stream_completion_to_chat_decorator,
 )
+from llama_index.llms.llm import LLM
+from llama_index.llms.types import (
+    ChatMessage,
+    ChatResponse,
+    ChatResponseAsyncGen,
+    ChatResponseGen,
+    CompletionResponse,
+    CompletionResponseAsyncGen,
+)
 
 
 class CustomLLM(LLM):
     """Simple abstract base class for custom LLMs.
 
-    Subclasses must implement the `__init__`, `complete`,
-        `stream_complete`, and `metadata` methods.
+    Subclasses must implement the `__init__`, `_complete`,
+        `_stream_complete`, and `metadata` methods.
     """
 
     @llm_chat_callback()
diff --git a/llama_index/llms/everlyai.py b/llama_index/llms/everlyai.py
index 1ff6404b5935574e09738051dc0d6244e9686566..708b801db46579e53729b24c4a7ec084a2f4d196 100644
--- a/llama_index/llms/everlyai.py
+++ b/llama_index/llms/everlyai.py
@@ -1,11 +1,12 @@
-from typing import Any, Dict, Optional
+from typing import Any, Callable, Dict, Optional, Sequence
 
 from llama_index.callbacks import CallbackManager
 from llama_index.constants import DEFAULT_NUM_OUTPUTS, DEFAULT_TEMPERATURE
-from llama_index.llms.base import LLMMetadata
 from llama_index.llms.everlyai_utils import everlyai_modelname_to_contextsize
 from llama_index.llms.generic_utils import get_from_param_or_env
 from llama_index.llms.openai import OpenAI
+from llama_index.llms.types import ChatMessage, LLMMetadata
+from llama_index.types import BaseOutputParser, PydanticProgramMode
 
 EVERLYAI_API_BASE = "https://everlyai.xyz/hosted"
 DEFAULT_MODEL = "meta-llama/Llama-2-7b-chat-hf"
@@ -21,6 +22,11 @@ class EverlyAI(OpenAI):
         max_retries: int = 10,
         api_key: Optional[str] = None,
         callback_manager: Optional[CallbackManager] = None,
+        system_prompt: Optional[str] = None,
+        messages_to_prompt: Optional[Callable[[Sequence[ChatMessage]], str]] = None,
+        completion_to_prompt: Optional[Callable[[str], str]] = None,
+        pydantic_program_mode: PydanticProgramMode = PydanticProgramMode.DEFAULT,
+        output_parser: Optional[BaseOutputParser] = None,
     ) -> None:
         additional_kwargs = additional_kwargs or {}
         callback_manager = callback_manager or CallbackManager([])
@@ -36,6 +42,11 @@ class EverlyAI(OpenAI):
             additional_kwargs=additional_kwargs,
             max_retries=max_retries,
             callback_manager=callback_manager,
+            system_prompt=system_prompt,
+            messages_to_prompt=messages_to_prompt,
+            completion_to_prompt=completion_to_prompt,
+            pydantic_program_mode=pydantic_program_mode,
+            output_parser=output_parser,
         )
 
     @classmethod
diff --git a/llama_index/llms/generic_utils.py b/llama_index/llms/generic_utils.py
index fd501dc12d6004a695744e7542ad9eb2e0090929..3ad12c0c2bda0a17666808b8db949a1fea547df6 100644
--- a/llama_index/llms/generic_utils.py
+++ b/llama_index/llms/generic_utils.py
@@ -1,7 +1,7 @@
 import os
 from typing import Any, Awaitable, Callable, List, Optional, Sequence
 
-from llama_index.llms.base import (
+from llama_index.llms.types import (
     ChatMessage,
     ChatResponse,
     ChatResponseAsyncGen,
diff --git a/llama_index/llms/gradient.py b/llama_index/llms/gradient.py
index cadcdcf6ef3120e5e3acb339f49f296189da5e3f..c2091523beff2d5fd9edfe803d473e43f0485c08 100644
--- a/llama_index/llms/gradient.py
+++ b/llama_index/llms/gradient.py
@@ -1,17 +1,19 @@
-from typing import Any, Optional
+from typing import Any, Callable, Optional, Sequence
 
 from typing_extensions import override
 
 from llama_index.bridge.pydantic import Field, PrivateAttr
 from llama_index.callbacks import CallbackManager
 from llama_index.constants import DEFAULT_NUM_OUTPUTS
-from llama_index.llms import (
+from llama_index.llms.base import llm_completion_callback
+from llama_index.llms.custom import CustomLLM
+from llama_index.llms.types import (
+    ChatMessage,
     CompletionResponse,
     CompletionResponseGen,
-    CustomLLM,
     LLMMetadata,
 )
-from llama_index.llms.base import llm_completion_callback
+from llama_index.types import BaseOutputParser, PydanticProgramMode
 
 
 class _BaseGradientLLM(CustomLLM):
@@ -49,6 +51,11 @@ class _BaseGradientLLM(CustomLLM):
         workspace_id: Optional[str] = None,
         callback_manager: Optional[CallbackManager] = None,
         is_chat_model: bool = False,
+        system_prompt: Optional[str] = None,
+        messages_to_prompt: Optional[Callable[[Sequence[ChatMessage]], str]] = None,
+        completion_to_prompt: Optional[Callable[[str], str]] = None,
+        pydantic_program_mode: PydanticProgramMode = PydanticProgramMode.DEFAULT,
+        output_parser: Optional[BaseOutputParser] = None,
         **kwargs: Any,
     ) -> None:
         super().__init__(
@@ -58,6 +65,11 @@ class _BaseGradientLLM(CustomLLM):
             workspace_id=workspace_id,
             callback_manager=callback_manager,
             is_chat_model=is_chat_model,
+            system_prompt=system_prompt,
+            messages_to_prompt=messages_to_prompt,
+            completion_to_prompt=completion_to_prompt,
+            pydantic_program_mode=pydantic_program_mode,
+            output_parser=output_parser,
             **kwargs,
         )
         try:
diff --git a/llama_index/llms/huggingface.py b/llama_index/llms/huggingface.py
index fb1fc67b8e72a566cf52785902760a6374730a5c..acd9fe9ea080c68a9a3a94bd246f83bb9c62e1ad 100644
--- a/llama_index/llms/huggingface.py
+++ b/llama_index/llms/huggingface.py
@@ -8,16 +8,7 @@ from llama_index.constants import (
     DEFAULT_CONTEXT_WINDOW,
     DEFAULT_NUM_OUTPUTS,
 )
-from llama_index.llms import ChatResponseAsyncGen, CompletionResponseAsyncGen
 from llama_index.llms.base import (
-    LLM,
-    ChatMessage,
-    ChatResponse,
-    ChatResponseGen,
-    CompletionResponse,
-    CompletionResponseGen,
-    LLMMetadata,
-    MessageRole,
     llm_chat_callback,
     llm_completion_callback,
 )
@@ -29,7 +20,19 @@ from llama_index.llms.generic_utils import (
 from llama_index.llms.generic_utils import (
     messages_to_prompt as generic_messages_to_prompt,
 )
+from llama_index.llms.types import (
+    ChatMessage,
+    ChatResponse,
+    ChatResponseAsyncGen,
+    ChatResponseGen,
+    CompletionResponse,
+    CompletionResponseAsyncGen,
+    CompletionResponseGen,
+    LLMMetadata,
+    MessageRole,
+)
 from llama_index.prompts.base import PromptTemplate
+from llama_index.types import BaseOutputParser, PydanticProgramMode
 
 DEFAULT_HUGGINGFACE_MODEL = "StabilityAI/stablelm-tuned-alpha-3b"
 if TYPE_CHECKING:
@@ -73,8 +76,8 @@ class HuggingFaceLLM(CustomLLM):
             "The model card on HuggingFace should specify if this is needed."
         ),
     )
-    query_wrapper_prompt: str = Field(
-        default="{query_str}",
+    query_wrapper_prompt: PromptTemplate = Field(
+        default=PromptTemplate("{query_str}"),
         description=(
             "The query wrapper prompt, containing the query placeholder. "
             "The model card on HuggingFace should specify if this is needed. "
@@ -129,13 +132,11 @@ class HuggingFaceLLM(CustomLLM):
     _model: Any = PrivateAttr()
     _tokenizer: Any = PrivateAttr()
     _stopping_criteria: Any = PrivateAttr()
-    _messages_to_prompt: Callable = PrivateAttr()
 
     def __init__(
         self,
         context_window: int = DEFAULT_CONTEXT_WINDOW,
         max_new_tokens: int = DEFAULT_NUM_OUTPUTS,
-        system_prompt: str = "",
         query_wrapper_prompt: Union[str, PromptTemplate] = "{query_str}",
         tokenizer_name: str = DEFAULT_HUGGINGFACE_MODEL,
         model_name: str = DEFAULT_HUGGINGFACE_MODEL,
@@ -148,8 +149,12 @@ class HuggingFaceLLM(CustomLLM):
         model_kwargs: Optional[dict] = None,
         generate_kwargs: Optional[dict] = None,
         is_chat_model: Optional[bool] = False,
-        messages_to_prompt: Optional[Callable] = None,
         callback_manager: Optional[CallbackManager] = None,
+        system_prompt: Optional[str] = None,
+        messages_to_prompt: Optional[Callable[[Sequence[ChatMessage]], str]] = None,
+        completion_to_prompt: Optional[Callable[[str], str]] = None,
+        pydantic_program_mode: PydanticProgramMode = PydanticProgramMode.DEFAULT,
+        output_parser: Optional[BaseOutputParser] = None,
     ) -> None:
         """Initialize params."""
         try:
@@ -215,8 +220,8 @@ class HuggingFaceLLM(CustomLLM):
 
         self._stopping_criteria = StoppingCriteriaList([StopOnTokens()])
 
-        if isinstance(query_wrapper_prompt, PromptTemplate):
-            query_wrapper_prompt = query_wrapper_prompt.template
+        if isinstance(query_wrapper_prompt, str):
+            query_wrapper_prompt = PromptTemplate(query_wrapper_prompt)
 
         self._messages_to_prompt = (
             messages_to_prompt or self._tokenizer_messages_to_prompt
@@ -225,7 +230,6 @@ class HuggingFaceLLM(CustomLLM):
         super().__init__(
             context_window=context_window,
             max_new_tokens=max_new_tokens,
-            system_prompt=system_prompt,
             query_wrapper_prompt=query_wrapper_prompt,
             tokenizer_name=tokenizer_name,
             model_name=model_name,
@@ -237,6 +241,11 @@ class HuggingFaceLLM(CustomLLM):
             generate_kwargs=generate_kwargs or {},
             is_chat_model=is_chat_model,
             callback_manager=callback_manager,
+            system_prompt=system_prompt,
+            messages_to_prompt=messages_to_prompt,
+            completion_to_prompt=completion_to_prompt,
+            pydantic_program_mode=pydantic_program_mode,
+            output_parser=output_parser,
         )
 
     @classmethod
@@ -386,7 +395,7 @@ def chat_messages_to_conversational_kwargs(
     return kwargs
 
 
-class HuggingFaceInferenceAPI(LLM):
+class HuggingFaceInferenceAPI(CustomLLM):
     """
     Wrapper on the Hugging Face's Inference API.
 
diff --git a/llama_index/llms/konko.py b/llama_index/llms/konko.py
index 10af19d63fc730bd4c315d4060e11a5dc50ad1e3..ecb0562585fe80a02ee86ff5b71e51d05bd03123 100644
--- a/llama_index/llms/konko.py
+++ b/llama_index/llms/konko.py
@@ -3,19 +3,7 @@ from typing import Any, Awaitable, Callable, Dict, Optional, Sequence
 from llama_index.bridge.pydantic import Field
 from llama_index.callbacks import CallbackManager
 from llama_index.constants import DEFAULT_NUM_OUTPUTS, DEFAULT_TEMPERATURE
-from llama_index.llms.base import (
-    LLM,
-    ChatMessage,
-    ChatResponse,
-    ChatResponseAsyncGen,
-    ChatResponseGen,
-    CompletionResponse,
-    CompletionResponseAsyncGen,
-    CompletionResponseGen,
-    LLMMetadata,
-    llm_chat_callback,
-    llm_completion_callback,
-)
+from llama_index.llms.base import llm_chat_callback, llm_completion_callback
 from llama_index.llms.generic_utils import (
     achat_to_completion_decorator,
     acompletion_to_chat_decorator,
@@ -35,6 +23,18 @@ from llama_index.llms.konko_utils import (
     resolve_konko_credentials,
     to_openai_message_dicts,
 )
+from llama_index.llms.llm import LLM
+from llama_index.llms.types import (
+    ChatMessage,
+    ChatResponse,
+    ChatResponseAsyncGen,
+    ChatResponseGen,
+    CompletionResponse,
+    CompletionResponseAsyncGen,
+    CompletionResponseGen,
+    LLMMetadata,
+)
+from llama_index.types import BaseOutputParser, PydanticProgramMode
 
 DEFAULT_KONKO_MODEL = "meta-llama/Llama-2-13b-chat-hf"
 
@@ -80,6 +80,11 @@ class Konko(LLM):
         api_base: Optional[str] = None,
         api_version: Optional[str] = None,
         callback_manager: Optional[CallbackManager] = None,
+        system_prompt: Optional[str] = None,
+        messages_to_prompt: Optional[Callable[[Sequence[ChatMessage]], str]] = None,
+        completion_to_prompt: Optional[Callable[[str], str]] = None,
+        pydantic_program_mode: PydanticProgramMode = PydanticProgramMode.DEFAULT,
+        output_parser: Optional[BaseOutputParser] = None,
         **kwargs: Any,
     ) -> None:
         additional_kwargs = additional_kwargs or {}
@@ -110,6 +115,11 @@ class Konko(LLM):
             api_type=api_type,
             api_version=api_version,
             api_base=api_base,
+            system_prompt=system_prompt,
+            messages_to_prompt=messages_to_prompt,
+            completion_to_prompt=completion_to_prompt,
+            pydantic_program_mode=pydantic_program_mode,
+            output_parser=output_parser,
             **kwargs,
         )
 
diff --git a/llama_index/llms/konko_utils.py b/llama_index/llms/konko_utils.py
index 6859f969908489e0306bc7c28e0b2555330d178a..a097aab4cb9f97d9a64a173174de225c34ab2d30 100644
--- a/llama_index/llms/konko_utils.py
+++ b/llama_index/llms/konko_utils.py
@@ -11,8 +11,8 @@ from tenacity import (
 )
 
 from llama_index.bridge.pydantic import BaseModel
-from llama_index.llms.base import ChatMessage
 from llama_index.llms.generic_utils import get_from_param_or_env
+from llama_index.llms.types import ChatMessage
 
 DEFAULT_KONKO_API_TYPE = "open_ai"
 DEFAULT_KONKO_API_BASE = "https://api.konko.ai/v1"
diff --git a/llama_index/llms/langchain.py b/llama_index/llms/langchain.py
index 145f60a5f93a4b1ffb264850ba82430d2c3f07ca..b0f49ae9dee82ac43ffc6d7dbfed5d4f53e2d5de 100644
--- a/llama_index/llms/langchain.py
+++ b/llama_index/llms/langchain.py
@@ -1,13 +1,14 @@
 from threading import Thread
-from typing import TYPE_CHECKING, Any, Generator, Optional, Sequence
+from typing import TYPE_CHECKING, Any, Callable, Generator, Optional, Sequence
 
 if TYPE_CHECKING:
     from langchain.base_language import BaseLanguageModel
 
 from llama_index.bridge.pydantic import PrivateAttr
 from llama_index.callbacks import CallbackManager
-from llama_index.llms.base import (
-    LLM,
+from llama_index.llms.base import llm_chat_callback, llm_completion_callback
+from llama_index.llms.llm import LLM
+from llama_index.llms.types import (
     ChatMessage,
     ChatResponse,
     ChatResponseAsyncGen,
@@ -16,9 +17,8 @@ from llama_index.llms.base import (
     CompletionResponseAsyncGen,
     CompletionResponseGen,
     LLMMetadata,
-    llm_chat_callback,
-    llm_completion_callback,
 )
+from llama_index.types import BaseOutputParser, PydanticProgramMode
 
 
 class LangChainLLM(LLM):
@@ -30,9 +30,21 @@ class LangChainLLM(LLM):
         self,
         llm: "BaseLanguageModel",
         callback_manager: Optional[CallbackManager] = None,
+        system_prompt: Optional[str] = None,
+        messages_to_prompt: Optional[Callable[[Sequence[ChatMessage]], str]] = None,
+        completion_to_prompt: Optional[Callable[[str], str]] = None,
+        pydantic_program_mode: PydanticProgramMode = PydanticProgramMode.DEFAULT,
+        output_parser: Optional[BaseOutputParser] = None,
     ) -> None:
         self._llm = llm
-        super().__init__(callback_manager=callback_manager)
+        super().__init__(
+            callback_manager=callback_manager,
+            system_prompt=system_prompt,
+            messages_to_prompt=messages_to_prompt,
+            completion_to_prompt=completion_to_prompt,
+            pydantic_program_mode=pydantic_program_mode,
+            output_parser=output_parser,
+        )
 
     @classmethod
     def class_name(cls) -> str:
diff --git a/llama_index/llms/langchain_utils.py b/llama_index/llms/langchain_utils.py
index ef47ff9a4c520935c117ddcf125d03108e416f14..accc029b082b3b74e0ddac719d50acc1a553ab5c 100644
--- a/llama_index/llms/langchain_utils.py
+++ b/llama_index/llms/langchain_utils.py
@@ -16,8 +16,8 @@ from llama_index.bridge.langchain import (
 from llama_index.bridge.langchain import BaseMessage as LCMessage
 from llama_index.constants import AI21_J2_CONTEXT_WINDOW, COHERE_CONTEXT_WINDOW
 from llama_index.llms.anyscale_utils import anyscale_modelname_to_contextsize
-from llama_index.llms.base import ChatMessage, LLMMetadata, MessageRole
 from llama_index.llms.openai_utils import openai_modelname_to_contextsize
+from llama_index.llms.types import ChatMessage, LLMMetadata, MessageRole
 
 
 def is_chat_model(llm: BaseLanguageModel) -> bool:
diff --git a/llama_index/llms/litellm.py b/llama_index/llms/litellm.py
index b0acaaa8e0f905f7e51c17c0def22fc280f78b1a..e1524c630676120776337f5edd1e3dd59615a724 100644
--- a/llama_index/llms/litellm.py
+++ b/llama_index/llms/litellm.py
@@ -3,19 +3,7 @@ from typing import Any, Awaitable, Callable, Dict, Optional, Sequence
 from llama_index.bridge.pydantic import Field
 from llama_index.callbacks import CallbackManager
 from llama_index.constants import DEFAULT_TEMPERATURE
-from llama_index.llms.base import (
-    LLM,
-    ChatMessage,
-    ChatResponse,
-    ChatResponseAsyncGen,
-    ChatResponseGen,
-    CompletionResponse,
-    CompletionResponseAsyncGen,
-    CompletionResponseGen,
-    LLMMetadata,
-    llm_chat_callback,
-    llm_completion_callback,
-)
+from llama_index.llms.base import llm_chat_callback, llm_completion_callback
 from llama_index.llms.generic_utils import (
     achat_to_completion_decorator,
     acompletion_to_chat_decorator,
@@ -35,6 +23,18 @@ from llama_index.llms.litellm_utils import (
     to_openai_message_dicts,
     validate_litellm_api_key,
 )
+from llama_index.llms.llm import LLM
+from llama_index.llms.types import (
+    ChatMessage,
+    ChatResponse,
+    ChatResponseAsyncGen,
+    ChatResponseGen,
+    CompletionResponse,
+    CompletionResponseAsyncGen,
+    CompletionResponseGen,
+    LLMMetadata,
+)
+from llama_index.types import BaseOutputParser, PydanticProgramMode
 
 DEFAULT_LITELLM_MODEL = "gpt-3.5-turbo"
 
@@ -77,6 +77,11 @@ class LiteLLM(LLM):
         api_type: Optional[str] = None,
         api_base: Optional[str] = None,
         callback_manager: Optional[CallbackManager] = None,
+        system_prompt: Optional[str] = None,
+        messages_to_prompt: Optional[Callable[[Sequence[ChatMessage]], str]] = None,
+        completion_to_prompt: Optional[Callable[[str], str]] = None,
+        pydantic_program_mode: PydanticProgramMode = PydanticProgramMode.DEFAULT,
+        output_parser: Optional[BaseOutputParser] = None,
         **kwargs: Any,
     ) -> None:
         if "custom_llm_provider" in kwargs:
@@ -103,6 +108,11 @@ class LiteLLM(LLM):
             additional_kwargs=additional_kwargs,
             max_retries=max_retries,
             callback_manager=callback_manager,
+            system_prompt=system_prompt,
+            messages_to_prompt=messages_to_prompt,
+            completion_to_prompt=completion_to_prompt,
+            pydantic_program_mode=pydantic_program_mode,
+            output_parser=output_parser,
             **kwargs,
         )
 
diff --git a/llama_index/llms/litellm_utils.py b/llama_index/llms/litellm_utils.py
index 16963679463d510d2d79570194de08e8cc3cd202..2af40dab67e42e501a5f6aacb42df85f14fc0f79 100644
--- a/llama_index/llms/litellm_utils.py
+++ b/llama_index/llms/litellm_utils.py
@@ -11,7 +11,7 @@ from tenacity import (
 )
 
 from llama_index.bridge.pydantic import BaseModel
-from llama_index.llms.base import ChatMessage
+from llama_index.llms.types import ChatMessage
 
 MISSING_API_KEY_ERROR_MESSAGE = """No API key found for LLM.
 E.g. to use openai Please set the OPENAI_API_KEY environment variable or \
diff --git a/llama_index/llms/llama_api.py b/llama_index/llms/llama_api.py
index b8d879de0adb1af194930155667bed2a31cc5081..9f7e07e13731553ca1a97d4a0c2b39a2cfc36d04 100644
--- a/llama_index/llms/llama_api.py
+++ b/llama_index/llms/llama_api.py
@@ -1,24 +1,24 @@
-from typing import Any, Dict, Optional, Sequence
+from typing import Any, Callable, Dict, Optional, Sequence
 
 from llama_index.bridge.pydantic import Field, PrivateAttr
 from llama_index.callbacks import CallbackManager
 from llama_index.constants import DEFAULT_NUM_OUTPUTS
-from llama_index.llms.base import (
+from llama_index.llms.base import llm_chat_callback, llm_completion_callback
+from llama_index.llms.custom import CustomLLM
+from llama_index.llms.generic_utils import chat_to_completion_decorator
+from llama_index.llms.openai_utils import (
+    from_openai_message_dict,
+    to_openai_message_dicts,
+)
+from llama_index.llms.types import (
     ChatMessage,
     ChatResponse,
     ChatResponseGen,
     CompletionResponse,
     CompletionResponseGen,
     LLMMetadata,
-    llm_chat_callback,
-    llm_completion_callback,
-)
-from llama_index.llms.custom import CustomLLM
-from llama_index.llms.generic_utils import chat_to_completion_decorator
-from llama_index.llms.openai_utils import (
-    from_openai_message_dict,
-    to_openai_message_dicts,
 )
+from llama_index.types import BaseOutputParser, PydanticProgramMode
 
 
 class LlamaAPI(CustomLLM):
@@ -39,6 +39,11 @@ class LlamaAPI(CustomLLM):
         additional_kwargs: Optional[Dict[str, Any]] = None,
         api_key: Optional[str] = None,
         callback_manager: Optional[CallbackManager] = None,
+        system_prompt: Optional[str] = None,
+        messages_to_prompt: Optional[Callable[[Sequence[ChatMessage]], str]] = None,
+        completion_to_prompt: Optional[Callable[[str], str]] = None,
+        pydantic_program_mode: PydanticProgramMode = PydanticProgramMode.DEFAULT,
+        output_parser: Optional[BaseOutputParser] = None,
     ) -> None:
         try:
             from llamaapi import LlamaAPI as Client
@@ -56,6 +61,11 @@ class LlamaAPI(CustomLLM):
             max_tokens=max_tokens,
             additional_kwargs=additional_kwargs or {},
             callback_manager=callback_manager,
+            system_prompt=system_prompt,
+            messages_to_prompt=messages_to_prompt,
+            completion_to_prompt=completion_to_prompt,
+            pydantic_program_mode=pydantic_program_mode,
+            output_parser=output_parser,
         )
 
     @classmethod
diff --git a/llama_index/llms/llama_cpp.py b/llama_index/llms/llama_cpp.py
index 1ff5edec52ce5e8f323459df990677608063e0b2..7ab0bd1cda5fec8574ed8f6f105ed02f4463b67f 100644
--- a/llama_index/llms/llama_cpp.py
+++ b/llama_index/llms/llama_cpp.py
@@ -11,24 +11,21 @@ from llama_index.constants import (
     DEFAULT_NUM_OUTPUTS,
     DEFAULT_TEMPERATURE,
 )
-from llama_index.llms.base import (
+from llama_index.llms.base import llm_chat_callback, llm_completion_callback
+from llama_index.llms.custom import CustomLLM
+from llama_index.llms.generic_utils import (
+    completion_response_to_chat_response,
+    stream_completion_response_to_chat_response,
+)
+from llama_index.llms.types import (
     ChatMessage,
     ChatResponse,
     ChatResponseGen,
     CompletionResponse,
     CompletionResponseGen,
     LLMMetadata,
-    llm_chat_callback,
-    llm_completion_callback,
-)
-from llama_index.llms.custom import CustomLLM
-from llama_index.llms.generic_utils import (
-    completion_response_to_chat_response,
-    stream_completion_response_to_chat_response,
-)
-from llama_index.llms.generic_utils import (
-    messages_to_prompt as generic_messages_to_prompt,
 )
+from llama_index.types import BaseOutputParser, PydanticProgramMode
 from llama_index.utils import get_cache_dir
 
 DEFAULT_LLAMA_CPP_GGML_MODEL = (
@@ -65,12 +62,6 @@ class LlamaCPP(CustomLLM):
         description="The maximum number of context tokens for the model.",
         gt=0,
     )
-    messages_to_prompt: Callable = Field(
-        description="The function to convert messages to a prompt.", exclude=True
-    )
-    completion_to_prompt: Callable = Field(
-        description="The function to convert a completion to a prompt.", exclude=True
-    )
     generate_kwargs: Dict[str, Any] = Field(
         default_factory=dict, description="Kwargs used for generation."
     )
@@ -91,12 +82,15 @@ class LlamaCPP(CustomLLM):
         temperature: float = DEFAULT_TEMPERATURE,
         max_new_tokens: int = DEFAULT_NUM_OUTPUTS,
         context_window: int = DEFAULT_CONTEXT_WINDOW,
-        messages_to_prompt: Optional[Callable] = None,
-        completion_to_prompt: Optional[Callable] = None,
         callback_manager: Optional[CallbackManager] = None,
         generate_kwargs: Optional[Dict[str, Any]] = None,
         model_kwargs: Optional[Dict[str, Any]] = None,
         verbose: bool = DEFAULT_LLAMA_CPP_MODEL_VERBOSITY,
+        system_prompt: Optional[str] = None,
+        messages_to_prompt: Optional[Callable[[Sequence[ChatMessage]], str]] = None,
+        completion_to_prompt: Optional[Callable[[str], str]] = None,
+        pydantic_program_mode: PydanticProgramMode = PydanticProgramMode.DEFAULT,
+        output_parser: Optional[BaseOutputParser] = None,
     ) -> None:
         try:
             from llama_cpp import Llama
@@ -135,9 +129,6 @@ class LlamaCPP(CustomLLM):
             self._model = Llama(model_path=model_path, **model_kwargs)
 
         model_path = model_path
-        messages_to_prompt = messages_to_prompt or generic_messages_to_prompt
-        completion_to_prompt = completion_to_prompt or (lambda x: x)
-
         generate_kwargs = generate_kwargs or {}
         generate_kwargs.update(
             {"temperature": temperature, "max_tokens": max_new_tokens}
@@ -149,12 +140,15 @@ class LlamaCPP(CustomLLM):
             temperature=temperature,
             context_window=context_window,
             max_new_tokens=max_new_tokens,
-            messages_to_prompt=messages_to_prompt,
-            completion_to_prompt=completion_to_prompt,
             callback_manager=callback_manager,
             generate_kwargs=generate_kwargs,
             model_kwargs=model_kwargs,
             verbose=verbose,
+            system_prompt=system_prompt,
+            messages_to_prompt=messages_to_prompt,
+            completion_to_prompt=completion_to_prompt,
+            pydantic_program_mode=pydantic_program_mode,
+            output_parser=output_parser,
         )
 
     @classmethod
diff --git a/llama_index/llms/llama_utils.py b/llama_index/llms/llama_utils.py
index d0ad1d1b5e9a02910f5c7625714285ec5a78d4b7..2ee0e950b7510c509ccaccdd1b734a5c8c0c228f 100644
--- a/llama_index/llms/llama_utils.py
+++ b/llama_index/llms/llama_utils.py
@@ -1,6 +1,6 @@
 from typing import List, Optional, Sequence
 
-from llama_index.llms.base import ChatMessage, MessageRole
+from llama_index.llms.types import ChatMessage, MessageRole
 
 BOS, EOS = "<s>", "</s>"
 B_INST, E_INST = "[INST]", "[/INST]"
diff --git a/llama_index/llms/llm.py b/llama_index/llms/llm.py
new file mode 100644
index 0000000000000000000000000000000000000000..6d8e0aa193c781b661ec3854ad9fecbcde05ece6
--- /dev/null
+++ b/llama_index/llms/llm.py
@@ -0,0 +1,310 @@
+from collections import ChainMap
+from typing import Any, List, Optional, Protocol, Sequence, runtime_checkable
+
+from llama_index.bridge.pydantic import BaseModel, Field, validator
+from llama_index.callbacks import CBEventType, EventPayload
+from llama_index.llms.base import BaseLLM
+from llama_index.llms.generic_utils import (
+    messages_to_prompt as generic_messages_to_prompt,
+)
+from llama_index.llms.types import (
+    ChatMessage,
+    ChatResponseAsyncGen,
+    ChatResponseGen,
+    CompletionResponseAsyncGen,
+    CompletionResponseGen,
+    MessageRole,
+)
+from llama_index.prompts import BasePromptTemplate, PromptTemplate
+from llama_index.types import (
+    BaseOutputParser,
+    PydanticProgramMode,
+    TokenAsyncGen,
+    TokenGen,
+)
+
+
+# NOTE: These two protocols are needed to appease mypy
+@runtime_checkable
+class MessagesToPromptType(Protocol):
+    def __call__(self, messages: Sequence[ChatMessage]) -> str:
+        pass
+
+
+@runtime_checkable
+class CompletionToPromptType(Protocol):
+    def __call__(self, prompt: str) -> str:
+        pass
+
+
+def stream_completion_response_to_tokens(
+    completion_response_gen: CompletionResponseGen,
+) -> TokenGen:
+    """Convert a stream completion response to a stream of tokens."""
+
+    def gen() -> TokenGen:
+        for response in completion_response_gen:
+            yield response.delta or ""
+
+    return gen()
+
+
+def stream_chat_response_to_tokens(
+    chat_response_gen: ChatResponseGen,
+) -> TokenGen:
+    """Convert a stream completion response to a stream of tokens."""
+
+    def gen() -> TokenGen:
+        for response in chat_response_gen:
+            yield response.delta or ""
+
+    return gen()
+
+
+async def astream_completion_response_to_tokens(
+    completion_response_gen: CompletionResponseAsyncGen,
+) -> TokenAsyncGen:
+    """Convert a stream completion response to a stream of tokens."""
+
+    async def gen() -> TokenAsyncGen:
+        async for response in completion_response_gen:
+            yield response.delta or ""
+
+    return gen()
+
+
+async def astream_chat_response_to_tokens(
+    chat_response_gen: ChatResponseAsyncGen,
+) -> TokenAsyncGen:
+    """Convert a stream completion response to a stream of tokens."""
+
+    async def gen() -> TokenAsyncGen:
+        async for response in chat_response_gen:
+            yield response.delta or ""
+
+    return gen()
+
+
+class LLM(BaseLLM):
+    system_prompt: Optional[str] = Field(description="System prompt for LLM calls.")
+    messages_to_prompt: MessagesToPromptType = Field(
+        description="Function to convert a list of messages to an LLM prompt.",
+        default=generic_messages_to_prompt,
+        exclude=True,
+    )
+    completion_to_prompt: CompletionToPromptType = Field(
+        description="Function to convert a completion to an LLM prompt.",
+        default=lambda x: x,
+        exclude=True,
+    )
+    output_parser: Optional[BaseOutputParser] = Field(
+        description="Output parser to parse, validate, and correct errors programmatically.",
+        default=None,
+        exclude=True,
+    )
+    pydantic_program_mode: PydanticProgramMode = PydanticProgramMode.DEFAULT
+
+    # deprecated
+    query_wrapper_prompt: Optional[BasePromptTemplate] = Field(
+        description="Query wrapper prompt for LLM calls.",
+        default=None,
+        exclude=True,
+    )
+
+    @validator("messages_to_prompt", pre=True)
+    def set_messages_to_prompt(
+        cls, messages_to_prompt: Optional[MessagesToPromptType]
+    ) -> MessagesToPromptType:
+        return messages_to_prompt or generic_messages_to_prompt
+
+    @validator("completion_to_prompt", pre=True)
+    def set_completion_to_prompt(
+        cls, completion_to_prompt: Optional[CompletionToPromptType]
+    ) -> CompletionToPromptType:
+        return completion_to_prompt or (lambda x: x)
+
+    def _log_template_data(
+        self, prompt: BasePromptTemplate, **prompt_args: Any
+    ) -> None:
+        template_vars = {
+            k: v
+            for k, v in ChainMap(prompt.kwargs, prompt_args).items()
+            if k in prompt.template_vars
+        }
+        with self.callback_manager.event(
+            CBEventType.TEMPLATING,
+            payload={
+                EventPayload.TEMPLATE: prompt.get_template(llm=self),
+                EventPayload.TEMPLATE_VARS: template_vars,
+                EventPayload.SYSTEM_PROMPT: self.system_prompt,
+                EventPayload.QUERY_WRAPPER_PROMPT: self.query_wrapper_prompt,
+            },
+        ):
+            pass
+
+    def _get_prompt(self, prompt: BasePromptTemplate, **prompt_args: Any) -> str:
+        formatted_prompt = prompt.format(
+            llm=self,
+            messages_to_prompt=self.messages_to_prompt,
+            completion_to_prompt=self.completion_to_prompt,
+            **prompt_args,
+        )
+        if self.output_parser is not None:
+            formatted_prompt = self.output_parser.format(formatted_prompt)
+        return self._extend_prompt(formatted_prompt)
+
+    def _get_messages(
+        self, prompt: BasePromptTemplate, **prompt_args: Any
+    ) -> List[ChatMessage]:
+        messages = prompt.format_messages(llm=self, **prompt_args)
+        if self.output_parser is not None:
+            messages = self.output_parser.format_messages(messages)
+        return self._extend_messages(messages)
+
+    def structured_predict(
+        self,
+        output_cls: BaseModel,
+        prompt: PromptTemplate,
+        **prompt_args: Any,
+    ) -> BaseModel:
+        from llama_index.program.utils import get_program_for_llm
+
+        program = get_program_for_llm(
+            output_cls,
+            prompt,
+            self,
+            pydantic_program_mode=self.pydantic_program_mode,
+        )
+
+        return program(**prompt_args)
+
+    async def astructured_predict(
+        self,
+        output_cls: BaseModel,
+        prompt: PromptTemplate,
+        **prompt_args: Any,
+    ) -> BaseModel:
+        from llama_index.program.utils import get_program_for_llm
+
+        program = get_program_for_llm(
+            output_cls,
+            prompt,
+            self,
+            pydantic_program_mode=self.pydantic_program_mode,
+        )
+
+        return await program.acall(**prompt_args)
+
+    def _parse_output(self, output: str) -> str:
+        if self.output_parser is not None:
+            return str(self.output_parser.parse(output))
+
+        return output
+
+    def predict(
+        self,
+        prompt: BasePromptTemplate,
+        **prompt_args: Any,
+    ) -> str:
+        """Predict."""
+        self._log_template_data(prompt, **prompt_args)
+
+        if self.metadata.is_chat_model:
+            messages = self._get_messages(prompt, **prompt_args)
+            chat_response = self.chat(messages)
+            output = chat_response.message.content or ""
+        else:
+            formatted_prompt = self._get_prompt(prompt, **prompt_args)
+            response = self.complete(formatted_prompt)
+            output = response.text
+
+        return self._parse_output(output)
+
+    def stream(
+        self,
+        prompt: BasePromptTemplate,
+        **prompt_args: Any,
+    ) -> TokenGen:
+        """Stream."""
+        self._log_template_data(prompt, **prompt_args)
+
+        if self.metadata.is_chat_model:
+            messages = self._get_messages(prompt, **prompt_args)
+            chat_response = self.stream_chat(messages)
+            stream_tokens = stream_chat_response_to_tokens(chat_response)
+        else:
+            formatted_prompt = self._get_prompt(prompt, **prompt_args)
+            stream_response = self.stream_complete(formatted_prompt)
+            stream_tokens = stream_completion_response_to_tokens(stream_response)
+
+        if prompt.output_parser is not None or self.output_parser is not None:
+            raise NotImplementedError("Output parser is not supported for streaming.")
+
+        return stream_tokens
+
+    async def apredict(
+        self,
+        prompt: BasePromptTemplate,
+        **prompt_args: Any,
+    ) -> str:
+        """Async predict."""
+        self._log_template_data(prompt, **prompt_args)
+
+        if self.metadata.is_chat_model:
+            messages = self._get_messages(prompt, **prompt_args)
+            chat_response = await self.achat(messages)
+            output = chat_response.message.content or ""
+        else:
+            formatted_prompt = self._get_prompt(prompt, **prompt_args)
+            response = await self.acomplete(formatted_prompt)
+            output = response.text
+
+        return self._parse_output(output)
+
+    async def astream(
+        self,
+        prompt: BasePromptTemplate,
+        **prompt_args: Any,
+    ) -> TokenAsyncGen:
+        """Async stream."""
+        self._log_template_data(prompt, **prompt_args)
+
+        if self.metadata.is_chat_model:
+            messages = self._get_messages(prompt, **prompt_args)
+            chat_response = await self.astream_chat(messages)
+            stream_tokens = await astream_chat_response_to_tokens(chat_response)
+        else:
+            formatted_prompt = self._get_prompt(prompt, **prompt_args)
+            stream_response = await self.astream_complete(formatted_prompt)
+            stream_tokens = await astream_completion_response_to_tokens(stream_response)
+
+        if prompt.output_parser is not None or self.output_parser is not None:
+            raise NotImplementedError("Output parser is not supported for streaming.")
+
+        return stream_tokens
+
+    def _extend_prompt(
+        self,
+        formatted_prompt: str,
+    ) -> str:
+        """Add system and query wrapper prompts to base prompt."""
+        extended_prompt = formatted_prompt
+
+        if self.system_prompt:
+            extended_prompt = self.system_prompt + "\n\n" + extended_prompt
+
+        if self.query_wrapper_prompt:
+            extended_prompt = self.query_wrapper_prompt.format(
+                query_str=extended_prompt
+            )
+
+        return extended_prompt
+
+    def _extend_messages(self, messages: List[ChatMessage]) -> List[ChatMessage]:
+        """Add system prompt to chat message list."""
+        if self.system_prompt:
+            messages = [
+                ChatMessage(role=MessageRole.SYSTEM, content=self.system_prompt),
+                *messages,
+            ]
+        return messages
diff --git a/llama_index/llms/loading.py b/llama_index/llms/loading.py
index 2cbb9e74ad06eb89a43f4e93c357456556c07b00..a91e352c2df5aa25f0221c8093a4eba899a3ea37 100644
--- a/llama_index/llms/loading.py
+++ b/llama_index/llms/loading.py
@@ -1,12 +1,12 @@
 from typing import Dict, Type
 
-from llama_index.llms.base import LLM
 from llama_index.llms.bedrock import Bedrock
 from llama_index.llms.custom import CustomLLM
 from llama_index.llms.gradient import GradientBaseModelLLM, GradientModelAdapterLLM
 from llama_index.llms.huggingface import HuggingFaceLLM
 from llama_index.llms.langchain import LangChainLLM
 from llama_index.llms.llama_cpp import LlamaCPP
+from llama_index.llms.llm import LLM
 from llama_index.llms.mock import MockLLM
 from llama_index.llms.openai import OpenAI
 from llama_index.llms.palm import PaLM
diff --git a/llama_index/llms/localai.py b/llama_index/llms/localai.py
index d3baedf5512b7e9d78fde624564256a5f9a307cd..9f307254a65467668a8cc51e9906b9b0093d2bb7 100644
--- a/llama_index/llms/localai.py
+++ b/llama_index/llms/localai.py
@@ -7,14 +7,15 @@ Source: https://github.com/go-skynet/LocalAI
 
 import warnings
 from types import MappingProxyType
-from typing import Any, Dict, Mapping, Optional
+from typing import Any, Callable, Dict, Mapping, Optional, Sequence
 
 from llama_index.bridge.pydantic import Field
 from llama_index.constants import DEFAULT_CONTEXT_WINDOW
-from llama_index.llms.base import LLMMetadata
 from llama_index.llms.openai import OpenAI
 from llama_index.llms.openai_like import OpenAILike
 from llama_index.llms.openai_utils import is_function_calling_model
+from llama_index.llms.types import ChatMessage, LLMMetadata
+from llama_index.types import BaseOutputParser, PydanticProgramMode
 
 # Use these as kwargs for OpenAILike to connect to LocalAIs
 DEFAULT_LOCALAI_PORT = 8080
@@ -47,9 +48,23 @@ class LocalAI(OpenAI):
         self,
         api_key: Optional[str] = LOCALAI_DEFAULTS["api_key"],
         api_base: Optional[str] = LOCALAI_DEFAULTS["api_base"],
+        system_prompt: Optional[str] = None,
+        messages_to_prompt: Optional[Callable[[Sequence[ChatMessage]], str]] = None,
+        completion_to_prompt: Optional[Callable[[str], str]] = None,
+        pydantic_program_mode: PydanticProgramMode = PydanticProgramMode.DEFAULT,
+        output_parser: Optional[BaseOutputParser] = None,
         **kwargs: Any,
     ) -> None:
-        super().__init__(api_key=api_key, api_base=api_base, **kwargs)
+        super().__init__(
+            api_key=api_key,
+            api_base=api_base,
+            system_prompt=system_prompt,
+            messages_to_prompt=messages_to_prompt,
+            completion_to_prompt=completion_to_prompt,
+            pydantic_program_mode=pydantic_program_mode,
+            output_parser=output_parser,
+            **kwargs,
+        )
         warnings.warn(
             (
                 f"{type(self).__name__} subclass is deprecated in favor of"
diff --git a/llama_index/llms/mock.py b/llama_index/llms/mock.py
index 41b9ffd9aff7f27634902007e009f242b1192a8b..9e3cf32e2d2536bde51024c44f6148879ab8fbb3 100644
--- a/llama_index/llms/mock.py
+++ b/llama_index/llms/mock.py
@@ -1,13 +1,15 @@
-from typing import Any, Optional
+from typing import Any, Callable, Optional, Sequence
 
 from llama_index.callbacks import CallbackManager
-from llama_index.llms.base import (
+from llama_index.llms.base import llm_completion_callback
+from llama_index.llms.custom import CustomLLM
+from llama_index.llms.types import (
+    ChatMessage,
     CompletionResponse,
     CompletionResponseGen,
     LLMMetadata,
-    llm_completion_callback,
 )
-from llama_index.llms.custom import CustomLLM
+from llama_index.types import PydanticProgramMode
 
 
 class MockLLM(CustomLLM):
@@ -17,8 +19,19 @@ class MockLLM(CustomLLM):
         self,
         max_tokens: Optional[int] = None,
         callback_manager: Optional[CallbackManager] = None,
+        system_prompt: Optional[str] = None,
+        messages_to_prompt: Optional[Callable[[Sequence[ChatMessage]], str]] = None,
+        completion_to_prompt: Optional[Callable[[str], str]] = None,
+        pydantic_program_mode: PydanticProgramMode = PydanticProgramMode.DEFAULT,
     ) -> None:
-        super().__init__(max_tokens=max_tokens, callback_manager=callback_manager)
+        super().__init__(
+            max_tokens=max_tokens,
+            callback_manager=callback_manager,
+            system_prompt=system_prompt,
+            messages_to_prompt=messages_to_prompt,
+            completion_to_prompt=completion_to_prompt,
+            pydantic_program_mode=pydantic_program_mode,
+        )
 
     @classmethod
     def class_name(cls) -> str:
diff --git a/llama_index/llms/monsterapi.py b/llama_index/llms/monsterapi.py
index c7f12759ee909ca556fd8537d4a63e84a60dbd33..0e21207cb05d2ccf672526979d7ad7d95fe30f6e 100644
--- a/llama_index/llms/monsterapi.py
+++ b/llama_index/llms/monsterapi.py
@@ -3,19 +3,16 @@ from typing import Any, Callable, Dict, Optional, Sequence
 from llama_index.bridge.pydantic import Field, PrivateAttr
 from llama_index.callbacks import CallbackManager
 from llama_index.constants import DEFAULT_CONTEXT_WINDOW, DEFAULT_NUM_OUTPUTS
-from llama_index.llms.base import (
+from llama_index.llms.base import llm_chat_callback, llm_completion_callback
+from llama_index.llms.custom import CustomLLM
+from llama_index.llms.types import (
     ChatMessage,
     ChatResponse,
     CompletionResponse,
     CompletionResponseGen,
     LLMMetadata,
-    llm_chat_callback,
-    llm_completion_callback,
-)
-from llama_index.llms.custom import CustomLLM
-from llama_index.llms.generic_utils import (
-    messages_to_prompt as generic_messages_to_prompt,
 )
+from llama_index.types import BaseOutputParser, PydanticProgramMode
 
 DEFAULT_MONSTER_TEMP = 0.75
 
@@ -40,13 +37,6 @@ class MonsterLLM(CustomLLM):
         gt=0,
     )
 
-    messages_to_prompt: Callable = Field(
-        description="The function to convert messages to a prompt.", exclude=True
-    )
-    completion_to_prompt: Callable = Field(
-        description="The function to convert a completion to a prompt.", exclude=True
-    )
-
     _client: Any = PrivateAttr()
 
     def __init__(
@@ -57,14 +47,14 @@ class MonsterLLM(CustomLLM):
         temperature: float = DEFAULT_MONSTER_TEMP,
         context_window: int = DEFAULT_CONTEXT_WINDOW,
         callback_manager: Optional[CallbackManager] = None,
-        messages_to_prompt: Optional[Callable] = None,
-        completion_to_prompt: Optional[Callable] = None,
+        system_prompt: Optional[str] = None,
+        messages_to_prompt: Optional[Callable[[Sequence[ChatMessage]], str]] = None,
+        completion_to_prompt: Optional[Callable[[str], str]] = None,
+        pydantic_program_mode: PydanticProgramMode = PydanticProgramMode.DEFAULT,
+        output_parser: Optional[BaseOutputParser] = None,
     ) -> None:
         self._client, available_llms = self.initialize_client(monster_api_key)
 
-        _messages_to_prompt = messages_to_prompt or generic_messages_to_prompt
-        _completion_to_prompt = completion_to_prompt or (lambda x: x)
-
         # Check if provided model is supported
         if model not in available_llms:
             error_message = (
@@ -82,8 +72,11 @@ class MonsterLLM(CustomLLM):
             temperature=temperature,
             context_window=context_window,
             callback_manager=callback_manager,
-            messages_to_prompt=_messages_to_prompt,
-            completion_to_prompt=_completion_to_prompt,
+            system_prompt=system_prompt,
+            messages_to_prompt=messages_to_prompt,
+            completion_to_prompt=completion_to_prompt,
+            pydantic_program_mode=pydantic_program_mode,
+            output_parser=output_parser,
         )
 
     def initialize_client(self, monster_api_key: Optional[str]) -> Any:
diff --git a/llama_index/llms/ollama.py b/llama_index/llms/ollama.py
index bcc37c32c9c8d4d5caa90b5cfba198f666bda337..078e2f7a3c6ea76ccd5290303a17aee6a05b28fb 100644
--- a/llama_index/llms/ollama.py
+++ b/llama_index/llms/ollama.py
@@ -1,38 +1,35 @@
 import json
-from typing import Any, Callable, Dict, Iterator, Optional, Sequence
+from typing import Any, Dict, Iterator, Sequence
 
-from llama_index.bridge.pydantic import Field, PrivateAttr
-from llama_index.callbacks import CallbackManager
+from llama_index.bridge.pydantic import Field
 from llama_index.constants import (
     DEFAULT_CONTEXT_WINDOW,
     DEFAULT_NUM_OUTPUTS,
-    DEFAULT_TEMPERATURE,
 )
-from llama_index.llms.base import (
+from llama_index.llms.base import llm_chat_callback, llm_completion_callback
+from llama_index.llms.custom import CustomLLM
+from llama_index.llms.generic_utils import (
+    completion_response_to_chat_response,
+    stream_completion_response_to_chat_response,
+)
+from llama_index.llms.types import (
     ChatMessage,
     ChatResponse,
     ChatResponseGen,
     CompletionResponse,
     CompletionResponseGen,
     LLMMetadata,
-    llm_chat_callback,
-    llm_completion_callback,
-)
-from llama_index.llms.custom import CustomLLM
-from llama_index.llms.generic_utils import (
-    completion_response_to_chat_response,
-    stream_completion_response_to_chat_response,
-)
-from llama_index.llms.generic_utils import (
-    messages_to_prompt as generic_messages_to_prompt,
 )
 
 
 class Ollama(CustomLLM):
-    base_url: str = Field(description="Base url the model is hosted under.")
+    base_url: str = Field(
+        default="http://localhost:11434",
+        description="Base url the model is hosted under.",
+    )
     model: str = Field(description="The Ollama model to use.")
     temperature: float = Field(
-        default=DEFAULT_TEMPERATURE,
+        default=0.75,
         description="The temperature to use for sampling.",
         gte=0.0,
         lte=1.0,
@@ -49,34 +46,6 @@ class Ollama(CustomLLM):
         default_factory=dict, description="Additional kwargs for the Ollama API."
     )
 
-    _messages_to_prompt: Callable = PrivateAttr()
-    _completion_to_prompt: Callable = PrivateAttr()
-
-    def __init__(
-        self,
-        model: str,
-        base_url: str = "http://localhost:11434",
-        temperature: float = 0.75,
-        additional_kwargs: Optional[Dict[str, Any]] = None,
-        context_window: int = DEFAULT_CONTEXT_WINDOW,
-        prompt_key: str = "prompt",
-        messages_to_prompt: Optional[Callable] = None,
-        completion_to_prompt: Optional[Callable] = None,
-        callback_manager: Optional[CallbackManager] = None,
-    ) -> None:
-        self._messages_to_prompt = messages_to_prompt or generic_messages_to_prompt
-        self._completion_to_prompt = completion_to_prompt or (lambda x: x)
-
-        super().__init__(
-            model=model,
-            temperature=temperature,
-            base_url=base_url,
-            additional_kwargs=additional_kwargs or {},
-            context_window=context_window,
-            prompt_key=prompt_key,
-            callback_manager=callback_manager,
-        )
-
     @classmethod
     def class_name(cls) -> str:
         return "Ollama_llm"
@@ -112,16 +81,16 @@ class Ollama(CustomLLM):
 
     @llm_chat_callback()
     def chat(self, messages: Sequence[ChatMessage], **kwargs: Any) -> ChatResponse:
-        prompt = self._messages_to_prompt(messages)
-        completion_response = self.complete(prompt, **kwargs)
+        prompt = self.messages_to_prompt(messages)
+        completion_response = self.complete(prompt, formatted=True, **kwargs)
         return completion_response_to_chat_response(completion_response)
 
     @llm_chat_callback()
     def stream_chat(
         self, messages: Sequence[ChatMessage], **kwargs: Any
     ) -> ChatResponseGen:
-        prompt = self._messages_to_prompt(messages)
-        completion_response = self.stream_complete(prompt, **kwargs)
+        prompt = self.messages_to_prompt(messages)
+        completion_response = self.stream_complete(prompt, formatted=True, **kwargs)
         return stream_completion_response_to_chat_response(completion_response)
 
     @llm_completion_callback()
@@ -142,7 +111,9 @@ class Ollama(CustomLLM):
                 "Please install requests with `pip install requests`"
             )
         all_kwargs = self._get_all_kwargs(**kwargs)
-        prompt = self._completion_to_prompt(prompt)
+
+        if not kwargs.get("formatted", False):
+            prompt = self.completion_to_prompt(prompt)
         response = requests.post(
             url=f"{self.base_url}/api/generate/",
             headers={"Content-Type": "application/json"},
diff --git a/llama_index/llms/openai.py b/llama_index/llms/openai.py
index e2b35c914535df211d14fb8d3576928d455c9034..5a7ff6946b841ead9b3a9064a6fd1ef2ed0aa776 100644
--- a/llama_index/llms/openai.py
+++ b/llama_index/llms/openai.py
@@ -27,16 +27,6 @@ from llama_index.constants import (
     DEFAULT_TEMPERATURE,
 )
 from llama_index.llms.base import (
-    LLM,
-    ChatMessage,
-    ChatResponse,
-    ChatResponseAsyncGen,
-    ChatResponseGen,
-    CompletionResponse,
-    CompletionResponseAsyncGen,
-    CompletionResponseGen,
-    LLMMetadata,
-    MessageRole,
     llm_chat_callback,
     llm_completion_callback,
 )
@@ -50,6 +40,7 @@ from llama_index.llms.generic_utils import (
     stream_chat_to_completion_decorator,
     stream_completion_to_chat_decorator,
 )
+from llama_index.llms.llm import LLM
 from llama_index.llms.openai_utils import (
     from_openai_message,
     is_chat_model,
@@ -58,6 +49,18 @@ from llama_index.llms.openai_utils import (
     resolve_openai_credentials,
     to_openai_message_dicts,
 )
+from llama_index.llms.types import (
+    ChatMessage,
+    ChatResponse,
+    ChatResponseAsyncGen,
+    ChatResponseGen,
+    CompletionResponse,
+    CompletionResponseAsyncGen,
+    CompletionResponseGen,
+    LLMMetadata,
+    MessageRole,
+)
+from llama_index.types import BaseOutputParser, PydanticProgramMode
 
 DEFAULT_OPENAI_MODEL = "gpt-3.5-turbo"
 
@@ -131,6 +134,12 @@ class OpenAI(LLM):
         callback_manager: Optional[CallbackManager] = None,
         default_headers: Optional[Dict[str, str]] = None,
         http_client: Optional[httpx.Client] = None,
+        # base class
+        system_prompt: Optional[str] = None,
+        messages_to_prompt: Optional[Callable[[Sequence[ChatMessage]], str]] = None,
+        completion_to_prompt: Optional[Callable[[str], str]] = None,
+        pydantic_program_mode: PydanticProgramMode = PydanticProgramMode.DEFAULT,
+        output_parser: Optional[BaseOutputParser] = None,
         **kwargs: Any,
     ) -> None:
         additional_kwargs = additional_kwargs or {}
@@ -154,6 +163,11 @@ class OpenAI(LLM):
             timeout=timeout,
             reuse_client=reuse_client,
             default_headers=default_headers,
+            system_prompt=system_prompt,
+            messages_to_prompt=messages_to_prompt,
+            completion_to_prompt=completion_to_prompt,
+            pydantic_program_mode=pydantic_program_mode,
+            output_parser=output_parser,
             **kwargs,
         )
 
diff --git a/llama_index/llms/openai_like.py b/llama_index/llms/openai_like.py
index 72bdd778bd9f13db9cea58a2dbb6c5436a6e2363..ced6bda325cf1c12b0ef78ae8bb36e8bd088f621 100644
--- a/llama_index/llms/openai_like.py
+++ b/llama_index/llms/openai_like.py
@@ -2,8 +2,8 @@ from typing import Optional, Union
 
 from llama_index.bridge.pydantic import Field
 from llama_index.constants import DEFAULT_CONTEXT_WINDOW
-from llama_index.llms.base import LLMMetadata
 from llama_index.llms.openai import OpenAI, Tokenizer
+from llama_index.llms.types import LLMMetadata
 
 
 class OpenAILike(OpenAI):
diff --git a/llama_index/llms/openai_utils.py b/llama_index/llms/openai_utils.py
index 3ba53e465085cdeae70c4bcd9a3dc2eec4570a19..83365c77e64f32ab84ac02c35b2d2fedc74ab6fe 100644
--- a/llama_index/llms/openai_utils.py
+++ b/llama_index/llms/openai_utils.py
@@ -20,8 +20,8 @@ from tenacity import (
 from tenacity.stop import stop_base
 
 from llama_index.bridge.pydantic import BaseModel
-from llama_index.llms.base import ChatMessage
 from llama_index.llms.generic_utils import get_from_param_or_env
+from llama_index.llms.types import ChatMessage
 
 DEFAULT_OPENAI_API_TYPE = "open_ai"
 DEFAULT_OPENAI_API_BASE = "https://api.openai.com/v1"
diff --git a/llama_index/llms/openllm.py b/llama_index/llms/openllm.py
index 5fcf83c546aa05763e071e4b407064bb471ab530..4596ec11a2085d5713bfef636438ad9edfa65766 100644
--- a/llama_index/llms/openllm.py
+++ b/llama_index/llms/openllm.py
@@ -13,16 +13,7 @@ from typing import (
 
 from llama_index.bridge.pydantic import Field, PrivateAttr
 from llama_index.callbacks import CallbackManager
-from llama_index.llms import ChatResponseAsyncGen
 from llama_index.llms.base import (
-    LLM,
-    ChatMessage,
-    ChatResponse,
-    ChatResponseGen,
-    CompletionResponse,
-    CompletionResponseAsyncGen,
-    CompletionResponseGen,
-    LLMMetadata,
     llm_chat_callback,
     llm_completion_callback,
 )
@@ -32,6 +23,18 @@ from llama_index.llms.generic_utils import (
 from llama_index.llms.generic_utils import (
     messages_to_prompt as generic_messages_to_prompt,
 )
+from llama_index.llms.llm import LLM
+from llama_index.llms.types import (
+    ChatMessage,
+    ChatResponse,
+    ChatResponseAsyncGen,
+    ChatResponseGen,
+    CompletionResponse,
+    CompletionResponseAsyncGen,
+    CompletionResponseGen,
+    LLMMetadata,
+)
+from llama_index.types import PydanticProgramMode
 
 logger = logging.getLogger(__name__)
 
@@ -58,9 +61,6 @@ class OpenLLM(LLM):
     prompt_template: Optional[str] = Field(
         description="Optional prompt template to pass for this LLM."
     )
-    system_message: Optional[str] = Field(
-        description="Optional system message to pass for this LLM."
-    )
     backend: Optional[Literal["vllm", "pt"]] = Field(
         description="Optional backend to pass for this LLM. By default, it will use vLLM if vLLM is available in local system. Otherwise, it will fallback to PyTorch."
     )
@@ -85,22 +85,22 @@ class OpenLLM(LLM):
     else:
         _llm: Any = PrivateAttr()
 
-    _messages_to_prompt: Callable[[Sequence[ChatMessage]], Any] = PrivateAttr()
-
     def __init__(
         self,
         model_id: str,
         model_version: Optional[str] = None,
         model_tag: Optional[str] = None,
         prompt_template: Optional[str] = None,
-        system_message: Optional[str] = None,
         backend: Optional[Literal["vllm", "pt"]] = None,
         *args: Any,
         quantize: Optional[Literal["awq", "gptq", "int8", "int4", "squeezellm"]] = None,
         serialization: Literal["safetensors", "legacy"] = "safetensors",
         trust_remote_code: bool = False,
         callback_manager: Optional[CallbackManager] = None,
-        messages_to_prompt: Optional[Callable[..., Any]] = None,
+        system_prompt: Optional[str] = None,
+        messages_to_prompt: Optional[Callable[[Sequence[ChatMessage]], str]] = None,
+        completion_to_prompt: Optional[Callable[[str], str]] = None,
+        pydantic_program_mode: PydanticProgramMode = PydanticProgramMode.DEFAULT,
         **attrs: Any,
     ):
         try:
@@ -114,7 +114,7 @@ class OpenLLM(LLM):
             model_version=model_version,
             model_tag=model_tag,
             prompt_template=prompt_template,
-            system_message=system_message,
+            system_message=system_prompt,
             backend=backend,
             quantize=quantize,
             serialisation=serialization,
@@ -133,12 +133,15 @@ class OpenLLM(LLM):
             model_version=self._llm.revision,
             model_tag=str(self._llm.tag),
             prompt_template=prompt_template,
-            system_message=system_message,
             backend=self._llm.__llm_backend__,
             quantize=self._llm.quantise,
             serialization=self._llm._serialisation,
             trust_remote_code=self._llm.trust_remote_code,
             callback_manager=callback_manager,
+            system_prompt=system_prompt,
+            messages_to_prompt=messages_to_prompt,
+            completion_to_prompt=completion_to_prompt,
+            pydantic_program_mode=pydantic_program_mode,
         )
 
     @classmethod
diff --git a/llama_index/llms/palm.py b/llama_index/llms/palm.py
index d907acb0e19e7dda04f762bfc7b5af4baec32519..30b49b6e3248acb53142da4e37642c15f1a164d0 100644
--- a/llama_index/llms/palm.py
+++ b/llama_index/llms/palm.py
@@ -1,17 +1,19 @@
 """Palm API."""
 import os
-from typing import Any, Optional
+from typing import Any, Callable, Optional, Sequence
 
 from llama_index.bridge.pydantic import Field, PrivateAttr
 from llama_index.callbacks import CallbackManager
 from llama_index.constants import DEFAULT_NUM_OUTPUTS
-from llama_index.llms.base import (
+from llama_index.llms.base import llm_completion_callback
+from llama_index.llms.custom import CustomLLM
+from llama_index.llms.types import (
+    ChatMessage,
     CompletionResponse,
     CompletionResponseGen,
     LLMMetadata,
-    llm_completion_callback,
 )
-from llama_index.llms.custom import CustomLLM
+from llama_index.types import BaseOutputParser, PydanticProgramMode
 
 DEFAULT_PALM_MODEL = "models/text-bison-001"
 
@@ -39,6 +41,11 @@ class PaLM(CustomLLM):
         model_name: Optional[str] = DEFAULT_PALM_MODEL,
         num_output: Optional[int] = None,
         callback_manager: Optional[CallbackManager] = None,
+        system_prompt: Optional[str] = None,
+        messages_to_prompt: Optional[Callable[[Sequence[ChatMessage]], str]] = None,
+        completion_to_prompt: Optional[Callable[[str], str]] = None,
+        pydantic_program_mode: PydanticProgramMode = PydanticProgramMode.DEFAULT,
+        output_parser: Optional[BaseOutputParser] = None,
         **generate_kwargs: Any,
     ) -> None:
         """Initialize params."""
@@ -71,6 +78,11 @@ class PaLM(CustomLLM):
             num_output=num_output,
             generate_kwargs=generate_kwargs,
             callback_manager=callback_manager,
+            system_prompt=system_prompt,
+            messages_to_prompt=messages_to_prompt,
+            completion_to_prompt=completion_to_prompt,
+            pydantic_program_mode=pydantic_program_mode,
+            output_parser=output_parser,
         )
 
     @classmethod
diff --git a/llama_index/llms/perplexity.py b/llama_index/llms/perplexity.py
index aa5db405e1492e930bb88a2450161cfb4f5ff324..dd36e6bb22bb5ea8e49f4fdf7f01f85cb5b956ff 100644
--- a/llama_index/llms/perplexity.py
+++ b/llama_index/llms/perplexity.py
@@ -1,13 +1,14 @@
 import json
-from typing import Any, Dict, Optional, Sequence
+from typing import Any, Callable, Dict, Optional, Sequence
 
 import httpx
 import requests
 
 from llama_index.bridge.pydantic import Field
 from llama_index.callbacks import CallbackManager
-from llama_index.llms.base import (
-    LLM,
+from llama_index.llms.base import llm_chat_callback, llm_completion_callback
+from llama_index.llms.llm import LLM
+from llama_index.llms.types import (
     ChatMessage,
     ChatResponse,
     ChatResponseAsyncGen,
@@ -16,9 +17,8 @@ from llama_index.llms.base import (
     CompletionResponseAsyncGen,
     CompletionResponseGen,
     LLMMetadata,
-    llm_chat_callback,
-    llm_completion_callback,
 )
+from llama_index.types import BaseOutputParser, PydanticProgramMode
 
 
 class Perplexity(LLM):
@@ -60,6 +60,11 @@ class Perplexity(LLM):
         max_retries: int = 10,
         context_window: Optional[int] = None,
         callback_manager: Optional[CallbackManager] = None,
+        system_prompt: Optional[str] = None,
+        messages_to_prompt: Optional[Callable[[Sequence[ChatMessage]], str]] = None,
+        completion_to_prompt: Optional[Callable[[str], str]] = None,
+        pydantic_program_mode: PydanticProgramMode = PydanticProgramMode.DEFAULT,
+        output_parser: Optional[BaseOutputParser] = None,
         **kwargs: Any,
     ) -> None:
         additional_kwargs = additional_kwargs or {}
@@ -79,6 +84,11 @@ class Perplexity(LLM):
             api_base=api_base,
             headers=headers,
             context_window=context_window,
+            system_prompt=system_prompt,
+            messages_to_prompt=messages_to_prompt,
+            completion_to_prompt=completion_to_prompt,
+            pydantic_program_mode=pydantic_program_mode,
+            output_parser=output_parser,
             **kwargs,
         )
 
diff --git a/llama_index/llms/portkey.py b/llama_index/llms/portkey.py
index 5acf56b481aba3c0cd12117fc05e0016c11d84e6..48c92ca63860e1964d942d9882a1796ed6066779 100644
--- a/llama_index/llms/portkey.py
+++ b/llama_index/llms/portkey.py
@@ -1,19 +1,10 @@
 """
 Portkey integration with Llama_index for enhanced monitoring.
 """
-from typing import TYPE_CHECKING, Any, List, Optional, Sequence, Union, cast
+from typing import TYPE_CHECKING, Any, Callable, List, Optional, Sequence, Union, cast
 
 from llama_index.bridge.pydantic import Field, PrivateAttr
-from llama_index.llms.base import (
-    ChatMessage,
-    ChatResponse,
-    ChatResponseGen,
-    CompletionResponse,
-    CompletionResponseGen,
-    LLMMetadata,
-    llm_chat_callback,
-    llm_completion_callback,
-)
+from llama_index.llms.base import llm_chat_callback, llm_completion_callback
 from llama_index.llms.custom import CustomLLM
 from llama_index.llms.generic_utils import (
     chat_to_completion_decorator,
@@ -27,6 +18,15 @@ from llama_index.llms.portkey_utils import (
     get_llm,
     is_chat_model,
 )
+from llama_index.llms.types import (
+    ChatMessage,
+    ChatResponse,
+    ChatResponseGen,
+    CompletionResponse,
+    CompletionResponseGen,
+    LLMMetadata,
+)
+from llama_index.types import BaseOutputParser, PydanticProgramMode
 
 if TYPE_CHECKING:
     from portkey import (
@@ -63,6 +63,11 @@ class Portkey(CustomLLM):
         mode: Union["Modes", "ModesLiteral"],
         api_key: Optional[str] = None,
         base_url: Optional[str] = None,
+        system_prompt: Optional[str] = None,
+        messages_to_prompt: Optional[Callable[[Sequence[ChatMessage]], str]] = None,
+        completion_to_prompt: Optional[Callable[[str], str]] = None,
+        pydantic_program_mode: PydanticProgramMode = PydanticProgramMode.DEFAULT,
+        output_parser: Optional[BaseOutputParser] = None,
     ) -> None:
         """
         Initialize a Portkey instance.
@@ -82,6 +87,11 @@ class Portkey(CustomLLM):
         super().__init__(
             base_url=base_url,
             api_key=api_key,
+            system_prompt=system_prompt,
+            messages_to_prompt=messages_to_prompt,
+            completion_to_prompt=completion_to_prompt,
+            pydantic_program_mode=pydantic_program_mode,
+            output_parser=output_parser,
         )
         if api_key is not None:
             portkey.api_key = api_key
diff --git a/llama_index/llms/portkey_utils.py b/llama_index/llms/portkey_utils.py
index b328ea402f7505b252a7c338e030c49147c7529b..e23e6b5ee5455c58cb8d092c7faa57a74a0e143f 100644
--- a/llama_index/llms/portkey_utils.py
+++ b/llama_index/llms/portkey_utils.py
@@ -8,7 +8,6 @@ from typing import TYPE_CHECKING, List
 
 from llama_index.llms.anthropic import Anthropic
 from llama_index.llms.anthropic_utils import CLAUDE_MODELS
-from llama_index.llms.base import LLMMetadata
 from llama_index.llms.openai import OpenAI
 from llama_index.llms.openai_utils import (
     AZURE_TURBO_MODELS,
@@ -17,6 +16,7 @@ from llama_index.llms.openai_utils import (
     GPT4_MODELS,
     TURBO_MODELS,
 )
+from llama_index.llms.types import LLMMetadata
 
 if TYPE_CHECKING:
     from portkey import (
diff --git a/llama_index/llms/predibase.py b/llama_index/llms/predibase.py
index c7993044cb69ebf36292b4f55d55829309633978..38b86216f1016d8683c34f793da3896967810542 100644
--- a/llama_index/llms/predibase.py
+++ b/llama_index/llms/predibase.py
@@ -1,5 +1,5 @@
 import os
-from typing import Any, Optional
+from typing import Any, Callable, Optional, Sequence
 
 from llama_index.bridge.pydantic import Field, PrivateAttr
 from llama_index.callbacks import CallbackManager
@@ -8,13 +8,15 @@ from llama_index.constants import (
     DEFAULT_NUM_OUTPUTS,
     DEFAULT_TEMPERATURE,
 )
-from llama_index.llms.base import (
+from llama_index.llms.base import llm_completion_callback
+from llama_index.llms.custom import CustomLLM
+from llama_index.llms.types import (
+    ChatMessage,
     CompletionResponse,
     CompletionResponseGen,
     LLMMetadata,
-    llm_completion_callback,
 )
-from llama_index.llms.custom import CustomLLM
+from llama_index.types import BaseOutputParser, PydanticProgramMode
 
 
 class PredibaseLLM(CustomLLM):
@@ -49,6 +51,11 @@ class PredibaseLLM(CustomLLM):
         temperature: float = DEFAULT_TEMPERATURE,
         context_window: int = DEFAULT_CONTEXT_WINDOW,
         callback_manager: Optional[CallbackManager] = None,
+        system_prompt: Optional[str] = None,
+        messages_to_prompt: Optional[Callable[[Sequence[ChatMessage]], str]] = None,
+        completion_to_prompt: Optional[Callable[[str], str]] = None,
+        pydantic_program_mode: PydanticProgramMode = PydanticProgramMode.DEFAULT,
+        output_parser: Optional[BaseOutputParser] = None,
     ) -> None:
         predibase_api_key = (
             predibase_api_key
@@ -66,6 +73,11 @@ class PredibaseLLM(CustomLLM):
             temperature=temperature,
             context_window=context_window,
             callback_manager=callback_manager,
+            system_prompt=system_prompt,
+            messages_to_prompt=messages_to_prompt,
+            completion_to_prompt=completion_to_prompt,
+            pydantic_program_mode=pydantic_program_mode,
+            output_parser=output_parser,
         )
 
     @staticmethod
diff --git a/llama_index/llms/replicate.py b/llama_index/llms/replicate.py
index 8283ffe54064fb52918c36cb4f834678473b1aef..16c8adae63b45f6001ff1cef01c29c3652d8f861 100644
--- a/llama_index/llms/replicate.py
+++ b/llama_index/llms/replicate.py
@@ -1,25 +1,20 @@
-from typing import Any, Callable, Dict, Optional, Sequence
+from typing import Any, Dict, Sequence
 
-from llama_index.bridge.pydantic import Field, PrivateAttr
-from llama_index.callbacks import CallbackManager
+from llama_index.bridge.pydantic import Field
 from llama_index.constants import DEFAULT_CONTEXT_WINDOW, DEFAULT_NUM_OUTPUTS
-from llama_index.llms.base import (
+from llama_index.llms.base import llm_chat_callback, llm_completion_callback
+from llama_index.llms.custom import CustomLLM
+from llama_index.llms.generic_utils import (
+    completion_response_to_chat_response,
+    stream_completion_response_to_chat_response,
+)
+from llama_index.llms.types import (
     ChatMessage,
     ChatResponse,
     ChatResponseGen,
     CompletionResponse,
     CompletionResponseGen,
     LLMMetadata,
-    llm_chat_callback,
-    llm_completion_callback,
-)
-from llama_index.llms.custom import CustomLLM
-from llama_index.llms.generic_utils import (
-    completion_response_to_chat_response,
-    stream_completion_response_to_chat_response,
-)
-from llama_index.llms.generic_utils import (
-    messages_to_prompt as generic_messages_to_prompt,
 )
 
 DEFAULT_REPLICATE_TEMP = 0.75
@@ -34,14 +29,16 @@ class Replicate(CustomLLM):
         lte=1.0,
     )
     image: str = Field(
-        description="The image file for multimodal model to use. (optional)"
+        default="", description="The image file for multimodal model to use. (optional)"
     )
     context_window: int = Field(
         default=DEFAULT_CONTEXT_WINDOW,
         description="The maximum number of context tokens for the model.",
         gt=0,
     )
-    prompt_key: str = Field(description="The key to use for the prompt in API calls.")
+    prompt_key: str = Field(
+        default="prompt", description="The key to use for the prompt in API calls."
+    )
     additional_kwargs: Dict[str, Any] = Field(
         default_factory=dict, description="Additional kwargs for the Replicate API."
     )
@@ -49,36 +46,6 @@ class Replicate(CustomLLM):
         default=False, description="Whether the model is a chat model."
     )
 
-    _messages_to_prompt: Callable = PrivateAttr()
-    _completion_to_prompt: Callable = PrivateAttr()
-
-    def __init__(
-        self,
-        model: str,
-        temperature: float = DEFAULT_REPLICATE_TEMP,
-        image: Optional[str] = "",
-        additional_kwargs: Optional[Dict[str, Any]] = None,
-        context_window: int = DEFAULT_CONTEXT_WINDOW,
-        prompt_key: str = "prompt",
-        messages_to_prompt: Optional[Callable] = None,
-        completion_to_prompt: Optional[Callable] = None,
-        callback_manager: Optional[CallbackManager] = None,
-        is_chat_model: bool = False,
-    ) -> None:
-        self._messages_to_prompt = messages_to_prompt or generic_messages_to_prompt
-        self._completion_to_prompt = completion_to_prompt or (lambda x: x)
-
-        super().__init__(
-            model=model,
-            temperature=temperature,
-            image=image,
-            additional_kwargs=additional_kwargs or {},
-            context_window=context_window,
-            prompt_key=prompt_key,
-            callback_manager=callback_manager,
-            is_chat_model=is_chat_model,
-        )
-
     @classmethod
     def class_name(cls) -> str:
         return "Replicate_llm"
@@ -116,16 +83,16 @@ class Replicate(CustomLLM):
 
     @llm_chat_callback()
     def chat(self, messages: Sequence[ChatMessage], **kwargs: Any) -> ChatResponse:
-        prompt = self._messages_to_prompt(messages)
-        completion_response = self.complete(prompt, **kwargs)
+        prompt = self.messages_to_prompt(messages)
+        completion_response = self.complete(prompt, formatted=True, **kwargs)
         return completion_response_to_chat_response(completion_response)
 
     @llm_chat_callback()
     def stream_chat(
         self, messages: Sequence[ChatMessage], **kwargs: Any
     ) -> ChatResponseGen:
-        prompt = self._messages_to_prompt(messages)
-        completion_response = self.stream_complete(prompt, **kwargs)
+        prompt = self.messages_to_prompt(messages)
+        completion_response = self.stream_complete(prompt, formatted=True, **kwargs)
         return stream_completion_response_to_chat_response(completion_response)
 
     @llm_completion_callback()
@@ -146,7 +113,8 @@ class Replicate(CustomLLM):
                 "Please install replicate with `pip install replicate`"
             )
 
-        prompt = self._completion_to_prompt(prompt)
+        if not kwargs.get("formatted", False):
+            prompt = self.completion_to_prompt(prompt)
         input_dict = self._get_input_dict(prompt, **kwargs)
         response_iter = replicate.run(self.model, input=input_dict)
 
diff --git a/llama_index/llms/rungpt.py b/llama_index/llms/rungpt.py
index 65ff1e91e8053cec3860a1238908ada8c5073b6e..8351635325802089482f6b4744d628b9146df4f0 100644
--- a/llama_index/llms/rungpt.py
+++ b/llama_index/llms/rungpt.py
@@ -1,11 +1,12 @@
 import json
-from typing import Any, Dict, List, Optional, Sequence, Tuple, Union
+from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, Union
 
 from llama_index.bridge.pydantic import Field
 from llama_index.callbacks import CallbackManager
 from llama_index.constants import DEFAULT_CONTEXT_WINDOW, DEFAULT_NUM_OUTPUTS
-from llama_index.llms.base import (
-    LLM,
+from llama_index.llms.base import llm_chat_callback, llm_completion_callback
+from llama_index.llms.llm import LLM
+from llama_index.llms.types import (
     ChatMessage,
     ChatResponse,
     ChatResponseAsyncGen,
@@ -15,9 +16,8 @@ from llama_index.llms.base import (
     CompletionResponseGen,
     LLMMetadata,
     MessageRole,
-    llm_chat_callback,
-    llm_completion_callback,
 )
+from llama_index.types import BaseOutputParser, PydanticProgramMode
 
 DEFAULT_RUNGPT_MODEL = "rungpt"
 DEFAULT_RUNGPT_TEMP = 0.75
@@ -62,6 +62,11 @@ class RunGptLLM(LLM):
         context_window: int = DEFAULT_CONTEXT_WINDOW,
         additional_kwargs: Optional[Dict[str, Any]] = None,
         callback_manager: Optional[CallbackManager] = None,
+        system_prompt: Optional[str] = None,
+        messages_to_prompt: Optional[Callable[[Sequence[ChatMessage]], str]] = None,
+        completion_to_prompt: Optional[Callable[[str], str]] = None,
+        pydantic_program_mode: PydanticProgramMode = PydanticProgramMode.DEFAULT,
+        output_parser: Optional[BaseOutputParser] = None,
     ):
         if endpoint.startswith("http://"):
             base_url = endpoint
@@ -76,6 +81,11 @@ class RunGptLLM(LLM):
             additional_kwargs=additional_kwargs or {},
             callback_manager=callback_manager or CallbackManager([]),
             base_url=base_url,
+            system_prompt=system_prompt,
+            messages_to_prompt=messages_to_prompt,
+            completion_to_prompt=completion_to_prompt,
+            pydantic_program_mode=pydantic_program_mode,
+            output_parser=output_parser,
         )
 
     @classmethod
diff --git a/llama_index/llms/types.py b/llama_index/llms/types.py
new file mode 100644
index 0000000000000000000000000000000000000000..9db785861d0a20d2b702d9aabe24c6bb07202d7b
--- /dev/null
+++ b/llama_index/llms/types.py
@@ -0,0 +1,110 @@
+from enum import Enum
+from typing import Any, AsyncGenerator, Generator, Optional
+
+from llama_index.bridge.pydantic import BaseModel, Field
+from llama_index.constants import DEFAULT_CONTEXT_WINDOW, DEFAULT_NUM_OUTPUTS
+
+
+class MessageRole(str, Enum):
+    """Message role."""
+
+    SYSTEM = "system"
+    USER = "user"
+    ASSISTANT = "assistant"
+    FUNCTION = "function"
+    TOOL = "tool"
+
+
+# ===== Generic Model Input - Chat =====
+class ChatMessage(BaseModel):
+    """Chat message."""
+
+    role: MessageRole = MessageRole.USER
+    content: Optional[Any] = ""
+    additional_kwargs: dict = Field(default_factory=dict)
+
+    def __str__(self) -> str:
+        return f"{self.role.value}: {self.content}"
+
+
+# ===== Generic Model Output - Chat =====
+class ChatResponse(BaseModel):
+    """Chat response."""
+
+    message: ChatMessage
+    raw: Optional[dict] = None
+    delta: Optional[str] = None
+    additional_kwargs: dict = Field(default_factory=dict)
+
+    def __str__(self) -> str:
+        return str(self.message)
+
+
+ChatResponseGen = Generator[ChatResponse, None, None]
+ChatResponseAsyncGen = AsyncGenerator[ChatResponse, None]
+
+
+# ===== Generic Model Output - Completion =====
+class CompletionResponse(BaseModel):
+    """
+    Completion response.
+
+    Fields:
+        text: Text content of the response if not streaming, or if streaming,
+            the current extent of streamed text.
+        additional_kwargs: Additional information on the response(i.e. token
+            counts, function calling information).
+        raw: Optional raw JSON that was parsed to populate text, if relevant.
+        delta: New text that just streamed in (only relevant when streaming).
+    """
+
+    text: str
+    additional_kwargs: dict = Field(default_factory=dict)
+    raw: Optional[dict] = None
+    delta: Optional[str] = None
+
+    def __str__(self) -> str:
+        return self.text
+
+
+CompletionResponseGen = Generator[CompletionResponse, None, None]
+CompletionResponseAsyncGen = AsyncGenerator[CompletionResponse, None]
+
+
+class LLMMetadata(BaseModel):
+    context_window: int = Field(
+        default=DEFAULT_CONTEXT_WINDOW,
+        description=(
+            "Total number of tokens the model can be input and output for one response."
+        ),
+    )
+    num_output: int = Field(
+        default=DEFAULT_NUM_OUTPUTS,
+        description="Number of tokens the model can output when generating a response.",
+    )
+    is_chat_model: bool = Field(
+        default=False,
+        description=(
+            "Set True if the model exposes a chat interface (i.e. can be passed a"
+            " sequence of messages, rather than text), like OpenAI's"
+            " /v1/chat/completions endpoint."
+        ),
+    )
+    is_function_calling_model: bool = Field(
+        default=False,
+        # SEE: https://openai.com/blog/function-calling-and-other-api-updates
+        description=(
+            "Set True if the model supports function calling messages, similar to"
+            " OpenAI's function calling API. For example, converting 'Email Anya to"
+            " see if she wants to get coffee next Friday' to a function call like"
+            " `send_email(to: string, body: string)`."
+        ),
+    )
+    model_name: str = Field(
+        default="unknown",
+        description=(
+            "The model's name used for logging, testing, and sanity checking. For some"
+            " models this can be automatically discerned. For other models, like"
+            " locally loaded models, this must be manually specified."
+        ),
+    )
diff --git a/llama_index/llms/utils.py b/llama_index/llms/utils.py
index 53b1bcb3b18e959f82e65aefb1ade932b6a65a7b..076a97c4961d26f5124890b23b5df88c2dea4bb9 100644
--- a/llama_index/llms/utils.py
+++ b/llama_index/llms/utils.py
@@ -3,9 +3,9 @@ from typing import TYPE_CHECKING, Optional, Union
 if TYPE_CHECKING:
     from langchain.base_language import BaseLanguageModel
 
-from llama_index.llms.base import LLM
 from llama_index.llms.llama_cpp import LlamaCPP
 from llama_index.llms.llama_utils import completion_to_prompt, messages_to_prompt
+from llama_index.llms.llm import LLM
 from llama_index.llms.mock import MockLLM
 from llama_index.llms.openai import OpenAI
 from llama_index.llms.openai_utils import validate_openai_api_key
diff --git a/llama_index/llms/vertex.py b/llama_index/llms/vertex.py
index e74964c3d33c0688c94c6009888822a68fcc86dc..b6f2764a332ba7860634e1e621e52cb180b30bff 100644
--- a/llama_index/llms/vertex.py
+++ b/llama_index/llms/vertex.py
@@ -1,9 +1,13 @@
-from typing import Any, Dict, Optional, Sequence
+from typing import Any, Callable, Dict, Optional, Sequence
 
 from llama_index.bridge.pydantic import Field, PrivateAttr
 from llama_index.callbacks import CallbackManager
 from llama_index.llms.base import (
-    LLM,
+    llm_chat_callback,
+    llm_completion_callback,
+)
+from llama_index.llms.llm import LLM
+from llama_index.llms.types import (
     ChatMessage,
     ChatResponse,
     ChatResponseAsyncGen,
@@ -13,8 +17,6 @@ from llama_index.llms.base import (
     CompletionResponseGen,
     LLMMetadata,
     MessageRole,
-    llm_chat_callback,
-    llm_completion_callback,
 )
 from llama_index.llms.vertex_utils import (
     CHAT_MODELS,
@@ -27,6 +29,7 @@ from llama_index.llms.vertex_utils import (
     completion_with_retry,
     init_vertexai,
 )
+from llama_index.types import BaseOutputParser, PydanticProgramMode
 
 
 class Vertex(LLM):
@@ -60,6 +63,11 @@ class Vertex(LLM):
         iscode: bool = False,
         additional_kwargs: Optional[Dict[str, Any]] = None,
         callback_manager: Optional[CallbackManager] = None,
+        system_prompt: Optional[str] = None,
+        messages_to_prompt: Optional[Callable[[Sequence[ChatMessage]], str]] = None,
+        completion_to_prompt: Optional[Callable[[str], str]] = None,
+        pydantic_program_mode: PydanticProgramMode = PydanticProgramMode.DEFAULT,
+        output_parser: Optional[BaseOutputParser] = None,
     ) -> None:
         init_vertexai(project=project, location=location, credentials=credentials)
 
@@ -96,6 +104,11 @@ class Vertex(LLM):
             examples=examples,
             iscode=iscode,
             callback_manager=callback_manager,
+            system_prompt=system_prompt,
+            messages_to_prompt=messages_to_prompt,
+            completion_to_prompt=completion_to_prompt,
+            pydantic_program_mode=pydantic_program_mode,
+            output_parser=output_parser,
         )
 
     @classmethod
diff --git a/llama_index/llms/vertex_utils.py b/llama_index/llms/vertex_utils.py
index 7f5e8fba570bb1713028d561f5360ca713b950e5..e87ff3e4ba3c310ffd73bfcaf2f4dc71e94f4d95 100644
--- a/llama_index/llms/vertex_utils.py
+++ b/llama_index/llms/vertex_utils.py
@@ -12,7 +12,7 @@ from tenacity import (
     wait_exponential,
 )
 
-from llama_index.llms.base import MessageRole
+from llama_index.llms.types import MessageRole
 
 CHAT_MODELS = ["chat-bison", "chat-bison-32k", "chat-bison@001"]
 TEXT_MODELS = ["text-bison", "text-bison-32k", "text-bison@001"]
diff --git a/llama_index/llms/vllm.py b/llama_index/llms/vllm.py
index 0fd30f90df20e524495fd5778fe6795cc9905389..f10259663312da4c9cce09c0777ccf8be21a310b 100644
--- a/llama_index/llms/vllm.py
+++ b/llama_index/llms/vllm.py
@@ -3,8 +3,16 @@ from typing import Any, Callable, Dict, List, Optional, Sequence
 
 from llama_index.bridge.pydantic import Field, PrivateAttr
 from llama_index.callbacks import CallbackManager
-from llama_index.llms.base import (
-    LLM,
+from llama_index.llms.base import llm_chat_callback, llm_completion_callback
+from llama_index.llms.generic_utils import (
+    completion_response_to_chat_response,
+    stream_completion_response_to_chat_response,
+)
+from llama_index.llms.generic_utils import (
+    messages_to_prompt as generic_messages_to_prompt,
+)
+from llama_index.llms.llm import LLM
+from llama_index.llms.types import (
     ChatMessage,
     ChatResponse,
     ChatResponseAsyncGen,
@@ -13,17 +21,9 @@ from llama_index.llms.base import (
     CompletionResponseAsyncGen,
     CompletionResponseGen,
     LLMMetadata,
-    llm_chat_callback,
-    llm_completion_callback,
-)
-from llama_index.llms.generic_utils import (
-    completion_response_to_chat_response,
-    stream_completion_response_to_chat_response,
-)
-from llama_index.llms.generic_utils import (
-    messages_to_prompt as generic_messages_to_prompt,
 )
 from llama_index.llms.vllm_utils import get_response, post_http_request
+from llama_index.types import BaseOutputParser, PydanticProgramMode
 
 
 class Vllm(LLM):
@@ -100,14 +100,6 @@ class Vllm(LLM):
         description="The data type for the model weights and activations.",
     )
 
-    messages_to_prompt: Callable = Field(
-        description="The function to convert messages to a prompt.", exclude=True
-    )
-
-    completion_to_prompt: Callable = Field(
-        description="The function to convert a completion to a prompt.", exclude=True
-    )
-
     download_dir: Optional[str] = Field(
         default=None,
         description="Directory to download and load the weights. (Default to the default cache dir of huggingface)",
@@ -126,8 +118,8 @@ class Vllm(LLM):
         self,
         model: str = "facebook/opt-125m",
         temperature: float = 1.0,
-        tensor_parallel_size: Optional[int] = 1,
-        trust_remote_code: Optional[bool] = True,
+        tensor_parallel_size: int = 1,
+        trust_remote_code: bool = True,
         n: int = 1,
         best_of: Optional[int] = None,
         presence_penalty: float = 0.0,
@@ -143,9 +135,12 @@ class Vllm(LLM):
         download_dir: Optional[str] = None,
         vllm_kwargs: Dict[str, Any] = {},
         api_url: Optional[str] = "",
-        messages_to_prompt: Optional[Callable] = None,
-        completion_to_prompt: Optional[Callable] = None,
         callback_manager: Optional[CallbackManager] = None,
+        system_prompt: Optional[str] = None,
+        messages_to_prompt: Optional[Callable[[Sequence[ChatMessage]], str]] = None,
+        completion_to_prompt: Optional[Callable[[str], str]] = None,
+        pydantic_program_mode: PydanticProgramMode = PydanticProgramMode.DEFAULT,
+        output_parser: Optional[BaseOutputParser] = None,
     ) -> None:
         try:
             from vllm import LLM as VLLModel
@@ -166,8 +161,6 @@ class Vllm(LLM):
         else:
             self._client = None
         callback_manager = callback_manager or CallbackManager([])
-        messages_to_prompt = messages_to_prompt or generic_messages_to_prompt
-        completion_to_prompt = completion_to_prompt or (lambda x: x)
         super().__init__(
             model=model,
             temperature=temperature,
@@ -184,10 +177,13 @@ class Vllm(LLM):
             logprobs=logprobs,
             dtype=dtype,
             download_dir=download_dir,
-            messages_to_prompt=messages_to_prompt,
-            completion_to_prompt=completion_to_prompt,
             vllm_kwargs=vllm_kwargs,
             api_url=api_url,
+            system_prompt=system_prompt,
+            messages_to_prompt=messages_to_prompt,
+            completion_to_prompt=completion_to_prompt,
+            pydantic_program_mode=pydantic_program_mode,
+            output_parser=output_parser,
         )
 
     @classmethod
@@ -304,6 +300,7 @@ class VllmServer(Vllm):
         completion_to_prompt: Optional[Callable] = None,
         vllm_kwargs: Dict[str, Any] = {},
         callback_manager: Optional[CallbackManager] = None,
+        output_parser: Optional[BaseOutputParser] = None,
     ) -> None:
         self._client = None
         messages_to_prompt = messages_to_prompt or generic_messages_to_prompt
@@ -331,6 +328,8 @@ class VllmServer(Vllm):
             completion_to_prompt=completion_to_prompt,
             vllm_kwargs=vllm_kwargs,
             api_url=api_url,
+            callback_manager=callback_manager,
+            output_parser=output_parser,
         )
 
     @classmethod
diff --git a/llama_index/llms/watsonx.py b/llama_index/llms/watsonx.py
index 5f1ad841985cfd6fd6b1a4a8c5680e4a4d722e69..15c69392c06af1205133c274276b1e25956ca56f 100644
--- a/llama_index/llms/watsonx.py
+++ b/llama_index/llms/watsonx.py
@@ -1,9 +1,14 @@
-from typing import Any, Dict, Optional, Sequence
+from typing import Any, Callable, Dict, Optional, Sequence
 
 from llama_index.bridge.pydantic import Field, PrivateAttr
 from llama_index.callbacks import CallbackManager
-from llama_index.llms.base import (
-    LLM,
+from llama_index.llms.base import llm_chat_callback, llm_completion_callback
+from llama_index.llms.generic_utils import (
+    completion_to_chat_decorator,
+    stream_completion_to_chat_decorator,
+)
+from llama_index.llms.llm import LLM
+from llama_index.llms.types import (
     ChatMessage,
     ChatResponse,
     ChatResponseAsyncGen,
@@ -12,18 +17,13 @@ from llama_index.llms.base import (
     CompletionResponseAsyncGen,
     CompletionResponseGen,
     LLMMetadata,
-    llm_chat_callback,
-    llm_completion_callback,
-)
-from llama_index.llms.generic_utils import (
-    completion_to_chat_decorator,
-    stream_completion_to_chat_decorator,
 )
 from llama_index.llms.watsonx_utils import (
     WATSONX_MODELS,
     get_from_param_or_env_without_error,
     watsonx_model_to_context_size,
 )
+from llama_index.types import BaseOutputParser, PydanticProgramMode
 
 
 class WatsonX(LLM):
@@ -51,6 +51,11 @@ class WatsonX(LLM):
         temperature: Optional[float] = 0.1,
         additional_kwargs: Optional[Dict[str, Any]] = None,
         callback_manager: Optional[CallbackManager] = None,
+        system_prompt: Optional[str] = None,
+        messages_to_prompt: Optional[Callable[[Sequence[ChatMessage]], str]] = None,
+        completion_to_prompt: Optional[Callable[[str], str]] = None,
+        pydantic_program_mode: PydanticProgramMode = PydanticProgramMode.DEFAULT,
+        output_parser: Optional[BaseOutputParser] = None,
     ) -> None:
         """Initialize params."""
         if model_id not in WATSONX_MODELS:
@@ -94,6 +99,11 @@ class WatsonX(LLM):
             additional_kwargs=additional_kwargs,
             model_info=self._model.get_details(),
             callback_manager=callback_manager,
+            system_prompt=system_prompt,
+            messages_to_prompt=messages_to_prompt,
+            completion_to_prompt=completion_to_prompt,
+            pydantic_program_mode=pydantic_program_mode,
+            output_parser=output_parser,
         )
 
     @classmethod
diff --git a/llama_index/llms/xinference.py b/llama_index/llms/xinference.py
index d0f15dd0bb00f92b391b6666a067b899d8c7e701..62c02e90f4e3564a4825d2ba9e0a9a2cc01f7171 100644
--- a/llama_index/llms/xinference.py
+++ b/llama_index/llms/xinference.py
@@ -1,9 +1,14 @@
 import warnings
-from typing import Any, Dict, Optional, Sequence, Tuple
+from typing import Any, Callable, Dict, Optional, Sequence, Tuple
 
 from llama_index.bridge.pydantic import Field, PrivateAttr
 from llama_index.callbacks import CallbackManager
 from llama_index.llms.base import (
+    llm_chat_callback,
+    llm_completion_callback,
+)
+from llama_index.llms.custom import CustomLLM
+from llama_index.llms.types import (
     ChatMessage,
     ChatResponse,
     ChatResponseGen,
@@ -11,14 +16,12 @@ from llama_index.llms.base import (
     CompletionResponseGen,
     LLMMetadata,
     MessageRole,
-    llm_chat_callback,
-    llm_completion_callback,
 )
-from llama_index.llms.custom import CustomLLM
 from llama_index.llms.xinference_utils import (
     xinference_message_to_history,
     xinference_modelname_to_contextsize,
 )
+from llama_index.types import BaseOutputParser, PydanticProgramMode
 
 # an approximation of the ratio between llama and GPT2 tokens
 TOKEN_RATIO = 2.5
@@ -50,6 +53,11 @@ class Xinference(CustomLLM):
         temperature: float = DEFAULT_XINFERENCE_TEMP,
         max_tokens: Optional[int] = None,
         callback_manager: Optional[CallbackManager] = None,
+        system_prompt: Optional[str] = None,
+        messages_to_prompt: Optional[Callable[[Sequence[ChatMessage]], str]] = None,
+        completion_to_prompt: Optional[Callable[[str], str]] = None,
+        pydantic_program_mode: PydanticProgramMode = PydanticProgramMode.DEFAULT,
+        output_parser: Optional[BaseOutputParser] = None,
     ) -> None:
         generator, context_window, model_description = self.load_model(
             model_uid, endpoint
@@ -71,6 +79,11 @@ class Xinference(CustomLLM):
             max_tokens=max_tokens,
             model_description=model_description,
             callback_manager=callback_manager,
+            system_prompt=system_prompt,
+            messages_to_prompt=messages_to_prompt,
+            completion_to_prompt=completion_to_prompt,
+            pydantic_program_mode=pydantic_program_mode,
+            output_parser=output_parser,
         )
 
     def load_model(self, model_uid: str, endpoint: str) -> Tuple[Any, int, dict]:
diff --git a/llama_index/llms/xinference_utils.py b/llama_index/llms/xinference_utils.py
index 56ad201bc0b1a65b06e0e8ff26a085a3dd861fbc..bc1be05157a563848cbf6c34924685a18e27a4b5 100644
--- a/llama_index/llms/xinference_utils.py
+++ b/llama_index/llms/xinference_utils.py
@@ -2,7 +2,7 @@ from typing import Optional
 
 from typing_extensions import NotRequired, TypedDict
 
-from llama_index.llms.base import ChatMessage
+from llama_index.llms.types import ChatMessage
 
 XINFERENCE_MODEL_SIZES = {
     "baichuan": 2048,
diff --git a/llama_index/memory/chat_memory_buffer.py b/llama_index/memory/chat_memory_buffer.py
index fc6014368cab2fbdd7bad5d0f9e89b81337b6d1c..baa22c299dab88d4f7e2f7c53730c4df59c789dd 100644
--- a/llama_index/memory/chat_memory_buffer.py
+++ b/llama_index/memory/chat_memory_buffer.py
@@ -1,7 +1,8 @@
 from typing import Any, Callable, Dict, List, Optional, cast
 
 from llama_index.bridge.pydantic import Field, root_validator
-from llama_index.llms.base import LLM, ChatMessage, MessageRole
+from llama_index.llms.llm import LLM
+from llama_index.llms.types import ChatMessage, MessageRole
 from llama_index.memory.types import BaseMemory
 from llama_index.utils import GlobalsHelper
 
diff --git a/llama_index/memory/types.py b/llama_index/memory/types.py
index 5375f5d408290a1418b388e0722b79c4a8159623..2c00253dfcb4784b3fd8e5a274cb2782211144d5 100644
--- a/llama_index/memory/types.py
+++ b/llama_index/memory/types.py
@@ -2,7 +2,8 @@ from abc import abstractmethod
 from typing import Any, List, Optional
 
 from llama_index.bridge.pydantic import BaseModel
-from llama_index.llms.base import LLM, ChatMessage
+from llama_index.llms.llm import LLM
+from llama_index.llms.types import ChatMessage
 
 
 class BaseMemory(BaseModel):
diff --git a/llama_index/multi_modal_llms/base.py b/llama_index/multi_modal_llms/base.py
index 9bd70c1e1a345e375a74aa5a374c59a4eb012af6..fe537352ba0e562859caf81e4772201deb756086 100644
--- a/llama_index/multi_modal_llms/base.py
+++ b/llama_index/multi_modal_llms/base.py
@@ -7,7 +7,7 @@ from llama_index.constants import (
     DEFAULT_NUM_INPUT_FILES,
     DEFAULT_NUM_OUTPUTS,
 )
-from llama_index.llms.base import (
+from llama_index.llms.types import (
     ChatMessage,
     ChatResponse,
     ChatResponseAsyncGen,
diff --git a/llama_index/multi_modal_llms/openai.py b/llama_index/multi_modal_llms/openai.py
index ca04fffdea278069b2b02f3d873408f5ae1074c6..2228c349e92a720159c06d8644db30755762512c 100644
--- a/llama_index/multi_modal_llms/openai.py
+++ b/llama_index/multi_modal_llms/openai.py
@@ -16,7 +16,11 @@ from llama_index.constants import (
     DEFAULT_NUM_OUTPUTS,
     DEFAULT_TEMPERATURE,
 )
-from llama_index.llms.base import (
+from llama_index.llms.generic_utils import (
+    messages_to_prompt as generic_messages_to_prompt,
+)
+from llama_index.llms.openai_utils import from_openai_message, to_openai_message_dicts
+from llama_index.llms.types import (
     ChatMessage,
     ChatResponse,
     ChatResponseAsyncGen,
@@ -26,10 +30,6 @@ from llama_index.llms.base import (
     CompletionResponseGen,
     MessageRole,
 )
-from llama_index.llms.generic_utils import (
-    messages_to_prompt as generic_messages_to_prompt,
-)
-from llama_index.llms.openai_utils import from_openai_message, to_openai_message_dicts
 from llama_index.multi_modal_llms import (
     MultiModalLLM,
     MultiModalLLMMetadata,
diff --git a/llama_index/multi_modal_llms/replicate_multi_modal.py b/llama_index/multi_modal_llms/replicate_multi_modal.py
index 23fe2a2f6e6dee025676245802237c2e2cebc452..b0ae63ca6aa2e399e984ee9ee263f709d5baca16 100644
--- a/llama_index/multi_modal_llms/replicate_multi_modal.py
+++ b/llama_index/multi_modal_llms/replicate_multi_modal.py
@@ -4,7 +4,10 @@ from typing import Any, Callable, Dict, Optional, Sequence
 from llama_index.bridge.pydantic import Field, PrivateAttr
 from llama_index.callbacks import CallbackManager
 from llama_index.constants import DEFAULT_CONTEXT_WINDOW, DEFAULT_NUM_OUTPUTS
-from llama_index.llms.base import (
+from llama_index.llms.generic_utils import (
+    messages_to_prompt as generic_messages_to_prompt,
+)
+from llama_index.llms.types import (
     ChatMessage,
     ChatResponse,
     ChatResponseAsyncGen,
@@ -13,9 +16,6 @@ from llama_index.llms.base import (
     CompletionResponseAsyncGen,
     CompletionResponseGen,
 )
-from llama_index.llms.generic_utils import (
-    messages_to_prompt as generic_messages_to_prompt,
-)
 from llama_index.multi_modal_llms import (
     MultiModalLLM,
     MultiModalLLMMetadata,
diff --git a/llama_index/node_parser/relational/unstructured_element.py b/llama_index/node_parser/relational/unstructured_element.py
index ad601def64b20548fc4ce4ebc06c7af956757c67..ca4415e57ad2b4e5f7a36154f8517d9f15254bc0 100644
--- a/llama_index/node_parser/relational/unstructured_element.py
+++ b/llama_index/node_parser/relational/unstructured_element.py
@@ -7,7 +7,8 @@ from tqdm import tqdm
 
 from llama_index.bridge.pydantic import BaseModel, Field, ValidationError
 from llama_index.callbacks.base import CallbackManager
-from llama_index.llms.openai import LLM, OpenAI
+from llama_index.llms.llm import LLM
+from llama_index.llms.openai import OpenAI
 from llama_index.node_parser.interface import NodeParser
 from llama_index.response.schema import PydanticResponse
 from llama_index.schema import BaseNode, Document, IndexNode, TextNode
diff --git a/llama_index/node_parser/text/token.py b/llama_index/node_parser/text/token.py
index f6bff59fd2ecfa0c16465e963f01bc8936a3b1e3..12a9e12e0c6e5a88e5a4ff6c54cd9789c153761a 100644
--- a/llama_index/node_parser/text/token.py
+++ b/llama_index/node_parser/text/token.py
@@ -104,6 +104,7 @@ class TokenTextSplitter(MetadataAwareTextSplitter):
         """Split text into chunks, reserving space required for metadata str."""
         metadata_len = len(self._tokenizer(metadata_str)) + DEFAULT_METADATA_FORMAT_LEN
         effective_chunk_size = self.chunk_size - metadata_len
+        print(effective_chunk_size, flush=True)
         if effective_chunk_size <= 0:
             raise ValueError(
                 f"Metadata length ({metadata_len}) is longer than chunk size "
diff --git a/llama_index/playground/base.py b/llama_index/playground/base.py
index 89fa2259c3cd82de797ea3522a7eef344e33996a..4f51efa817ca03c65a9a97f3d512e9cad8c3b6f3 100644
--- a/llama_index/playground/base.py
+++ b/llama_index/playground/base.py
@@ -11,7 +11,7 @@ from llama_index.indices.base import BaseIndex
 from llama_index.indices.list.base import ListRetrieverMode, SummaryIndex
 from llama_index.indices.tree.base import TreeIndex, TreeRetrieverMode
 from llama_index.indices.vector_store import VectorStoreIndex
-from llama_index.llm_predictor import LLMPredictor
+from llama_index.llm_predictor.base import LLMPredictor
 from llama_index.schema import Document
 from llama_index.utils import get_color_mapping, print_text
 
diff --git a/llama_index/postprocessor/llm_rerank.py b/llama_index/postprocessor/llm_rerank.py
index 000d10aad71d4797c2b18ccb5f095accb22a7329..63f30825aaff310fbc641a0c4d245f86d7fc8d43 100644
--- a/llama_index/postprocessor/llm_rerank.py
+++ b/llama_index/postprocessor/llm_rerank.py
@@ -84,7 +84,7 @@ class LLMRerank(BaseNodePostprocessor):
             query_str = query_bundle.query_str
             fmt_batch_str = self._format_node_batch_fn(nodes_batch)
             # call each batch independently
-            raw_response = self.service_context.llm_predictor.predict(
+            raw_response = self.service_context.llm.predict(
                 self.choice_select_prompt,
                 context_str=fmt_batch_str,
                 query_str=query_str,
diff --git a/llama_index/postprocessor/node.py b/llama_index/postprocessor/node.py
index a0b8d9b66b0d0c173d9fb7956e3df559655e6f91..a75ac53c4ed483aa1b559b302bc2a9154f49773e 100644
--- a/llama_index/postprocessor/node.py
+++ b/llama_index/postprocessor/node.py
@@ -268,7 +268,6 @@ class AutoPrevNextNodePostprocessor(BaseNodePostprocessor):
 
     Args:
         docstore (BaseDocumentStore): The document store.
-        llm_predictor (LLMPredictor): The LLM predictor.
         num_nodes (int): The number of nodes to return (default: 1)
         infer_prev_next_tmpl (str): The template to use for inference.
             Required fields are {context_str} and {query_str}.
@@ -319,7 +318,7 @@ class AutoPrevNextNodePostprocessor(BaseNodePostprocessor):
         all_nodes: Dict[str, NodeWithScore] = {}
         for node in nodes:
             all_nodes[node.node.node_id] = node
-            # use response builder instead of llm_predictor directly
+            # use response builder instead of llm directly
             # to be more robust to handling long context
             response_builder = get_response_synthesizer(
                 service_context=self.service_context,
diff --git a/llama_index/postprocessor/pii.py b/llama_index/postprocessor/pii.py
index 83eb2d6ada309848c786bdcb92b5a35b2a37a4a1..ae30b6b72691403dca97af3f6407ffaab3496fda 100644
--- a/llama_index/postprocessor/pii.py
+++ b/llama_index/postprocessor/pii.py
@@ -65,7 +65,7 @@ class PIINodePostprocessor(BaseNodePostprocessor):
             "Return the mapping in JSON."
         )
 
-        response = self.service_context.llm_predictor.predict(
+        response = self.service_context.llm.predict(
             pii_prompt, context_str=text, query_str=task_str
         )
         splits = response.split("Output Mapping:")
diff --git a/llama_index/program/llm_program.py b/llama_index/program/llm_program.py
index 13123da24f2ffd2196856e13f0e220acfad4f791..878c96c267e614faac1dedd8e08aca06fe66a63a 100644
--- a/llama_index/program/llm_program.py
+++ b/llama_index/program/llm_program.py
@@ -1,7 +1,7 @@
 from typing import Any, Optional, Type, cast
 
 from llama_index.bridge.pydantic import BaseModel
-from llama_index.llms.base import LLM
+from llama_index.llms.llm import LLM
 from llama_index.llms.openai import OpenAI
 from llama_index.output_parsers.pydantic import PydanticOutputParser
 from llama_index.prompts.base import BasePromptTemplate, PromptTemplate
diff --git a/llama_index/program/openai_program.py b/llama_index/program/openai_program.py
index e4e2adfc683e5e05833fdbcc3869a9b7b74bef9d..764ba93f21428148187d4627ea8222bb0b2c7949 100644
--- a/llama_index/program/openai_program.py
+++ b/llama_index/program/openai_program.py
@@ -2,7 +2,7 @@ import logging
 from typing import Any, Dict, Generator, List, Optional, Tuple, Type, Union, cast
 
 from llama_index.agent.openai_agent import resolve_tool_choice
-from llama_index.llms.base import LLM
+from llama_index.llms.llm import LLM
 from llama_index.llms.openai import OpenAI
 from llama_index.llms.openai_utils import OpenAIToolCall, to_openai_tool
 from llama_index.program.llm_prompt_program import BaseLLMFunctionProgram
diff --git a/llama_index/program/predefined/evaporate/extractor.py b/llama_index/program/predefined/evaporate/extractor.py
index bf8afb741141d1b7d6b6398874502e103973baa2..8bbc4925ed1bd6f8d922ab04b0efa9c2c121f7b2 100644
--- a/llama_index/program/predefined/evaporate/extractor.py
+++ b/llama_index/program/predefined/evaporate/extractor.py
@@ -130,8 +130,8 @@ class EvaporateExtractor:
         """
         field2count: dict = defaultdict(int)
         for node in nodes:
-            llm_predictor = self._service_context.llm_predictor
-            result = llm_predictor.predict(
+            llm = self._service_context.llm
+            result = llm.predict(
                 self._schema_id_prompt,
                 topic=topic,
                 chunk=node.get_content(metadata_mode=MetadataMode.LLM),
diff --git a/llama_index/program/utils.py b/llama_index/program/utils.py
index f188f7e5fa5ef8ba2287800e6535f72ef369ede3..df8393ee408475db333664aec9c7213f0ab557e8 100644
--- a/llama_index/program/utils.py
+++ b/llama_index/program/utils.py
@@ -3,7 +3,7 @@
 from typing import Any, List, Type
 
 from llama_index.bridge.pydantic import BaseModel, Field, create_model
-from llama_index.llms.base import LLM
+from llama_index.llms.llm import LLM
 from llama_index.output_parsers.pydantic import PydanticOutputParser
 from llama_index.prompts.base import PromptTemplate
 from llama_index.types import BasePydanticProgram, PydanticProgramMode
diff --git a/llama_index/prompts/__init__.py b/llama_index/prompts/__init__.py
index 3f120cd5a6d9d80028ed4043d02e25d20b93a6f0..3de7bf26dc52d6fcd4e7a791cdba6a68c2c59d2f 100644
--- a/llama_index/prompts/__init__.py
+++ b/llama_index/prompts/__init__.py
@@ -1,6 +1,6 @@
 """Prompt class."""
 
-from llama_index.llms.base import ChatMessage, MessageRole
+from llama_index.llms.types import ChatMessage, MessageRole
 from llama_index.prompts.base import (
     BasePromptTemplate,
     ChatPromptTemplate,
diff --git a/llama_index/prompts/base.py b/llama_index/prompts/base.py
index 86fde16447e38e218e7fd8739a4f455b0670414c..18f0532b25921861b1e07dde77881a6dcf8d69ee 100644
--- a/llama_index/prompts/base.py
+++ b/llama_index/prompts/base.py
@@ -3,7 +3,7 @@
 
 from abc import ABC, abstractmethod
 from copy import deepcopy
-from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple
+from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Sequence, Tuple
 
 from llama_index.bridge.pydantic import Field
 
@@ -13,8 +13,14 @@ if TYPE_CHECKING:
         ConditionalPromptSelector as LangchainSelector,
     )
 from llama_index.bridge.pydantic import BaseModel
-from llama_index.llms.base import LLM, ChatMessage
-from llama_index.llms.generic_utils import messages_to_prompt, prompt_to_messages
+from llama_index.llms.base import BaseLLM
+from llama_index.llms.generic_utils import (
+    messages_to_prompt as default_messages_to_prompt,
+)
+from llama_index.llms.generic_utils import (
+    prompt_to_messages,
+)
+from llama_index.llms.types import ChatMessage
 from llama_index.prompts.prompt_type import PromptType
 from llama_index.prompts.utils import get_template_vars
 from llama_index.types import BaseOutputParser
@@ -88,17 +94,17 @@ class BasePromptTemplate(BaseModel, ABC):
         ...
 
     @abstractmethod
-    def format(self, llm: Optional[LLM] = None, **kwargs: Any) -> str:
+    def format(self, llm: Optional[BaseLLM] = None, **kwargs: Any) -> str:
         ...
 
     @abstractmethod
     def format_messages(
-        self, llm: Optional[LLM] = None, **kwargs: Any
+        self, llm: Optional[BaseLLM] = None, **kwargs: Any
     ) -> List[ChatMessage]:
         ...
 
     @abstractmethod
-    def get_template(self, llm: Optional[LLM] = None) -> str:
+    def get_template(self, llm: Optional[BaseLLM] = None) -> str:
         ...
 
 
@@ -147,7 +153,12 @@ class PromptTemplate(BasePromptTemplate):
         self.output_parser = output_parser
         return prompt
 
-    def format(self, llm: Optional[LLM] = None, **kwargs: Any) -> str:
+    def format(
+        self,
+        llm: Optional[BaseLLM] = None,
+        completion_to_prompt: Optional[Callable[[str], str]] = None,
+        **kwargs: Any,
+    ) -> str:
         """Format the prompt into a string."""
         del llm  # unused
         all_kwargs = {
@@ -157,19 +168,24 @@ class PromptTemplate(BasePromptTemplate):
 
         mapped_all_kwargs = self._map_all_vars(all_kwargs)
         prompt = self.template.format(**mapped_all_kwargs)
+
         if self.output_parser is not None:
             prompt = self.output_parser.format(prompt)
+
+        if completion_to_prompt is not None:
+            prompt = completion_to_prompt(prompt)
+
         return prompt
 
     def format_messages(
-        self, llm: Optional[LLM] = None, **kwargs: Any
+        self, llm: Optional[BaseLLM] = None, **kwargs: Any
     ) -> List[ChatMessage]:
         """Format the prompt into a list of chat messages."""
         del llm  # unused
         prompt = self.format(**kwargs)
         return prompt_to_messages(prompt)
 
-    def get_template(self, llm: Optional[LLM] = None) -> str:
+    def get_template(self, llm: Optional[BaseLLM] = None) -> str:
         return self.template
 
 
@@ -209,13 +225,22 @@ class ChatPromptTemplate(BasePromptTemplate):
         prompt.kwargs.update(kwargs)
         return prompt
 
-    def format(self, llm: Optional[LLM] = None, **kwargs: Any) -> str:
+    def format(
+        self,
+        llm: Optional[BaseLLM] = None,
+        messages_to_prompt: Optional[Callable[[Sequence[ChatMessage]], str]] = None,
+        **kwargs: Any,
+    ) -> str:
         del llm  # unused
         messages = self.format_messages(**kwargs)
-        return messages_to_prompt(messages)
+
+        if messages_to_prompt is not None:
+            return messages_to_prompt(messages)
+
+        return default_messages_to_prompt(messages)
 
     def format_messages(
-        self, llm: Optional[LLM] = None, **kwargs: Any
+        self, llm: Optional[BaseLLM] = None, **kwargs: Any
     ) -> List[ChatMessage]:
         del llm  # unused
         """Format the prompt into a list of chat messages."""
@@ -245,21 +270,21 @@ class ChatPromptTemplate(BasePromptTemplate):
 
         return messages
 
-    def get_template(self, llm: Optional[LLM] = None) -> str:
-        return messages_to_prompt(self.message_templates)
+    def get_template(self, llm: Optional[BaseLLM] = None) -> str:
+        return default_messages_to_prompt(self.message_templates)
 
 
 class SelectorPromptTemplate(BasePromptTemplate):
     default_template: BasePromptTemplate
     conditionals: Optional[
-        List[Tuple[Callable[[LLM], bool], BasePromptTemplate]]
+        List[Tuple[Callable[[BaseLLM], bool], BasePromptTemplate]]
     ] = None
 
     def __init__(
         self,
         default_template: BasePromptTemplate,
         conditionals: Optional[
-            List[Tuple[Callable[[LLM], bool], BasePromptTemplate]]
+            List[Tuple[Callable[[BaseLLM], bool], BasePromptTemplate]]
         ] = None,
     ):
         metadata = default_template.metadata
@@ -275,7 +300,7 @@ class SelectorPromptTemplate(BasePromptTemplate):
             output_parser=output_parser,
         )
 
-    def select(self, llm: Optional[LLM] = None) -> BasePromptTemplate:
+    def select(self, llm: Optional[BaseLLM] = None) -> BasePromptTemplate:
         # ensure output parser is up to date
         self.default_template.output_parser = self.output_parser
 
@@ -304,19 +329,19 @@ class SelectorPromptTemplate(BasePromptTemplate):
             default_template=default_template, conditionals=conditionals
         )
 
-    def format(self, llm: Optional[LLM] = None, **kwargs: Any) -> str:
+    def format(self, llm: Optional[BaseLLM] = None, **kwargs: Any) -> str:
         """Format the prompt into a string."""
         prompt = self.select(llm=llm)
         return prompt.format(**kwargs)
 
     def format_messages(
-        self, llm: Optional[LLM] = None, **kwargs: Any
+        self, llm: Optional[BaseLLM] = None, **kwargs: Any
     ) -> List[ChatMessage]:
         """Format the prompt into a list of chat messages."""
         prompt = self.select(llm=llm)
         return prompt.format_messages(**kwargs)
 
-    def get_template(self, llm: Optional[LLM] = None) -> str:
+    def get_template(self, llm: Optional[BaseLLM] = None) -> str:
         prompt = self.select(llm=llm)
         return prompt.get_template(llm=llm)
 
@@ -392,7 +417,7 @@ class LangchainPromptTemplate(BasePromptTemplate):
         lc_prompt.selector = lc_selector
         return lc_prompt
 
-    def format(self, llm: Optional[LLM] = None, **kwargs: Any) -> str:
+    def format(self, llm: Optional[BaseLLM] = None, **kwargs: Any) -> str:
         """Format the prompt into a string."""
         from llama_index.llms.langchain import LangChainLLM
 
@@ -414,7 +439,7 @@ class LangchainPromptTemplate(BasePromptTemplate):
         return lc_template.format(**mapped_kwargs)
 
     def format_messages(
-        self, llm: Optional[LLM] = None, **kwargs: Any
+        self, llm: Optional[BaseLLM] = None, **kwargs: Any
     ) -> List[ChatMessage]:
         """Format the prompt into a list of chat messages."""
         from llama_index.llms.langchain import LangChainLLM
@@ -439,7 +464,7 @@ class LangchainPromptTemplate(BasePromptTemplate):
         lc_messages = lc_prompt_value.to_messages()
         return from_lc_messages(lc_messages)
 
-    def get_template(self, llm: Optional[LLM] = None) -> str:
+    def get_template(self, llm: Optional[BaseLLM] = None) -> str:
         from llama_index.llms.langchain import LangChainLLM
 
         if llm is not None:
diff --git a/llama_index/prompts/chat_prompts.py b/llama_index/prompts/chat_prompts.py
index d6d41afc7ec41ce4029b25a55a103280088cf4a3..3fb855103103e1d34d184d204c17154776d784ee 100644
--- a/llama_index/prompts/chat_prompts.py
+++ b/llama_index/prompts/chat_prompts.py
@@ -1,6 +1,6 @@
 """Prompts for ChatGPT."""
 
-from llama_index.llms.base import ChatMessage, MessageRole
+from llama_index.llms.types import ChatMessage, MessageRole
 from llama_index.prompts.base import ChatPromptTemplate
 
 # text qa prompt
diff --git a/llama_index/prompts/lmformatenforcer_utils.py b/llama_index/prompts/lmformatenforcer_utils.py
index 8b9f25d70c6f39bb8e70833455b21d73a1683abc..c34618dc6dc2380cee95016e54fd4139347554f9 100644
--- a/llama_index/prompts/lmformatenforcer_utils.py
+++ b/llama_index/prompts/lmformatenforcer_utils.py
@@ -1,9 +1,9 @@
 from contextlib import contextmanager
 from typing import TYPE_CHECKING, Callable, Iterator
 
-from llama_index.llms.base import LLM
 from llama_index.llms.huggingface import HuggingFaceLLM
 from llama_index.llms.llama_cpp import LlamaCPP
+from llama_index.llms.llm import LLM
 
 if TYPE_CHECKING:
     from lmformatenforcer import CharacterLevelParser
diff --git a/llama_index/prompts/utils.py b/llama_index/prompts/utils.py
index b81c8c6496259935f203ecd6420810a11a8487cb..bf129ea8c6af32240bedcb848cec6820b45812a9 100644
--- a/llama_index/prompts/utils.py
+++ b/llama_index/prompts/utils.py
@@ -1,7 +1,7 @@
 from string import Formatter
 from typing import List
 
-from llama_index.llms.base import LLM
+from llama_index.llms.base import BaseLLM
 
 
 def get_template_vars(template_str: str) -> List[str]:
@@ -16,5 +16,5 @@ def get_template_vars(template_str: str) -> List[str]:
     return variables
 
 
-def is_chat_model(llm: LLM) -> bool:
+def is_chat_model(llm: BaseLLM) -> bool:
     return llm.metadata.is_chat_model
diff --git a/llama_index/query_engine/flare/answer_inserter.py b/llama_index/query_engine/flare/answer_inserter.py
index 1d996e61090ec5d728012715fd78799b4a8b225b..2434b910e1327b63948a39f557cbd11ae9cf56ba 100644
--- a/llama_index/query_engine/flare/answer_inserter.py
+++ b/llama_index/query_engine/flare/answer_inserter.py
@@ -173,7 +173,7 @@ class LLMLookaheadAnswerInserter(BaseLookaheadAnswerInserter):
         for query_task, answer in zip(query_tasks, answers):
             query_answer_pairs += f"Query: {query_task.query_str}\nAnswer: {answer}\n"
 
-        return self._service_context.llm_predictor.predict(
+        return self._service_context.llm.predict(
             self._answer_insert_prompt,
             lookahead_response=response,
             query_answer_pairs=query_answer_pairs,
diff --git a/llama_index/query_engine/flare/base.py b/llama_index/query_engine/flare/base.py
index bd429b75a5b9965ba172c5c6a2dabbbcf88f75f2..c83473f8e2ff78c6a7e081ef3dd1004bfffe7f5c 100644
--- a/llama_index/query_engine/flare/base.py
+++ b/llama_index/query_engine/flare/base.py
@@ -193,7 +193,7 @@ class FLAREInstructQueryEngine(BaseQueryEngine):
             # e.g.
             # The colors on the flag of Ghana have the following meanings. Red is
             # for [Search(Ghana flag meaning)],...
-            lookahead_resp = self._service_context.llm_predictor.predict(
+            lookahead_resp = self._service_context.llm.predict(
                 self._instruct_prompt,
                 query_str=query_bundle.query_str,
                 existing_answer=cur_response,
diff --git a/llama_index/query_engine/knowledge_graph_query_engine.py b/llama_index/query_engine/knowledge_graph_query_engine.py
index afee41f0f53e3397d8e35c78ca8cee2a460c81ac..0c156d97368b1bf0c28864e04af516ce97041200 100644
--- a/llama_index/query_engine/knowledge_graph_query_engine.py
+++ b/llama_index/query_engine/knowledge_graph_query_engine.py
@@ -183,7 +183,7 @@ class KnowledgeGraphQueryEngine(BaseQueryEngine):
         """Generate a Graph Store Query from a query bundle."""
         # Get the query engine query string
 
-        graph_store_query: str = self._service_context.llm_predictor.predict(
+        graph_store_query: str = self._service_context.llm.predict(
             self._graph_query_synthesis_prompt,
             query_str=query_str,
             schema=self._graph_schema,
@@ -195,7 +195,7 @@ class KnowledgeGraphQueryEngine(BaseQueryEngine):
         """Generate a Graph Store Query from a query bundle."""
         # Get the query engine query string
 
-        graph_store_query: str = await self._service_context.llm_predictor.apredict(
+        graph_store_query: str = await self._service_context.llm.apredict(
             self._graph_query_synthesis_prompt,
             query_str=query_str,
             schema=self._graph_schema,
diff --git a/llama_index/query_engine/pandas_query_engine.py b/llama_index/query_engine/pandas_query_engine.py
index 862dd5f4edec991230c36067a25cee17422a253a..a6ebb95307f3b7323be781f4c750dbc5aaceff9e 100644
--- a/llama_index/query_engine/pandas_query_engine.py
+++ b/llama_index/query_engine/pandas_query_engine.py
@@ -157,7 +157,7 @@ class PandasQueryEngine(BaseQueryEngine):
         """Answer a query."""
         context = self._get_table_context()
 
-        pandas_response_str = self._service_context.llm_predictor.predict(
+        pandas_response_str = self._service_context.llm.predict(
             self._pandas_prompt,
             df_str=context,
             query_str=query_bundle.query_str,
diff --git a/llama_index/query_engine/sql_join_query_engine.py b/llama_index/query_engine/sql_join_query_engine.py
index 30d06ee2b55d8e60e4f8966455caec06d353cd6f..faf5821fadb1e2e86060c70bef83c0d0b0fe9a43 100644
--- a/llama_index/query_engine/sql_join_query_engine.py
+++ b/llama_index/query_engine/sql_join_query_engine.py
@@ -10,8 +10,8 @@ from llama_index.indices.struct_store.sql_query import (
     BaseSQLTableQueryEngine,
     NLSQLTableQueryEngine,
 )
-from llama_index.llm_predictor import LLMPredictor
-from llama_index.llm_predictor.base import BaseLLMPredictor
+from llama_index.llm_predictor.base import LLMPredictorType
+from llama_index.llms.utils import resolve_llm
 from llama_index.prompts.base import BasePromptTemplate, PromptTemplate
 from llama_index.prompts.mixin import PromptDictType, PromptMixinType
 from llama_index.response.schema import RESPONSE_TYPE, Response
@@ -110,7 +110,7 @@ class SQLAugmentQueryTransform(BaseQueryTransform):
     after augmenting with SQL results.
 
     Args:
-        llm_predictor (LLMPredictor): LLM predictor to use for query transformation.
+        llm (LLM): LLM to use for query transformation.
         sql_augment_transform_prompt (BasePromptTemplate): PromptTemplate to use
             for query transformation.
         check_stop_parser (Optional[Callable[[str], bool]]): Check stop function.
@@ -119,12 +119,12 @@ class SQLAugmentQueryTransform(BaseQueryTransform):
 
     def __init__(
         self,
-        llm_predictor: Optional[BaseLLMPredictor] = None,
+        llm: Optional[LLMPredictorType] = None,
         sql_augment_transform_prompt: Optional[BasePromptTemplate] = None,
         check_stop_parser: Optional[Callable[[QueryBundle], bool]] = None,
     ) -> None:
         """Initialize params."""
-        self._llm_predictor = llm_predictor or LLMPredictor()
+        self._llm = llm or resolve_llm("default")
 
         self._sql_augment_transform_prompt = (
             sql_augment_transform_prompt or DEFAULT_SQL_AUGMENT_TRANSFORM_PROMPT
@@ -145,7 +145,7 @@ class SQLAugmentQueryTransform(BaseQueryTransform):
         query_str = query_bundle.query_str
         sql_query = metadata["sql_query"]
         sql_query_response = metadata["sql_query_response"]
-        new_query_str = self._llm_predictor.predict(
+        new_query_str = self._llm.predict(
             self._sql_augment_transform_prompt,
             query_str=query_str,
             sql_query_str=sql_query,
@@ -224,9 +224,7 @@ class SQLJoinQueryEngine(BaseQueryEngine):
         )
         self._sql_augment_query_transform = (
             sql_augment_query_transform
-            or SQLAugmentQueryTransform(
-                llm_predictor=self._service_context.llm_predictor
-            )
+            or SQLAugmentQueryTransform(llm=self._service_context.llm)
         )
         self._use_sql_join_synthesis = use_sql_join_synthesis
         self._verbose = verbose
@@ -284,7 +282,7 @@ class SQLJoinQueryEngine(BaseQueryEngine):
             print_text(f"query engine response: {other_response}\n", color="pink")
         logger.info(f"> query engine response: {other_response}")
 
-        response_str = self._service_context.llm_predictor.predict(
+        response_str = self._service_context.llm.predict(
             self._sql_join_synthesis_prompt,
             query_str=query_bundle.query_str,
             sql_query_str=sql_query,
diff --git a/llama_index/question_gen/llm_generators.py b/llama_index/question_gen/llm_generators.py
index 18b68fd9fafa3ba6c2a0351fd2b0188a017bc080..068cfabc460beaab4e95f79fa369d0a90d3fadfa 100644
--- a/llama_index/question_gen/llm_generators.py
+++ b/llama_index/question_gen/llm_generators.py
@@ -1,6 +1,6 @@
 from typing import List, Optional, Sequence, cast
 
-from llama_index.llm_predictor.base import BaseLLMPredictor
+from llama_index.llm_predictor.base import LLMPredictorType
 from llama_index.output_parsers.base import StructuredOutput
 from llama_index.prompts.base import BasePromptTemplate, PromptTemplate
 from llama_index.prompts.mixin import PromptDictType
@@ -20,10 +20,10 @@ from llama_index.types import BaseOutputParser
 class LLMQuestionGenerator(BaseQuestionGenerator):
     def __init__(
         self,
-        llm_predictor: BaseLLMPredictor,
+        llm: LLMPredictorType,
         prompt: BasePromptTemplate,
     ) -> None:
-        self._llm_predictor = llm_predictor
+        self._llm = llm
         self._prompt = prompt
 
         if self._prompt.output_parser is None:
@@ -47,7 +47,7 @@ class LLMQuestionGenerator(BaseQuestionGenerator):
             output_parser=output_parser,
             prompt_type=PromptType.SUB_QUESTION,
         )
-        return cls(service_context.llm_predictor, prompt)
+        return cls(service_context.llm, prompt)
 
     def _get_prompts(self) -> PromptDictType:
         """Get prompts."""
@@ -63,7 +63,7 @@ class LLMQuestionGenerator(BaseQuestionGenerator):
     ) -> List[SubQuestion]:
         tools_str = build_tools_text(tools)
         query_str = query.query_str
-        prediction = self._llm_predictor.predict(
+        prediction = self._llm.predict(
             prompt=self._prompt,
             tools_str=tools_str,
             query_str=query_str,
@@ -79,7 +79,7 @@ class LLMQuestionGenerator(BaseQuestionGenerator):
     ) -> List[SubQuestion]:
         tools_str = build_tools_text(tools)
         query_str = query.query_str
-        prediction = await self._llm_predictor.apredict(
+        prediction = await self._llm.apredict(
             prompt=self._prompt,
             tools_str=tools_str,
             query_str=query_str,
diff --git a/llama_index/question_gen/openai_generator.py b/llama_index/question_gen/openai_generator.py
index c461d32c1c0aefa56ab08064b7708cdc2881a632..8b96bbaf903574d940834f1b9634d7cd7701ff86 100644
--- a/llama_index/question_gen/openai_generator.py
+++ b/llama_index/question_gen/openai_generator.py
@@ -1,6 +1,6 @@
 from typing import List, Optional, Sequence, cast
 
-from llama_index.llms.base import LLM
+from llama_index.llms.llm import LLM
 from llama_index.llms.openai import OpenAI
 from llama_index.program.openai_program import OpenAIPydanticProgram
 from llama_index.prompts.mixin import PromptDictType
diff --git a/llama_index/response_synthesizers/accumulate.py b/llama_index/response_synthesizers/accumulate.py
index b56fc21f2da4b24f16161d4fb4943f2d9c46812b..6ff0fd357edaa4d86e4d1abcb0a610aacca71f94 100644
--- a/llama_index/response_synthesizers/accumulate.py
+++ b/llama_index/response_synthesizers/accumulate.py
@@ -1,5 +1,5 @@
 import asyncio
-from typing import Any, List, Optional, Sequence
+from typing import Any, Callable, List, Optional, Sequence
 
 from llama_index.async_utils import run_async_tasks
 from llama_index.prompts import BasePromptTemplate
@@ -114,18 +114,35 @@ class Accumulate(BaseSynthesizer):
             text_qa_template, [text_chunk]
         )
 
-        predictor = (
-            self._service_context.llm_predictor.apredict
-            if use_async
-            else self._service_context.llm_predictor.predict
-        )
+        predictor: Callable
+        if self._output_cls is None:
+            predictor = (
+                self._service_context.llm.apredict
+                if use_async
+                else self._service_context.llm.predict
+            )
 
-        return [
-            predictor(
-                text_qa_template,
-                context_str=cur_text_chunk,
-                output_cls=self._output_cls,
-                **response_kwargs,
+            return [
+                predictor(
+                    text_qa_template,
+                    context_str=cur_text_chunk,
+                    **response_kwargs,
+                )
+                for cur_text_chunk in text_chunks
+            ]
+        else:
+            predictor = (
+                self._service_context.llm.astructured_predict
+                if use_async
+                else self._service_context.llm.structured_predict
             )
-            for cur_text_chunk in text_chunks
-        ]
+
+            return [
+                predictor(
+                    self._output_cls,
+                    text_qa_template,
+                    context_str=cur_text_chunk,
+                    **response_kwargs,
+                )
+                for cur_text_chunk in text_chunks
+            ]
diff --git a/llama_index/response_synthesizers/generation.py b/llama_index/response_synthesizers/generation.py
index 825c282f75c8e188d9e834bee6dcbf87ca584915..128b958b091a1de0b998601920243e28665dcd3c 100644
--- a/llama_index/response_synthesizers/generation.py
+++ b/llama_index/response_synthesizers/generation.py
@@ -37,13 +37,13 @@ class Generation(BaseSynthesizer):
         del text_chunks
 
         if not self._streaming:
-            return await self._service_context.llm_predictor.apredict(
+            return await self._service_context.llm.apredict(
                 self._input_prompt,
                 query_str=query_str,
                 **response_kwargs,
             )
         else:
-            return self._service_context.llm_predictor.stream(
+            return self._service_context.llm.stream(
                 self._input_prompt,
                 query_str=query_str,
                 **response_kwargs,
@@ -59,13 +59,13 @@ class Generation(BaseSynthesizer):
         del text_chunks
 
         if not self._streaming:
-            return self._service_context.llm_predictor.predict(
+            return self._service_context.llm.predict(
                 self._input_prompt,
                 query_str=query_str,
                 **response_kwargs,
             )
         else:
-            return self._service_context.llm_predictor.stream(
+            return self._service_context.llm.stream(
                 self._input_prompt,
                 query_str=query_str,
                 **response_kwargs,
diff --git a/llama_index/response_synthesizers/refine.py b/llama_index/response_synthesizers/refine.py
index b031a758edfbef5dd063df5934c2b98c7c9ae243..9b575269f82ca8194a14e7618352490981ad6e0c 100644
--- a/llama_index/response_synthesizers/refine.py
+++ b/llama_index/response_synthesizers/refine.py
@@ -3,7 +3,7 @@ from typing import Any, Callable, Generator, Optional, Sequence, Type, cast
 
 from llama_index.bridge.pydantic import BaseModel, Field, ValidationError
 from llama_index.indices.utils import truncate_text
-from llama_index.llm_predictor.base import BaseLLMPredictor
+from llama_index.llm_predictor.base import LLMPredictorType
 from llama_index.prompts.base import BasePromptTemplate, PromptTemplate
 from llama_index.prompts.default_prompt_selectors import (
     DEFAULT_REFINE_PROMPT_SEL,
@@ -41,26 +41,45 @@ class DefaultRefineProgram(BasePydanticProgram):
     query_satisfied=True. In effect, doesn't do any answer filtering.
     """
 
-    def __init__(self, prompt: BasePromptTemplate, llm_predictor: BaseLLMPredictor):
+    def __init__(
+        self, prompt: BasePromptTemplate, llm: LLMPredictorType, output_cls: BaseModel
+    ):
         self._prompt = prompt
-        self._llm_predictor = llm_predictor
+        self._llm = llm
+        self._output_cls = output_cls
 
     @property
     def output_cls(self) -> Type[BaseModel]:
         return StructuredRefineResponse
 
     def __call__(self, *args: Any, **kwds: Any) -> StructuredRefineResponse:
-        answer = self._llm_predictor.predict(
-            self._prompt,
-            **kwds,
-        )
+        if self._output_cls is not None:
+            answer = self._llm.structured_predict(
+                self._output_cls,
+                self._prompt,
+                **kwds,
+            )
+            answer = answer.json()
+        else:
+            answer = self._llm.predict(
+                self._prompt,
+                **kwds,
+            )
         return StructuredRefineResponse(answer=answer, query_satisfied=True)
 
     async def acall(self, *args: Any, **kwds: Any) -> StructuredRefineResponse:
-        answer = await self._llm_predictor.apredict(
-            self._prompt,
-            **kwds,
-        )
+        if self._output_cls is not None:
+            answer = await self._llm.astructured_predict(
+                self._output_cls,
+                self._prompt,
+                **kwds,
+            )
+            answer = answer.json()
+        else:
+            answer = await self._llm.apredict(
+                self._prompt,
+                **kwds,
+            )
         return StructuredRefineResponse(answer=answer, query_satisfied=True)
 
 
@@ -155,7 +174,8 @@ class Refine(BaseSynthesizer):
         else:
             return DefaultRefineProgram(
                 prompt=prompt,
-                llm_predictor=self._service_context.llm_predictor,
+                llm=self._service_context.llm,
+                output_cls=self._output_cls,
             )
 
     def _give_response_single(
@@ -181,7 +201,6 @@ class Refine(BaseSynthesizer):
                         StructuredRefineResponse,
                         program(
                             context_str=cur_text_chunk,
-                            output_cls=self._output_cls,
                             **response_kwargs,
                         ),
                     )
@@ -193,10 +212,9 @@ class Refine(BaseSynthesizer):
                         f"Validation error on structured response: {e}", exc_info=True
                     )
             elif response is None and self._streaming:
-                response = self._service_context.llm_predictor.stream(
+                response = self._service_context.llm.stream(
                     text_qa_template,
                     context_str=cur_text_chunk,
-                    output_cls=self._output_cls,
                     **response_kwargs,
                 )
                 query_satisfied = True
@@ -265,7 +283,6 @@ class Refine(BaseSynthesizer):
                         StructuredRefineResponse,
                         program(
                             context_msg=cur_text_chunk,
-                            output_cls=self._output_cls,
                             **response_kwargs,
                         ),
                     )
@@ -285,10 +302,9 @@ class Refine(BaseSynthesizer):
                     query_str=query_str, existing_answer=response
                 )
 
-                response = self._service_context.llm_predictor.stream(
+                response = self._service_context.llm.stream(
                     refine_template,
                     context_msg=cur_text_chunk,
-                    output_cls=self._output_cls,
                     **response_kwargs,
                 )
 
@@ -371,7 +387,6 @@ class Refine(BaseSynthesizer):
                 try:
                     structured_response = await program.acall(
                         context_msg=cur_text_chunk,
-                        output_cls=self._output_cls,
                         **response_kwargs,
                     )
                     structured_response = cast(
@@ -414,7 +429,6 @@ class Refine(BaseSynthesizer):
                 try:
                     structured_response = await program.acall(
                         context_str=cur_text_chunk,
-                        output_cls=self._output_cls,
                         **response_kwargs,
                     )
                     structured_response = cast(
diff --git a/llama_index/response_synthesizers/simple_summarize.py b/llama_index/response_synthesizers/simple_summarize.py
index 0930729a27e89f6f753e8b63da0d623deea411f6..07f5d2db4acdf694dc1ad275bdfe1550290da686 100644
--- a/llama_index/response_synthesizers/simple_summarize.py
+++ b/llama_index/response_synthesizers/simple_summarize.py
@@ -42,13 +42,13 @@ class SimpleSummarize(BaseSynthesizer):
 
         response: RESPONSE_TEXT_TYPE
         if not self._streaming:
-            response = await self._service_context.llm_predictor.apredict(
+            response = await self._service_context.llm.apredict(
                 text_qa_template,
                 context_str=node_text,
                 **response_kwargs,
             )
         else:
-            response = self._service_context.llm_predictor.stream(
+            response = self._service_context.llm.stream(
                 text_qa_template,
                 context_str=node_text,
                 **response_kwargs,
@@ -76,13 +76,13 @@ class SimpleSummarize(BaseSynthesizer):
 
         response: RESPONSE_TEXT_TYPE
         if not self._streaming:
-            response = self._service_context.llm_predictor.predict(
+            response = self._service_context.llm.predict(
                 text_qa_template,
                 context_str=node_text,
                 **kwargs,
             )
         else:
-            response = self._service_context.llm_predictor.stream(
+            response = self._service_context.llm.stream(
                 text_qa_template,
                 context_str=node_text,
                 **kwargs,
diff --git a/llama_index/response_synthesizers/tree_summarize.py b/llama_index/response_synthesizers/tree_summarize.py
index 773726d70faf047ee38c690022a79592697040dc..b85a80349ff065972067bfe8fadf7a0f345fd1e3 100644
--- a/llama_index/response_synthesizers/tree_summarize.py
+++ b/llama_index/response_synthesizers/tree_summarize.py
@@ -1,5 +1,5 @@
 import asyncio
-from typing import Any, List, Optional, Sequence
+from typing import Any, Optional, Sequence
 
 from llama_index.async_utils import run_async_tasks
 from llama_index.prompts import BasePromptTemplate
@@ -70,37 +70,54 @@ class TreeSummarize(BaseSynthesizer):
         if len(text_chunks) == 1:
             response: RESPONSE_TEXT_TYPE
             if self._streaming:
-                response = self._service_context.llm_predictor.stream(
+                response = self._service_context.llm.stream(
                     summary_template, context_str=text_chunks[0], **response_kwargs
                 )
             else:
-                response = await self._service_context.llm_predictor.apredict(
-                    summary_template,
-                    output_cls=self._output_cls,
-                    context_str=text_chunks[0],
-                    **response_kwargs,
-                )
+                if self._output_cls is None:
+                    response = await self._service_context.llm.apredict(
+                        summary_template,
+                        context_str=text_chunks[0],
+                        **response_kwargs,
+                    )
+                else:
+                    response = await self._service_context.llm.astructured_predict(
+                        self._output_cls,
+                        summary_template,
+                        context_str=text_chunks[0],
+                        **response_kwargs,
+                    )
 
             # return pydantic object if output_cls is specified
-            return (
-                response
-                if self._output_cls is None
-                else self._output_cls.parse_raw(response)
-            )
+            return response
 
         else:
             # summarize each chunk
-            tasks = [
-                self._service_context.llm_predictor.apredict(
-                    summary_template,
-                    output_cls=self._output_cls,
-                    context_str=text_chunk,
-                    **response_kwargs,
-                )
-                for text_chunk in text_chunks
-            ]
+            if self._output_cls is None:
+                tasks = [
+                    self._service_context.llm.apredict(
+                        summary_template,
+                        context_str=text_chunk,
+                        **response_kwargs,
+                    )
+                    for text_chunk in text_chunks
+                ]
+            else:
+                tasks = [
+                    self._service_context.llm.astructured_predict(
+                        self._output_cls,
+                        summary_template,
+                        context_str=text_chunk,
+                        **response_kwargs,
+                    )
+                    for text_chunk in text_chunks
+                ]
 
-            summaries: List[str] = await asyncio.gather(*tasks)
+            summary_responses = await asyncio.gather(*tasks)
+            if self._output_cls is not None:
+                summaries = [summary.json() for summary in summary_responses]
+            else:
+                summaries = summary_responses
 
             # recursively summarize the summaries
             return await self.aget_response(
@@ -129,48 +146,76 @@ class TreeSummarize(BaseSynthesizer):
         if len(text_chunks) == 1:
             response: RESPONSE_TEXT_TYPE
             if self._streaming:
-                response = self._service_context.llm_predictor.stream(
+                response = self._service_context.llm.stream(
                     summary_template, context_str=text_chunks[0], **response_kwargs
                 )
             else:
-                response = self._service_context.llm_predictor.predict(
-                    summary_template,
-                    output_cls=self._output_cls,
-                    context_str=text_chunks[0],
-                    **response_kwargs,
-                )
-
-            # return pydantic object if output_cls is specified
-            return (
-                response
-                if self._output_cls is None
-                else self._output_cls.parse_raw(response)
-            )
-
-        else:
-            # summarize each chunk
-            if self._use_async:
-                tasks = [
-                    self._service_context.llm_predictor.apredict(
+                if self._output_cls is None:
+                    response = self._service_context.llm.predict(
                         summary_template,
-                        output_cls=self._output_cls,
-                        context_str=text_chunk,
+                        context_str=text_chunks[0],
                         **response_kwargs,
                     )
-                    for text_chunk in text_chunks
-                ]
-
-                summaries: List[str] = run_async_tasks(tasks)
-            else:
-                summaries = [
-                    self._service_context.llm_predictor.predict(
+                else:
+                    response = self._service_context.llm.structured_predict(
+                        self._output_cls,
                         summary_template,
-                        output_cls=self._output_cls,
-                        context_str=text_chunk,
+                        context_str=text_chunks[0],
                         **response_kwargs,
                     )
-                    for text_chunk in text_chunks
-                ]
+
+            return response
+
+        else:
+            # summarize each chunk
+            if self._use_async:
+                if self._output_cls is None:
+                    tasks = [
+                        self._service_context.llm.apredict(
+                            summary_template,
+                            context_str=text_chunk,
+                            **response_kwargs,
+                        )
+                        for text_chunk in text_chunks
+                    ]
+                else:
+                    tasks = [
+                        self._service_context.llm.astructured_predict(
+                            self._output_cls,
+                            summary_template,
+                            context_str=text_chunk,
+                            **response_kwargs,
+                        )
+                        for text_chunk in text_chunks
+                    ]
+
+                summary_responses = run_async_tasks(tasks)
+
+                if self._output_cls is not None:
+                    summaries = [summary.json() for summary in summary_responses]
+                else:
+                    summaries = summary_responses
+            else:
+                if self._output_cls is None:
+                    summaries = [
+                        self._service_context.llm.predict(
+                            summary_template,
+                            context_str=text_chunk,
+                            **response_kwargs,
+                        )
+                        for text_chunk in text_chunks
+                    ]
+                else:
+                    summaries = [
+                        self._service_context.llm.structured_predict(
+                            self._output_cls,
+                            summary_template,
+                            context_str=text_chunk,
+                            **response_kwargs,
+                        )
+                        for text_chunk in text_chunks
+                    ]
+                    summaries = [summary.json() for summary in summaries]
 
             # recursively summarize the summaries
             return self.get_response(
diff --git a/llama_index/schema.py b/llama_index/schema.py
index d595fe6245bd50185b8a3534969d55906cfb9301..aa6cf7a468d63e0010a6eb13f8cd1758a6faefed 100644
--- a/llama_index/schema.py
+++ b/llama_index/schema.py
@@ -71,9 +71,11 @@ class BaseComponent(BaseModel):
 
         # remove local functions
         keys_to_remove = []
-        for key in state["__dict__"]:
+        for key, val in state["__dict__"].items():
             if key.endswith("_fn"):
                 keys_to_remove.append(key)
+            if "function <lambda>" in str(val):
+                keys_to_remove.append(key)
         for key in keys_to_remove:
             state["__dict__"].pop(key, None)
 
diff --git a/llama_index/selectors/llm_selectors.py b/llama_index/selectors/llm_selectors.py
index e4a2425491e9d7d615176d0cfcc0f66de5143cf5..eb3cf6945cee563863032ecd79d57263876d0ed5 100644
--- a/llama_index/selectors/llm_selectors.py
+++ b/llama_index/selectors/llm_selectors.py
@@ -1,6 +1,6 @@
 from typing import Any, Dict, List, Optional, Sequence, cast
 
-from llama_index.llm_predictor.base import BaseLLMPredictor
+from llama_index.llm_predictor.base import LLMPredictorType
 from llama_index.output_parsers.base import StructuredOutput
 from llama_index.output_parsers.selection import Answer, SelectionOutputParser
 from llama_index.prompts.mixin import PromptDictType
@@ -47,16 +47,16 @@ class LLMSingleSelector(BaseSelector):
     LLM-based selector that chooses one out of many options.
 
     Args:
-        llm_predictor (BaseLLMPredictor): An LLM predictor.
+        LLM (LLM): An LLM.
         prompt (SingleSelectPrompt): A LLM prompt for selecting one out of many options.
     """
 
     def __init__(
         self,
-        llm_predictor: BaseLLMPredictor,
+        llm: LLMPredictorType,
         prompt: SingleSelectPrompt,
     ) -> None:
-        self._llm_predictor = llm_predictor
+        self._llm = llm
         self._prompt = prompt
 
         if self._prompt.output_parser is None:
@@ -80,7 +80,7 @@ class LLMSingleSelector(BaseSelector):
             output_parser=output_parser,
             prompt_type=PromptType.SINGLE_SELECT,
         )
-        return cls(service_context.llm_predictor, prompt)
+        return cls(service_context.llm, prompt)
 
     def _get_prompts(self) -> Dict[str, Any]:
         """Get prompts."""
@@ -98,7 +98,7 @@ class LLMSingleSelector(BaseSelector):
         choices_text = _build_choices_text(choices)
 
         # predict
-        prediction = self._llm_predictor.predict(
+        prediction = self._llm.predict(
             prompt=self._prompt,
             num_choices=len(choices),
             context_list=choices_text,
@@ -117,7 +117,7 @@ class LLMSingleSelector(BaseSelector):
         choices_text = _build_choices_text(choices)
 
         # predict
-        prediction = await self._llm_predictor.apredict(
+        prediction = await self._llm.apredict(
             prompt=self._prompt,
             num_choices=len(choices),
             context_list=choices_text,
@@ -136,18 +136,18 @@ class LLMMultiSelector(BaseSelector):
     LLM-based selector that chooses multiple out of many options.
 
     Args:
-        llm_predictor (LLMPredictor): An LLM predictor.
+        llm (LLM): An LLM.
         prompt (SingleSelectPrompt): A LLM prompt for selecting multiple out of many
             options.
     """
 
     def __init__(
         self,
-        llm_predictor: BaseLLMPredictor,
+        llm: LLMPredictorType,
         prompt: MultiSelectPrompt,
         max_outputs: Optional[int] = None,
     ) -> None:
-        self._llm_predictor = llm_predictor
+        self._llm = llm
         self._prompt = prompt
         self._max_outputs = max_outputs
 
@@ -175,7 +175,7 @@ class LLMMultiSelector(BaseSelector):
             output_parser=output_parser,
             prompt_type=PromptType.MULTI_SELECT,
         )
-        return cls(service_context.llm_predictor, prompt, max_outputs)
+        return cls(service_context.llm, prompt, max_outputs)
 
     def _get_prompts(self) -> Dict[str, Any]:
         """Get prompts."""
@@ -193,7 +193,7 @@ class LLMMultiSelector(BaseSelector):
         context_list = _build_choices_text(choices)
         max_outputs = self._max_outputs or len(choices)
 
-        prediction = self._llm_predictor.predict(
+        prediction = self._llm.predict(
             prompt=self._prompt,
             num_choices=len(choices),
             max_outputs=max_outputs,
@@ -212,7 +212,7 @@ class LLMMultiSelector(BaseSelector):
         context_list = _build_choices_text(choices)
         max_outputs = self._max_outputs or len(choices)
 
-        prediction = await self._llm_predictor.apredict(
+        prediction = await self._llm.apredict(
             prompt=self._prompt,
             num_choices=len(choices),
             max_outputs=max_outputs,
diff --git a/llama_index/selectors/utils.py b/llama_index/selectors/utils.py
index c9fbbb5a81ad8c4c5b57d2267903a507676b4596..651d7b2aece97c2bc363d58b5cda680b6b8be8be 100644
--- a/llama_index/selectors/utils.py
+++ b/llama_index/selectors/utils.py
@@ -17,13 +17,13 @@ def get_selector_from_context(
 
     if is_multi:
         try:
-            llm = service_context.llm_predictor.llm
+            llm = service_context.llm
             selector = PydanticMultiSelector.from_defaults(llm=llm)  # type: ignore
         except ValueError:
             selector = LLMMultiSelector.from_defaults(service_context=service_context)
     else:
         try:
-            llm = service_context.llm_predictor.llm
+            llm = service_context.llm
             selector = PydanticSingleSelector.from_defaults(llm=llm)  # type: ignore
         except ValueError:
             selector = LLMSingleSelector.from_defaults(service_context=service_context)
diff --git a/llama_index/service_context.py b/llama_index/service_context.py
index 2a9006d572d30d3f65aaedf64d2aafc2749e4ab2..c4378f24896dec9d13c94f0f4725e284c26ff1c8 100644
--- a/llama_index/service_context.py
+++ b/llama_index/service_context.py
@@ -10,7 +10,7 @@ from llama_index.embeddings.utils import EmbedType, resolve_embed_model
 from llama_index.indices.prompt_helper import PromptHelper
 from llama_index.llm_predictor import LLMPredictor
 from llama_index.llm_predictor.base import BaseLLMPredictor, LLMMetadata
-from llama_index.llms.base import LLM
+from llama_index.llms.llm import LLM
 from llama_index.llms.utils import LLMType, resolve_llm
 from llama_index.logger import LlamaLogger
 from llama_index.node_parser.interface import NodeParser, TextSplitter
@@ -164,6 +164,14 @@ class ServiceContext:
             if llm_predictor is not None:
                 raise ValueError("Cannot specify both llm and llm_predictor")
             llm = resolve_llm(llm)
+            llm.system_prompt = llm.system_prompt or system_prompt
+            llm.query_wrapper_prompt = llm.query_wrapper_prompt or query_wrapper_prompt
+            llm.pydantic_program_mode = (
+                llm.pydantic_program_mode or pydantic_program_mode
+            )
+
+        if llm_predictor is not None:
+            print("LLMPredictor is deprecated, please use LLM instead.")
         llm_predictor = llm_predictor or LLMPredictor(
             llm=llm, pydantic_program_mode=pydantic_program_mode
         )
@@ -311,8 +319,6 @@ class ServiceContext:
 
     @property
     def llm(self) -> LLM:
-        if not isinstance(self.llm_predictor, LLMPredictor):
-            raise ValueError("llm_predictor must be an instance of LLMPredictor")
         return self.llm_predictor.llm
 
     @property
diff --git a/llama_index/types.py b/llama_index/types.py
index c0c9b65efe3620c2f268c9a73bb59a4c03311540..e454b18e8ed12b7e29c836fe498bea1324ea863d 100644
--- a/llama_index/types.py
+++ b/llama_index/types.py
@@ -14,7 +14,7 @@ from typing import (
 )
 
 from llama_index.bridge.pydantic import BaseModel
-from llama_index.llms.base import ChatMessage, MessageRole
+from llama_index.llms.types import ChatMessage, MessageRole
 
 Model = TypeVar("Model", bound=BaseModel)
 
@@ -33,9 +33,9 @@ class BaseOutputParser(Protocol):
     def parse(self, output: str) -> Any:
         """Parse, validate, and correct errors programmatically."""
 
-    @abstractmethod
     def format(self, query: str) -> str:
         """Format a query with structured output formatting instructions."""
+        return query
 
     def format_messages(self, messages: List[ChatMessage]) -> List[ChatMessage]:
         """Format a list of messages with structured output formatting instructions."""
diff --git a/tests/agent/openai/test_openai_agent.py b/tests/agent/openai/test_openai_agent.py
index 36fda875a5b869e2f0478bba73646f8f650259ce..a6859dac74df7b4e12976946720b9de8b79d4f88 100644
--- a/tests/agent/openai/test_openai_agent.py
+++ b/tests/agent/openai/test_openai_agent.py
@@ -4,9 +4,9 @@ from unittest.mock import MagicMock, patch
 import pytest
 from llama_index.agent.openai_agent import OpenAIAgent, call_tool_with_error_handling
 from llama_index.chat_engine.types import AgentChatResponse
-from llama_index.llms.base import ChatMessage, ChatResponse
 from llama_index.llms.mock import MockLLM
 from llama_index.llms.openai import OpenAI
+from llama_index.llms.types import ChatMessage, ChatResponse
 from llama_index.tools.function_tool import FunctionTool
 from openai.types.chat.chat_completion import ChatCompletion, Choice
 from openai.types.chat.chat_completion_message import ChatCompletionMessage
diff --git a/tests/agent/react/test_react_agent.py b/tests/agent/react/test_react_agent.py
index 75e21f00b8eda7a071120575623dc74954512925..308d55a96676b3413dc80a0960140d350726adde 100644
--- a/tests/agent/react/test_react_agent.py
+++ b/tests/agent/react/test_react_agent.py
@@ -5,13 +5,13 @@ import pytest
 from llama_index.agent.react.base import ReActAgent
 from llama_index.bridge.pydantic import PrivateAttr
 from llama_index.chat_engine.types import AgentChatResponse, StreamingAgentChatResponse
-from llama_index.llms.base import (
+from llama_index.llms.mock import MockLLM
+from llama_index.llms.types import (
     ChatMessage,
     ChatResponse,
     ChatResponseGen,
     MessageRole,
 )
-from llama_index.llms.mock import MockLLM
 from llama_index.tools.function_tool import FunctionTool
 
 
diff --git a/tests/chat_engine/test_condense_plus_context.py b/tests/chat_engine/test_condense_plus_context.py
index 6d61d6164235abc56ea98bc4bcad2e88ffe55c1b..4f246e437edddc756e72ba44d3d9dc1126e97511 100644
--- a/tests/chat_engine/test_condense_plus_context.py
+++ b/tests/chat_engine/test_condense_plus_context.py
@@ -1,15 +1,24 @@
 from typing import Any, List
-from unittest.mock import Mock
+from unittest.mock import Mock, patch
 
 from llama_index.chat_engine.condense_plus_context import CondensePlusContextChatEngine
 from llama_index.indices.base_retriever import BaseRetriever
 from llama_index.indices.service_context import ServiceContext
-from llama_index.llm_predictor.base import LLMPredictor
+from llama_index.llms.mock import MockLLM
 from llama_index.memory.chat_memory_buffer import ChatMemoryBuffer
-from llama_index.prompts.base import BasePromptTemplate
+from llama_index.prompts import BasePromptTemplate
 from llama_index.schema import NodeWithScore, TextNode
 
 
+def override_predict(self: Any, prompt: BasePromptTemplate, **prompt_args: Any) -> str:
+    return prompt.format(**prompt_args)
+
+
+@patch.object(
+    MockLLM,
+    "predict",
+    override_predict,
+)
 def test_condense_plus_context_chat_engine(
     mock_service_context: ServiceContext,
 ) -> None:
@@ -39,13 +48,6 @@ def test_condense_plus_context_chat_engine(
 
     mock_retriever.retrieve.side_effect = override_retrieve
 
-    mock_llm_predictor = Mock(spec=LLMPredictor)
-
-    def override_predict(prompt: BasePromptTemplate, **prompt_args: Any) -> str:
-        return prompt.format(**prompt_args)
-
-    mock_llm_predictor.predict.side_effect = override_predict
-
     context_prompt = "Context information: {context_str}"
 
     condense_prompt = (
@@ -56,8 +58,7 @@ def test_condense_plus_context_chat_engine(
 
     engine = CondensePlusContextChatEngine(
         retriever=mock_retriever,
-        llm=mock_service_context.llm,
-        llm_predictor=mock_llm_predictor,
+        llm=MockLLM(),
         memory=ChatMemoryBuffer.from_defaults(
             chat_history=[], llm=mock_service_context.llm
         ),
diff --git a/tests/chat_engine/test_condense_question.py b/tests/chat_engine/test_condense_question.py
index 349fe686b328703c39651c6bf9adc0de7e6b43ca..fb249a5a3d5ea169adf11c557798b00b0dd90db5 100644
--- a/tests/chat_engine/test_condense_question.py
+++ b/tests/chat_engine/test_condense_question.py
@@ -2,7 +2,7 @@ from unittest.mock import Mock
 
 from llama_index.chat_engine.condense_question import CondenseQuestionChatEngine
 from llama_index.core import BaseQueryEngine
-from llama_index.llms.base import ChatMessage, MessageRole
+from llama_index.llms.types import ChatMessage, MessageRole
 from llama_index.response.schema import Response
 from llama_index.service_context import ServiceContext
 
diff --git a/tests/chat_engine/test_simple.py b/tests/chat_engine/test_simple.py
index e84fcdbddebc57692ce664a39667e98b279bda5c..fa6e191b25776285702d845c2415e3d9d6186abb 100644
--- a/tests/chat_engine/test_simple.py
+++ b/tests/chat_engine/test_simple.py
@@ -1,5 +1,5 @@
 from llama_index.chat_engine.simple import SimpleChatEngine
-from llama_index.llms.base import ChatMessage, MessageRole
+from llama_index.llms.types import ChatMessage, MessageRole
 from llama_index.service_context import ServiceContext
 
 
diff --git a/tests/conftest.py b/tests/conftest.py
index 5791c4d5d1ec2038cfe686af1ae4050bcbc3ea61..5d6b5e2d1586e4c9a79f56d69bbad56081732fb8 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -6,8 +6,8 @@ from typing import Any, List, Optional
 import openai
 import pytest
 from llama_index.llm_predictor.base import LLMPredictor
-from llama_index.llms.base import LLMMetadata
 from llama_index.llms.mock import MockLLM
+from llama_index.llms.types import LLMMetadata
 from llama_index.node_parser.text import SentenceSplitter, TokenTextSplitter
 from llama_index.service_context import ServiceContext
 
@@ -68,10 +68,27 @@ def patch_llm_predictor(monkeypatch: pytest.MonkeyPatch) -> None:
         LLMMetadata(),
     )
 
+    monkeypatch.setattr(
+        MockLLM,
+        "predict",
+        patch_llmpredictor_predict,
+    )
+    monkeypatch.setattr(
+        MockLLM,
+        "apredict",
+        patch_llmpredictor_apredict,
+    )
+    monkeypatch.setattr(
+        MockLLM,
+        "metadata",
+        LLMMetadata(),
+    )
+
 
 @pytest.fixture()
 def mock_service_context(
-    patch_token_text_splitter: Any, patch_llm_predictor: Any
+    patch_token_text_splitter: Any,
+    patch_llm_predictor: Any,
 ) -> ServiceContext:
     return ServiceContext.from_defaults(embed_model=MockEmbedding())
 
diff --git a/tests/indices/list/test_retrievers.py b/tests/indices/list/test_retrievers.py
index 5fcbb38200a2e631f4d298e4d54f7fcd24ca4835..903c85a73d67a1bd8ebc208ef09fb1a176129f8e 100644
--- a/tests/indices/list/test_retrievers.py
+++ b/tests/indices/list/test_retrievers.py
@@ -3,7 +3,7 @@ from unittest.mock import patch
 
 from llama_index.indices.list.base import SummaryIndex
 from llama_index.indices.list.retrievers import SummaryIndexEmbeddingRetriever
-from llama_index.llm_predictor.base import LLMPredictor
+from llama_index.llms.mock import MockLLM
 from llama_index.prompts import BasePromptTemplate
 from llama_index.schema import Document
 from llama_index.service_context import ServiceContext
@@ -55,7 +55,7 @@ def mock_llmpredictor_predict(
 
 
 @patch.object(
-    LLMPredictor,
+    MockLLM,
     "predict",
     mock_llmpredictor_predict,
 )
diff --git a/tests/indices/query/query_transform/test_base.py b/tests/indices/query/query_transform/test_base.py
index 438acd468288b1b96ce256b3620cf85691996903..b1c71c0fcf72173d1c324305a6dc2db9ce1d08a3 100644
--- a/tests/indices/query/query_transform/test_base.py
+++ b/tests/indices/query/query_transform/test_base.py
@@ -11,7 +11,7 @@ def test_decompose_query_transform(mock_service_context: ServiceContext) -> None
     """Test decompose query transform."""
     query_transform = DecomposeQueryTransform(
         decompose_query_prompt=MOCK_DECOMPOSE_PROMPT,
-        llm_predictor=mock_service_context.llm_predictor,
+        llm=mock_service_context.llm,
     )
 
     query_str = "What is?"
diff --git a/tests/indices/response/test_tree_summarize.py b/tests/indices/response/test_tree_summarize.py
index 8130df52469924d073634d23217105307a783d7f..93432e2fa4310849e66813754c4025f4e2816aa0 100644
--- a/tests/indices/response/test_tree_summarize.py
+++ b/tests/indices/response/test_tree_summarize.py
@@ -1,11 +1,13 @@
 """Test tree summarize."""
 
-from typing import List, Sequence
-from unittest.mock import Mock
+from typing import Any, List, Sequence
+from unittest.mock import Mock, patch
 
 import pytest
 from llama_index.bridge.pydantic import BaseModel
 from llama_index.indices.prompt_helper import PromptHelper
+from llama_index.llm_predictor import LLMPredictor
+from llama_index.llms.mock import MockLLM
 from llama_index.prompts.base import PromptTemplate
 from llama_index.prompts.prompt_type import PromptType
 from llama_index.response_synthesizers import TreeSummarize
@@ -53,11 +55,19 @@ def test_tree_summarize(mock_service_context_merge_chunks: ServiceContext) -> No
     assert str(response) == "Text chunk 1\nText chunk 2\nText chunk 3\nText chunk 4"
 
 
+class TestModel(BaseModel):
+    hello: str
+
+
+def mock_return_class(*args: Any, **kwargs: Any) -> TestModel:
+    return TestModel(hello="Test Chunk 5")
+
+
+@patch.object(MockLLM, "structured_predict", mock_return_class)
 def test_tree_summarize_output_cls(
     mock_service_context_merge_chunks: ServiceContext,
 ) -> None:
-    class TestModel(BaseModel):
-        hello: str
+    mock_service_context_merge_chunks.llm_predictor = LLMPredictor(MockLLM())
 
     mock_summary_prompt_tmpl = "{context_str}{query_str}"
     mock_summary_prompt = PromptTemplate(
@@ -71,9 +81,7 @@ def test_tree_summarize_output_cls(
         '{"hello":"Test Chunk 3"}',
         '{"hello":"Test Chunk 4"}',
     ]
-    response_rtr = {"hello": "Test Chunk 5"}
-    TestModel.parse_raw = Mock(name="parse_raw")  # type: ignore
-    TestModel.parse_raw.return_value = response_rtr
+    response_dict = {"hello": "Test Chunk 5"}
 
     # test sync
     tree_summarize = TreeSummarize(
@@ -83,8 +91,8 @@ def test_tree_summarize_output_cls(
     )
     full_response = "\n".join(texts)
     response = tree_summarize.get_response(text_chunks=texts, query_str=query_str)
-    TestModel.parse_raw.assert_called_once_with(full_response)
-    assert response == response_rtr
+    assert isinstance(response, TestModel)
+    assert response.dict() == response_dict
 
 
 def test_tree_summarize_use_async(
diff --git a/tests/indices/struct_store/test_json_query.py b/tests/indices/struct_store/test_json_query.py
index 3b1bc4757cae778dee7939404707be712fab9b2d..a84b13ad5cd7e8de3ade1b3e3760a120805088f3 100644
--- a/tests/indices/struct_store/test_json_query.py
+++ b/tests/indices/struct_store/test_json_query.py
@@ -2,11 +2,14 @@
 
 import asyncio
 import json
-from typing import Any, Dict, Generator, cast
-from unittest.mock import AsyncMock, MagicMock, patch
+from typing import Any, Dict, cast
+from unittest.mock import patch
 
 import pytest
 from llama_index.indices.struct_store.json_query import JSONQueryEngine, JSONType
+from llama_index.llm_predictor import LLMPredictor
+from llama_index.llms.mock import MockLLM
+from llama_index.prompts.base import BasePromptTemplate
 from llama_index.response.schema import Response
 from llama_index.schema import QueryBundle
 from llama_index.service_context import ServiceContext
@@ -21,23 +24,35 @@ TEST_PARAMS = [
 TEST_LLM_OUTPUT = "test_llm_output"
 
 
-@pytest.fixture()
-def mock_json_service_ctx(
-    mock_service_context: ServiceContext,
-) -> Generator[ServiceContext, None, None]:
-    with patch.object(mock_service_context, "llm_predictor") as mock_llm_predictor:
-        mock_llm_predictor.apredict = AsyncMock(return_value=TEST_LLM_OUTPUT)
-        mock_llm_predictor.predict = MagicMock(return_value=TEST_LLM_OUTPUT)
-        yield mock_service_context
+def mock_predict(self: Any, prompt: BasePromptTemplate, **prompt_args: Any) -> str:
+    return TEST_LLM_OUTPUT
+
+
+async def amock_predict(
+    self: Any, prompt: BasePromptTemplate, **prompt_args: Any
+) -> str:
+    return TEST_LLM_OUTPUT
 
 
 @pytest.mark.parametrize(("synthesize_response", "call_apredict"), TEST_PARAMS)
+@patch.object(
+    MockLLM,
+    "predict",
+    mock_predict,
+)
+@patch.object(
+    MockLLM,
+    "apredict",
+    amock_predict,
+)
 def test_json_query_engine(
     synthesize_response: bool,
     call_apredict: bool,
-    mock_json_service_ctx: ServiceContext,
+    mock_service_context: ServiceContext,
 ) -> None:
     """Test GPTNLJSONQueryEngine."""
+    mock_service_context.llm_predictor = LLMPredictor(MockLLM())
+
     # Test on some sample data
     json_val = cast(JSONType, {})
     json_schema = cast(JSONType, {})
@@ -53,7 +68,7 @@ def test_json_query_engine(
     query_engine = JSONQueryEngine(
         json_value=json_val,
         json_schema=json_schema,
-        service_context=mock_json_service_ctx,
+        service_context=mock_service_context,
         output_processor=test_output_processor,
         verbose=True,
         synthesize_response=synthesize_response,
diff --git a/tests/llms/test_anthropic.py b/tests/llms/test_anthropic.py
index 5366d3fc3942885c978b5fa93adc86dc822d6762..c7386ffbd46a2eeb4c769a8f4135d26d64903497 100644
--- a/tests/llms/test_anthropic.py
+++ b/tests/llms/test_anthropic.py
@@ -1,6 +1,6 @@
 import pytest
 from llama_index.llms.anthropic import Anthropic
-from llama_index.llms.base import ChatMessage
+from llama_index.llms.types import ChatMessage
 
 try:
     import anthropic
diff --git a/tests/llms/test_anthropic_utils.py b/tests/llms/test_anthropic_utils.py
index ed3d978568b7b7980bc7f310e6a5595669795f2f..76b4dce62cae9741b414d5d620d1a9ff1938468b 100644
--- a/tests/llms/test_anthropic_utils.py
+++ b/tests/llms/test_anthropic_utils.py
@@ -3,7 +3,7 @@ from llama_index.llms.anthropic_utils import (
     anthropic_modelname_to_contextsize,
     messages_to_anthropic_prompt,
 )
-from llama_index.llms.base import ChatMessage, MessageRole
+from llama_index.llms.types import ChatMessage, MessageRole
 
 
 def test_messages_to_anthropic_prompt() -> None:
diff --git a/tests/llms/test_bedrock.py b/tests/llms/test_bedrock.py
index f462ce5b9ad1c347cd40e3680c8e87f8adcf17c5..169efc742bbd00a3aee8636f69fbdddb9de361e1 100644
--- a/tests/llms/test_bedrock.py
+++ b/tests/llms/test_bedrock.py
@@ -5,7 +5,7 @@ from typing import Any, Generator
 from botocore.response import StreamingBody
 from botocore.stub import Stubber
 from llama_index.llms import Bedrock
-from llama_index.llms.base import ChatMessage
+from llama_index.llms.types import ChatMessage
 from pytest import MonkeyPatch
 
 
diff --git a/tests/llms/test_cohere.py b/tests/llms/test_cohere.py
index 99d3afe3ccbe20bb29e49e8a2ed1659b45ef4a3d..1d65c83a36dc41196a1b1d53a00183a421998a22 100644
--- a/tests/llms/test_cohere.py
+++ b/tests/llms/test_cohere.py
@@ -1,7 +1,7 @@
 from typing import Any
 
 import pytest
-from llama_index.llms.base import ChatMessage
+from llama_index.llms.types import ChatMessage
 from pytest import MonkeyPatch
 
 try:
diff --git a/tests/llms/test_custom.py b/tests/llms/test_custom.py
index cd957cb98ff7c43d646b9a3a72aa23661a204980..3cd79eca330d4879384ef2d1408281efbf4d9691 100644
--- a/tests/llms/test_custom.py
+++ b/tests/llms/test_custom.py
@@ -1,12 +1,12 @@
 from typing import Any
 
-from llama_index.llms.base import (
+from llama_index.llms.custom import CustomLLM
+from llama_index.llms.types import (
     ChatMessage,
     CompletionResponse,
     CompletionResponseGen,
     LLMMetadata,
 )
-from llama_index.llms.custom import CustomLLM
 
 
 class TestLLM(CustomLLM):
diff --git a/tests/llms/test_konko.py b/tests/llms/test_konko.py
index 019494ac60599d199dc231508df53145f891b786..8b62dd0b6113a8cf4b5b7b6a66e50b729a1a2b4c 100644
--- a/tests/llms/test_konko.py
+++ b/tests/llms/test_konko.py
@@ -1,8 +1,8 @@
 from typing import Any, Generator
 
 import pytest
-from llama_index.llms.base import ChatMessage
 from llama_index.llms.konko import Konko
+from llama_index.llms.types import ChatMessage
 from pytest import MonkeyPatch
 
 try:
diff --git a/tests/llms/test_langchain.py b/tests/llms/test_langchain.py
index dae2cd6827db8ad0d6361dbb088f97d71f63dcaa..15b1c03f583a01538a03324ab2e2a565f52451a6 100644
--- a/tests/llms/test_langchain.py
+++ b/tests/llms/test_langchain.py
@@ -1,7 +1,7 @@
 from typing import List
 
 import pytest
-from llama_index.llms.base import ChatMessage, MessageRole
+from llama_index.llms.types import ChatMessage, MessageRole
 
 try:
     import cohere
diff --git a/tests/llms/test_litellm.py b/tests/llms/test_litellm.py
index 3d6643f18d50b2f42d38c252df812f311f6c4103..8786f7b506793283dd298f0c667833f477cf020d 100644
--- a/tests/llms/test_litellm.py
+++ b/tests/llms/test_litellm.py
@@ -6,8 +6,8 @@ except ImportError:
     litellm = None  # type: ignore
 
 import pytest
-from llama_index.llms.base import ChatMessage
 from llama_index.llms.litellm import LiteLLM
+from llama_index.llms.types import ChatMessage
 from pytest import MonkeyPatch
 
 from tests.conftest import CachedOpenAIApiKeys
diff --git a/tests/llms/test_llama_utils.py b/tests/llms/test_llama_utils.py
index 84326c8f6b1860c4a8ac7d7664971d0e9501fc53..b8587d7a5d2b8c09491b80c1f2d0b53d64ded916 100644
--- a/tests/llms/test_llama_utils.py
+++ b/tests/llms/test_llama_utils.py
@@ -1,7 +1,6 @@
 from typing import Sequence
 
 import pytest
-from llama_index.llms.base import ChatMessage, MessageRole
 from llama_index.llms.llama_utils import (
     B_INST,
     B_SYS,
@@ -13,6 +12,7 @@ from llama_index.llms.llama_utils import (
     completion_to_prompt,
     messages_to_prompt,
 )
+from llama_index.llms.types import ChatMessage, MessageRole
 
 
 @pytest.fixture()
diff --git a/tests/llms/test_localai.py b/tests/llms/test_localai.py
index e8e93e7ff99e12704e585938a356719ece4a4fc4..eda548c0ab3f7a9d6ef99316d0bd9bd1fe0d3ac6 100644
--- a/tests/llms/test_localai.py
+++ b/tests/llms/test_localai.py
@@ -2,7 +2,7 @@ from unittest.mock import MagicMock, patch
 
 import pytest
 from llama_index.llms import LocalAI
-from llama_index.llms.base import ChatMessage
+from llama_index.llms.types import ChatMessage
 from openai.types import Completion, CompletionChoice
 from openai.types.chat.chat_completion import ChatCompletion, Choice
 from openai.types.chat.chat_completion_message import ChatCompletionMessage
diff --git a/tests/llms/test_openai.py b/tests/llms/test_openai.py
index 9921739e81321dcc93581e6b9f88b455996bf56c..ebc42b20929bec45147b195b87cb2306ab863b18 100644
--- a/tests/llms/test_openai.py
+++ b/tests/llms/test_openai.py
@@ -3,8 +3,8 @@ from typing import Any, AsyncGenerator, Generator
 from unittest.mock import AsyncMock, MagicMock, patch
 
 import pytest
-from llama_index.llms.base import ChatMessage
 from llama_index.llms.openai import OpenAI
+from llama_index.llms.types import ChatMessage
 from openai.types.chat.chat_completion import (
     ChatCompletion,
     ChatCompletionMessage,
diff --git a/tests/llms/test_openai_like.py b/tests/llms/test_openai_like.py
index 463a2d97b966d91026698276c5d2b2c0a1df6035..99a96f6f44e948e7219265ee698f3158a836433c 100644
--- a/tests/llms/test_openai_like.py
+++ b/tests/llms/test_openai_like.py
@@ -2,8 +2,8 @@ from typing import List
 from unittest.mock import MagicMock, call, patch
 
 from llama_index.llms import LOCALAI_DEFAULTS, OpenAILike
-from llama_index.llms.base import ChatMessage, MessageRole
 from llama_index.llms.openai import Tokenizer
+from llama_index.llms.types import ChatMessage, MessageRole
 from openai.types import Completion, CompletionChoice
 from openai.types.chat.chat_completion import ChatCompletion, Choice
 from openai.types.chat.chat_completion_message import ChatCompletionMessage
diff --git a/tests/llms/test_openai_utils.py b/tests/llms/test_openai_utils.py
index d5dcd76c2828376da252f6ef76eef48adf659736..9a7005f5c42cad2360a5678adeab74f7439e9ba7 100644
--- a/tests/llms/test_openai_utils.py
+++ b/tests/llms/test_openai_utils.py
@@ -1,12 +1,12 @@
 from typing import List
 
 import pytest
-from llama_index.llms.base import ChatMessage, MessageRole
 from llama_index.llms.openai_utils import (
     from_openai_message_dicts,
     from_openai_messages,
     to_openai_message_dicts,
 )
+from llama_index.llms.types import ChatMessage, MessageRole
 from openai.types.chat.chat_completion_assistant_message_param import (
     FunctionCall as FunctionCallParam,
 )
diff --git a/tests/llms/test_palm.py b/tests/llms/test_palm.py
index 6068008b59df7d36e22d0ca8c5b0aa9ef0a622b7..c145f24ba64b5bad2d71c83acfc77980f7d9802e 100644
--- a/tests/llms/test_palm.py
+++ b/tests/llms/test_palm.py
@@ -36,8 +36,8 @@ sys.modules["google.generativeai"] = MockPalmPackage()
 
 from typing import Any
 
-from llama_index.llms.base import CompletionResponse
 from llama_index.llms.palm import PaLM
+from llama_index.llms.types import CompletionResponse
 
 
 @pytest.mark.skipif(
diff --git a/tests/llms/test_rungpt.py b/tests/llms/test_rungpt.py
index ea286272a2d64a2297d6bc24e4e7b07522bcb94f..475e719c8a73bf8a0e4cf366588b6493b83b66ee 100644
--- a/tests/llms/test_rungpt.py
+++ b/tests/llms/test_rungpt.py
@@ -2,11 +2,11 @@ from typing import Any, Dict, Generator, List
 from unittest.mock import MagicMock, patch
 
 import pytest
-from llama_index.llms.base import (
+from llama_index.llms.rungpt import RunGptLLM
+from llama_index.llms.types import (
     ChatMessage,
     MessageRole,
 )
-from llama_index.llms.rungpt import RunGptLLM
 
 try:
     import sseclient
diff --git a/tests/llms/test_vertex.py b/tests/llms/test_vertex.py
index 2f93a47f634536bac63533f3095e5736983621cb..4d6f00686397faddca6542f3e4a43992b0b308c5 100644
--- a/tests/llms/test_vertex.py
+++ b/tests/llms/test_vertex.py
@@ -1,5 +1,5 @@
 import pytest
-from llama_index.llms.base import CompletionResponse
+from llama_index.llms.types import CompletionResponse
 from llama_index.llms.vertex import Vertex
 from llama_index.llms.vertex_utils import init_vertexai
 
diff --git a/tests/llms/test_watsonx.py b/tests/llms/test_watsonx.py
index 0176e501b9f65532922a6c71d4b1a0cabe4abced..990028bc9116592c2eff30822f8c5646d8e2836d 100644
--- a/tests/llms/test_watsonx.py
+++ b/tests/llms/test_watsonx.py
@@ -3,7 +3,7 @@ from typing import Any, Dict, Generator, Optional
 from unittest.mock import MagicMock
 
 import pytest
-from llama_index.llms.base import ChatMessage
+from llama_index.llms.types import ChatMessage
 
 try:
     import ibm_watson_machine_learning
diff --git a/tests/llms/test_xinference.py b/tests/llms/test_xinference.py
index 9787961140163d2d87d0b822705c6bd1af6de604..3c2000746a106ea70f7da1e5737ed3d128ff8dd8 100644
--- a/tests/llms/test_xinference.py
+++ b/tests/llms/test_xinference.py
@@ -1,7 +1,7 @@
 from typing import Any, Dict, Generator, Iterator, List, Mapping, Sequence, Tuple, Union
 
 import pytest
-from llama_index.llms.base import (
+from llama_index.llms.types import (
     ChatMessage,
     ChatResponse,
     CompletionResponse,
diff --git a/tests/postprocessor/test_llm_rerank.py b/tests/postprocessor/test_llm_rerank.py
index 07a79438d8a438201ecf2877e6c2783cf95935a2..f50216c5751711350569b7ce964f6d66d4c6b037 100644
--- a/tests/postprocessor/test_llm_rerank.py
+++ b/tests/postprocessor/test_llm_rerank.py
@@ -3,7 +3,7 @@
 from typing import Any, List
 from unittest.mock import patch
 
-from llama_index.llm_predictor import LLMPredictor
+from llama_index.llms.mock import MockLLM
 from llama_index.postprocessor.llm_rerank import LLMRerank
 from llama_index.prompts import BasePromptTemplate
 from llama_index.schema import BaseNode, NodeWithScore, QueryBundle, TextNode
@@ -42,7 +42,7 @@ def mock_format_node_batch_fn(nodes: List[BaseNode]) -> str:
 
 
 @patch.object(
-    LLMPredictor,
+    MockLLM,
     "predict",
     mock_llmpredictor_predict,
 )
diff --git a/tests/program/test_llm_program.py b/tests/program/test_llm_program.py
index 5b03af7117c2bf819ab315136eb4ce964b6a5a29..ae8d4dcab16b1bb6b443c803fdf9970c69b27e07 100644
--- a/tests/program/test_llm_program.py
+++ b/tests/program/test_llm_program.py
@@ -4,7 +4,7 @@ import json
 from unittest.mock import MagicMock
 
 from llama_index.bridge.pydantic import BaseModel
-from llama_index.llms.base import (
+from llama_index.llms.types import (
     ChatMessage,
     ChatResponse,
     CompletionResponse,
diff --git a/tests/program/test_lmformatenforcer.py b/tests/program/test_lmformatenforcer.py
index daecf975fd77cbefe3da38ba090c78841dc7e0d5..9b9468c3d694fef3bc4cc14c85ca2d654f933268 100644
--- a/tests/program/test_lmformatenforcer.py
+++ b/tests/program/test_lmformatenforcer.py
@@ -3,8 +3,8 @@ from unittest.mock import MagicMock
 
 import pytest
 from llama_index.bridge.pydantic import BaseModel
-from llama_index.llms.base import CompletionResponse
 from llama_index.llms.huggingface import HuggingFaceLLM
+from llama_index.llms.types import CompletionResponse
 from llama_index.program.lmformatenforcer_program import LMFormatEnforcerPydanticProgram
 
 has_lmformatenforcer = find_spec("lmformatenforcer") is not None
diff --git a/tests/program/test_multi_modal_llm_program.py b/tests/program/test_multi_modal_llm_program.py
index c0f7d1977ab52a1cf389a7b60690436fec0e7ce6..7d1fe9b848c1f0b3ecfec513f95e1651541e2fa0 100644
--- a/tests/program/test_multi_modal_llm_program.py
+++ b/tests/program/test_multi_modal_llm_program.py
@@ -5,7 +5,7 @@ from typing import Sequence
 from unittest.mock import MagicMock
 
 from llama_index.bridge.pydantic import BaseModel
-from llama_index.llms.base import (
+from llama_index.llms.types import (
     CompletionResponse,
 )
 from llama_index.multi_modal_llms import MultiModalLLMMetadata
diff --git a/tests/prompts/test_base.py b/tests/prompts/test_base.py
index ec4b9f1e90ad971bb51857da6d8b244bff453a06..00993b6f610aa3f0b39838b9ecd8da59dc3b30fa 100644
--- a/tests/prompts/test_base.py
+++ b/tests/prompts/test_base.py
@@ -5,7 +5,7 @@ from typing import Any
 
 import pytest
 from llama_index.llms import MockLLM
-from llama_index.llms.base import ChatMessage, MessageRole
+from llama_index.llms.types import ChatMessage, MessageRole
 from llama_index.prompts import (
     ChatPromptTemplate,
     LangchainPromptTemplate,
diff --git a/tests/query_engine/test_retriever_query_engine.py b/tests/query_engine/test_retriever_query_engine.py
index eedb032fec73ebaa3858e0fa690e7b9c5fe5316b..431837d9cb1eb2c051e8abe341bbeae84a06cbc1 100644
--- a/tests/query_engine/test_retriever_query_engine.py
+++ b/tests/query_engine/test_retriever_query_engine.py
@@ -1,7 +1,6 @@
 import pytest
 from llama_index import (
     Document,
-    LLMPredictor,
     ServiceContext,
     TreeIndex,
 )
@@ -19,16 +18,14 @@ except ImportError:
 @pytest.mark.skipif(anthropic is None, reason="anthropic not installed")
 def test_query_engine_falls_back_to_inheriting_retrievers_service_context() -> None:
     documents = [Document(text="Hi")]
-    gpt35turbo_predictor = LLMPredictor(
-        llm=OpenAI(
-            temperature=0,
-            model_name="gpt-3.5-turbo-0613",
-            streaming=True,
-            openai_api_key="test-test-test",
-        ),
+    gpt35turbo_predictor = OpenAI(
+        temperature=0,
+        model_name="gpt-3.5-turbo-0613",
+        streaming=True,
+        openai_api_key="test-test-test",
     )
     gpt35_sc = ServiceContext.from_defaults(
-        llm_predictor=gpt35turbo_predictor,
+        llm=gpt35turbo_predictor,
         chunk_size=512,
     )
 
@@ -37,21 +34,21 @@ def test_query_engine_falls_back_to_inheriting_retrievers_service_context() -> N
     query_engine = RetrieverQueryEngine(retriever=retriever)
 
     assert (
-        retriever._service_context.llm_predictor.metadata.model_name
-        == gpt35turbo_predictor._llm.metadata.model_name
+        retriever._service_context.llm.metadata.model_name
+        == gpt35turbo_predictor.metadata.model_name
     )
     assert (
-        query_engine._response_synthesizer.service_context.llm_predictor.metadata.model_name
-        == retriever._service_context.llm_predictor.metadata.model_name
+        query_engine._response_synthesizer.service_context.llm.metadata.model_name
+        == retriever._service_context.llm.metadata.model_name
     )
     assert (
         query_engine._response_synthesizer.service_context == retriever._service_context
     )
 
     documents = [Document(text="Hi")]
-    claude_predictor = LLMPredictor(llm=Anthropic(model="claude-2"))
+    claude_predictor = Anthropic(model="claude-2")
     claude_sc = ServiceContext.from_defaults(
-        llm_predictor=claude_predictor,
+        llm=claude_predictor,
         chunk_size=512,
     )
 
@@ -60,12 +57,12 @@ def test_query_engine_falls_back_to_inheriting_retrievers_service_context() -> N
     query_engine = RetrieverQueryEngine(retriever=retriever)
 
     assert (
-        retriever._service_context.llm_predictor.metadata.model_name
-        == claude_predictor._llm.metadata.model_name
+        retriever._service_context.llm.metadata.model_name
+        == claude_predictor.metadata.model_name
     )
     assert (
-        query_engine._response_synthesizer.service_context.llm_predictor.metadata.model_name
-        == retriever._service_context.llm_predictor.metadata.model_name
+        query_engine._response_synthesizer.service_context.llm.metadata.model_name
+        == retriever._service_context.llm.metadata.model_name
     )
     assert (
         query_engine._response_synthesizer.service_context == retriever._service_context
diff --git a/tests/token_predictor/test_base.py b/tests/token_predictor/test_base.py
index 8d15fafab0f068a4330f22f1478fd7c950c2f4f9..1a397eea60ca7a9d6558309938afdc19db1934e9 100644
--- a/tests/token_predictor/test_base.py
+++ b/tests/token_predictor/test_base.py
@@ -6,7 +6,7 @@ from unittest.mock import patch
 from llama_index.indices.keyword_table.base import KeywordTableIndex
 from llama_index.indices.list.base import SummaryIndex
 from llama_index.indices.tree.base import TreeIndex
-from llama_index.llm_predictor.mock import MockLLMPredictor
+from llama_index.llms.mock import MockLLM
 from llama_index.node_parser import TokenTextSplitter
 from llama_index.schema import Document
 from llama_index.service_context import ServiceContext
@@ -26,8 +26,8 @@ def test_token_predictor(mock_split: Any) -> None:
         "This is a test v2."
     )
     document = Document(text=doc_text)
-    llm_predictor = MockLLMPredictor(max_tokens=256)
-    service_context = ServiceContext.from_defaults(llm_predictor=llm_predictor)
+    llm = MockLLM(max_tokens=256)
+    service_context = ServiceContext.from_defaults(llm=llm)
 
     # test tree index
     index = TreeIndex.from_documents([document], service_context=service_context)