diff --git a/docs/BUILD b/docs/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..db46e8d6c978c67e301dd6c47bee08c1b3fd141c
--- /dev/null
+++ b/docs/BUILD
@@ -0,0 +1 @@
+python_sources()
diff --git a/docs/examples/discover_llamaindex/document_management/BUILD b/docs/examples/discover_llamaindex/document_management/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..db46e8d6c978c67e301dd6c47bee08c1b3fd141c
--- /dev/null
+++ b/docs/examples/discover_llamaindex/document_management/BUILD
@@ -0,0 +1 @@
+python_sources()
diff --git a/docs/examples/finetuning/embeddings/BUILD b/docs/examples/finetuning/embeddings/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..db46e8d6c978c67e301dd6c47bee08c1b3fd141c
--- /dev/null
+++ b/docs/examples/finetuning/embeddings/BUILD
@@ -0,0 +1 @@
+python_sources()
diff --git a/docs/examples/output_parsing/BUILD b/docs/examples/output_parsing/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..db46e8d6c978c67e301dd6c47bee08c1b3fd141c
--- /dev/null
+++ b/docs/examples/output_parsing/BUILD
@@ -0,0 +1 @@
+python_sources()
diff --git a/llama-datasets/10k/uber_2021/BUILD b/llama-datasets/10k/uber_2021/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..db46e8d6c978c67e301dd6c47bee08c1b3fd141c
--- /dev/null
+++ b/llama-datasets/10k/uber_2021/BUILD
@@ -0,0 +1 @@
+python_sources()
diff --git a/llama-datasets/10k/uber_2021/README.md b/llama-datasets/10k/uber_2021/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..8edf323890a039f586be1e1d64f15e3f044ff18c
--- /dev/null
+++ b/llama-datasets/10k/uber_2021/README.md
@@ -0,0 +1,61 @@
+# Uber 10K Dataset 2021
+
+## CLI Usage
+
+You can download `llamadatasets` directly using `llamaindex-cli`, which comes installed with the `llama-index` python package:
+
+```bash
+llamaindex-cli download-llamadataset Uber10KDataset2021 --download-dir ./data
+```
+
+You can then inspect the files at `./data`. When you're ready to load the data into
+python, you can use the below snippet of code:
+
+```python
+from llama_index.core import SimpleDirectoryReader
+from llama_index.core.llama_dataset import LabelledRagDataset
+
+rag_dataset = LabelledRagDataset.from_json("./data/rag_dataset.json")
+documents = SimpleDirectoryReader(input_dir="./data/source_files").load_data()
+```
+
+## Code Usage
+
+You can download the dataset to a directory, say `./data` directly in Python
+as well. From there, you can use the convenient `RagEvaluatorPack` llamapack to
+run your own LlamaIndex RAG pipeline with the `llamadataset`.
+
+```python
+from llama_index.core.llama_dataset import download_llama_dataset
+from llama_index.core.llama_pack import download_llama_pack
+from llama_index.core import VectorStoreIndex
+
+# download and install dependencies for benchmark dataset
+rag_dataset, documents = download_llama_dataset("Uber10KDataset2021", "./data")
+
+# build basic RAG system
+index = VectorStoreIndex.from_documents(documents=documents)
+query_engine = index.as_query_engine()
+
+# evaluate using the RagEvaluatorPack
+RagEvaluatorPack = download_llama_pack(
+    "RagEvaluatorPack", "./rag_evaluator_pack"
+)
+rag_evaluator_pack = RagEvaluatorPack(
+    rag_dataset=rag_dataset,
+    query_engine=query_engine,
+    show_progress=True,
+)
+
+############################################################################
+# NOTE: If have a lower tier subscription for OpenAI API like Usage Tier 1 #
+# then you'll need to use different batch_size and sleep_time_in_seconds.  #
+# For Usage Tier 1, settings that seemed to work well were batch_size=5,   #
+# and sleep_time_in_seconds=15 (as of December 2023.)                      #
+############################################################################
+
+benchmark_df = await rag_evaluator_pack.arun(
+    batch_size=20,  # batches the number of openai api calls to make
+    sleep_time_in_seconds=1,  # seconds to sleep before making an api call
+)
+```
diff --git a/llama-datasets/10k/uber_2021/card.json b/llama-datasets/10k/uber_2021/card.json
new file mode 100644
index 0000000000000000000000000000000000000000..a6fb854c1eafcee77b6bbe590a9f22ed918ebe56
--- /dev/null
+++ b/llama-datasets/10k/uber_2021/card.json
@@ -0,0 +1,27 @@
+{
+  "name": "Uber 10K Dataset 2021",
+  "className": "LabelledRagDataset",
+  "description": "A labelled RAG dataset based on the Uber 2021 10K document, consisting of queries, reference answers, and reference contexts.",
+  "numberObservations": 822,
+  "containsExamplesByHumans": false,
+  "containsExamplesByAi": true,
+  "sourceUrls": [],
+  "baselines": [
+    {
+      "name": "llamaindex",
+      "config": {
+        "chunkSize": 1024,
+        "llm": "gpt-3.5-turbo",
+        "similarityTopK": 2,
+        "embedModel": "text-embedding-ada-002"
+      },
+      "metrics": {
+        "contextSimilarity": 0.943,
+        "correctness": 3.874,
+        "faithfulness": 0.667,
+        "relevancy": 0.844
+      },
+      "codeUrl": "https://github.com/run-llama/llama-hub/blob/main/llama_hub/llama_datasets/10k/uber_2021/llamaindex_baseline.py"
+    }
+  ]
+}
diff --git a/llama-datasets/10k/uber_2021/llamaindex_baseline.py b/llama-datasets/10k/uber_2021/llamaindex_baseline.py
new file mode 100644
index 0000000000000000000000000000000000000000..8af8eb51b536ab64c7443aa48ab6063008d3d057
--- /dev/null
+++ b/llama-datasets/10k/uber_2021/llamaindex_baseline.py
@@ -0,0 +1,41 @@
+import asyncio
+
+from llama_index.core.llama_dataset import download_llama_dataset
+from llama_index.core.llama_pack import download_llama_pack
+from llama_index.core import VectorStoreIndex
+from llama_index.llms import OpenAI
+
+
+async def main():
+    # DOWNLOAD LLAMADATASET
+    rag_dataset, documents = download_llama_dataset(
+        "Uber10KDataset2021", "./uber10k_2021_dataset"
+    )
+
+    # BUILD BASIC RAG PIPELINE
+    index = VectorStoreIndex.from_documents(documents=documents)
+    query_engine = index.as_query_engine()
+
+    # EVALUATE WITH PACK
+    RagEvaluatorPack = download_llama_pack("RagEvaluatorPack", "./pack_stuff")
+    judge_llm = OpenAI(model="gpt-3.5-turbo")
+    rag_evaluator = RagEvaluatorPack(
+        query_engine=query_engine, rag_dataset=rag_dataset, judge_llm=judge_llm
+    )
+
+    ############################################################################
+    # NOTE: If have a lower tier subscription for OpenAI API like Usage Tier 1 #
+    # then you'll need to use different batch_size and sleep_time_in_seconds.  #
+    # For Usage Tier 1, settings that seemed to work well were batch_size=5,   #
+    # and sleep_time_in_seconds=15 (as of December 2023.)                      #
+    ############################################################################
+    benchmark_df = await rag_evaluator.arun(
+        batch_size=20,  # batches the number of openai api calls to make
+        sleep_time_in_seconds=1,  # number of seconds sleep before making an api call
+    )
+    print(benchmark_df)
+
+
+if __name__ == "__main__":
+    loop = asyncio.get_event_loop()
+    loop.run_until_complete(main)
diff --git a/llama-datasets/__init__.py b/llama-datasets/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/llama-datasets/blockchain_solana/BUILD b/llama-datasets/blockchain_solana/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..db46e8d6c978c67e301dd6c47bee08c1b3fd141c
--- /dev/null
+++ b/llama-datasets/blockchain_solana/BUILD
@@ -0,0 +1 @@
+python_sources()
diff --git a/llama-datasets/blockchain_solana/README.md b/llama-datasets/blockchain_solana/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..cebaed8787417a50148ab9fa7e66ffeb7f292386
--- /dev/null
+++ b/llama-datasets/blockchain_solana/README.md
@@ -0,0 +1,61 @@
+# Blockchain Solana Dataset
+
+## CLI Usage
+
+You can download `llamadatasets` directly using `llamaindex-cli`, which comes installed with the `llama-index` python package:
+
+```bash
+llamaindex-cli download-llamadataset BlockchainSolanaDataset --download-dir ./data
+```
+
+You can then inspect the files at `./data`. When you're ready to load the data into
+python, you can use the below snippet of code:
+
+```python
+from llama_index.core import SimpleDirectoryReader
+from llama_index.core.llama_dataset import LabelledRagDataset
+
+rag_dataset = LabelledRagDataset.from_json("./data/rag_dataset.json")
+documents = SimpleDirectoryReader(input_dir="./data/source_files").load_data()
+```
+
+## Code Usage
+
+You can download the dataset to a directory, say `./data` directly in Python
+as well. From there, you can use the convenient `RagEvaluatorPack` llamapack to
+run your own LlamaIndex RAG pipeline with the `llamadataset`.
+
+```python
+from llama_index.core.llama_dataset import download_llama_dataset
+from llama_index.core.llama_pack import download_llama_pack
+from llama_index.core import VectorStoreIndex
+
+# download and install dependencies for benchmark dataset
+rag_dataset, documents = download_llama_dataset(
+    "BlockchainSolanaDataset", "./data"
+)
+
+# build basic RAG system
+index = VectorStoreIndex.from_documents(documents=documents)
+query_engine = index.as_query_engine()
+
+# evaluate using the RagEvaluatorPack
+RagEvaluatorPack = download_llama_pack(
+    "RagEvaluatorPack", "./rag_evaluator_pack"
+)
+rag_evaluator_pack = RagEvaluatorPack(
+    rag_dataset=rag_dataset, query_engine=query_engine
+)
+
+############################################################################
+# NOTE: If have a lower tier subscription for OpenAI API like Usage Tier 1 #
+# then you'll need to use different batch_size and sleep_time_in_seconds.  #
+# For Usage Tier 1, settings that seemed to work well were batch_size=5,   #
+# and sleep_time_in_seconds=15 (as of December 2023.)                      #
+############################################################################
+
+benchmark_df = await rag_evaluator_pack.arun(
+    batch_size=20,  # batches the number of openai api calls to make
+    sleep_time_in_seconds=1,  # seconds to sleep before making an api call
+)
+```
diff --git a/llama-datasets/blockchain_solana/card.json b/llama-datasets/blockchain_solana/card.json
new file mode 100644
index 0000000000000000000000000000000000000000..d2dcba74d5de594c630faf16419fcd9073992d46
--- /dev/null
+++ b/llama-datasets/blockchain_solana/card.json
@@ -0,0 +1,27 @@
+{
+  "name": "Blockchain Solana",
+  "className": "LabelledRagDataset",
+  "description": "A labelled RAG dataset based off an article, From Bitcoin to Solana – Innovating Blockchain towards Enterprise Applications),by Xiangyu Li, Xinyu Wang, Tingli Kong, Junhao Zheng and Min Luo, consisting of queries, reference answers, and reference contexts.",
+  "numberObservations": 58,
+  "containsExamplesByHumans": false,
+  "containsExamplesByAi": true,
+  "sourceUrls": ["https://arxiv.org/abs/2207.05240"],
+  "baselines": [
+    {
+      "name": "llamaindex",
+      "config": {
+        "chunkSize": 1024,
+        "llm": "gpt-3.5-turbo",
+        "similarityTopK": 2,
+        "embedModel": "text-embedding-ada-002"
+      },
+      "metrics": {
+        "contextSimilarity": 0.945,
+        "correctness": 4.457,
+        "faithfulness": 1.0,
+        "relevancy": 1.0
+      },
+      "codeUrl": "https://github.com/run-llama/llama-hub/blob/main/llama_hub/llama_datasets/blockchain_solana/llamaindex_baseline.py"
+    }
+  ]
+}
diff --git a/llama-datasets/blockchain_solana/llamaindex_baseline.py b/llama-datasets/blockchain_solana/llamaindex_baseline.py
new file mode 100644
index 0000000000000000000000000000000000000000..0d9979c490dde9104be55a2a9b4137f6ea9cb26f
--- /dev/null
+++ b/llama-datasets/blockchain_solana/llamaindex_baseline.py
@@ -0,0 +1,37 @@
+import asyncio
+
+from llama_index.core.llama_dataset import download_llama_dataset
+from llama_index.core.llama_pack import download_llama_pack
+from llama_index.core import VectorStoreIndex
+
+
+async def main():
+    # DOWNLOAD LLAMADATASET
+    rag_dataset, documents = download_llama_dataset(
+        "BlockchainSolanaDataset", "./blockchain_solana"
+    )
+
+    # BUILD BASIC RAG PIPELINE
+    index = VectorStoreIndex.from_documents(documents=documents)
+    query_engine = index.as_query_engine()
+
+    # EVALUATE WITH PACK
+    RagEvaluatorPack = download_llama_pack("RagEvaluatorPack", "./pack_stuff")
+    rag_evaluator = RagEvaluatorPack(query_engine=query_engine, rag_dataset=rag_dataset)
+
+    ############################################################################
+    # NOTE: If have a lower tier subscription for OpenAI API like Usage Tier 1 #
+    # then you'll need to use different batch_size and sleep_time_in_seconds.  #
+    # For Usage Tier 1, settings that seemed to work well were batch_size=5,   #
+    # and sleep_time_in_seconds=15 (as of December 2023.)                      #
+    ############################################################################
+    benchmark_df = await rag_evaluator.arun(
+        batch_size=20,  # batches the number of openai api calls to make
+        sleep_time_in_seconds=1,  # number of seconds sleep before making an api call
+    )
+    print(benchmark_df)
+
+
+if __name__ == "__main__":
+    loop = asyncio.get_event_loop()
+    loop.run_until_complete(main)
diff --git a/llama-datasets/braintrust_coda/BUILD b/llama-datasets/braintrust_coda/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..db46e8d6c978c67e301dd6c47bee08c1b3fd141c
--- /dev/null
+++ b/llama-datasets/braintrust_coda/BUILD
@@ -0,0 +1 @@
+python_sources()
diff --git a/llama-datasets/braintrust_coda/README.md b/llama-datasets/braintrust_coda/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..96bedb61f87f878c0db6c582143ec55c86bebb43
--- /dev/null
+++ b/llama-datasets/braintrust_coda/README.md
@@ -0,0 +1,65 @@
+# Braintrust Coda Help Desk Dataset
+
+[![Braintrust (346 x 40 px)](https://github.com/nerdai/llama-hub/assets/92402603/a99bddf3-0eab-42e8-8c53-8432da8299d3)](https://www.braintrustdata.com/)
+
+_This dataset was kindly provided by Kenny Wong and Ankur Goyal._
+
+## CLI Usage
+
+You can download `llamadatasets` directly using `llamaindex-cli`, which comes installed with the `llama-index` python package:
+
+```bash
+llamaindex-cli download-llamadataset BraintrustCodaHelpDeskDataset --download-dir ./data
+```
+
+You can then inspect the files at `./data`. When you're ready to load the data into
+python, you can use the below snippet of code:
+
+```python
+from llama_index.core import SimpleDirectoryReader
+from llama_index.core.llama_dataset import LabelledRagDataset
+
+rag_dataset = LabelledRagDataset.from_json("./data/rag_dataset.json")
+documents = SimpleDirectoryReader(input_dir="./data/source_files").load_data()
+```
+
+## Code Usage
+
+You can download the dataset to a directory, say `./data` directly in Python
+as well. From there, you can use the convenient `RagEvaluatorPack` llamapack to
+run your own LlamaIndex RAG pipeline with the `llamadataset`.
+
+```python
+from llama_index.core.llama_dataset import download_llama_dataset
+from llama_index.core.llama_pack import download_llama_pack
+from llama_index.core import VectorStoreIndex
+
+# download and install dependencies for benchmark dataset
+rag_dataset, documents = download_llama_dataset(
+    "BraintrustCodaHelpDeskDataset", "./data"
+)
+
+# build basic RAG system
+index = VectorStoreIndex.from_documents(documents=documents)
+query_engine = index.as_query_engine()
+
+# evaluate using the RagEvaluatorPack
+RagEvaluatorPack = download_llama_pack(
+    "RagEvaluatorPack", "./rag_evaluator_pack"
+)
+rag_evaluator_pack = RagEvaluatorPack(
+    rag_dataset=rag_dataset, query_engine=query_engine
+)
+
+############################################################################
+# NOTE: If have a lower tier subscription for OpenAI API like Usage Tier 1 #
+# then you'll need to use different batch_size and sleep_time_in_seconds.  #
+# For Usage Tier 1, settings that seemed to work well were batch_size=5,   #
+# and sleep_time_in_seconds=15 (as of December 2023.)                      #
+############################################################################
+
+benchmark_df = await rag_evaluator_pack.arun(
+    batch_size=20,  # batches the number of openai api calls to make
+    sleep_time_in_seconds=1,  # seconds to sleep before making an api call
+)
+```
diff --git a/llama-datasets/braintrust_coda/__init__.py b/llama-datasets/braintrust_coda/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/llama-datasets/braintrust_coda/card.json b/llama-datasets/braintrust_coda/card.json
new file mode 100644
index 0000000000000000000000000000000000000000..6da7d2f45d7cec169a1b4d9de44078207f57efeb
--- /dev/null
+++ b/llama-datasets/braintrust_coda/card.json
@@ -0,0 +1,29 @@
+{
+  "name": "Braintrust Coda Help Desk",
+  "className": "LabelledRagDataset",
+  "description": "A list of automatically generated question/answer pairs from the Coda (https://coda.io/) help docs. This dataset is interesting because most models include Coda’s documentation as part of their training set, so you can baseline performance without RAG.",
+  "numberObservations": 100,
+  "containsExamplesByHumans": false,
+  "containsExamplesByAi": true,
+  "sourceUrls": [
+    "https://gist.githubusercontent.com/wong-codaio/b8ea0e087f800971ca5ec9eef617273e/raw/39f8bd2ebdecee485021e20f2c1d40fd649a4c77/articles.json"
+  ],
+  "baselines": [
+    {
+      "name": "llamaindex",
+      "config": {
+        "chunkSize": 1024,
+        "llm": "gpt-3.5-turbo",
+        "similarityTopK": 2,
+        "embedModel": "text-embedding-ada-002"
+      },
+      "metrics": {
+        "contextSimilarity": 0.955,
+        "correctness": 4.32,
+        "faithfulness": 0.9,
+        "relevancy": 0.93
+      },
+      "codeUrl": "https://github.com/run-llama/llama-hub/blob/main/llama_hub/llama_datasets/braintrust_coda/llamaindex_baseline.py"
+    }
+  ]
+}
diff --git a/llama-datasets/braintrust_coda/llamaindex_baseline.py b/llama-datasets/braintrust_coda/llamaindex_baseline.py
new file mode 100644
index 0000000000000000000000000000000000000000..13fd55153f03c644f09d65d774c4d4c1dab4bd28
--- /dev/null
+++ b/llama-datasets/braintrust_coda/llamaindex_baseline.py
@@ -0,0 +1,37 @@
+import asyncio
+
+from llama_index.core.llama_dataset import download_llama_dataset
+from llama_index.core.llama_pack import download_llama_pack
+from llama_index.core import VectorStoreIndex
+
+
+async def main():
+    # DOWNLOAD LLAMADATASET
+    rag_dataset, documents = download_llama_dataset(
+        "BraintrustCodaHelpDeskDataset", "./braintrust_codahdd"
+    )
+
+    # BUILD BASIC RAG PIPELINE
+    index = VectorStoreIndex.from_documents(documents=documents)
+    query_engine = index.as_query_engine()
+
+    # EVALUATE WITH PACK
+    RagEvaluatorPack = download_llama_pack("RagEvaluatorPack", "./pack_stuff")
+    rag_evaluator = RagEvaluatorPack(query_engine=query_engine, rag_dataset=rag_dataset)
+
+    ############################################################################
+    # NOTE: If have a lower tier subscription for OpenAI API like Usage Tier 1 #
+    # then you'll need to use different batch_size and sleep_time_in_seconds.  #
+    # For Usage Tier 1, settings that seemed to work well were batch_size=5,   #
+    # and sleep_time_in_seconds=15 (as of December 2023.)                      #
+    ############################################################################
+    benchmark_df = await rag_evaluator.arun(
+        batch_size=20,  # batches the number of openai api calls to make
+        sleep_time_in_seconds=1,  # number of seconds sleep before making an api call
+    )
+    print(benchmark_df)
+
+
+if __name__ == "__main__":
+    loop = asyncio.get_event_loop()
+    loop.run_until_complete(main)
diff --git a/llama-datasets/covidqa/BUILD b/llama-datasets/covidqa/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..db46e8d6c978c67e301dd6c47bee08c1b3fd141c
--- /dev/null
+++ b/llama-datasets/covidqa/BUILD
@@ -0,0 +1 @@
+python_sources()
diff --git a/llama-datasets/covidqa/README.md b/llama-datasets/covidqa/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..1a725277534dcf07b42ebb247684f181186f58c8
--- /dev/null
+++ b/llama-datasets/covidqa/README.md
@@ -0,0 +1,59 @@
+# Covid Qa Dataset
+
+## CLI Usage
+
+You can download `llamadatasets` directly using `llamaindex-cli`, which comes installed with the `llama-index` python package:
+
+```bash
+llamaindex-cli download-llamadataset CovidQaDataset --download-dir ./data
+```
+
+You can then inspect the files at `./data`. When you're ready to load the data into
+python, you can use the below snippet of code:
+
+```python
+from llama_index.core import SimpleDirectoryReader
+from llama_index.core.llama_dataset import LabelledRagDataset
+
+rag_dataset = LabelledRagDataset.from_json("./data/rag_dataset.json")
+documents = SimpleDirectoryReader(input_dir="./data/source_files").load_data()
+```
+
+## Code Usage
+
+You can download the dataset to a directory, say `./data` directly in Python
+as well. From there, you can use the convenient `RagEvaluatorPack` llamapack to
+run your own LlamaIndex RAG pipeline with the `llamadataset`.
+
+```python
+from llama_index.core.llama_dataset import download_llama_dataset
+from llama_index.core.llama_pack import download_llama_pack
+from llama_index.core import VectorStoreIndex
+
+# download and install dependencies for benchmark dataset
+rag_dataset, documents = download_llama_dataset("CovidQaDataset", "./data")
+
+# build basic RAG system
+index = VectorStoreIndex.from_documents(documents=documents)
+query_engine = index.as_query_engine()
+
+# evaluate using the RagEvaluatorPack
+RagEvaluatorPack = download_llama_pack(
+    "RagEvaluatorPack", "./rag_evaluator_pack"
+)
+rag_evaluator_pack = RagEvaluatorPack(
+    rag_dataset=rag_dataset, query_engine=query_engine, show_progress=True
+)
+
+############################################################################
+# NOTE: If have a lower tier subscription for OpenAI API like Usage Tier 1 #
+# then you'll need to use different batch_size and sleep_time_in_seconds.  #
+# For Usage Tier 1, settings that seemed to work well were batch_size=5,   #
+# and sleep_time_in_seconds=15 (as of December 2023.)                      #
+############################################################################
+
+benchmark_df = await rag_evaluator_pack.arun(
+    batch_size=40,  # batches the number of openai api calls to make
+    sleep_time_in_seconds=1,  # seconds to sleep before making an api call
+)
+```
diff --git a/llama-datasets/covidqa/card.json b/llama-datasets/covidqa/card.json
new file mode 100644
index 0000000000000000000000000000000000000000..6e362e4a70f6f2e460b27da509e046f69fe568b3
--- /dev/null
+++ b/llama-datasets/covidqa/card.json
@@ -0,0 +1,29 @@
+{
+  "name": "Covid QA Dataset",
+  "className": "LabelledRagDataset",
+  "description": "A human-annotated RAG dataset consisting of over 300 question-answer pairs. This dataset represents a subset of the Covid-QA dataset available on Kaggle and authored by Xhlulu. It is a collection of frequently asked questions on COVID from various websites. This subset only considers the top 10 webpages containing the most question-answer pairs.",
+  "numberObservations": 316,
+  "containsExamplesByHumans": true,
+  "containsExamplesByAi": false,
+  "sourceUrls": [
+    "https://www.kaggle.com/datasets/xhlulu/covidqa/?select=news.csv"
+  ],
+  "baselines": [
+    {
+      "name": "llamaindex",
+      "config": {
+        "chunkSize": 1024,
+        "llm": "gpt-3.5-turbo",
+        "similarityTopK": 2,
+        "embedModel": "text-embedding-ada-002"
+      },
+      "metrics": {
+        "contextSimilarity": null,
+        "correctness": 3.96,
+        "faithfulness": 0.889,
+        "relevancy": 0.848
+      },
+      "codeUrl": "https://github.com/run-llama/llama-hub/blob/main/llama_hub/llama_datasets/covidqa/llamaindex_baseline.py"
+    }
+  ]
+}
diff --git a/llama-datasets/covidqa/llamaindex_baseline.py b/llama-datasets/covidqa/llamaindex_baseline.py
new file mode 100644
index 0000000000000000000000000000000000000000..152490e025f41282cbaedddc7bfeb20b914f38ce
--- /dev/null
+++ b/llama-datasets/covidqa/llamaindex_baseline.py
@@ -0,0 +1,35 @@
+import asyncio
+
+from llama_index.core.llama_dataset import download_llama_dataset
+from llama_index.core.llama_pack import download_llama_pack
+from llama_index.core import VectorStoreIndex
+
+
+async def main():
+    # DOWNLOAD LLAMADATASET
+    rag_dataset, documents = download_llama_dataset("CovidQaDataset", "./data")
+
+    # BUILD BASIC RAG PIPELINE
+    index = VectorStoreIndex.from_documents(documents=documents)
+    query_engine = index.as_query_engine()
+
+    # EVALUATE WITH PACK
+    RagEvaluatorPack = download_llama_pack("RagEvaluatorPack", "./pack")
+    rag_evaluator = RagEvaluatorPack(query_engine=query_engine, rag_dataset=rag_dataset)
+
+    ############################################################################
+    # NOTE: If have a lower tier subscription for OpenAI API like Usage Tier 1 #
+    # then you'll need to use different batch_size and sleep_time_in_seconds.  #
+    # For Usage Tier 1, settings that seemed to work well were batch_size=5,   #
+    # and sleep_time_in_seconds=15 (as of December 2023.)                      #
+    ############################################################################
+    benchmark_df = await rag_evaluator.arun(
+        batch_size=40,  # batches the number of openai api calls to make
+        sleep_time_in_seconds=1,  # number of seconds sleep before making an api call
+    )
+    print(benchmark_df)
+
+
+if __name__ == "__main__":
+    loop = asyncio.get_event_loop()
+    loop.run_until_complete(main)
diff --git a/llama-datasets/docugami_kg_rag/sec_10_q/BUILD b/llama-datasets/docugami_kg_rag/sec_10_q/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..db46e8d6c978c67e301dd6c47bee08c1b3fd141c
--- /dev/null
+++ b/llama-datasets/docugami_kg_rag/sec_10_q/BUILD
@@ -0,0 +1 @@
+python_sources()
diff --git a/llama-datasets/docugami_kg_rag/sec_10_q/README.md b/llama-datasets/docugami_kg_rag/sec_10_q/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..fcd97d2c03d8499837113eae273aa5e7e8c25802
--- /dev/null
+++ b/llama-datasets/docugami_kg_rag/sec_10_q/README.md
@@ -0,0 +1,63 @@
+# Docugami KG-RAG - Sec 10-Q
+
+A labelled RAG dataset with SEC 10-Q documents for major tech companies including queries across multiple docs and chunks, with reference answers. See [https://github.com/docugami/KG-RAG-datasets](https://github.com/docugami/KG-RAG-datasets) for details.
+
+## CLI Usage
+
+You can download `llamadatasets` directly using `llamaindex-cli`, which comes installed with the `llama-index` python package:
+
+```bash
+llamaindex-cli download-llamadataset DocugamiKgRagSec10Q --download-dir ./data
+```
+
+You can then inspect the files at `./data`. When you're ready to load the data into
+python, you can use the below snippet of code:
+
+```python
+from llama_index.core import SimpleDirectoryReader
+from llama_index.core.llama_dataset import LabelledRagDataset
+
+rag_dataset = LabelledRagDataset.from_json("./data/rag_dataset.json")
+documents = SimpleDirectoryReader(input_dir="./data/source_files").load_data()
+```
+
+## Code Usage
+
+You can download the dataset to a directory, say `./data` directly in Python
+as well. From there, you can use the convenient `RagEvaluatorPack` llamapack to
+run your own LlamaIndex RAG pipeline with the `llamadataset`.
+
+```python
+from llama_index.core.llama_dataset import download_llama_dataset
+from llama_index.core.llama_pack import download_llama_pack
+from llama_index.core import VectorStoreIndex
+
+# download and install dependencies for benchmark dataset
+rag_dataset, documents = download_llama_dataset(
+    "DocugamiKgRagSec10Q", "./data"
+)
+
+# build basic RAG system
+index = VectorStoreIndex.from_documents(documents=documents)
+query_engine = index.as_query_engine()
+
+# evaluate using the RagEvaluatorPack
+RagEvaluatorPack = download_llama_pack(
+    "RagEvaluatorPack", "./rag_evaluator_pack"
+)
+rag_evaluator_pack = RagEvaluatorPack(
+    rag_dataset=rag_dataset, query_engine=query_engine
+)
+
+############################################################################
+# NOTE: If have a lower tier subscription for OpenAI API like Usage Tier 1 #
+# then you'll need to use different batch_size and sleep_time_in_seconds.  #
+# For Usage Tier 1, settings that seemed to work well were batch_size=5,   #
+# and sleep_time_in_seconds=15 (as of December 2023.)                      #
+############################################################################
+
+benchmark_df = await rag_evaluator_pack.arun(
+    batch_size=20,  # batches the number of openai api calls to make
+    sleep_time_in_seconds=1,  # seconds to sleep before making an api call
+)
+```
diff --git a/llama-datasets/docugami_kg_rag/sec_10_q/card.json b/llama-datasets/docugami_kg_rag/sec_10_q/card.json
new file mode 100644
index 0000000000000000000000000000000000000000..cfc73290dfc5d761261333f6aad34996f8e022db
--- /dev/null
+++ b/llama-datasets/docugami_kg_rag/sec_10_q/card.json
@@ -0,0 +1,27 @@
+{
+  "name": "Docugami KG-RAG - SEC 10-Q",
+  "className": "LabelledRagDataset",
+  "description": "A labelled RAG dataset with SEC 10-Q documents for major tech companies including queries across multiple docs and chunks, with reference answers. See https://github.com/docugami/KG-RAG-datasets for details.",
+  "numberObservations": 195,
+  "containsExamplesByHumans": true,
+  "containsExamplesByAi": false,
+  "sourceUrls": [],
+  "baselines": [
+    {
+      "name": "llamaindex",
+      "config": {
+        "chunkSize": 1024,
+        "llm": "gpt-3.5-turbo",
+        "similarityTopK": 2,
+        "embedModel": "text-embedding-ada-002"
+      },
+      "metrics": {
+        "contextSimilarity": null,
+        "correctness": 2.703,
+        "faithfulness": 0.897,
+        "relevancy": 0.826
+      },
+      "codeUrl": ""
+    }
+  ]
+}
diff --git a/llama-datasets/docugami_kg_rag/sec_10_q/llamaindex_baseline.py b/llama-datasets/docugami_kg_rag/sec_10_q/llamaindex_baseline.py
new file mode 100644
index 0000000000000000000000000000000000000000..0945521bd17abe3a5445125b9953a08decff1547
--- /dev/null
+++ b/llama-datasets/docugami_kg_rag/sec_10_q/llamaindex_baseline.py
@@ -0,0 +1,41 @@
+import asyncio
+
+from llama_index.core.llama_dataset import download_llama_dataset
+from llama_index.core.llama_pack import download_llama_pack
+from llama_index.core import VectorStoreIndex
+from llama_index.llms import OpenAI
+
+
+async def main():
+    # DOWNLOAD LLAMADATASET
+    rag_dataset, documents = download_llama_dataset(
+        "DocugamiKgRagSec10Q", "./docugami_kg_rag_sec_10_q"
+    )
+
+    # BUILD BASIC RAG PIPELINE
+    index = VectorStoreIndex.from_documents(documents=documents)
+    query_engine = index.as_query_engine()
+
+    # EVALUATE WITH PACK
+    RagEvaluatorPack = download_llama_pack("RagEvaluatorPack", "./pack_stuff")
+    judge_llm = OpenAI(model="gpt-3.5-turbo")
+    rag_evaluator = RagEvaluatorPack(
+        query_engine=query_engine, rag_dataset=rag_dataset, judge_llm=judge_llm
+    )
+
+    ############################################################################
+    # NOTE: If have a lower tier subscription for OpenAI API like Usage Tier 1 #
+    # then you'll need to use different batch_size and sleep_time_in_seconds.  #
+    # For Usage Tier 1, settings that seemed to work well were batch_size=5,   #
+    # and sleep_time_in_seconds=15 (as of December 2023.)                      #
+    ############################################################################
+    benchmark_df = await rag_evaluator.arun(
+        batch_size=20,  # batches the number of openai api calls to make
+        sleep_time_in_seconds=1,  # number of seconds sleep before making an api call
+    )
+    print(benchmark_df)
+
+
+if __name__ == "__main__":
+    loop = asyncio.get_event_loop()
+    loop.run_until_complete(main)
diff --git a/llama-datasets/eval_llm_survey_paper/BUILD b/llama-datasets/eval_llm_survey_paper/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..db46e8d6c978c67e301dd6c47bee08c1b3fd141c
--- /dev/null
+++ b/llama-datasets/eval_llm_survey_paper/BUILD
@@ -0,0 +1 @@
+python_sources()
diff --git a/llama-datasets/eval_llm_survey_paper/README.md b/llama-datasets/eval_llm_survey_paper/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..5252e714a4a4d0d6f6dd619898fae30a1cf5666a
--- /dev/null
+++ b/llama-datasets/eval_llm_survey_paper/README.md
@@ -0,0 +1,61 @@
+# Evaluating Llm Survey Paper Dataset
+
+## CLI Usage
+
+You can download `llamadatasets` directly using `llamaindex-cli`, which comes installed with the `llama-index` python package:
+
+```bash
+llamaindex-cli download-llamadataset EvaluatingLlmSurveyPaperDataset --download-dir ./data
+```
+
+You can then inspect the files at `./data`. When you're ready to load the data into
+python, you can use the below snippet of code:
+
+```python
+from llama_index.core import SimpleDirectoryReader
+from llama_index.core.llama_dataset import LabelledRagDataset
+
+rag_dataset = LabelledRagDataset.from_json("./data/rag_dataset.json")
+documents = SimpleDirectoryReader(input_dir="./data/source_files").load_data()
+```
+
+## Code Usage
+
+You can download the dataset to a directory, say `./data` directly in Python
+as well. From there, you can use the convenient `RagEvaluatorPack` llamapack to
+run your own LlamaIndex RAG pipeline with the `llamadataset`.
+
+```python
+from llama_index.core.llama_dataset import download_llama_dataset
+from llama_index.core.llama_pack import download_llama_pack
+from llama_index.core import VectorStoreIndex
+
+# download and install dependencies for benchmark dataset
+rag_dataset, documents = download_llama_dataset(
+    "EvaluatingLlmSurveyPaperDataset", "./data"
+)
+
+# build basic RAG system
+index = VectorStoreIndex.from_documents(documents=documents)
+query_engine = index.as_query_engine()
+
+# evaluate using the RagEvaluatorPack
+RagEvaluatorPack = download_llama_pack(
+    "RagEvaluatorPack", "./rag_evaluator_pack"
+)
+rag_evaluator_pack = RagEvaluatorPack(
+    rag_dataset=rag_dataset, query_engine=query_engine
+)
+
+############################################################################
+# NOTE: If have a lower tier subscription for OpenAI API like Usage Tier 1 #
+# then you'll need to use different batch_size and sleep_time_in_seconds.  #
+# For Usage Tier 1, settings that seemed to work well were batch_size=5,   #
+# and sleep_time_in_seconds=15 (as of December 2023.)                      #
+############################################################################
+
+benchmark_df = await rag_evaluator_pack.arun(
+    batch_size=20,  # batches the number of openai api calls to make
+    sleep_time_in_seconds=1,  # seconds to sleep before making an api call
+)
+```
diff --git a/llama-datasets/eval_llm_survey_paper/card.json b/llama-datasets/eval_llm_survey_paper/card.json
new file mode 100644
index 0000000000000000000000000000000000000000..54c5e4f25556b47e527d79f6dc9ffcde8554f631
--- /dev/null
+++ b/llama-datasets/eval_llm_survey_paper/card.json
@@ -0,0 +1,27 @@
+{
+  "name": "Evaluating LLM Survey Paper Dataset",
+  "className": "LabelledRagDataset",
+  "description": "A labelled RAG dataset over the comprehensive, spanning 111 pages in total, survey on evaluating LLMs.",
+  "numberObservations": 276,
+  "containsExamplesByHumans": false,
+  "containsExamplesByAi": true,
+  "sourceUrls": ["https://arxiv.org/pdf/2310.19736.pdf"],
+  "baselines": [
+    {
+      "name": "llamaindex",
+      "config": {
+        "chunkSize": 1024,
+        "llm": "gpt-3.5-turbo",
+        "similarityTopK": 2,
+        "embedModel": "text-embedding-ada-002"
+      },
+      "metrics": {
+        "contextSimilarity": 0.923,
+        "correctness": 3.81,
+        "faithfulness": 0.888,
+        "relevancy": 0.808
+      },
+      "codeUrl": "https://github.com/run-llama/llama-hub/blob/main/llama_hub/llama_datasets/mini_squadv2/llamaindex_baseline.py"
+    }
+  ]
+}
diff --git a/llama-datasets/eval_llm_survey_paper/llamaindex_baseline.py b/llama-datasets/eval_llm_survey_paper/llamaindex_baseline.py
new file mode 100644
index 0000000000000000000000000000000000000000..001accf42813c25f8bdf05bacdaab5d22b74ab72
--- /dev/null
+++ b/llama-datasets/eval_llm_survey_paper/llamaindex_baseline.py
@@ -0,0 +1,34 @@
+from llama_index.core.llama_dataset import download_llama_dataset
+from llama_index.core.llama_pack import download_llama_pack
+from llama_index.core import VectorStoreIndex
+
+
+async def main():
+    # DOWNLOAD LLAMADATASET
+    rag_dataset, documents = download_llama_dataset(
+        "EvaluatingLlmSurveyPaperDataset", "./data"
+    )
+
+    # BUILD BASIC RAG PIPELINE
+    index = VectorStoreIndex.from_documents(documents=documents)
+    query_engine = index.as_query_engine()
+
+    # EVALUATE WITH PACK
+    RagEvaluatorPack = download_llama_pack("RagEvaluatorPack", "./pack")
+    rag_evaluator = RagEvaluatorPack(query_engine=query_engine, rag_dataset=rag_dataset)
+
+    ############################################################################
+    # NOTE: If have a lower tier subscription for OpenAI API like Usage Tier 1 #
+    # then you'll need to use different batch_size and sleep_time_in_seconds.  #
+    # For Usage Tier 1, settings that seemed to work well were batch_size=5,   #
+    # and sleep_time_in_seconds=15 (as of December 2023.)                      #
+    ############################################################################
+    benchmark_df = await rag_evaluator.arun(
+        batch_size=20,  # batches the number of openai api calls to make
+        sleep_time_in_seconds=1,  # number of seconds sleep before making an api call
+    )
+    print(benchmark_df)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/llama-datasets/history_of_alexnet/BUILD b/llama-datasets/history_of_alexnet/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..db46e8d6c978c67e301dd6c47bee08c1b3fd141c
--- /dev/null
+++ b/llama-datasets/history_of_alexnet/BUILD
@@ -0,0 +1 @@
+python_sources()
diff --git a/llama-datasets/history_of_alexnet/README.md b/llama-datasets/history_of_alexnet/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..1cf0f1e64caa0537761527b0a95e261bfa8ee216
--- /dev/null
+++ b/llama-datasets/history_of_alexnet/README.md
@@ -0,0 +1,61 @@
+# History Of Alexnet Dataset
+
+## CLI Usage
+
+You can download `llamadatasets` directly using `llamaindex-cli`, which comes installed with the `llama-index` python package:
+
+```bash
+llamaindex-cli download-llamadataset HistoryOfAlexnetDataset --download-dir ./data
+```
+
+You can then inspect the files at `./data`. When you're ready to load the data into
+python, you can use the below snippet of code:
+
+```python
+from llama_index.core import SimpleDirectoryReader
+from llama_index.core.llama_dataset import LabelledRagDataset
+
+rag_dataset = LabelledRagDataset.from_json("./data/rag_dataset.json")
+documents = SimpleDirectoryReader(input_dir="./data/source_files").load_data()
+```
+
+## Code Usage
+
+You can download the dataset to a directory, say `./data` directly in Python
+as well. From there, you can use the convenient `RagEvaluatorPack` llamapack to
+run your own LlamaIndex RAG pipeline with the `llamadataset`.
+
+```python
+from llama_index.core.llama_dataset import download_llama_dataset
+from llama_index.core.llama_pack import download_llama_pack
+from llama_index.core import VectorStoreIndex
+
+# download and install dependencies for benchmark dataset
+rag_dataset, documents = download_llama_dataset(
+    "HistoryOfAlexnetDataset", "./data"
+)
+
+# build basic RAG system
+index = VectorStoreIndex.from_documents(documents=documents)
+query_engine = index.as_query_engine()
+
+# evaluate using the RagEvaluatorPack
+RagEvaluatorPack = download_llama_pack(
+    "RagEvaluatorPack", "./rag_evaluator_pack"
+)
+rag_evaluator_pack = RagEvaluatorPack(
+    rag_dataset=rag_dataset, query_engine=query_engine
+)
+
+############################################################################
+# NOTE: If have a lower tier subscription for OpenAI API like Usage Tier 1 #
+# then you'll need to use different batch_size and sleep_time_in_seconds.  #
+# For Usage Tier 1, settings that seemed to work well were batch_size=5,   #
+# and sleep_time_in_seconds=15 (as of December 2023.)                      #
+############################################################################
+
+benchmark_df = await rag_evaluator_pack.arun(
+    batch_size=20,  # batches the number of openai api calls to make
+    sleep_time_in_seconds=1,  # seconds to sleep before making an api call
+)
+```
diff --git a/llama-datasets/history_of_alexnet/card.json b/llama-datasets/history_of_alexnet/card.json
new file mode 100644
index 0000000000000000000000000000000000000000..632136c4aec75da1f7bfb98349ce0bfed0e09402
--- /dev/null
+++ b/llama-datasets/history_of_alexnet/card.json
@@ -0,0 +1,27 @@
+{
+  "name": "History of Alexnet Dataset",
+  "className": "LabelledRagDataset",
+  "description": "A labelled RAG dataset based off an article, The History Began from AlexNet: A Comprehensive Survey on Deep Learning Approaches, by Md Zahangir Alom, Tarek M. Taha, Christopher Yakopcic, Stefan Westberg, Paheding Sidike, Mst Shamima Nasrin, Brian C Van Esesn, Abdul A S. Awwal, Vijayan K. Asari, consisting of queries, reference answers, and reference contexts.",
+  "numberObservations": 160,
+  "containsExamplesByHumans": false,
+  "containsExamplesByAi": true,
+  "sourceUrls": ["https://arxiv.org/abs/1803.01164"],
+  "baselines": [
+    {
+      "name": "llamaindex",
+      "config": {
+        "chunkSize": 1024,
+        "llm": "gpt-3.5-turbo",
+        "similarityTopK": 2,
+        "embedModel": "text-embedding-ada-002"
+      },
+      "metrics": {
+        "contextSimilarity": 0.931,
+        "correctness": 4.434,
+        "faithfulness": 0.963,
+        "relevancy": 0.931
+      },
+      "codeUrl": "https://github.com/run-llama/llama-hub/blob/main/llama_hub/llama_datasets/history_of_alexnet/llamaindex_baseline.py"
+    }
+  ]
+}
diff --git a/llama-datasets/history_of_alexnet/llamaindex_baseline.py b/llama-datasets/history_of_alexnet/llamaindex_baseline.py
new file mode 100644
index 0000000000000000000000000000000000000000..dcf09d21a2db246a7e3d9ba8943f13429a7c33b7
--- /dev/null
+++ b/llama-datasets/history_of_alexnet/llamaindex_baseline.py
@@ -0,0 +1,35 @@
+import asyncio
+
+from llama_index.core.llama_dataset import download_llama_dataset
+from llama_index.core.llama_pack import download_llama_pack
+from llama_index.core import VectorStoreIndex
+
+
+async def main():
+    # DOWNLOAD LLAMADATASET
+    rag_dataset, documents = download_llama_dataset("HistoryOfAlexnetDataset", "./data")
+
+    # BUILD BASIC RAG PIPELINE
+    index = VectorStoreIndex.from_documents(documents=documents)
+    query_engine = index.as_query_engine()
+
+    # EVALUATE WITH PACK
+    RagEvaluatorPack = download_llama_pack("RagEvaluatorPack", "./pack_stuff")
+    rag_evaluator = RagEvaluatorPack(query_engine=query_engine, rag_dataset=rag_dataset)
+
+    ############################################################################
+    # NOTE: If have a lower tier subscription for OpenAI API like Usage Tier 1 #
+    # then you'll need to use different batch_size and sleep_time_in_seconds.  #
+    # For Usage Tier 1, settings that seemed to work well were batch_size=5,   #
+    # and sleep_time_in_seconds=15 (as of December 2023.)                      #
+    ############################################################################
+    benchmark_df = await rag_evaluator.arun(
+        batch_size=20,  # batches the number of openai api calls to make
+        sleep_time_in_seconds=1,  # number of seconds sleep before making an api call
+    )
+    print(benchmark_df)
+
+
+if __name__ == "__main__":
+    loop = asyncio.get_event_loop()
+    loop.run_until_complete(main)
diff --git a/llama-datasets/library.json b/llama-datasets/library.json
new file mode 100644
index 0000000000000000000000000000000000000000..c30a293ccd74ac054b7c3d8a1fce644b70980e80
--- /dev/null
+++ b/llama-datasets/library.json
@@ -0,0 +1,87 @@
+{
+  "PaulGrahamEssayDataset": {
+    "id": "paul_graham_essay",
+    "author": "nerdai",
+    "keywords": ["rag"]
+  },
+  "BraintrustCodaHelpDeskDataset": {
+    "id": "braintrust_coda",
+    "author": "dashk",
+    "keywords": ["rag", "help desk"]
+  },
+  "PatronusAIFinanceBenchDataset": {
+    "id": "patronus_financebench",
+    "author": "anandnk24",
+    "keywords": ["rag", "finance"]
+  },
+  "BlockchainSolanaDataset": {
+    "id": "blockchain_solana",
+    "author": "CalculusC",
+    "keywords": ["rag", "cryptocurrency"]
+  },
+  "MiniTruthfulQADataset": {
+    "id": "mini_truthfulqa",
+    "author": "nerdai",
+    "keywords": ["rag", "truthfulqa"]
+  },
+  "Llama2PaperDataset": {
+    "id": "llama2_paper",
+    "author": "jerryjliu",
+    "keywords": ["rag", "llama2"]
+  },
+  "Uber10KDataset2021": {
+    "id": "10k/uber_2021",
+    "author": "jerryjliu",
+    "keywords": ["sec", "uber", "10k"]
+  },
+  "MiniSquadV2Dataset": {
+    "id": "mini_squadv2",
+    "author": "axiomofjoy",
+    "keywords": ["rag", "squadv2"]
+  },
+  "OriginOfCovid19Dataset": {
+    "id": "origin_of_covid19",
+    "author": "CalculusC",
+    "keywords": ["rag", "covid-19"]
+  },
+  "EvaluatingLlmSurveyPaperDataset": {
+    "id": "eval_llm_survey_paper",
+    "author": "nerdai",
+    "keywords": ["rag", "evaluation", "paper"]
+  },
+  "CovidQaDataset": {
+    "id": "covidqa",
+    "author": "nerdai",
+    "keywords": ["rag", "covid"]
+  },
+  "MiniCovidQaDataset": {
+    "id": "mini_covidqa",
+    "author": "nerdai",
+    "keywords": ["rag", "covid", "mini"]
+  },
+  "HistoryOfAlexnetDataset": {
+    "id": "history_of_alexnet",
+    "author": "CalculusC",
+    "keywords": ["rag", "alexnet"]
+  },
+  "DocugamiKgRagSec10Q": {
+    "id": "docugami_kg_rag/sec_10_q",
+    "author": "Docugami",
+    "keywords": ["rag", "kg-rag", "10q", "docugami"]
+  },
+  "MtBenchHumanJudgementDataset": {
+    "id": "mt_bench_humanjudgement",
+    "author": "nerdai",
+    "keywords": ["evaluator", "llm as judge", "human agreement"]
+  },
+  "MiniMtBenchSingleGradingDataset": {
+    "id": "mini_mt_bench_singlegrading",
+    "author": "nerdai",
+    "keywords": ["evaluator", "llm as judge"]
+  },
+  "MiniEsgBenchDataset": {
+    "id": "mini_esg_bench",
+    "author": "nerdai",
+    "keywords": ["rag", "pdf", "esg"]
+  }
+}
diff --git a/llama-datasets/llama2_paper/BUILD b/llama-datasets/llama2_paper/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..db46e8d6c978c67e301dd6c47bee08c1b3fd141c
--- /dev/null
+++ b/llama-datasets/llama2_paper/BUILD
@@ -0,0 +1 @@
+python_sources()
diff --git a/llama-datasets/llama2_paper/README.md b/llama-datasets/llama2_paper/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..457cd8fc2f188b56320513c2e54d9893b4b0fb52
--- /dev/null
+++ b/llama-datasets/llama2_paper/README.md
@@ -0,0 +1,59 @@
+# Llama 2 Paper Dataset
+
+## CLI Usage
+
+You can download `llamadatasets` directly using `llamaindex-cli`, which comes installed with the `llama-index` python package:
+
+```bash
+llamaindex-cli download-llamadataset Llama2PaperDataset --download-dir ./data
+```
+
+You can then inspect the files at `./data`. When you're ready to load the data into
+python, you can use the below snippet of code:
+
+```python
+from llama_index.core import SimpleDirectoryReader
+from llama_index.core.llama_dataset import LabelledRagDataset
+
+rag_dataset = LabelledRagDataset.from_json("./data/rag_dataset.json")
+documents = SimpleDirectoryReader(input_dir="./data/source_files").load_data()
+```
+
+## Code Usage
+
+You can download the dataset to a directory, say `./data` directly in Python
+as well. From there, you can use the convenient `RagEvaluatorPack` llamapack to
+run your own LlamaIndex RAG pipeline with the `llamadataset`.
+
+```python
+from llama_index.core.llama_dataset import download_llama_dataset
+from llama_index.core.llama_pack import download_llama_pack
+from llama_index.core import VectorStoreIndex
+
+# download and install dependencies for benchmark dataset
+rag_dataset, documents = download_llama_dataset("Llama2PaperDataset", "./data")
+
+# build basic RAG system
+index = VectorStoreIndex.from_documents(documents=documents)
+query_engine = index.as_query_engine()
+
+# evaluate using the RagEvaluatorPack
+RagEvaluatorPack = download_llama_pack(
+    "RagEvaluatorPack", "./rag_evaluator_pack"
+)
+rag_evaluator_pack = RagEvaluatorPack(
+    rag_dataset=rag_dataset, query_engine=query_engine
+)
+
+############################################################################
+# NOTE: If have a lower tier subscription for OpenAI API like Usage Tier 1 #
+# then you'll need to use different batch_size and sleep_time_in_seconds.  #
+# For Usage Tier 1, settings that seemed to work well were batch_size=5,   #
+# and sleep_time_in_seconds=15 (as of December 2023.)                      #
+############################################################################
+
+benchmark_df = await rag_evaluator_pack.arun(
+    batch_size=20,  # batches the number of openai api calls to make
+    sleep_time_in_seconds=1,  # seconds to sleep before making an api call
+)
+```
diff --git a/llama-datasets/llama2_paper/__init__.py b/llama-datasets/llama2_paper/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/llama-datasets/llama2_paper/card.json b/llama-datasets/llama2_paper/card.json
new file mode 100644
index 0000000000000000000000000000000000000000..ce772a8f76205e2aeb7dfb4adcffb80bbfc76c4e
--- /dev/null
+++ b/llama-datasets/llama2_paper/card.json
@@ -0,0 +1,27 @@
+{
+  "name": "Llama 2 Paper Dataset",
+  "className": "LabelledRagDataset",
+  "description": "A labelled RAG dataset based off the Llama 2 ArXiv PDF.",
+  "numberObservations": 100,
+  "containsExamplesByHumans": false,
+  "containsExamplesByAi": true,
+  "sourceUrls": ["https://arxiv.org/abs/2307.09288"],
+  "baselines": [
+    {
+      "name": "llamaindex",
+      "config": {
+        "chunkSize": 1024,
+        "llm": "gpt-3.5-turbo",
+        "similarityTopK": 2,
+        "embedModel": "text-embedding-ada-002"
+      },
+      "metrics": {
+        "contextSimilarity": 0.939,
+        "correctness": 4.08,
+        "faithfulness": 0.97,
+        "relevancy": 0.95
+      },
+      "codeUrl": "https://github.com/run-llama/llama-hub/blob/main/llama_hub/llama_datasets/llama2_paper/llamaindex_baseline.py"
+    }
+  ]
+}
diff --git a/llama-datasets/llama2_paper/llamaindex_baseline.py b/llama-datasets/llama2_paper/llamaindex_baseline.py
new file mode 100644
index 0000000000000000000000000000000000000000..110208f68248c7c3cf789f2cab6596a494eb82e8
--- /dev/null
+++ b/llama-datasets/llama2_paper/llamaindex_baseline.py
@@ -0,0 +1,35 @@
+import asyncio
+
+from llama_index.core.llama_dataset import download_llama_dataset
+from llama_index.core.llama_pack import download_llama_pack
+from llama_index.core import VectorStoreIndex
+
+
+async def main():
+    # DOWNLOAD LLAMADATASET
+    rag_dataset, documents = download_llama_dataset("Llama2PaperDataset", "./data")
+
+    # BUILD BASIC RAG PIPELINE
+    index = VectorStoreIndex.from_documents(documents=documents)
+    query_engine = index.as_query_engine()
+
+    # EVALUATE WITH PACK
+    RagEvaluatorPack = download_llama_pack("RagEvaluatorPack", "./pack_stuff")
+    rag_evaluator = RagEvaluatorPack(query_engine=query_engine, rag_dataset=rag_dataset)
+
+    ############################################################################
+    # NOTE: If have a lower tier subscription for OpenAI API like Usage Tier 1 #
+    # then you'll need to use different batch_size and sleep_time_in_seconds.  #
+    # For Usage Tier 1, settings that seemed to work well were batch_size=5,   #
+    # and sleep_time_in_seconds=15 (as of December 2023.)                      #
+    ############################################################################
+    benchmark_df = await rag_evaluator.arun(
+        batch_size=20,  # batches the number of openai api calls to make
+        sleep_time_in_seconds=1,  # number of seconds sleep before making an api call
+    )
+    print(benchmark_df)
+
+
+if __name__ == "__main__":
+    loop = asyncio.get_event_loop()
+    loop.run_until_complete(main)
diff --git a/llama-datasets/mini_covidqa/BUILD b/llama-datasets/mini_covidqa/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..db46e8d6c978c67e301dd6c47bee08c1b3fd141c
--- /dev/null
+++ b/llama-datasets/mini_covidqa/BUILD
@@ -0,0 +1 @@
+python_sources()
diff --git a/llama-datasets/mini_covidqa/README.md b/llama-datasets/mini_covidqa/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..350c82c739e3339965b4c97bb19b6d30e3b3b20c
--- /dev/null
+++ b/llama-datasets/mini_covidqa/README.md
@@ -0,0 +1,59 @@
+# Mini Covid Qa Dataset
+
+## CLI Usage
+
+You can download `llamadatasets` directly using `llamaindex-cli`, which comes installed with the `llama-index` python package:
+
+```bash
+llamaindex-cli download-llamadataset MiniCovidQaDataset --download-dir ./data
+```
+
+You can then inspect the files at `./data`. When you're ready to load the data into
+python, you can use the below snippet of code:
+
+```python
+from llama_index.core import SimpleDirectoryReader
+from llama_index.core.llama_dataset import LabelledRagDataset
+
+rag_dataset = LabelledRagDataset.from_json("./data/rag_dataset.json")
+documents = SimpleDirectoryReader(input_dir="./data/source_files").load_data()
+```
+
+## Code Usage
+
+You can download the dataset to a directory, say `./data` directly in Python
+as well. From there, you can use the convenient `RagEvaluatorPack` llamapack to
+run your own LlamaIndex RAG pipeline with the `llamadataset`.
+
+```python
+from llama_index.core.llama_dataset import download_llama_dataset
+from llama_index.core.llama_pack import download_llama_pack
+from llama_index.core import VectorStoreIndex
+
+# download and install dependencies for benchmark dataset
+rag_dataset, documents = download_llama_dataset("MiniCovidQaDataset", "./data")
+
+# build basic RAG system
+index = VectorStoreIndex.from_documents(documents=documents)
+query_engine = index.as_query_engine()
+
+# evaluate using the RagEvaluatorPack
+RagEvaluatorPack = download_llama_pack(
+    "RagEvaluatorPack", "./rag_evaluator_pack"
+)
+rag_evaluator_pack = RagEvaluatorPack(
+    rag_dataset=rag_dataset, query_engine=query_engine
+)
+
+############################################################################
+# NOTE: If have a lower tier subscription for OpenAI API like Usage Tier 1 #
+# then you'll need to use different batch_size and sleep_time_in_seconds.  #
+# For Usage Tier 1, settings that seemed to work well were batch_size=5,   #
+# and sleep_time_in_seconds=15 (as of December 2023.)                      #
+############################################################################
+
+benchmark_df = await rag_evaluator_pack.arun(
+    batch_size=20,  # batches the number of openai api calls to make
+    sleep_time_in_seconds=1,  # seconds to sleep before making an api call
+)
+```
diff --git a/llama-datasets/mini_covidqa/card.json b/llama-datasets/mini_covidqa/card.json
new file mode 100644
index 0000000000000000000000000000000000000000..99c5b97746ad68718166f04a6ccd5133f6f536ee
--- /dev/null
+++ b/llama-datasets/mini_covidqa/card.json
@@ -0,0 +1,29 @@
+{
+  "name": "Mini Covid QA Dataset",
+  "className": "LabelledRagDataset",
+  "description": "This dataset is a mini version of CovidQaDataset.\n A human-annotated RAG dataset consisting of over 300 question-answer pairs. This dataset represents a subset of the Covid-QA dataset available on Kaggle and authored by Xhlulu. It is a collection of frequently asked questions on COVID from various websites. This subset only considers the top 10 webpages containing the most question-answer pairs.",
+  "numberObservations": 42,
+  "containsExamplesByHumans": true,
+  "containsExamplesByAi": false,
+  "sourceUrls": [
+    "https://www.kaggle.com/datasets/xhlulu/mini_covidqa/?select=news.csv"
+  ],
+  "baselines": [
+    {
+      "name": "llamaindex",
+      "config": {
+        "chunkSize": 1024,
+        "llm": "gpt-3.5-turbo",
+        "similarityTopK": 2,
+        "embedModel": "text-embedding-ada-002"
+      },
+      "metrics": {
+        "contextSimilarity": null,
+        "correctness": 4.214,
+        "faithfulness": 0.857,
+        "relevancy": 0.833
+      },
+      "codeUrl": "https://github.com/run-llama/llama-hub/blob/main/llama_hub/llama_datasets/mini_covidqa/llamaindex_baseline.py"
+    }
+  ]
+}
diff --git a/llama-datasets/mini_covidqa/llamaindex_baseline.py b/llama-datasets/mini_covidqa/llamaindex_baseline.py
new file mode 100644
index 0000000000000000000000000000000000000000..bcc9e5b963f815a9f18029db50a9b8a67f04c408
--- /dev/null
+++ b/llama-datasets/mini_covidqa/llamaindex_baseline.py
@@ -0,0 +1,35 @@
+import asyncio
+
+from llama_index.core.llama_dataset import download_llama_dataset
+from llama_index.core.llama_pack import download_llama_pack
+from llama_index.core import VectorStoreIndex
+
+
+async def main():
+    # DOWNLOAD LLAMADATASET
+    rag_dataset, documents = download_llama_dataset("MiniCovidQaDataset", "./data")
+
+    # BUILD BASIC RAG PIPELINE
+    index = VectorStoreIndex.from_documents(documents=documents)
+    query_engine = index.as_query_engine()
+
+    # EVALUATE WITH PACK
+    RagEvaluatorPack = download_llama_pack("RagEvaluatorPack", "./pack")
+    rag_evaluator = RagEvaluatorPack(query_engine=query_engine, rag_dataset=rag_dataset)
+
+    ############################################################################
+    # NOTE: If have a lower tier subscription for OpenAI API like Usage Tier 1 #
+    # then you'll need to use different batch_size and sleep_time_in_seconds.  #
+    # For Usage Tier 1, settings that seemed to work well were batch_size=5,   #
+    # and sleep_time_in_seconds=15 (as of December 2023.)                      #
+    ############################################################################
+    benchmark_df = await rag_evaluator.arun(
+        batch_size=40,  # batches the number of openai api calls to make
+        sleep_time_in_seconds=1,  # number of seconds sleep before making an api call
+    )
+    print(benchmark_df)
+
+
+if __name__ == "__main__":
+    loop = asyncio.get_event_loop()
+    loop.run_until_complete(main)
diff --git a/llama-datasets/mini_esg_bench/BUILD b/llama-datasets/mini_esg_bench/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..db46e8d6c978c67e301dd6c47bee08c1b3fd141c
--- /dev/null
+++ b/llama-datasets/mini_esg_bench/BUILD
@@ -0,0 +1 @@
+python_sources()
diff --git a/llama-datasets/mini_esg_bench/README.md b/llama-datasets/mini_esg_bench/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..2f3c89c00115b79540e6948f91df90eedee4c3be
--- /dev/null
+++ b/llama-datasets/mini_esg_bench/README.md
@@ -0,0 +1,75 @@
+# Mini Esg Bench Dataset
+
+## CLI Usage
+
+You can download `llamadatasets` directly using `llamaindex-cli`, which comes installed with the `llama-index` python package:
+
+```bash
+llamaindex-cli download-llamadataset MiniEsgBenchDataset --download-dir ./data
+```
+
+You can then inspect the files at `./data`. When you're ready to load the data into
+python, you can use the below snippet of code:
+
+```python
+from llama_index.core import SimpleDirectoryReader
+from llama_index.core.llama_dataset import LabelledRagDataset
+
+rag_dataset = LabelledRagDataset.from_json("./data/rag_dataset.json")
+documents = SimpleDirectoryReader(input_dir="./data/source_files").load_data()
+```
+
+## Code Usage
+
+You can download the dataset to a directory, say `./data` directly in Python
+as well. From there, you can use the convenient `RagEvaluatorPack` llamapack to
+run your own LlamaIndex RAG pipeline with the `llamadataset`.
+
+```python
+from llama_index.core.llama_dataset import download_llama_dataset
+from llama_index.core.llama_pack import download_llama_pack
+from llama_index.core import VectorStoreIndex
+
+# download and install dependencies for benchmark dataset
+rag_dataset, documents = download_llama_dataset(
+    "MiniEsgBenchDataset", "./data"
+)
+
+# build basic RAG system
+index = VectorStoreIndex.from_documents(documents=documents)
+query_engine = index.as_query_engine()
+
+# evaluate using the RagEvaluatorPack
+RagEvaluatorPack = download_llama_pack(
+    "RagEvaluatorPack", "./rag_evaluator_pack"
+)
+rag_evaluator_pack = RagEvaluatorPack(
+    rag_dataset=rag_dataset, query_engine=query_engine
+)
+
+############################################################################
+# NOTE: If have a lower tier subscription for OpenAI API like Usage Tier 1 #
+# then you'll need to use different batch_size and sleep_time_in_seconds.  #
+# For Usage Tier 1, settings that seemed to work well were batch_size=5,   #
+# and sleep_time_in_seconds=15 (as of December 2023.)                      #
+############################################################################
+
+benchmark_df = await rag_evaluator_pack.arun(
+    batch_size=20,  # batches the number of openai api calls to make
+    sleep_time_in_seconds=1,  # seconds to sleep before making an api call
+)
+```
+
+## Citing the data
+
+If you choose to use this dataset for research, it would be appreciated if you
+could cite it with given details below.
+
+```text
+@misc{llamaindex_mini_esg_bench_2023,
+      title={Mini ESG Bench},
+      author={Val Andrei Fajardo},
+      year={2023},
+      organization={llamaindex}
+}
+```
diff --git a/llama-datasets/mini_esg_bench/card.json b/llama-datasets/mini_esg_bench/card.json
new file mode 100644
index 0000000000000000000000000000000000000000..726fd0909bb4aa47f7cc7bfc5b1a9b3f605c73f3
--- /dev/null
+++ b/llama-datasets/mini_esg_bench/card.json
@@ -0,0 +1,27 @@
+{
+  "name": "Mini ESG Bench Dataset",
+  "className": "LabelledRagDataset",
+  "description": "This dataset is meant to be a difficult benchmark for pdf parsers. In particular, adopting the terminology used in the PDFTriage paper (https://arxiv.org/abs/2309.08872), we curate difficult questions involving structural knowledge of the PDF documents. The examples in this dataset come from the Environment, Social and (corporate) Governance (ESG) reports of FAANG (companies) and Microsoft in 2021-2022.",
+  "numberObservations": 50,
+  "containsExamplesByHumans": true,
+  "containsExamplesByAi": false,
+  "sourceUrls": [],
+  "baselines": [
+    {
+      "name": "llamaindex",
+      "config": {
+        "chunkSize": 1024,
+        "llm": "gpt-3.5-turbo",
+        "similarityTopK": 2,
+        "embedModel": "text-embedding-ada-002"
+      },
+      "metrics": {
+        "contextSimilarity": 0.836,
+        "correctness": 1.88,
+        "faithfulness": 0.84,
+        "relevancy": 0.6
+      },
+      "codeUrl": "https://github.com/run-llama/llama-hub/blob/main/llama_hub/llama_datasets/mini_esg_bench/llamaindex_baseline.py"
+    }
+  ]
+}
diff --git a/llama-datasets/mini_esg_bench/llamaindex_baseline.py b/llama-datasets/mini_esg_bench/llamaindex_baseline.py
new file mode 100644
index 0000000000000000000000000000000000000000..7aa4607b0f2715f5348dd4844cfe7dd4ba6244e9
--- /dev/null
+++ b/llama-datasets/mini_esg_bench/llamaindex_baseline.py
@@ -0,0 +1,35 @@
+import asyncio
+
+from llama_index.core.llama_dataset import download_llama_dataset
+from llama_index.core.llama_pack import download_llama_pack
+from llama_index.core import VectorStoreIndex
+
+
+async def main():
+    # DOWNLOAD LLAMADATASET
+    rag_dataset, documents = download_llama_dataset("MiniEsgBenchDataset", "./data")
+
+    # BUILD BASIC RAG PIPELINE
+    index = VectorStoreIndex.from_documents(documents=documents)
+    query_engine = index.as_query_engine()
+
+    # EVALUATE WITH PACK
+    RagEvaluatorPack = download_llama_pack("RagEvaluatorPack", "./pack_stuff")
+    rag_evaluator = RagEvaluatorPack(query_engine=query_engine, rag_dataset=rag_dataset)
+
+    ############################################################################
+    # NOTE: If have a lower tier subscription for OpenAI API like Usage Tier 1 #
+    # then you'll need to use different batch_size and sleep_time_in_seconds.  #
+    # For Usage Tier 1, settings that seemed to work well were batch_size=5,   #
+    # and sleep_time_in_seconds=15 (as of December 2023.)                      #
+    ############################################################################
+    benchmark_df = await rag_evaluator.arun(
+        batch_size=20,  # batches the number of openai api calls to make
+        sleep_time_in_seconds=1,  # number of seconds sleep before making an api call
+    )
+    print(benchmark_df)
+
+
+if __name__ == "__main__":
+    loop = asyncio.get_event_loop()
+    loop.run_until_complete(main)
diff --git a/llama-datasets/mini_mt_bench_singlegrading/BUILD b/llama-datasets/mini_mt_bench_singlegrading/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..db46e8d6c978c67e301dd6c47bee08c1b3fd141c
--- /dev/null
+++ b/llama-datasets/mini_mt_bench_singlegrading/BUILD
@@ -0,0 +1 @@
+python_sources()
diff --git a/llama-datasets/mini_mt_bench_singlegrading/README.md b/llama-datasets/mini_mt_bench_singlegrading/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..ed7170a4f932161c76407c10088e5749121c8937
--- /dev/null
+++ b/llama-datasets/mini_mt_bench_singlegrading/README.md
@@ -0,0 +1,82 @@
+# Mini Mt Bench Single Grading Dataset
+
+## CLI Usage
+
+You can download `llamadatasets` directly using `llamaindex-cli`, which comes installed with the `llama-index` python package:
+
+```bash
+llamaindex-cli download-llamadataset MiniMtBenchSingleGradingDataset --download-dir ./data
+```
+
+You can then inspect the files at `./data`. When you're ready to load the data into
+python, you can use the below snippet of code:
+
+```python
+from llama_index.core import SimpleDirectoryReader
+from llama_index.core.llama_dataset import LabelledEvaluatorDataset
+
+evaluator_dataset = LabelledEvaluatorDataset.from_json(
+    "./data/pairwise_evaluation_dataset.json"
+)
+```
+
+## Code Usage
+
+You can download the dataset to a directory, say `./data` directly in Python
+as well. From there, you can use the convenient `EvaluatorBenchmarkerPack` llamapack to
+run your own LlamaIndex RAG pipeline with the `llamadataset`.
+
+```python
+from llama_index.core.llama_dataset import download_llama_dataset
+from llama_index.core.llama_pack import download_llama_pack
+from llama_index.core.evaluation import CorrectnessEvaluator
+from llama_index.llms import OpenAI
+from llama_index.core import ServiceContext
+
+# download benchmark dataset
+evaluator_dataset, _ = download_llama_dataset(
+    "MiniMtBenchSingleGradingDataset", "./data"
+)
+
+# define your evaluator
+gpt_4_context = ServiceContext.from_defaults(
+    llm=OpenAI(temperature=0, model="gpt-4"),
+)
+
+evaluator = CorrectnessEvaluator(service_context=gpt_4_context)
+
+# evaluate using the EvaluatorBenchmarkerPack
+EvaluatorBenchmarkerPack = download_llama_pack(
+    "EvaluatorBenchmarkerPack", "./pack"
+)
+evaluator_benchmarker = EvaluatorBenchmarkerPack(
+    evaluator=evaluator,
+    eval_dataset=evaluator_dataset,
+    show_progress=True,
+)
+
+############################################################################
+# NOTE: If have a lower tier subscription for OpenAI API like Usage Tier 1 #
+# then you'll need to use different batch_size and sleep_time_in_seconds.  #
+# For Usage Tier 1, settings that seemed to work well were batch_size=5,   #
+# and sleep_time_in_seconds=15 (as of December 2023.)                      #
+############################################################################
+
+benchmark_df = await evaluator_benchmarker.arun(
+    batch_size=20,  # batches the number of openai api calls to make
+    sleep_time_in_seconds=1,  # seconds to sleep before making an api call
+)
+```
+
+## Original data citation
+
+```text
+@misc{zheng2023judging,
+      title={Judging LLM-as-a-judge with MT-Bench and Chatbot Arena},
+      author={Lianmin Zheng and Wei-Lin Chiang and Ying Sheng and Siyuan Zhuang and Zhanghao Wu and Yonghao Zhuang and Zi Lin and Zhuohan Li and Dacheng Li and Eric. P Xing and Hao Zhang and Joseph E. Gonzalez and Ion Stoica},
+      year={2023},
+      eprint={2306.05685},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL}
+}
+```
diff --git a/llama-datasets/mini_mt_bench_singlegrading/baselines.py b/llama-datasets/mini_mt_bench_singlegrading/baselines.py
new file mode 100644
index 0000000000000000000000000000000000000000..f8096e882ba40fb22e78a78f40dd7fb9a1a1e112
--- /dev/null
+++ b/llama-datasets/mini_mt_bench_singlegrading/baselines.py
@@ -0,0 +1,84 @@
+import asyncio
+
+from llama_index.core.llama_dataset import download_llama_dataset
+from llama_index.core.llama_pack import download_llama_pack
+from llama_index.core.evaluation import CorrectnessEvaluator
+from llama_index.llms import OpenAI, Gemini
+from llama_index.core import ServiceContext
+import pandas as pd
+
+
+async def main():
+    # DOWNLOAD LLAMADATASET
+    evaluator_dataset, _ = download_llama_dataset(
+        "MiniMtBenchSingleGradingDataset", "./mini_mt_bench_data"
+    )
+
+    # DEFINE EVALUATORS
+    gpt_4_context = ServiceContext.from_defaults(
+        llm=OpenAI(temperature=0, model="gpt-4"),
+    )
+
+    gpt_3p5_context = ServiceContext.from_defaults(
+        llm=OpenAI(temperature=0, model="gpt-3.5-turbo"),
+    )
+
+    gemini_pro_context = ServiceContext.from_defaults(
+        llm=Gemini(model="models/gemini-pro", temperature=0)
+    )
+
+    evaluators = {
+        "gpt-4": CorrectnessEvaluator(service_context=gpt_4_context),
+        "gpt-3.5": CorrectnessEvaluator(service_context=gpt_3p5_context),
+        "gemini-pro": CorrectnessEvaluator(service_context=gemini_pro_context),
+    }
+
+    # EVALUATE WITH PACK
+    ############################################################################
+    # NOTE: If have a lower tier subscription for OpenAI API like Usage Tier 1 #
+    # then you'll need to use different batch_size and sleep_time_in_seconds.  #
+    # For Usage Tier 1, settings that seemed to work well were batch_size=5,   #
+    # and sleep_time_in_seconds=15 (as of December 2023.)                      #
+    ############################################################################
+    EvaluatorBenchmarkerPack = download_llama_pack("EvaluatorBenchmarkerPack", "./pack")
+    evaluator_benchmarker = EvaluatorBenchmarkerPack(
+        evaluator=evaluators["gpt-3.5"],
+        eval_dataset=evaluator_dataset,
+        show_progress=True,
+    )
+    gpt_3p5_benchmark_df = await evaluator_benchmarker.arun(
+        batch_size=100, sleep_time_in_seconds=0
+    )
+
+    evaluator_benchmarker = EvaluatorBenchmarkerPack(
+        evaluator=evaluators["gpt-4"],
+        eval_dataset=evaluator_dataset,
+        show_progress=True,
+    )
+    gpt_4_benchmark_df = await evaluator_benchmarker.arun(
+        batch_size=100, sleep_time_in_seconds=0
+    )
+
+    evaluator_benchmarker = EvaluatorBenchmarkerPack(
+        evaluator=evaluators["gemini-pro"],
+        eval_dataset=evaluator_dataset,
+        show_progress=True,
+    )
+    gemini_pro_benchmark_df = await evaluator_benchmarker.arun(
+        batch_size=5, sleep_time_in_seconds=0.5
+    )
+
+    benchmark_df = pd.concat(
+        [
+            gpt_3p5_benchmark_df,
+            gpt_4_benchmark_df,
+            gemini_pro_benchmark_df,
+        ],
+        axis=0,
+    )
+    print(benchmark_df)
+
+
+if __name__ == "__main__":
+    loop = asyncio.get_event_loop()
+    loop.run_until_complete(main)
diff --git a/llama-datasets/mini_mt_bench_singlegrading/card.json b/llama-datasets/mini_mt_bench_singlegrading/card.json
new file mode 100644
index 0000000000000000000000000000000000000000..06bfe889a7dd2924630444ee1d42e14e1a839907
--- /dev/null
+++ b/llama-datasets/mini_mt_bench_singlegrading/card.json
@@ -0,0 +1,55 @@
+{
+  "name": "Mini MT Bench Dataset",
+  "className": "LabelledEvaluatorDataset",
+  "description": "This is a miniature version to the original MT Bench (Single-Grading) Dataset. In particular, this dataset only consists of answers produced by Llama2-70b LLM to the 160 questions i.e., 80 x 2 since there are two turns. The reference evaluations are done using the `CorrectnessEvaluator` class and with GPT-4 as the judge LLM.",
+  "numberObservations": 160,
+  "containsExamplesByHumans": false,
+  "containsExamplesByAi": true,
+  "sourceUrls": [
+    "https://huggingface.co/spaces/lmsys/mt-bench/tree/main/data/mt_bench"
+  ],
+  "baselines": [
+    {
+      "name": "gpt-3.5",
+      "config": {
+        "promptUrl": "https://github.com/run-llama/llama_index.core/blob/e471e5f8a93ddae6d366cdbba8a497cd6728c7f8/llama_index.core/evaluation/correctness.py#L17",
+        "llm": "gpt-3.5"
+      },
+      "metrics": {
+        "invalidPredictions": 0,
+        "correlation": 0.317,
+        "meanAbsoluteError": 1.119,
+        "hamming": 27
+      },
+      "codeUrl": "https://github.com/run-llama/llama-hub/blob/main/llama_hub/llama_datasets/mini_mt_bench_singlegrading/baselines.py"
+    },
+    {
+      "name": "gpt-4",
+      "config": {
+        "promptUrl": "https://github.com/run-llama/llama_index.core/blob/e471e5f8a93ddae6d366cdbba8a497cd6728c7f8/llama_index.core/evaluation/correctness.py#L17",
+        "llm": "gpt-4"
+      },
+      "metrics": {
+        "invalidPredictions": 0,
+        "correlation": 0.966,
+        "meanAbsoluteError": 0.094,
+        "hamming": 143
+      },
+      "codeUrl": "https://github.com/run-llama/llama-hub/blob/main/llama_hub/llama_datasets/mini_mt_bench_singlegrading/baselines.py"
+    },
+    {
+      "name": "gemini-pro",
+      "config": {
+        "promptUrl": "https://github.com/run-llama/llama_index.core/blob/e471e5f8a93ddae6d366cdbba8a497cd6728c7f8/llama_index.core/evaluation/correctness.py#L17",
+        "llm": "gemini-pro"
+      },
+      "metrics": {
+        "invalidPredictions": 1,
+        "correlation": 0.295,
+        "meanAbsoluteError": 1.22,
+        "hamming": 12
+      },
+      "codeUrl": "https://github.com/run-llama/llama-hub/blob/main/llama_hub/llama_datasets/mini_mt_bench_singlegrading/baselines.py"
+    }
+  ]
+}
diff --git a/llama-datasets/mini_squadv2/BUILD b/llama-datasets/mini_squadv2/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..db46e8d6c978c67e301dd6c47bee08c1b3fd141c
--- /dev/null
+++ b/llama-datasets/mini_squadv2/BUILD
@@ -0,0 +1 @@
+python_sources()
diff --git a/llama-datasets/mini_squadv2/README.md b/llama-datasets/mini_squadv2/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..ea5cdfd259e9fe67fccf9d3a47341c7b78315849
--- /dev/null
+++ b/llama-datasets/mini_squadv2/README.md
@@ -0,0 +1,79 @@
+# Mini Squad V2 Dataset
+
+[![arize (100 x 40 px)](https://github.com/nerdai/llama-hub/assets/92402603/eb4cb77a-1a1a-48a0-9f9d-277798832200)](https://arize.com/)
+
+This dataset was prepared in collaboration with Xander Song of Arize AI.
+
+## CLI Usage
+
+You can download `llamadatasets` directly using `llamaindex-cli`, which comes installed with the `llama-index` python package:
+
+```bash
+llamaindex-cli download-llamadataset MiniSquadV2Dataset --download-dir ./data
+```
+
+You can then inspect the files at `./data`. When you're ready to load the data into
+python, you can use the below snippet of code:
+
+```python
+from llama_index.core import SimpleDirectoryReader
+from llama_index.core.llama_dataset import LabelledRagDataset
+
+rag_dataset = LabelledRagDataset.from_json("./data/rag_dataset.json")
+documents = SimpleDirectoryReader(input_dir="./data/source_files").load_data()
+```
+
+## Code Usage
+
+You can download the dataset to a directory, say `./data` directly in Python
+as well. From there, you can use the convenient `RagEvaluatorPack` llamapack to
+run your own LlamaIndex RAG pipeline with the `llamadataset`.
+
+```python
+from llama_index.core.llama_dataset import download_llama_dataset
+from llama_index.core.llama_pack import download_llama_pack
+from llama_index.core import VectorStoreIndex
+
+# download and install dependencies for benchmark dataset
+rag_dataset, documents = download_llama_dataset("MiniSquadV2Dataset", "./data")
+
+# build basic RAG system
+index = VectorStoreIndex.from_documents(documents=documents)
+query_engine = index.as_query_engine()
+
+# evaluate using the RagEvaluatorPack
+RagEvaluatorPack = download_llama_pack(
+    "RagEvaluatorPack", "./rag_evaluator_pack"
+)
+rag_evaluator_pack = RagEvaluatorPack(
+    rag_dataset=rag_dataset, query_engine=query_engine
+)
+
+############################################################################
+# NOTE: If have a lower tier subscription for OpenAI API like Usage Tier 1 #
+# then you'll need to use different batch_size and sleep_time_in_seconds.  #
+# For Usage Tier 1, settings that seemed to work well were batch_size=5,   #
+# and sleep_time_in_seconds=15 (as of December 2023.)                      #
+############################################################################
+
+benchmark_df = await rag_evaluator_pack.arun(
+    batch_size=20,  # batches the number of openai api calls to make
+    sleep_time_in_seconds=1,  # seconds to sleep before making an api call
+)
+```
+
+## Original data citation
+
+```tex
+@article{2016arXiv160605250R,
+       author = {{Rajpurkar}, Pranav and {Zhang}, Jian and {Lopyrev},
+                 Konstantin and {Liang}, Percy},
+        title = "{SQuAD: 100,000+ Questions for Machine Comprehension of Text}",
+      journal = {arXiv e-prints},
+         year = 2016,
+          eid = {arXiv:1606.05250},
+        pages = {arXiv:1606.05250},
+archivePrefix = {arXiv},
+       eprint = {1606.05250},
+}
+```
diff --git a/llama-datasets/mini_squadv2/card.json b/llama-datasets/mini_squadv2/card.json
new file mode 100644
index 0000000000000000000000000000000000000000..6c5adffa7cd01a244bf737f1c34ab45ef818f519
--- /dev/null
+++ b/llama-datasets/mini_squadv2/card.json
@@ -0,0 +1,27 @@
+{
+  "name": "Mini Squad V2 Dataset",
+  "className": "LabelledRagDataset",
+  "description": "This is a subset of the original SquadV2 dataset. In particular, it considers only the top 10 Wikipedia pages in terms of having questions about them.",
+  "numberObservations": 195,
+  "containsExamplesByHumans": true,
+  "containsExamplesByAi": false,
+  "sourceUrls": ["https://huggingface.co/datasets/squad_v2"],
+  "baselines": [
+    {
+      "name": "llamaindex",
+      "config": {
+        "chunkSize": 1024,
+        "llm": "gpt-3.5-turbo",
+        "similarityTopK": 2,
+        "embedModel": "text-embedding-ada-002"
+      },
+      "metrics": {
+        "contextSimilarity": 0.878,
+        "correctness": 3.464,
+        "faithfulness": 0.815,
+        "relevancy": 0.697
+      },
+      "codeUrl": "https://github.com/run-llama/llama-hub/blob/main/llama_hub/llama_datasets/mini_squadv2/llamaindex_baseline.py"
+    }
+  ]
+}
diff --git a/llama-datasets/mini_squadv2/llamaindex_baseline.py b/llama-datasets/mini_squadv2/llamaindex_baseline.py
new file mode 100644
index 0000000000000000000000000000000000000000..5bfe2a58b6451679d1a0d2e7b40094c0560ab038
--- /dev/null
+++ b/llama-datasets/mini_squadv2/llamaindex_baseline.py
@@ -0,0 +1,35 @@
+import asyncio
+
+from llama_index.core.llama_dataset import download_llama_dataset
+from llama_index.core.llama_pack import download_llama_pack
+from llama_index.core import VectorStoreIndex
+
+
+async def main():
+    # DOWNLOAD LLAMADATASET
+    rag_dataset, documents = download_llama_dataset("MiniSquadV2Dataset", "./data")
+
+    # BUILD BASIC RAG PIPELINE
+    index = VectorStoreIndex.from_documents(documents=documents)
+    query_engine = index.as_query_engine()
+
+    # EVALUATE WITH PACK
+    RagEvaluatorPack = download_llama_pack("RagEvaluatorPack", "./pack_stuff")
+    rag_evaluator = RagEvaluatorPack(query_engine=query_engine, rag_dataset=rag_dataset)
+
+    ############################################################################
+    # NOTE: If have a lower tier subscription for OpenAI API like Usage Tier 1 #
+    # then you'll need to use different batch_size and sleep_time_in_seconds.  #
+    # For Usage Tier 1, settings that seemed to work well were batch_size=5,   #
+    # and sleep_time_in_seconds=15 (as of December 2023.)                      #
+    ############################################################################
+    benchmark_df = await rag_evaluator.arun(
+        batch_size=20,  # batches the number of openai api calls to make
+        sleep_time_in_seconds=1,  # number of seconds sleep before making an api call
+    )
+    print(benchmark_df)
+
+
+if __name__ == "__main__":
+    loop = asyncio.get_event_loop()
+    loop.run_until_complete(main)
diff --git a/llama-datasets/mini_truthfulqa/BUILD b/llama-datasets/mini_truthfulqa/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..db46e8d6c978c67e301dd6c47bee08c1b3fd141c
--- /dev/null
+++ b/llama-datasets/mini_truthfulqa/BUILD
@@ -0,0 +1 @@
+python_sources()
diff --git a/llama-datasets/mini_truthfulqa/README.md b/llama-datasets/mini_truthfulqa/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3ae8b99b1177c841c23364251277beecc8215e9c
--- /dev/null
+++ b/llama-datasets/mini_truthfulqa/README.md
@@ -0,0 +1,74 @@
+# Mini TruthfulQA Dataset
+
+## CLI Usage
+
+You can download `llamadatasets` directly using `llamaindex-cli`, which comes installed with the `llama-index` python package:
+
+```bash
+llamaindex-cli download-llamadataset MiniTruthfulQADataset --download-dir ./data
+```
+
+You can then inspect the files at `./data`. When you're ready to load the data into
+python, you can use the below snippet of code:
+
+```python
+from llama_index.core import SimpleDirectoryReader
+from llama_index.core.llama_dataset import LabelledRagDataset
+
+rag_dataset = LabelledRagDataset.from_json("./data/rag_dataset.json")
+documents = SimpleDirectoryReader(input_dir="./data/source_files").load_data()
+```
+
+## Code Usage
+
+You can download the dataset to a directory, say `./data` directly in Python
+as well. From there, you can use the convenient `RagEvaluatorPack` llamapack to
+run your own LlamaIndex RAG pipeline with the `llamadataset`.
+
+```python
+from llama_index.core.llama_dataset import download_llama_dataset
+from llama_index.core.llama_pack import download_llama_pack
+from llama_index.core import VectorStoreIndex
+
+# download and install dependencies for benchmark dataset
+rag_dataset, documents = download_llama_dataset(
+    "MiniTruthfulQADataset", "./data"
+)
+
+# build basic RAG system
+index = VectorStoreIndex.from_documents(documents=documents)
+query_engine = index.as_query_engine()
+
+# evaluate using the RagEvaluatorPack
+RagEvaluatorPack = download_llama_pack(
+    "RagEvaluatorPack", "./rag_evaluator_pack"
+)
+rag_evaluator_pack = RagEvaluatorPack(
+    rag_dataset=rag_dataset, query_engine=query_engine
+)
+
+############################################################################
+# NOTE: If have a lower tier subscription for OpenAI API like Usage Tier 1 #
+# then you'll need to use different batch_size and sleep_time_in_seconds.  #
+# For Usage Tier 1, settings that seemed to work well were batch_size=5,   #
+# and sleep_time_in_seconds=15 (as of December 2023.)                      #
+############################################################################
+
+benchmark_df = await rag_evaluator_pack.arun(
+    batch_size=20,  # batches the number of openai api calls to make
+    sleep_time_in_seconds=1,  # seconds to sleep before making an api call
+)
+```
+
+## Original data citation
+
+```tex
+@misc{lin2021truthfulqa,
+    title={TruthfulQA: Measuring How Models Mimic Human Falsehoods},
+    author={Stephanie Lin and Jacob Hilton and Owain Evans},
+    year={2021},
+    eprint={2109.07958},
+    archivePrefix={arXiv},
+    primaryClass={cs.CL}
+}
+```
diff --git a/llama-datasets/mini_truthfulqa/card.json b/llama-datasets/mini_truthfulqa/card.json
new file mode 100644
index 0000000000000000000000000000000000000000..dae559b9612700cc3202eafcb2f87f281608d5a7
--- /dev/null
+++ b/llama-datasets/mini_truthfulqa/card.json
@@ -0,0 +1,27 @@
+{
+  "name": "Mini TruthfulQA Dataset",
+  "className": "LabelledRagDataset",
+  "description": "This is a subset of the TruthfulQA benchmark. Only examples that are based off of Wikipedia pages are considered; and furthermore, Wikipedia pages that contain only one question are also dropped. The result is 152 examples for evaluating a RAG system.",
+  "numberObservations": 152,
+  "containsExamplesByHumans": true,
+  "containsExamplesByAi": false,
+  "sourceUrls": ["https://huggingface.co/datasets/truthful_qa"],
+  "baselines": [
+    {
+      "name": "llamaindex",
+      "config": {
+        "chunkSize": 1024,
+        "llm": "gpt-3.5-turbo",
+        "similarityTopK": 2,
+        "embedModel": "text-embedding-ada-002"
+      },
+      "metrics": {
+        "contextSimilarity": null,
+        "correctness": 3.845,
+        "faithfulness": 0.605,
+        "relevancy": 0.599
+      },
+      "codeUrl": "https://github.com/run-llama/llama-hub/blob/main/llama_hub/llama_datasets/mini_truthfulqa/llamaindex_baseline.py"
+    }
+  ]
+}
diff --git a/llama-datasets/mini_truthfulqa/llamaindex_baseline.py b/llama-datasets/mini_truthfulqa/llamaindex_baseline.py
new file mode 100644
index 0000000000000000000000000000000000000000..457fafcc5863663b5583f18819c247287961606b
--- /dev/null
+++ b/llama-datasets/mini_truthfulqa/llamaindex_baseline.py
@@ -0,0 +1,35 @@
+import asyncio
+
+from llama_index.core.llama_dataset import download_llama_dataset
+from llama_index.core.llama_pack import download_llama_pack
+from llama_index.core import VectorStoreIndex
+
+
+async def main():
+    # DOWNLOAD LLAMADATASET
+    rag_dataset, documents = download_llama_dataset("MiniTruthfulQADataset", "./data")
+
+    # BUILD BASIC RAG PIPELINE
+    index = VectorStoreIndex.from_documents(documents=documents)
+    query_engine = index.as_query_engine()
+
+    # EVALUATE WITH PACK
+    RagEvaluatorPack = download_llama_pack("RagEvaluatorPack", "./pack")
+    rag_evaluator = RagEvaluatorPack(query_engine=query_engine, rag_dataset=rag_dataset)
+
+    ############################################################################
+    # NOTE: If have a lower tier subscription for OpenAI API like Usage Tier 1 #
+    # then you'll need to use different batch_size and sleep_time_in_seconds.  #
+    # For Usage Tier 1, settings that seemed to work well were batch_size=5,   #
+    # and sleep_time_in_seconds=15 (as of December 2023.)                      #
+    ############################################################################
+    benchmark_df = await rag_evaluator.arun(
+        batch_size=20,  # batches the number of openai api calls to make
+        sleep_time_in_seconds=1,  # number of seconds sleep before making an api call
+    )
+    print(benchmark_df)
+
+
+if __name__ == "__main__":
+    loop = asyncio.get_event_loop()
+    loop.run_until_complete(main)
diff --git a/llama-datasets/mt_bench_humanjudgement/BUILD b/llama-datasets/mt_bench_humanjudgement/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..db46e8d6c978c67e301dd6c47bee08c1b3fd141c
--- /dev/null
+++ b/llama-datasets/mt_bench_humanjudgement/BUILD
@@ -0,0 +1 @@
+python_sources()
diff --git a/llama-datasets/mt_bench_humanjudgement/README.md b/llama-datasets/mt_bench_humanjudgement/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..9f9e9d762e513765b949890891f829e989b07403
--- /dev/null
+++ b/llama-datasets/mt_bench_humanjudgement/README.md
@@ -0,0 +1,82 @@
+# Mt Bench Human Judgement Dataset
+
+## CLI Usage
+
+You can download `llamadatasets` directly using `llamaindex-cli`, which comes installed with the `llama-index` python package:
+
+```bash
+llamaindex-cli download-llamadataset MtBenchHumanJudgementDataset --download-dir ./data
+```
+
+You can then inspect the files at `./data`. When you're ready to load the data into
+python, you can use the below snippet of code:
+
+```python
+from llama_index.core import SimpleDirectoryReader
+from llama_index.core.llama_dataset import LabelledPairwiseEvaluatorDataset
+
+pairwise_evaluator_dataset = LabelledPairwiseEvaluatorDataset.from_json(
+    "./data/pairwise_evaluator_dataset.json"
+)
+```
+
+## Code Usage
+
+You can download the dataset to a directory, say `./data` directly in Python
+as well. From there, you can use the convenient `EvaluatorBenchmarkerPack` llamapack to
+run your own LlamaIndex RAG pipeline with the `llamadataset`.
+
+```python
+from llama_index.core.llama_dataset import download_llama_dataset
+from llama_index.core.llama_pack import download_llama_pack
+from llama_index.core.evaluator import PairwiseComparisonEvaluator
+from llama_index.llms import OpenAI
+from llama_index.core import ServiceContext
+
+# download benchmark dataset
+pairwise_evaluator_dataset, _ = download_llama_dataset(
+    "MtBenchHumanJudgementDataset", "./data"
+)
+
+# define your evaluator
+gpt_4_context = ServiceContext.from_defaults(
+    llm=OpenAI(temperature=0, model="gpt-4"),
+)
+
+evaluator = PairwiseComparisonEvaluator(service_context=gpt_4_context)
+
+# evaluate using the EvaluatorBenchmarkerPack
+EvaluatorBenchmarkerPack = download_llama_pack(
+    "EvaluatorBenchmarkerPack", "./pack"
+)
+evaluator_benchmarker = EvaluatorBenchmarkerPack(
+    evaluator=evaluator,
+    eval_dataset=pairwise_evaluator_dataset,
+    show_progress=True,
+)
+
+############################################################################
+# NOTE: If have a lower tier subscription for OpenAI API like Usage Tier 1 #
+# then you'll need to use different batch_size and sleep_time_in_seconds.  #
+# For Usage Tier 1, settings that seemed to work well were batch_size=5,   #
+# and sleep_time_in_seconds=15 (as of December 2023.)                      #
+############################################################################
+
+benchmark_df = await evaluator_benchmarker.arun(
+    batch_size=20,  # batches the number of openai api calls to make
+    sleep_time_in_seconds=1,  # seconds to sleep before making an api call
+)
+```
+
+## Original data citation
+
+```text
+@misc{zheng2023judging,
+      title={Judging LLM-as-a-judge with MT-Bench and Chatbot Arena},
+      author={Lianmin Zheng and Wei-Lin Chiang and Ying Sheng and Siyuan Zhuang and Zhanghao Wu and Yonghao Zhuang and Zi Lin and Zhuohan Li and Dacheng Li and Eric. P Xing and Hao Zhang and Joseph E. Gonzalez and Ion Stoica},
+      year={2023},
+      eprint={2306.05685},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL}
+}
+```
diff --git a/llama-datasets/mt_bench_humanjudgement/baselines.py b/llama-datasets/mt_bench_humanjudgement/baselines.py
new file mode 100644
index 0000000000000000000000000000000000000000..3e8094ef9154064c806604c30431a05c50d82542
--- /dev/null
+++ b/llama-datasets/mt_bench_humanjudgement/baselines.py
@@ -0,0 +1,84 @@
+import asyncio
+
+from llama_index.core.llama_dataset import download_llama_dataset
+from llama_index.core.llama_pack import download_llama_pack
+from llama_index.core.evaluation import PairwiseComparisonEvaluator
+from llama_index.llms import OpenAI, Gemini
+from llama_index.core import ServiceContext
+import pandas as pd
+
+
+async def main():
+    # DOWNLOAD LLAMADATASET
+    pairwise_evaluator_dataset, _ = download_llama_dataset(
+        "MtBenchHumanJudgementDataset", "./mt_bench_data"
+    )
+
+    # DEFINE EVALUATORS
+    gpt_4_context = ServiceContext.from_defaults(
+        llm=OpenAI(temperature=0, model="gpt-4"),
+    )
+
+    gpt_3p5_context = ServiceContext.from_defaults(
+        llm=OpenAI(temperature=0, model="gpt-3.5-turbo"),
+    )
+
+    gemini_pro_context = ServiceContext.from_defaults(
+        llm=Gemini(model="models/gemini-pro", temperature=0)
+    )
+
+    evaluators = {
+        "gpt-4": PairwiseComparisonEvaluator(service_context=gpt_4_context),
+        "gpt-3.5": PairwiseComparisonEvaluator(service_context=gpt_3p5_context),
+        "gemini-pro": PairwiseComparisonEvaluator(service_context=gemini_pro_context),
+    }
+
+    # EVALUATE WITH PACK
+    ############################################################################
+    # NOTE: If have a lower tier subscription for OpenAI API like Usage Tier 1 #
+    # then you'll need to use different batch_size and sleep_time_in_seconds.  #
+    # For Usage Tier 1, settings that seemed to work well were batch_size=5,   #
+    # and sleep_time_in_seconds=15 (as of December 2023.)                      #
+    ############################################################################
+    EvaluatorBenchmarkerPack = download_llama_pack("EvaluatorBenchmarkerPack", "./pack")
+    evaluator_benchmarker = EvaluatorBenchmarkerPack(
+        evaluator=evaluators["gpt-3.5"],
+        eval_dataset=pairwise_evaluator_dataset,
+        show_progress=True,
+    )
+    gpt_3p5_benchmark_df = await evaluator_benchmarker.arun(
+        batch_size=100, sleep_time_in_seconds=0
+    )
+
+    evaluator_benchmarker = EvaluatorBenchmarkerPack(
+        evaluator=evaluators["gpt-4"],
+        eval_dataset=pairwise_evaluator_dataset,
+        show_progress=True,
+    )
+    gpt_4_benchmark_df = await evaluator_benchmarker.arun(
+        batch_size=100, sleep_time_in_seconds=0
+    )
+
+    evaluator_benchmarker = EvaluatorBenchmarkerPack(
+        evaluator=evaluators["gemini-pro"],
+        eval_dataset=pairwise_evaluator_dataset,
+        show_progress=True,
+    )
+    gemini_pro_benchmark_df = await evaluator_benchmarker.arun(
+        batch_size=5, sleep_time_in_seconds=0.5
+    )
+
+    benchmark_df = pd.concat(
+        [
+            gpt_3p5_benchmark_df,
+            gpt_4_benchmark_df,
+            gemini_pro_benchmark_df,
+        ],
+        axis=0,
+    )
+    print(benchmark_df)
+
+
+if __name__ == "__main__":
+    loop = asyncio.get_event_loop()
+    loop.run_until_complete(main)
diff --git a/llama-datasets/mt_bench_humanjudgement/card.json b/llama-datasets/mt_bench_humanjudgement/card.json
new file mode 100644
index 0000000000000000000000000000000000000000..286ab624ce64de57bc8ff420c2af025faeeb8f20
--- /dev/null
+++ b/llama-datasets/mt_bench_humanjudgement/card.json
@@ -0,0 +1,58 @@
+{
+  "name": "MT Bench Human Judgement Dataset",
+  "className": "LabelledPairwiseEvaluatorDataset",
+  "description": "This is an adaptation of the original MT Bench Human Judgement dataset, where human evaluators compare two llm model responses and rank them according to their own preference. In the original version, there can be more than one human evaluator for a given example (query, two model responses). In this adapted version however, we aggregate these 'repeated' entries and convert the 'winner' column of the original schema to instead represent the proportion of times 'model_a' wins across all of the human evaluators. To adapt this to a llama-dataset, and to better consider ties (albeit with small samples) we set an uncertainty threshold for this proportion in that if it is between [0.4, 0.6] then we consider there to be no winner between the two models.",
+  "numberObservations": 1204,
+  "containsExamplesByHumans": true,
+  "containsExamplesByAi": false,
+  "sourceUrls": [
+    "https://huggingface.co/datasets/lmsys/mt_bench_human_judgments"
+  ],
+  "baselines": [
+    {
+      "name": "gpt-3.5",
+      "config": {
+        "promptUrl": "https://github.com/run-llama/llama_index.core/blob/e471e5f8a93ddae6d366cdbba8a497cd6728c7f8/llama_index.core/evaluation/pairwise.py#L21",
+        "llm": "gpt-3.5"
+      },
+      "metrics": {
+        "invalidPredictions": 89,
+        "inconclusives": 407,
+        "ties": 51,
+        "agreementRateWithTies": 0.743,
+        "agreementRateWithoutTies": 0.798
+      },
+      "codeUrl": "https://github.com/run-llama/llama-hub/blob/main/llama_hub/llama_datasets/mt_bench_humanjudgement/baselines.py"
+    },
+    {
+      "name": "gpt-4",
+      "config": {
+        "promptUrl": "https://github.com/run-llama/llama_index.core/blob/e471e5f8a93ddae6d366cdbba8a497cd6728c7f8/llama_index.core/evaluation/pairwise.py#L21",
+        "llm": "gpt-4"
+      },
+      "metrics": {
+        "invalidPredictions": 1,
+        "inconclusives": 107,
+        "ties": 102,
+        "agreementRateWithTies": 0.709,
+        "agreementRateWithoutTies": 0.779
+      },
+      "codeUrl": "https://github.com/run-llama/llama-hub/blob/main/llama_hub/llama_datasets/mt_bench_humanjudgement/baselines.py"
+    },
+    {
+      "name": "gemini-pro",
+      "config": {
+        "promptUrl": "https://github.com/run-llama/llama_index.core/blob/e471e5f8a93ddae6d366cdbba8a497cd6728c7f8/llama_index.core/evaluation/pairwise.py#L21",
+        "llm": "gemini-pro"
+      },
+      "metrics": {
+        "invalidPredictions": 2,
+        "inconclusives": 295,
+        "ties": 60,
+        "agreementRateWithTies": 0.742,
+        "agreementRateWithoutTies": 0.793
+      },
+      "codeUrl": "https://github.com/run-llama/llama-hub/blob/main/llama_hub/llama_datasets/mt_bench_humanjudgement/baselines.py"
+    }
+  ]
+}
diff --git a/llama-datasets/origin_of_covid19/BUILD b/llama-datasets/origin_of_covid19/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..db46e8d6c978c67e301dd6c47bee08c1b3fd141c
--- /dev/null
+++ b/llama-datasets/origin_of_covid19/BUILD
@@ -0,0 +1 @@
+python_sources()
diff --git a/llama-datasets/origin_of_covid19/README.md b/llama-datasets/origin_of_covid19/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..a7d8b4b9bd8b67391f1118fcab7c9aa964e066b2
--- /dev/null
+++ b/llama-datasets/origin_of_covid19/README.md
@@ -0,0 +1,62 @@
+# Origin Of COVID-19 Dataset
+
+## CLI Usage
+
+You can download `llamadatasets` directly using `llamaindex-cli`, which comes installed with the `llama-index` python package:
+
+```bash
+llamaindex-cli download-llamadataset OriginOfCovid19Dataset --download-dir ./data
+```
+
+You can then inspect the files at `./data`. When you're ready to load the data into
+python, you can use the below snippet of code:
+
+```python
+from llama_index.core import SimpleDirectoryReader
+from llama_index.core.llama_dataset import LabelledRagDataset
+
+rag_dataset = LabelledRagDataset.from_json("./data/rag_dataset.json")
+documents = SimpleDirectoryReader(input_dir="./data/source_files").load_data()
+```
+
+## Code Usage
+
+You can download the dataset to a directory, say `./data` directly in Python
+as well. From there, you can use the convenient `RagEvaluatorPack` llamapack to
+run your own LlamaIndex RAG pipeline with the `llamadataset`.
+
+```python
+from llama_index.core.llama_dataset import download_llama_dataset
+from llama_index.core.llama_pack import download_llama_pack
+from llama_index.core import VectorStoreIndex
+
+# download and install dependencies for benchmark dataset
+rag_dataset, documents = download_llama_dataset(
+    "OriginOfCovid19Dataset", "./data"
+)
+
+# build basic RAG system
+index = VectorStoreIndex.from_documents(documents=documents)
+query_engine = index.as_query_engine()
+
+# evaluate using the RagEvaluatorPack
+RagEvaluatorPack = download_llama_pack(
+    "RagEvaluatorPack", "./rag_evaluator_pack"
+)
+rag_evaluator_pack = RagEvaluatorPack(
+    rag_dataset=rag_dataset, query_engine=query_engine
+)
+benchmark_df = rag_evaluator_pack.run()  # async arun() supported as well
+
+############################################################################
+# NOTE: If have a lower tier subscription for OpenAI API like Usage Tier 1 #
+# then you'll need to use different batch_size and sleep_time_in_seconds.  #
+# For Usage Tier 1, settings that seemed to work well were batch_size=5,   #
+# and sleep_time_in_seconds=15 (as of December 2023.)                      #
+############################################################################
+
+benchmark_df = await rag_evaluator_pack.arun(
+    batch_size=20,  # batches the number of openai api calls to make
+    sleep_time_in_seconds=1,  # seconds to sleep before making an api call
+)
+```
diff --git a/llama-datasets/origin_of_covid19/card.json b/llama-datasets/origin_of_covid19/card.json
new file mode 100644
index 0000000000000000000000000000000000000000..6686d3a68b978917c801cd66541600e1b7a77733
--- /dev/null
+++ b/llama-datasets/origin_of_covid19/card.json
@@ -0,0 +1,27 @@
+{
+  "name": "Origin Of Covid19 Dataset",
+  "className": "LabelledRagDataset",
+  "description": "A labelled RAG dataset based off an article, The Origin Of COVID-19 and Why It Matters, by Morens DM, Breman JG, Calisher CH, Doherty PC, Hahn BH, Keusch GT, Kramer LD, LeDuc JW, Monath TP, Taubenberger JK, consisting of queries, reference answers, and reference contexts.",
+  "numberObservations": 24,
+  "containsExamplesByHumans": false,
+  "containsExamplesByAi": true,
+  "sourceUrls": ["https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7470595/"],
+  "baselines": [
+    {
+      "name": "llamaindex",
+      "config": {
+        "chunkSize": 1024,
+        "llm": "gpt-3.5-turbo",
+        "similarityTopK": 2,
+        "embedModel": "text-embedding-ada-002"
+      },
+      "metrics": {
+        "contextSimilarity": 0.952,
+        "correctness": 4.562,
+        "faithfulness": 1.0,
+        "relevancy": 0.958
+      },
+      "codeUrl": "https://github.com/run-llama/llama-hub/blob/main/llama_hub/llama_datasets/origin_of_covid19/llamaindex_baseline.py"
+    }
+  ]
+}
diff --git a/llama-datasets/origin_of_covid19/llamaindex_baseline.py b/llama-datasets/origin_of_covid19/llamaindex_baseline.py
new file mode 100644
index 0000000000000000000000000000000000000000..83a5f7415054cfecf6a2e89c70aecc9e05bf7667
--- /dev/null
+++ b/llama-datasets/origin_of_covid19/llamaindex_baseline.py
@@ -0,0 +1,35 @@
+import asyncio
+
+from llama_index.core.llama_dataset import download_llama_dataset
+from llama_index.core.llama_pack import download_llama_pack
+from llama_index.core import VectorStoreIndex
+
+
+async def main():
+    # DOWNLOAD LLAMADATASET
+    rag_dataset, documents = download_llama_dataset("OriginOfCovid19", "./data")
+
+    # BUILD BASIC RAG PIPELINE
+    index = VectorStoreIndex.from_documents(documents=documents)
+    query_engine = index.as_query_engine()
+
+    # EVALUATE WITH PACK
+    RagEvaluatorPack = download_llama_pack("RagEvaluatorPack", "./pack")
+    rag_evaluator = RagEvaluatorPack(query_engine=query_engine, rag_dataset=rag_dataset)
+
+    ############################################################################
+    # NOTE: If have a lower tier subscription for OpenAI API like Usage Tier 1 #
+    # then you'll need to use different batch_size and sleep_time_in_seconds.  #
+    # For Usage Tier 1, settings that seemed to work well were batch_size=5,   #
+    # and sleep_time_in_seconds=15 (as of December 2023.)                      #
+    ############################################################################
+    benchmark_df = await rag_evaluator.arun(
+        batch_size=20,  # batches the number of openai api calls to make
+        sleep_time_in_seconds=1,  # number of seconds sleep before making an api call
+    )
+    print(benchmark_df)
+
+
+if __name__ == "__main__":
+    loop = asyncio.get_event_loop()
+    loop.run_until_complete(main)
diff --git a/llama-datasets/patronus_financebench/BUILD b/llama-datasets/patronus_financebench/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..db46e8d6c978c67e301dd6c47bee08c1b3fd141c
--- /dev/null
+++ b/llama-datasets/patronus_financebench/BUILD
@@ -0,0 +1 @@
+python_sources()
diff --git a/llama-datasets/patronus_financebench/README.md b/llama-datasets/patronus_financebench/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3fb0fdce8129dcfe79a87096c0e9788d34c7a2d0
--- /dev/null
+++ b/llama-datasets/patronus_financebench/README.md
@@ -0,0 +1,68 @@
+# Patronus AI FinanceBench Dataset
+
+[![patronus-ai-logo (200 x 40 px)](https://github.com/nerdai/llama-hub/assets/92402603/62a6df3f-57a3-4d68-917b-b0947392efcd)](https://www.patronus.ai/)
+
+This dataset is a subset of the original FinanceBench dataset. In particular, to
+make this benchmark more computationally efficient, we only keep the documents for
+which there are 2 or more questions. Such filtering, reduced the total unique pdf
+documents from 98 to 32.
+
+## CLI Usage
+
+You can download `llamadatasets` directly using `llamaindex-cli`, which comes installed with the `llama-index` python package:
+
+```bash
+llamaindex-cli download-llamadataset PatronusAIFinanceBenchDataset --download-dir ./data
+```
+
+You can then inspect the files at `./data`. When you're ready to load the data into
+python, you can use the below snippet of code:
+
+```python
+from llama_index.core import SimpleDirectoryReader
+from llama_index.core.llama_dataset import LabelledRagDataset
+
+rag_dataset = LabelledRagDataset.from_json("./data/rag_dataset.json")
+documents = SimpleDirectoryReader(input_dir="./data/source_files").load_data()
+```
+
+## Code Usage
+
+You can download the dataset to a directory, say `./data` directly in Python
+as well. From there, you can use the convenient `RagEvaluatorPack` llamapack to
+run your own LlamaIndex RAG pipeline with the `llamadataset`.
+
+```python
+from llama_index.core.llama_dataset import download_llama_dataset
+from llama_index.core.llama_pack import download_llama_pack
+from llama_index.core import VectorStoreIndex
+
+# download and install dependencies for benchmark dataset
+rag_dataset, documents = download_llama_dataset(
+    "PatronusAIFinanceBenchDataset", "./data"
+)
+
+# build basic RAG system
+index = VectorStoreIndex.from_documents(documents=documents)
+query_engine = index.as_query_engine()
+
+# evaluate using the RagEvaluatorPack
+RagEvaluatorPack = download_llama_pack(
+    "RagEvaluatorPack", "./rag_evaluator_pack"
+)
+rag_evaluator_pack = RagEvaluatorPack(
+    rag_dataset=rag_dataset, query_engine=query_engine
+)
+
+############################################################################
+# NOTE: If have a lower tier subscription for OpenAI API like Usage Tier 1 #
+# then you'll need to use different batch_size and sleep_time_in_seconds.  #
+# For Usage Tier 1, settings that seemed to work well were batch_size=5,   #
+# and sleep_time_in_seconds=15 (as of December 2023.)                      #
+############################################################################
+
+benchmark_df = await rag_evaluator_pack.arun(
+    batch_size=20,  # batches the number of openai api calls to make
+    sleep_time_in_seconds=1,  # seconds to sleep before making an api call
+)
+```
diff --git a/llama-datasets/patronus_financebench/__init__.py b/llama-datasets/patronus_financebench/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/llama-datasets/patronus_financebench/card.json b/llama-datasets/patronus_financebench/card.json
new file mode 100644
index 0000000000000000000000000000000000000000..38bcfffb6d09d64dafae7e7accb7da74fcfe2b5d
--- /dev/null
+++ b/llama-datasets/patronus_financebench/card.json
@@ -0,0 +1,27 @@
+{
+  "name": "Patronus AI FinanceBench",
+  "className": "LabelledRagDataset",
+  "description": "This is a subset of the original FinanceBench dataset. FinanceBench is a first-of-its-kind test suite for evaluating the performance of LLMs on open book financial question answering (QA). This is an open source sample of 150 annotated examples used in the evaluation and analysis of models assessed in the FinanceBench paper. The dataset comprises of questions about publicly traded companies, with corresponding answers and evidence strings. The questions in FinanceBench are ecologically valid and cover a diverse set of scenarios. They are intended to be clear-cut and straightforward to answer to serve as a minimum performance standard.",
+  "numberObservations": 98,
+  "containsExamplesByHumans": true,
+  "containsExamplesByAi": false,
+  "sourceUrls": ["https://huggingface.co/datasets/PatronusAI/financebench"],
+  "baselines": [
+    {
+      "name": "llamaindex",
+      "config": {
+        "chunkSize": 1024,
+        "llm": "gpt-3.5-turbo",
+        "similarityTopK": 1,
+        "embedModel": "text-embedding-ada-002"
+      },
+      "metrics": {
+        "contextSimilarity": 0.87,
+        "correctness": 2.622,
+        "faithfulness": 0.755,
+        "relevancy": 0.684
+      },
+      "codeUrl": "https://github.com/run-llama/llama-hub/blob/main/llama_hub/llama_datasets/patronus_financebench/llamaindex_baseline.py"
+    }
+  ]
+}
diff --git a/llama-datasets/patronus_financebench/llamaindex_baseline.py b/llama-datasets/patronus_financebench/llamaindex_baseline.py
new file mode 100644
index 0000000000000000000000000000000000000000..7b9b31b20ec9f880cd018be87c351b2f8878bb1a
--- /dev/null
+++ b/llama-datasets/patronus_financebench/llamaindex_baseline.py
@@ -0,0 +1,37 @@
+import asyncio
+
+from llama_index.core.llama_dataset import download_llama_dataset
+from llama_index.core.llama_pack import download_llama_pack
+from llama_index.core import VectorStoreIndex
+
+
+async def main():
+    # DOWNLOAD LLAMADATASET
+    rag_dataset, documents = download_llama_dataset(
+        "PatronusAIFinanceBenchDataset", "./patronus_financebench"
+    )
+
+    # BUILD BASIC RAG PIPELINE
+    index = VectorStoreIndex.from_documents(documents=documents)
+    query_engine = index.as_query_engine()
+
+    # EVALUATE WITH PACK
+    RagEvaluatorPack = download_llama_pack("RagEvaluatorPack", "./pack_stuff")
+    rag_evaluator = RagEvaluatorPack(query_engine=query_engine, rag_dataset=rag_dataset)
+
+    ############################################################################
+    # NOTE: If have a lower tier subscription for OpenAI API like Usage Tier 1 #
+    # then you'll need to use different batch_size and sleep_time_in_seconds.  #
+    # For Usage Tier 1, settings that seemed to work well were batch_size=5,   #
+    # and sleep_time_in_seconds=15 (as of December 2023.)                      #
+    ############################################################################
+    benchmark_df = await rag_evaluator.arun(
+        batch_size=20,  # batches the number of openai api calls to make
+        sleep_time_in_seconds=1,  # number of seconds sleep before making an api call
+    )
+    print(benchmark_df)
+
+
+if __name__ == "__main__":
+    loop = asyncio.get_event_loop()
+    loop.run_until_complete(main)
diff --git a/llama-datasets/paul_graham_essay/BUILD b/llama-datasets/paul_graham_essay/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..db46e8d6c978c67e301dd6c47bee08c1b3fd141c
--- /dev/null
+++ b/llama-datasets/paul_graham_essay/BUILD
@@ -0,0 +1 @@
+python_sources()
diff --git a/llama-datasets/paul_graham_essay/README.md b/llama-datasets/paul_graham_essay/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..ac92763359758c68da8a4ce2085871ce3c36587d
--- /dev/null
+++ b/llama-datasets/paul_graham_essay/README.md
@@ -0,0 +1,61 @@
+# Paul Graham Essay Dataset
+
+## CLI Usage
+
+You can download `llamadatasets` directly using `llamaindex-cli`, which comes installed with the `llama-index` python package:
+
+```bash
+llamaindex-cli download-llamadataset PaulGrahamEssayDataset --download-dir ./data
+```
+
+You can then inspect the files at `./data`. When you're ready to load the data into
+python, you can use the below snippet of code:
+
+```python
+from llama_index.core import SimpleDirectoryReader
+from llama_index.core.llama_dataset import LabelledRagDataset
+
+rag_dataset = LabelledRagDataset.from_json("./data/rag_dataset.json")
+documents = SimpleDirectoryReader(input_dir="./data/source_files").load_data()
+```
+
+## Code Usage
+
+You can download the dataset to a directory, say `./data` directly in Python
+as well. From there, you can use the convenient `RagEvaluatorPack` llamapack to
+run your own LlamaIndex RAG pipeline with the `llamadataset`.
+
+```python
+from llama_index.core.llama_dataset import download_llama_dataset
+from llama_index.core.llama_pack import download_llama_pack
+from llama_index.core import VectorStoreIndex
+
+# download and install dependencies for benchmark dataset
+rag_dataset, documents = download_llama_dataset(
+    "PaulGrahamEssayDataset", "./data"
+)
+
+# build basic RAG system
+index = VectorStoreIndex.from_documents(documents=documents)
+query_engine = index.as_query_engine()
+
+# evaluate using the RagEvaluatorPack
+RagEvaluatorPack = download_llama_pack(
+    "RagEvaluatorPack", "./rag_evaluator_pack"
+)
+rag_evaluator_pack = RagEvaluatorPack(
+    rag_dataset=rag_dataset, query_engine=query_engine
+)
+
+############################################################################
+# NOTE: If have a lower tier subscription for OpenAI API like Usage Tier 1 #
+# then you'll need to use different batch_size and sleep_time_in_seconds.  #
+# For Usage Tier 1, settings that seemed to work well were batch_size=5,   #
+# and sleep_time_in_seconds=15 (as of December 2023.)                      #
+############################################################################
+
+benchmark_df = await rag_evaluator_pack.arun(
+    batch_size=20,  # batches the number of openai api calls to make
+    sleep_time_in_seconds=1,  # seconds to sleep before making an api call
+)
+```
diff --git a/llama-datasets/paul_graham_essay/__init__.py b/llama-datasets/paul_graham_essay/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/llama-datasets/paul_graham_essay/card.json b/llama-datasets/paul_graham_essay/card.json
new file mode 100644
index 0000000000000000000000000000000000000000..8d4eb8d67dc98cd013fd7182722834c335fdf1d5
--- /dev/null
+++ b/llama-datasets/paul_graham_essay/card.json
@@ -0,0 +1,27 @@
+{
+  "name": "Paul Graham Essay",
+  "className": "LabelledRagDataset",
+  "description": "A labelled RAG dataset based off an essay by Paul Graham, consisting of queries, reference answers, and reference contexts.",
+  "numberObservations": 44,
+  "containsExamplesByHumans": false,
+  "containsExamplesByAi": true,
+  "sourceUrls": ["http://www.paulgraham.com/articles.html"],
+  "baselines": [
+    {
+      "name": "llamaindex",
+      "config": {
+        "chunkSize": 1024,
+        "llm": "gpt-3.5-turbo",
+        "similarityTopK": 2,
+        "embedModel": "text-embedding-ada-002"
+      },
+      "metrics": {
+        "contextSimilarity": 0.934,
+        "correctness": 4.239,
+        "faithfulness": 0.977,
+        "relevancy": 0.977
+      },
+      "codeUrl": "https://github.com/run-llama/llama-hub/blob/main/llama_hub/llama_datasets/paul_graham_essay/llamaindex_baseline.py"
+    }
+  ]
+}
diff --git a/llama-datasets/paul_graham_essay/llamaindex_baseline.py b/llama-datasets/paul_graham_essay/llamaindex_baseline.py
new file mode 100644
index 0000000000000000000000000000000000000000..f5f3a76eed9263f095e29704c47535d6d15ec672
--- /dev/null
+++ b/llama-datasets/paul_graham_essay/llamaindex_baseline.py
@@ -0,0 +1,37 @@
+import asyncio
+
+from llama_index.core.llama_dataset import download_llama_dataset
+from llama_index.core.llama_pack import download_llama_pack
+from llama_index.core import VectorStoreIndex
+
+
+async def main():
+    # DOWNLOAD LLAMADATASET
+    rag_dataset, documents = download_llama_dataset(
+        "PaulGrahamEssayDataset", "./paul_graham"
+    )
+
+    # BUILD BASIC RAG PIPELINE
+    index = VectorStoreIndex.from_documents(documents=documents)
+    query_engine = index.as_query_engine()
+
+    # EVALUATE WITH PACK
+    RagEvaluatorPack = download_llama_pack("RagEvaluatorPack", "./pack_stuff")
+    rag_evaluator = RagEvaluatorPack(query_engine=query_engine, rag_dataset=rag_dataset)
+
+    ############################################################################
+    # NOTE: If have a lower tier subscription for OpenAI API like Usage Tier 1 #
+    # then you'll need to use different batch_size and sleep_time_in_seconds.  #
+    # For Usage Tier 1, settings that seemed to work well were batch_size=5,   #
+    # and sleep_time_in_seconds=15 (as of December 2023.)                      #
+    ############################################################################
+    benchmark_df = await rag_evaluator.arun(
+        batch_size=20,  # batches the number of openai api calls to make
+        sleep_time_in_seconds=1,  # number of seconds sleep before making an api call
+    )
+    print(benchmark_df)
+
+
+if __name__ == "__main__":
+    loop = asyncio.get_event_loop()
+    loop.run_until_complete(main)
diff --git a/llama-datasets/template_README.md b/llama-datasets/template_README.md
new file mode 100644
index 0000000000000000000000000000000000000000..10852a6ffc3cd4264b9aff4b486991257afd87d9
--- /dev/null
+++ b/llama-datasets/template_README.md
@@ -0,0 +1,59 @@
+# {NAME}
+
+## CLI Usage
+
+You can download `llamadatasets` directly using `llamaindex-cli`, which comes installed with the `llama-index` python package:
+
+```bash
+llamaindex-cli download-llamadataset {NAME_CAMELCASE} --download-dir ./data
+```
+
+You can then inspect the files at `./data`. When you're ready to load the data into
+python, you can use the below snippet of code:
+
+```python
+from llama_index.core import SimpleDirectoryReader
+from llama_index.core.llama_dataset import LabelledRagDataset
+
+rag_dataset = LabelledRagDataset.from_json("./data/rag_dataset.json")
+documents = SimpleDirectoryReader(input_dir="./data/source_files").load_data()
+```
+
+## Code Usage
+
+You can download the dataset to a directory, say `./data` directly in Python
+as well. From there, you can use the convenient `RagEvaluatorPack` llamapack to
+run your own LlamaIndex RAG pipeline with the `llamadataset`.
+
+```python
+from llama_index.core.llama_dataset import download_llama_dataset
+from llama_index.core.llama_pack import download_llama_pack
+from llama_index.core import VectorStoreIndex
+
+# download and install dependencies for benchmark dataset
+rag_dataset, documents = download_llama_dataset("{NAME_CAMELCASE}", "./data")
+
+# build basic RAG system
+index = VectorStoreIndex.from_documents(documents=documents)
+query_engine = index.as_query_engine()
+
+# evaluate using the RagEvaluatorPack
+RagEvaluatorPack = download_llama_pack(
+    "RagEvaluatorPack", "./rag_evaluator_pack"
+)
+rag_evaluator_pack = RagEvaluatorPack(
+    rag_dataset=rag_dataset, query_engine=query_engine
+)
+
+############################################################################
+# NOTE: If have a lower tier subscription for OpenAI API like Usage Tier 1 #
+# then you'll need to use different batch_size and sleep_time_in_seconds.  #
+# For Usage Tier 1, settings that seemed to work well were batch_size=5,   #
+# and sleep_time_in_seconds=15 (as of December 2023.)                      #
+############################################################################
+
+benchmark_df = await rag_evaluator_pack.arun(
+    batch_size=20,  # batches the number of openai api calls to make
+    sleep_time_in_seconds=1,  # seconds to sleep before making an api call
+)
+```
diff --git a/llama-index-core/llama_index/core/download/dataset.py b/llama-index-core/llama_index/core/download/dataset.py
index 8900107626cdbc1ec797737c6bf9e4327529c9cc..57398975982857523a1b95e8ce3f4bbf6de69e00 100644
--- a/llama-index-core/llama_index/core/download/dataset.py
+++ b/llama-index-core/llama_index/core/download/dataset.py
@@ -6,7 +6,6 @@ from pathlib import Path
 from typing import Any, Dict, List, Optional, Union
 
 import tqdm
-from llama_index.core.download.module import LLAMA_HUB_URL
 from llama_index.core.download.utils import (
     get_file_content,
     get_file_content_bytes,
@@ -14,6 +13,12 @@ from llama_index.core.download.utils import (
     initialize_directory,
 )
 
+LLAMA_INDEX_CONTENTS_URL = (
+    f"https://raw.githubusercontent.com/run-llama/llama_index/main"
+)
+LLAMA_DATASETS_PATH = "/llama-datasets"
+LLAMA_DATASETS_URL = LLAMA_INDEX_CONTENTS_URL + LLAMA_DATASETS_PATH
+
 LLAMA_DATASETS_LFS_URL = (
     f"https://media.githubusercontent.com/media/run-llama/llama-datasets/main"
 )
@@ -91,7 +96,8 @@ def get_dataset_info(
         source_files = []
         if dataset_class_name == "LabelledRagDataset":
             source_files = get_source_files_list(
-                str(remote_source_dir_path), f"/{dataset_id}/{source_files_path}"
+                str(remote_source_dir_path),
+                f"/llama_datasets/{dataset_id}/{source_files_path}",
             )
 
         # create cache dir if needed
@@ -141,7 +147,7 @@ def download_dataset_and_source_files(
         base_file_name = _resolve_dataset_file_name(dataset_class_name)
 
         dataset_raw_content, _ = get_file_content(
-            str(remote_lfs_dir_path), f"/{dataset_id}/{base_file_name}"
+            str(remote_lfs_dir_path), f"/llama_datasets/{dataset_id}/{base_file_name}"
         )
 
         with open(f"{module_path}/{base_file_name}", "w") as f:
@@ -158,7 +164,7 @@ def download_dataset_and_source_files(
                 if ".pdf" in source_file:
                     source_file_raw_content_bytes, _ = get_file_content_bytes(
                         str(remote_lfs_dir_path),
-                        f"/{dataset_id}/{source_files_dir_path}/{source_file}",
+                        f"/llama_datasets/{dataset_id}/{source_files_dir_path}/{source_file}",
                     )
                     with open(
                         f"{module_path}/{source_files_dir_path}/{source_file}", "wb"
@@ -167,7 +173,7 @@ def download_dataset_and_source_files(
                 else:
                     source_file_raw_content, _ = get_file_content(
                         str(remote_lfs_dir_path),
-                        f"/{dataset_id}/{source_files_dir_path}/{source_file}",
+                        f"/llama_datasets/{dataset_id}/{source_files_dir_path}/{source_file}",
                     )
                     with open(
                         f"{module_path}/{source_files_dir_path}/{source_file}", "w"
@@ -177,7 +183,7 @@ def download_dataset_and_source_files(
 
 def download_llama_dataset(
     dataset_class: str,
-    llama_hub_url: str = LLAMA_HUB_URL,
+    llama_datasets_url: str = LLAMA_DATASETS_URL,
     llama_datasets_lfs_url: str = LLAMA_DATASETS_LFS_URL,
     llama_datasets_source_files_tree_url: str = LLAMA_DATASETS_SOURCE_FILES_GITHUB_TREE_URL,
     refresh_cache: bool = False,
@@ -218,7 +224,7 @@ def download_llama_dataset(
     # fetch info from library.json file
     dataset_info = get_dataset_info(
         local_dir_path=dirpath,
-        remote_dir_path=llama_hub_url,
+        remote_dir_path=llama_datasets_url,
         remote_source_dir_path=llama_datasets_source_files_tree_url,
         dataset_class=dataset_class,
         refresh_cache=refresh_cache,
diff --git a/llama-index-core/llama_index/core/llama_dataset/download.py b/llama-index-core/llama_index/core/llama_dataset/download.py
index e17c5657be675ae8241144b721bdff0729bf9d04..29622e7e4daddae3137c12b510fcf7a571e216e2 100644
--- a/llama-index-core/llama_index/core/llama_dataset/download.py
+++ b/llama-index-core/llama_index/core/llama_dataset/download.py
@@ -4,10 +4,10 @@ from llama_index.core import Document
 from llama_index.core.download.dataset import (
     LLAMA_DATASETS_LFS_URL,
     LLAMA_DATASETS_SOURCE_FILES_GITHUB_TREE_URL,
+    LLAMA_DATASETS_URL,
 )
 from llama_index.core.download.dataset import download_llama_dataset as download
 from llama_index.core.download.module import (
-    LLAMA_HUB_URL,
     MODULE_TYPE,
     track_download,
 )
@@ -35,7 +35,7 @@ def _resolve_dataset_class(filename: str) -> Type[BaseLlamaDataset]:
 def download_llama_dataset(
     llama_dataset_class: str,
     download_dir: str,
-    llama_hub_url: str = LLAMA_HUB_URL,
+    llama_datasets_url: str = LLAMA_DATASETS_URL,
     llama_datasets_lfs_url: str = LLAMA_DATASETS_LFS_URL,
     llama_datasets_source_files_tree_url: str = LLAMA_DATASETS_SOURCE_FILES_GITHUB_TREE_URL,
     show_progress: bool = False,
@@ -67,12 +67,12 @@ def download_llama_dataset(
     """
     filenames: Tuple[str, str] = download(
         llama_dataset_class,
-        llama_hub_url=llama_hub_url,
+        llama_datasets_url=llama_datasets_url,
         llama_datasets_lfs_url=llama_datasets_lfs_url,
         llama_datasets_source_files_tree_url=llama_datasets_source_files_tree_url,
         refresh_cache=True,
         custom_path=download_dir,
-        library_path="llama_datasets/library.json",
+        library_path="library.json",
         disable_library_cache=True,
         override_path=True,
         show_progress=show_progress,