diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..fddb41d4977d2badb5a8b801139b346eaa14e2d4
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,2 @@
+rag/
+postgres_data/
diff --git a/README.md b/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..7ea6299abfa82ad3fe5bccdb8e2695f25e531f29
--- /dev/null
+++ b/README.md
@@ -0,0 +1,49 @@
+# Building, Evaluating, and Optimizing your RAG App for Production
+
+Large Language Models (LLMs) are revolutionizing how users can search for, interact with, and generate new content. Some recent stacks and toolkits around Retrieval-Augmented Generation (RAG) have emerged, enabling users to build applications such as chatbots using LLMs on their private data. However, while setting up a naive RAG stack is straightforward, having it meet a production quality bar is hard. To be an AI engineer, you need to learn principled development practices for evaluation and optimization of your RAG app - from data parameters to retrieval algorithms to fine-tuning.
+
+This workshop will guide you through this development process. You'll start with the basic RAG stack, create an initial evaluation suite, and then experiment with different advanced techniques to improve RAG performance.
+
+## Environment Setup
+Setup python environment
+1. Create and activate a python virtual environment
+```
+python3 -m venv rag
+source rag/bin/activate
+```
+2. Install dependencies
+```
+pip install -r requirements.txt 
+```
+
+Setup postgres
+1. Install docker: follow OS-specific instructions at https://docs.docker.com/engine/install/
+2. Launch postgres with docker compose (under project directory)
+```
+docker-compose up -d
+```
+
+Prepare OpenAI credentials 
+1. Create one at https://platform.openai.com/account/api-keys if you don't have one
+
+## Get Started
+We will be going through 3 notebooks, to follow along:
+```
+jupyter lab
+```
+
+
+## Core Dependencies
+```
+llama-index
+ray[data]
+
+# for notebooks
+jupyter
+
+# for postgres
+sqlalchemy[asyncio]
+pgvector
+psycopg2-binary
+asyncpg
+```
diff --git a/datasets/.gitignore b/datasets/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..c77c1c901ea46ed35d9e7ad5626e87c72c0351cd
--- /dev/null
+++ b/datasets/.gitignore
@@ -0,0 +1,2 @@
+docs.ray.io/
+sql_dumps/
diff --git a/docs.zip b/datasets/docs.zip
similarity index 100%
rename from docs.zip
rename to datasets/docs.zip
diff --git a/notebooks/golden-responses.json b/datasets/golden-responses.json
similarity index 100%
rename from notebooks/golden-responses.json
rename to datasets/golden-responses.json
diff --git a/docker-compose.yaml b/docker-compose.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d4cf99d2682ec82c85088db1a411919918051edd
--- /dev/null
+++ b/docker-compose.yaml
@@ -0,0 +1,13 @@
+version: "3.9"
+
+services:
+  postgres:
+    image: ankane/pgvector
+    ports:
+      - 5432:5432
+    volumes:
+      - ./postgres_data:/var/lib/postgresql/data
+    environment:
+      - POSTGRES_PASSWORD=postgres
+      - POSTGRES_USER=postgres
+      - POSTGRES_DB=postgres
diff --git a/notebooks/.gitignore b/notebooks/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..6bbbd0dc3927bd4759db702ffc6709d5813b962f
--- /dev/null
+++ b/notebooks/.gitignore
@@ -0,0 +1,2 @@
+__pycache__/
+.ipynb_checkpoints/
diff --git a/notebooks/01_rag.ipynb b/notebooks/01_rag.ipynb
index f552d7dc072ea714454de22b5244854b5d7fc471..42d771cdc716da36fcce3aefaf44dc8889f4e0be 100644
--- a/notebooks/01_rag.ipynb
+++ b/notebooks/01_rag.ipynb
@@ -13,8 +13,7 @@
    "id": "a2ab54b8-5341-42fa-8790-93e71bbc43b5",
    "metadata": {},
    "source": [
-    "- GitHub repository: https://github.com/anyscale/ray-summit-2023-training/tree/main\n",
-    "- Anyscale Endpoints: https://endpoints.anyscale.com/\n",
+    "- GitHub repository: https://github.com/Disiok/ai-engineer-workshop/\n",
     "- Ray documentation: https://docs.ray.io/\n",
     "- LlamaIndex documentation: https://gpt-index.readthedocs.io/en/stable/"
    ]
@@ -28,9 +27,9 @@
     "\n",
     "In this notebook we will learn how to:\n",
     "1. ðŸ’» Develop a retrieval augmented generation (RAG) based LLM application.\n",
-    "2. ðŸš€ Scale the major components (embed, index, serve, etc.) in our application.\n",
+    "2. ðŸš€ Scale the major components (embed, index, etc.) in our application.\n",
     "\n",
-    "We will use both [LlamaIndex](https://gpt-index.readthedocs.io/en/stable/) and [Ray](https://docs.ray.io/) for developing our LLM application, and [Anyscale Endpoints](https://endpoints.anyscale.com/) as the LLM engine. \n",
+    "We will use both [LlamaIndex](https://gpt-index.readthedocs.io/en/stable/) and [Ray](https://docs.ray.io/) for developing our LLM application. \n",
     "\n",
     "<img width=\"500\" src=\"https://images.ctfassets.net/xjan103pcp94/4PX0l1ruKqfH17YvUiMFPw/c60a7a665125cb8056bebcc146c23b76/image8.png\">"
    ]
@@ -40,7 +39,7 @@
    "id": "7aa52945-492f-47ae-aabc-18ad43430f6d",
    "metadata": {},
    "source": [
-    "## Setup Credentials"
+    "## Setup"
    ]
   },
   {
@@ -48,12 +47,12 @@
    "id": "b1f4fa1b-e1a6-402e-8f8a-462b3d02c87d",
    "metadata": {},
    "source": [
-    "Let's setup our credentials for Anyscale Endpoints, and optionally for Open AI"
+    "Let's setup our credentials for Open AI"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 46,
    "id": "e991060f-c95d-46f0-8bb9-7a310fc17ed3",
    "metadata": {
     "tags": []
@@ -62,10 +61,6 @@
    "source": [
     "import os\n",
     "\n",
-    "os.environ[\"ANYSCALE_API_BASE\"] = \"https://api.endpoints.anyscale.com/v1/chat/completions\"\n",
-    "os.environ[\"ANYSCALE_API_KEY\"] = \"esecret_2hvvt43kbmpgzev7k2xqa9h6dv\"\n",
-    "\n",
-    "os.environ[\"OPENAI_API_BASE\"] = \"https://api.openai.com/v1\"\n",
     "# os.environ[\"OPENAI_API_KEY\"] = ..."
    ]
   },
@@ -120,6 +115,20 @@
     "The Ray documentation has already been downloaded and is stored in shared storage directory in our Anyscale workspace. We parse the html files in the downloaded documentation, and create a Ray Dataset out of the doc paths."
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "20d5d0b9-be8b-491c-8879-09532c70dee6",
+   "metadata": {
+    "scrolled": true,
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "%cd ../datasets\n",
+    "!unzip -o docs.zip"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -131,7 +140,7 @@
    "source": [
     "from pathlib import Path\n",
     "\n",
-    "RAY_DOCS_DIRECTORY = \"/efs/shared_storage/amog/docs.ray.io/en/master/\""
+    "RAY_DOCS_DIRECTORY = \"../datasets/docs.ray.io/en/master/\""
    ]
   },
   {
@@ -164,6 +173,14 @@
     "<img width=\"800\" src=\"https://images.ctfassets.net/xjan103pcp94/1eFnKmG5xqPIFtPupZ327X/f6152723e18322b90aaa8be5d2d5a6e4/image5.png\">"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "a2ed7959-77c6-473a-a9c5-22bd4e4875f1",
+   "metadata": {},
+   "source": [
+    "### Parse data"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -262,7 +279,7 @@
    "id": "b77115cd-734a-4228-b49e-5546739b8694",
    "metadata": {},
    "source": [
-    "We now have a list of Documents (with text and source of each section) but we shouldn't directly use this as context to our RAG application just yet. The text lengths of each section are all varied and many are quite large chunks. If were to use these large sections, then we'd be inserting a lot of noisy/unwanted context and because all LLMs have a maximum context length, we wouldn't be able to fit too many relevant contexts. Therefore, we're going to split the text within each section into smaller chunks. Intuitively, smaller chunks will encapsulate single/few concepts and will be less noisy compared to larger chunks. We're going to choose some typical text splitting values (ex. `chunk_size=300`) to create our chunks for now but we'll be experiments with a range of values later.\n",
+    "We now have a list of Documents (with text and source of each section) but we shouldn't directly use this as context to our RAG application just yet. The text lengths of each section are all varied and many are quite large chunks. If were to use these large sections, then we'd be inserting a lot of noisy/unwanted context and because all LLMs have a maximum context length, we wouldn't be able to fit too many relevant contexts. Therefore, we're going to split the text within each section into smaller chunks. Intuitively, smaller chunks will encapsulate single/few concepts and will be less noisy compared to larger chunks. We're going to choose some typical text splitting values (ex. `chunk_size=512`) to create our chunks for now but we'll be experiments with a range of values later.\n",
     "\n",
     "<img src=\"../images/length-distribution.png\" alt=\"Section length distributions\" width=\"1000\"/>"
    ]
@@ -296,7 +313,7 @@
    },
    "outputs": [],
    "source": [
-    "chunk_size = 300\n",
+    "chunk_size = 512\n",
     "chunk_overlap = 50\n",
     "\n",
     "def chunk_document(document):\n",
@@ -388,23 +405,18 @@
    "outputs": [],
    "source": [
     "import numpy as np\n",
-    "from langchain.embeddings import OpenAIEmbeddings\n",
-    "from langchain.embeddings.huggingface import HuggingFaceEmbeddings\n",
+    "from llama_index.embeddings import OpenAIEmbedding, HuggingFaceEmbedding\n",
     "\n",
-    "def get_embedding_model(model_name):\n",
+    "def get_embedding_model(model_name, embed_batch_size=100):\n",
     "    if model_name == \"text-embedding-ada-002\":\n",
-    "            return OpenAIEmbeddings(\n",
+    "            return OpenAIEmbedding(\n",
     "                model=model_name,\n",
-    "                openai_api_base=os.environ[\"OPENAI_API_BASE\"],\n",
-    "                openai_api_key=os.environ[\"OPENAI_API_KEY\"])\n",
+    "                embed_batch_size=embed_batch_size,\n",
+    "                api_key=os.environ[\"OPENAI_API_KEY\"])\n",
     "    else:\n",
-    "        model_kwargs = {\"device\": \"cuda\"}\n",
-    "        encode_kwargs = {\"device\": \"cuda\", \"batch_size\": 100}\n",
-    "\n",
-    "        return HuggingFaceEmbeddings(\n",
+    "        return HuggingFaceEmbedding(\n",
     "            model_name=model_name,\n",
-    "            model_kwargs=model_kwargs,\n",
-    "            encode_kwargs=encode_kwargs)"
+    "            embed_batch_size=embed_batch_size)"
    ]
   },
   {
@@ -434,7 +446,7 @@
     "        text = [node.text for node in nodes]\n",
     "        \n",
     "        # Embed the batch of text.\n",
-    "        embeddings = self.embedding_model.embed_documents(text)\n",
+    "        embeddings = self.embedding_model.get_text_embedding_batch(text)\n",
     "        assert len(nodes) == len(embeddings)\n",
     "\n",
     "        # Store the embedding in the LlamaIndex node.\n",
@@ -453,8 +465,10 @@
    "outputs": [],
    "source": [
     "# Specify the embedding model to use.\n",
+    "embedding_model_name = \"text-embedding-ada-002\"\n",
+    "\n",
     "# Specify \"text-embedding-ada-002\" for Open AI embeddings.\n",
-    "embedding_model_name = \"thenlper/gte-base\""
+    "# embedding_model_name = \"thenlper/gte-base\""
    ]
   },
   {
@@ -518,7 +532,7 @@
     "    EmbedChunks,\n",
     "    fn_constructor_kwargs={\"model_name\": embedding_model_name},\n",
     "    batch_size=100, \n",
-    "    num_gpus=1 if embedding_model_name!=\"text-embedding-ada-002\" else 0,\n",
+    "    num_gpus=0,\n",
     "    compute=ActorPoolStrategy(size=2))"
    ]
   },
@@ -542,32 +556,28 @@
   },
   {
    "cell_type": "markdown",
-   "id": "1d26ef0f-14a5-423c-a429-c6d71dfe6e03",
+   "id": "587030d3-4b28-4cf3-82c4-08bfcc7fa3c9",
    "metadata": {},
    "source": [
-    "Let's setup a Postgres database. We have already installed Postgres for you in this workspace"
+    "As the final step in our data pipeline, we will store the embeddings into our Postgres database"
    ]
   },
   {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "1203cbbb-965a-4055-bd48-7e1ad8fb2f43",
+   "cell_type": "markdown",
+   "id": "e3f5b67c-adfd-4f97-a3f8-3d2891674b56",
    "metadata": {
     "tags": []
    },
-   "outputs": [],
    "source": [
-    "%%bash\n",
-    "# Set up pgvector\n",
-    "bash ../setup-pgvector.sh"
+    "#### Postgres Vector Store"
    ]
   },
   {
    "cell_type": "markdown",
-   "id": "587030d3-4b28-4cf3-82c4-08bfcc7fa3c9",
+   "id": "1d26ef0f-14a5-423c-a429-c6d71dfe6e03",
    "metadata": {},
    "source": [
-    "As the final step in our data pipeline, we will store the embeddings into our Postgres database"
+    "Let's setup our Postgres database. The following assume you have docker installed and launched postgres in a local container, i.e. via `docker-compose up -d`"
    ]
   },
   {
@@ -581,7 +591,7 @@
    "source": [
     "%%bash\n",
     "# Drop existing table if it exists\n",
-    "sudo -u postgres psql -d postgres -c \"DROP TABLE IF EXISTS data_document;\""
+    "docker exec -u postgres ai-engineer-workshop-postgres-1 psql -d postgres -c \"DROP TABLE IF EXISTS data_document;\""
    ]
   },
   {
@@ -604,7 +614,7 @@
     "            host=\"localhost\", \n",
     "            table_name=\"document\",\n",
     "            port=\"5432\",\n",
-    "            embed_dim=768,\n",
+    "            embed_dim=1536,\n",
     "        )\n",
     "\n",
     "store = get_postgres_store()\n",
@@ -633,7 +643,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "295c75b2-3d1d-4dd0-af1f-046016a3f2ab",
+   "id": "e3054ffa-50c3-4e71-9df7-11ad5905bfe0",
    "metadata": {
     "tags": []
    },
@@ -671,7 +681,7 @@
    "outputs": [],
    "source": [
     "%%bash\n",
-    "sudo -u postgres psql -c \"SELECT count(*) FROM data_document;\""
+    "docker exec -u postgres ai-engineer-workshop-postgres-1 psql -c \"SELECT count(*) FROM data_document;\""
    ]
   },
   {
@@ -679,7 +689,7 @@
    "id": "19b0a6c5-2963-43c4-b002-7b2c48c69842",
    "metadata": {},
    "source": [
-    "## Retrieval"
+    "## Step 2: Retrieval"
    ]
   },
   {
@@ -779,7 +789,7 @@
    "id": "b06ee9e3-9482-41c3-9edb-852d2376a276",
    "metadata": {},
    "source": [
-    "## Response generation"
+    "## Step 3: Response generation"
    ]
   },
   {
@@ -811,7 +821,7 @@
    },
    "outputs": [],
    "source": [
-    "from llama_index.llms import Anyscale"
+    "from llama_index.llms import OpenAI"
    ]
   },
   {
@@ -823,8 +833,8 @@
    },
    "outputs": [],
    "source": [
-    "# Use Anyscale endpoints as the LLM to LlamaIndex.\n",
-    "llm = Anyscale(model=\"meta-llama/Llama-2-70b-chat-hf\", temperature=0.1)\n",
+    "# Use OpenAI as the LLM to LlamaIndex.\n",
+    "llm = OpenAI(model=\"gpt-3.5-turbo\", temperature=0.1)\n",
     "\n",
     "# Use the same embedding model that we used to embed our documents.\n",
     "embedding_model = get_embedding_model(embedding_model_name)\n",
@@ -907,7 +917,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.8.13"
+   "version": "3.11.4"
   }
  },
  "nbformat": 4,
diff --git a/notebooks/02_evaluation.ipynb b/notebooks/02_evaluation.ipynb
index cb5ca42c1b7afe9e25f27a0a53c99faafef7faa4..61d68920f1aae09cf7da5c8b9eaa93a9a6929e3c 100644
--- a/notebooks/02_evaluation.ipynb
+++ b/notebooks/02_evaluation.ipynb
@@ -39,10 +39,6 @@
    "source": [
     "import os\n",
     "\n",
-    "os.environ[\"ANYSCALE_API_BASE\"] = \"https://api.endpoints.anyscale.com/v1/chat/completions\"\n",
-    "os.environ[\"ANYSCALE_API_KEY\"] = \"esecret_2hvvt43kbmpgzev7k2xqa9h6dv\"\n",
-    "\n",
-    "os.environ[\"OPENAI_API_BASE\"] = \"https://api.openai.com/v1\"\n",
     "# os.environ[\"OPENAI_API_KEY\"] = ..."
    ]
   },
@@ -159,7 +155,7 @@
    "source": [
     "from pathlib import Path\n",
     "\n",
-    "RAY_DOCS_DIRECTORY = Path(\"/efs/shared_storage/amog/docs.ray.io/en/master/\")"
+    "RAY_DOCS_DIRECTORY = Path(\"docs.ray.io/en/master/\")"
    ]
   },
   {
@@ -362,7 +358,7 @@
    },
    "outputs": [],
    "source": [
-    "retriever = get_retriever(similarity_top_k=5)"
+    "retriever = get_retriever(similarity_top_k=5, embedding_model_name='text-embedding-ada-002')"
    ]
   },
   {
@@ -382,9 +378,10 @@
    },
    "outputs": [],
    "source": [
+    "from tqdm import tqdm\n",
     "results = []\n",
     "\n",
-    "for entry in data:\n",
+    "for entry in tqdm(data):\n",
     "    query = entry[\"question\"]\n",
     "    expected_source = entry['source']\n",
     "    \n",
@@ -492,9 +489,9 @@
    "id": "ac0266f2-8a34-411e-a661-4c2fc8cbb5ef",
    "metadata": {},
    "source": [
-    "To generate ground truth responses, and then to evaluate the generated responses vs. the ground truth, we need a \"golden\" LLM. But which LLM should we use? We now run into a problem: we need to determine the quality of different LLMs to choose as a \"golden\" LLM, but doing so requires a \"golden\" LLM. Leaderboards on general benchmarks provide a rough indication on which LLMs perform better, but in this case, we will go with the eye-test.\n",
+    "To generate ground truth responses, and then to evaluate the generated responses vs. the ground truth, we need a \"golden\" LLM. But which LLM should we use? We now run into a problem: we need to determine the quality of different LLMs to choose as a \"golden\" LLM, but doing so requires a \"golden\" LLM. \n",
     "\n",
-    "Let's get responses from both GPT-4 and Llama2-70B and see for ourselves which one is better."
+    "Leaderboards on general benchmarks provide a rough indication on which LLMs perform better, we will go with OpenAI's GPT-4 here since it's been shown to be [well aligned with human preferences](https://arxiv.org/pdf/2306.05685.pdf)"
    ]
   },
   {
@@ -510,7 +507,7 @@
     "\n",
     "def fetch_text_from_source(source: str):\n",
     "    url, anchor = source.split(\"#\") if \"#\" in source else (source, None)\n",
-    "    file_path = Path(\"/efs/shared_storage/amog/\", url.split(\"https://\")[-1])\n",
+    "    file_path = Path(\"./\", url.split(\"https://\")[-1])\n",
     "    with open(file_path, \"r\", encoding=\"utf-8\") as file:\n",
     "        html_content = file.read()\n",
     "    soup = BeautifulSoup(html_content, \"html.parser\")\n",
@@ -574,124 +571,12 @@
     "    return responses"
    ]
   },
-  {
-   "cell_type": "markdown",
-   "id": "d4747619-5678-4e8a-b59e-0576322ad727",
-   "metadata": {},
-   "source": [
-    "Let's get responses from gpt-4"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "e4e97c5a-2ed5-4e91-964d-80d2f7663107",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "llm = OpenAI(model='gpt-4', temperature=0.0)\n",
-    "gpt4_responses = generate_responses(data[:5], llm)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "0230ccc5-a2d2-475a-9ca7-cb594d99c9f7",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "gpt4_responses"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "e7dee70c-4b84-435b-9dc9-463ab4ce5d5e",
-   "metadata": {},
-   "source": [
-    "Now let's get responses from LLama2-70b"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "c3433a5b-cbcb-44bc-8049-36c847bba3c5",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "from llama_index.llms import Anyscale\n",
-    "from llama_index import ServiceContext\n",
-    "\n",
-    "llm = Anyscale(model='meta-llama/Llama-2-70b-chat-hf', temperature=0.0)\n",
-    "llama_responses = generate_responses(data[:5], llm)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "b39e4fd2-c6e7-4ba8-837f-543b1087d00d",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "llama_responses"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "df65c767-a3f5-4987-a7e6-58067b16c351",
-   "metadata": {},
-   "source": [
-    "Now let's compare the two"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "1d441a97-f5c6-451c-9852-4cf13328a466",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "BOLD = '\\033[1m'\n",
-    "END = '\\033[0m'\n",
-    "    \n",
-    "for query, gpt_response, llama_response in zip(data[:5], gpt4_responses, llama_responses):\n",
-    "    print(f\"{BOLD}Query:{END} {query['question']}\")\n",
-    "    print(f\"{BOLD}GPT4 answer:{END} {gpt_response}\")\n",
-    "    print(f\"{BOLD}Llama2-70B answer:{END} {llama_response}\")\n",
-    "    print(\"\\n\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "66e3fc5d-13bb-4314-b148-ab4be5b98dcb",
-   "metadata": {},
-   "source": [
-    "Based on these answers, we go with GPT-4 as our \"golden\" LLM."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "18a6cfa4-8f59-43e7-a694-843b9613b1b3",
-   "metadata": {},
-   "source": [
-    "### Generating our Golden Responses"
-   ]
-  },
   {
    "cell_type": "markdown",
    "id": "08f5c2a6-a5ac-4b9c-ab45-61d444b97b9b",
    "metadata": {},
    "source": [
-    "Now that we have chosen which LLM to use, we can generate our reference responses. Let's generate 10 reference responses and save them to a file."
+    "We can now generate our reference responses. Let's generate 10 reference responses and save them to a file."
    ]
   },
   {
@@ -729,7 +614,7 @@
    },
    "outputs": [],
    "source": [
-    "with open(\"golden-responses.json\", \"w\") as file:\n",
+    "with open(\"../datasets/golden-responses.json\", \"w\") as file:\n",
     "    json.dump(reference_dataset, file, indent=4)"
    ]
   },
@@ -758,7 +643,7 @@
    },
    "outputs": [],
    "source": [
-    "with open(\"golden-responses.json\", \"r\") as file:\n",
+    "with open(\"../datasets/golden-responses.json\", \"r\") as file:\n",
     "    golden_responses = json.load(file)"
    ]
   },
@@ -795,7 +680,7 @@
    },
    "outputs": [],
    "source": [
-    "query_engine = get_query_engine(similarity_top_k=5, llm_model_name='meta-llama/Llama-2-70b-chat-hf')\n",
+    "query_engine = get_query_engine(similarity_top_k=5, llm_model_name='gpt-3.5-turbo', embedding_model_name='text-embedding-ada-002')\n",
     "\n",
     "# Store both the original response object and the response string.\n",
     "rag_responses = []\n",
@@ -1088,7 +973,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.8.13"
+   "version": "3.11.4"
   }
  },
  "nbformat": 4,
diff --git a/notebooks/03_optimize.ipynb b/notebooks/03_optimize.ipynb
index b8749548f8b48c517ff14a85d5352b2da6543dcf..ef962c7b305630dcc027703756427aad5ca7bf57 100644
--- a/notebooks/03_optimize.ipynb
+++ b/notebooks/03_optimize.ipynb
@@ -15,8 +15,7 @@
     "tags": []
    },
    "source": [
-    "- GitHub repository: https://github.com/anyscale/ray-summit-2023-training/tree/main\n",
-    "- Anyscale Endpoints: https://endpoints.anyscale.com/\n",
+    "- GitHub repository: https://github.com/Disiok/ai-engineer-workshop/\n",
     "- Ray documentation: https://docs.ray.io/\n",
     "- LlamaIndex documentation: https://gpt-index.readthedocs.io/en/stable/"
    ]
@@ -40,7 +39,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 25,
    "id": "d2ba2156-1355-4529-a881-24232b42a878",
    "metadata": {
     "tags": []
@@ -49,10 +48,6 @@
    "source": [
     "import os\n",
     "\n",
-    "os.environ[\"ANYSCALE_API_BASE\"] = \"https://api.endpoints.anyscale.com/v1/chat/completions\"\n",
-    "os.environ[\"ANYSCALE_API_KEY\"] = \"esecret_2hvvt43kbmpgzev7k2xqa9h6dv\"\n",
-    "\n",
-    "os.environ[\"OPENAI_API_BASE\"] = \"https://api.openai.com/v1\"\n",
     "# os.environ[\"OPENAI_API_KEY\"] = ...\n",
     "\n",
     "os.environ[\"DB_CONNECTION_STRING\"] = \"dbname=postgres user=postgres host=localhost password=postgres\""
@@ -60,7 +55,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 2,
    "id": "b309dbd2-ffbe-4f21-add9-e1c5449569d5",
    "metadata": {
     "tags": []
@@ -70,16 +65,13 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "2023-09-18 21:47:15,911\tINFO worker.py:1465 -- Connecting to existing Ray cluster at address: 10.0.33.113:6379...\n",
-      "2023-09-18 21:47:15,921\tINFO worker.py:1640 -- Connected to Ray cluster. View the dashboard at \u001b[1m\u001b[32mhttps://session-hvq6cjxyd917stdzvn4cs58auc.i.anyscaleuserdata.com \u001b[39m\u001b[22m\n",
-      "2023-09-18 21:47:15,925\tINFO packaging.py:346 -- Pushing file package 'gcs://_ray_pkg_adcf6c6b0a7f5609e26ea5822b1e1b1a.zip' (0.69MiB) to Ray cluster...\n",
-      "2023-09-18 21:47:15,927\tINFO packaging.py:359 -- Successfully pushed file package 'gcs://_ray_pkg_adcf6c6b0a7f5609e26ea5822b1e1b1a.zip'.\n"
+      "2023-10-08 16:35:57,171\tINFO worker.py:1642 -- Started a local Ray instance.\n"
      ]
     },
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "7a962b665e884c1dbb844874a13eefb2",
+       "model_id": "a6b2f0e20ec34200ad18b682e36d4f35",
        "version_major": 2,
        "version_minor": 0
       },
@@ -103,27 +95,23 @@
        "        <table class=\"jp-RenderedHTMLCommon\" style=\"border-collapse: collapse;color: var(--jp-ui-font-color1);font-size: var(--jp-ui-font-size1);\">\n",
        "    <tr>\n",
        "        <td style=\"text-align: left\"><b>Python version:</b></td>\n",
-       "        <td style=\"text-align: left\"><b>3.10.8</b></td>\n",
+       "        <td style=\"text-align: left\"><b>3.11.4</b></td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "        <td style=\"text-align: left\"><b>Ray version:</b></td>\n",
-       "        <td style=\"text-align: left\"><b>3.0.0.dev0</b></td>\n",
+       "        <td style=\"text-align: left\"><b>2.7.0</b></td>\n",
        "    </tr>\n",
-       "    <tr>\n",
-       "    <td style=\"text-align: left\"><b>Dashboard:</b></td>\n",
-       "    <td style=\"text-align: left\"><b><a href=\"http://session-hvq6cjxyd917stdzvn4cs58auc.i.anyscaleuserdata.com\" target=\"_blank\">http://session-hvq6cjxyd917stdzvn4cs58auc.i.anyscaleuserdata.com</a></b></td>\n",
-       "</tr>\n",
-       "\n",
+       "    \n",
        "</table>\n",
        "\n",
        "    </div>\n",
        "</div>\n"
       ],
       "text/plain": [
-       "RayContext(dashboard_url='session-hvq6cjxyd917stdzvn4cs58auc.i.anyscaleuserdata.com', python_version='3.10.8', ray_version='3.0.0.dev0', ray_commit='883049d823fee5da883af2cd10756413d5b47407', protocol_version=None)"
+       "RayContext(dashboard_url='', python_version='3.11.4', ray_version='2.7.0', ray_commit='b4bba4717f5ba04ee25580fe8f88eed63ef0c5dc', protocol_version=None)"
       ]
      },
-     "execution_count": 3,
+     "execution_count": 2,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -134,10 +122,7 @@
     "# Credentials\n",
     "ray.init(runtime_env={\n",
     "    \"env_vars\": {\n",
-    "        \"OPENAI_API_BASE\": os.environ[\"OPENAI_API_BASE\"],\n",
     "        \"OPENAI_API_KEY\": os.environ[\"OPENAI_API_KEY\"], \n",
-    "        \"ANYSCALE_API_BASE\": os.environ[\"ANYSCALE_API_BASE\"],\n",
-    "        \"ANYSCALE_API_KEY\": os.environ[\"ANYSCALE_API_KEY\"],\n",
     "        \"DB_CONNECTION_STRING\": os.environ[\"DB_CONNECTION_STRING\"],\n",
     "    },\n",
     "})"
@@ -145,7 +130,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": 3,
    "id": "ab654d85-882a-40bd-bd8e-3c94758cbee9",
    "metadata": {
     "tags": []
@@ -225,7 +210,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 21,
    "id": "3b464eac-e2a9-4d08-ae48-5814c59484b4",
    "metadata": {
     "tags": []
@@ -239,13 +224,15 @@
     "ROOT_DIR = Path(os.getcwd()).parent\n",
     "EXPERIMENTS_DIR = Path(ROOT_DIR, \"experiments\")\n",
     "\n",
-    "EFS_DIR = Path(\"/efs/shared_storage/simon\")\n",
-    "DOCS_PATH = Path(EFS_DIR, \"docs.ray.io/en/master/\")"
+    "DATA_DIR = Path(ROOT_DIR, 'datasets')\n",
+    "DOCS_PATH = Path(DATA_DIR, \"docs.ray.io/en/master/\")\n",
+    "SQL_DUMP_DIR = Path(DATA_DIR, \"sql_dumps\")\n",
+    "SQL_DUMP_DIR.mkdir(parents=True, exist_ok=True)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 22,
    "id": "7e982bd1-4810-4b06-b872-ba2a0c50be25",
    "metadata": {
     "tags": []
@@ -261,25 +248,24 @@
     "\n",
     "def build_or_load_index(docs_path, embedding_model_name, chunk_size, chunk_overlap):\n",
     "    # Drop current Vector DB and prepare for new one\n",
-    "    execute_bash(f'psql \"{os.environ[\"DB_CONNECTION_STRING\"]}\" -c \"DROP TABLE data_document;\"')\n",
-    "    #execute_bash(f'sudo -u postgres psql -f ../migrations/vector-{EMBEDDING_DIMENSIONS[embedding_model_name]}.sql')\n",
-    "    SQL_DUMP_FP = Path(EFS_DIR, \"sql_dumps\", f\"{embedding_model_name.split('/')[-1]}_{chunk_size}_{chunk_overlap}.sql\")\n",
+    "    execute_bash(f'docker exec -u postgres ai-engineer-workshop-postgres-1 psql -c \"DROP TABLE data_document;\"')\n",
+    "    SQL_DUMP_FP = Path(SQL_DUMP_DIR, f\"{embedding_model_name.split('/')[-1]}_{chunk_size}_{chunk_overlap}.sql\")\n",
     "    \n",
     "    # Vector DB\n",
     "    if SQL_DUMP_FP.exists():  # Load from SQL dump\n",
     "        print('Loading from SQL dump')\n",
-    "        execute_bash(f'psql \"{os.environ[\"DB_CONNECTION_STRING\"]}\" -f {SQL_DUMP_FP}')\n",
+    "        execute_bash(f'docker exec -u postgres ai-engineer-workshop-postgres-1 psql < {SQL_DUMP_FP}')\n",
     "    else:  \n",
     "        print('Creating new index')\n",
     "        build_index(docs_path, embedding_model_name, chunk_size, chunk_overlap)\n",
     "        \n",
     "        print('Saving to SQL dump')\n",
-    "        execute_bash(f\"sudo -u postgres pg_dump -c > {SQL_DUMP_FP}\")"
+    "        execute_bash(f\"docker exec -u postgres ai-engineer-workshop-postgres-1 pg_dump -c > {SQL_DUMP_FP}\")"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 23,
    "id": "f328c952-fc4c-48dc-b877-51e013757045",
    "metadata": {
     "tags": []
@@ -293,7 +279,7 @@
     "from pprint import pprint\n",
     "\n",
     "DEFAULT_EXP_CONFIG = {\n",
-    "    'embedding_model_name': \"thenlper/gte-base\",\n",
+    "    'embedding_model_name': \"text-embedding-ada-002\",\n",
     "    'chunk_size': 1024,\n",
     "    'chunk_overlap': 50,\n",
     "    'similarity_top_k': 5,\n",
@@ -356,7 +342,7 @@
     "        temperature=run_exp_config['temperature'],\n",
     "    )\n",
     "    \n",
-    "    with open(\"golden-responses.json\", \"r\") as file:\n",
+    "    with open(\"../datasets/golden-responses.json\", \"r\") as file:\n",
     "        golden_dataset = json.load(file)\n",
     "    \n",
     "    queries = [item['question'] for item in golden_dataset]\n",
@@ -387,7 +373,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 24,
    "id": "3422311c-30c4-4431-9dd4-d31faa70a964",
    "metadata": {
     "tags": []
@@ -403,7 +389,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 38,
+   "execution_count": 20,
    "id": "12e78197-e32c-4c11-947a-97d5f3522d3b",
    "metadata": {
     "tags": []
@@ -1619,7 +1605,7 @@
    },
    "outputs": [],
    "source": [
-    "with open(\"golden-responses.json\", \"r\") as file:\n",
+    "with open(\"../datasets/golden-responses.json\", \"r\") as file:\n",
     "    golden_dataset = json.load(file)\n",
     "\n",
     "e2e_queries = [item['question'] for item in golden_dataset]\n",
@@ -2128,7 +2114,7 @@
    },
    "outputs": [],
    "source": [
-    "with open(\"golden-responses.json\", \"r\") as file:\n",
+    "with open(\"../datasets/golden-responses.json\", \"r\") as file:\n",
     "    golden_dataset = json.load(file)\n",
     "\n",
     "queries = [item['question'] for item in golden_dataset]\n",
@@ -2781,7 +2767,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.8"
+   "version": "3.11.4"
   }
  },
  "nbformat": 4,
diff --git a/notebooks/data.py b/notebooks/data.py
index fa03a637b5959416a1ca8287c4107baaea4e6ee8..077018fa87498b61543aed3fc0dfe554c4b54ae4 100644
--- a/notebooks/data.py
+++ b/notebooks/data.py
@@ -5,8 +5,7 @@ import ray
 from ray.data import ActorPoolStrategy
 from ray.util.scheduling_strategies import NodeAffinitySchedulingStrategy
 
-from langchain.embeddings import OpenAIEmbeddings
-from langchain.embeddings.huggingface import HuggingFaceEmbeddings
+from llama_index.embeddings import OpenAIEmbedding, HuggingFaceEmbedding
 from llama_index.readers import HTMLTagReader
 from llama_index.vector_stores import PGVectorStore
 from llama_index.node_parser import SimpleNodeParser
@@ -35,20 +34,17 @@ def extract_sections(record):
     return [{"document": document} for document in documents]
 
 
-def get_embedding_model(model_name):
+def get_embedding_model(model_name, embed_batch_size=100):
     if model_name == "text-embedding-ada-002":
-            return OpenAIEmbeddings(
+            return OpenAIEmbedding(
                 model=model_name,
-                openai_api_base=os.environ["OPENAI_API_BASE"],
-                openai_api_key=os.environ["OPENAI_API_KEY"])
+                embed_batch_size=embed_batch_size,
+                api_key=os.environ["OPENAI_API_KEY"])
     else:
-        model_kwargs = {"device": "cuda"}
-        encode_kwargs = {"device": "cuda", "batch_size": 100}
-
-        return HuggingFaceEmbeddings(
+        return HuggingFaceEmbedding(
             model_name=model_name,
-            model_kwargs=model_kwargs,
-            encode_kwargs=encode_kwargs)
+            embed_batch_size=embed_batch_size
+        )
     
     
 class EmbedChunks:
@@ -61,7 +57,7 @@ class EmbedChunks:
         text = [node.text for node in nodes]
         
         # Embed the batch of text.
-        embeddings = self.embedding_model.embed_documents(text)
+        embeddings = self.embedding_model.get_text_embedding_batch(text)
         assert len(nodes) == len(embeddings)
 
         # Store the embedding in the LlamaIndex node.
@@ -117,7 +113,7 @@ def build_index(docs_path, embedding_model_name, chunk_size, chunk_overlap):
         EmbedChunks,
         fn_constructor_kwargs={"model_name": embedding_model_name},
         batch_size=100, 
-        num_gpus=1 if embedding_model_name!="text-embedding-ada-002" else 0,
+        num_gpus=0 if embedding_model_name!="text-embedding-ada-002" else 0,
         compute=ActorPoolStrategy(size=2))
 
     # Index data
diff --git a/notebooks/eval-scores.json b/notebooks/eval-scores.json
index bfbd280fbbc70c8b20877b6c7d8c788bccb24e69..b6a9fef4db690791831ec3ee5770fb0273309961 100644
--- a/notebooks/eval-scores.json
+++ b/notebooks/eval-scores.json
@@ -1,72 +1,112 @@
 [
     {
         "question": "I\u2019m struggling a bit with Ray Data type conversions when I do map_batches. Any advice?",
-        "golden_response": "Sure, when using the `map_batches()` function in Ray Data, you can specify the batch format by using the `batch_format` parameter. If you want to work with NumPy ndarrays, you can set `batch_format=\"numpy\"`. For example, if you have a function that increases the brightness of an image, you can use it like this:\n\n```python\nfrom typing import Dict\nimport numpy as np\nimport ray\n\ndef increase_brightness(batch: Dict[str, np.ndarray]) -> Dict[str, np.ndarray]:\n    batch[\"image\"] = np.clip(batch[\"image\"] + 4, 0, 255)\n    return batch\n\nds = (\n    ray.data.read_images(\"s3://[email\u00a0protected]/image-datasets/simple\")\n    .map_batches(increase_brightness, batch_format=\"numpy\")\n)\n```\n\nOn the other hand, if you prefer to work with pandas DataFrames, you can set `batch_format=\"pandas\"`. For instance, if you have a function that drops NA values from a DataFrame, you can use it like this:\n\n```python\nimport pandas as pd\nimport ray\n\ndef drop_nas(batch: pd.DataFrame) -> pd.DataFrame:\n    return batch.dropna()\n\nds = (\n    ray.data.read_csv(\"s3://[email\u00a0protected]/iris.csv\")\n    .map_batches(drop_nas, batch_format=\"pandas\")\n)\n```\n\nRemember, the function you pass to `map_batches()` should be able to handle the data type you specify in `batch_format`.",
-        "generated_response": "Ray Data type conversions can be a bit tricky when using map_batches. One thing to keep in mind is that Ray Data represents batches as dictionaries of NumPy ndarrays or pandas DataFrames by default. However, you can configure the batch type by specifying the batch_format parameter in map_batches(). You can return either format (NumPy ndarrays or pandas DataFrames) from your function.\n\nAdditionally, it's worth noting that Ray Data automatically converts torch tensors to NumPy arrays under the hood. So if you're working with torch tensors, be aware that subsequent transformations will accept NumPy arrays as input, not torch tensors.\n\nIf you're experiencing slow performance with the map, flat_map, and filter operations, consider using the map_batches() method instead. The warning message suggests that these operations are unvectorized and can be slow. By using map_batches(), you can potentially improve performance, especially if you're using vectorized transformations.\n\nOverall, understanding the default batch format, configuring the batch type, and being aware of the automatic conversion of torch tensors can help you navigate Ray Data type conversions more effectively.",
+        "golden_response": {
+            "question": "I\u2019m struggling a bit with Ray Data type conversions when I do map_batches. Any advice?",
+            "source": "https://docs.ray.io/en/master/data/transforming-data.html#configuring-batch-format",
+            "response": "Sure, when using the `map_batches()` function in Ray Data, you can specify the batch format by using the `batch_format` parameter. If you want to work with NumPy ndarrays, you can set `batch_format=\"numpy\"`. For example, if you have a function that increases the brightness of an image, you can use this format. \n\nHere's a sample code:\n\n```python\nfrom typing import Dict\nimport numpy as np\nimport ray\n\ndef increase_brightness(batch: Dict[str, np.ndarray]) -> Dict[str, np.ndarray]:\n    batch[\"image\"] = np.clip(batch[\"image\"] + 4, 0, 255)\n    return batch\n\nds = (\n    ray.data.read_images(\"s3://[email\u00a0protected]/image-datasets/simple\")\n    .map_batches(increase_brightness, batch_format=\"numpy\")\n)\n```\n\nOn the other hand, if you prefer to work with pandas DataFrames, you can set `batch_format=\"pandas\"`. For instance, if you have a function that drops NA values from a DataFrame, you can use this format.\n\nHere's a sample code:\n\n```python\nimport pandas as pd\nimport ray\n\ndef drop_nas(batch: pd.DataFrame) -> pd.DataFrame:\n    return batch.dropna()\n\nds = (\n    ray.data.read_csv(\"s3://[email\u00a0protected]/iris.csv\")\n    .map_batches(drop_nas, batch_format=\"pandas\")\n)\n```\n\nRemember, by default, Ray Data represents batches as dicts of NumPy ndarrays."
+        },
+        "generated_response": "Consider specifying the batch format in the `map_batches()` function when working with Ray Data type conversions. By default, Ray Data represents batches as dictionaries of NumPy ndarrays. However, you can configure the batch type by setting the `batch_format` parameter in `map_batches()`. This allows you to return batches in either NumPy ndarrays or pandas DataFrames format. By specifying the desired batch format, you can ensure that the type conversions are handled correctly during the `map_batches()` operation.",
         "score": 4.5,
-        "reasoning": "The generated answer is highly relevant to the user query and provides a detailed explanation of how to handle type conversions in Ray Data when using the `map_batches()` function. It correctly mentions the use of the `batch_format` parameter and the automatic conversion of torch tensors to NumPy arrays. However, it does not provide specific examples of how to use the `map_batches()` function with different `batch_format` values, which was included in the reference answer. This slight omission prevents the generated answer from receiving a perfect score."
+        "reasoning": "The generated answer is relevant and mostly correct. It provides the user with advice on how to handle type conversions in Ray Data when using the `map_batches()` function, which is what the user asked for. However, it lacks the specific examples provided in the reference answer, which would have made the explanation clearer and more practical."
     },
     {
         "question": "How does autoscaling work in a Ray Serve application?",
-        "golden_response": "In a Ray Serve application, autoscaling works by adjusting to traffic spikes through observing queue sizes and making scaling decisions to add or remove replicas. This is achieved by setting the autoscaling_config field in deployment options. The autoscaling_config parameters include min_replicas, initial_replicas, max_replicas, and target_num_ongoing_requests_per_replica. \n\nThe min_replicas and max_replicas fields configure the range of replicas which the Serve autoscaler chooses from. Deployments will start with initial_replicas, which is optional and set to min_replicas by default. The target_num_ongoing_requests_per_replica configuration specifies how aggressively the autoscaler should react to traffic. \n\nThe Ray Serve Autoscaler is an application-level autoscaler that sits on top of the Ray Autoscaler. It asks Ray to start a number of replica actors based on the request demand. If the Ray Autoscaler determines there aren\u2019t enough available CPUs to place these actors, it responds by requesting more Ray nodes. Similarly, when Ray Serve scales down and terminates some replica actors, it will try to do so in a way that results in the most nodes having no Ray actors or tasks running on them, at which point the Ray autoscaler will remove those nodes. \n\nOther parameters that influence autoscaling decisions include downscale_delay_s, upscale_delay_s, smoothing_factor, and metrics_interval_s. These parameters control the frequency of autoscaling work, the speed of each autoscaling step, and how often each replica sends metrics to the autoscaler, respectively.",
-        "generated_response": "The autoscaling feature in a Ray Serve application automatically adjusts the number of replicas based on the load. The Serve Autoscaler, which runs in the Serve Controller actor, periodically checks the ServeHandle queues and in-flight queries on replicas to determine whether to scale the number of replicas. Each ServeHandle continuously polls the controller for new deployment replicas and sends any buffered or new queries to the replica until the maximum number of concurrent queries is reached. Autoscaling in Ray Serve is an application-level autoscaler that asks Ray to start a number of replica actors based on the request demand. If there aren't enough available CPUs, the Ray Autoscaler requests more Ray nodes from the underlying cloud provider. When scaling down, Ray Serve tries to terminate replica actors in a way that results in the most nodes having no Ray actors or tasks running on them, and the Ray Autoscaler removes those nodes.",
-        "score": 5.0,
-        "reasoning": "The generated answer is highly relevant to the user query and provides a detailed explanation of how autoscaling works in a Ray Serve application. It correctly explains the process of adjusting the number of replicas based on the load, the role of the Serve Autoscaler, and how it interacts with the Ray Autoscaler. The answer also correctly describes the process of scaling up and down based on available resources. Although it does not mention the specific parameters of the autoscaling_config field, it still provides a comprehensive overview of the autoscaling process in Ray Serve."
+        "golden_response": {
+            "question": "How does autoscaling work in a Ray Serve application?",
+            "source": "https://docs.ray.io/en/master/serve/scaling-and-resource-allocation.html#autoscaling",
+            "response": "In a Ray Serve application, autoscaling works by adjusting to traffic spikes through observing queue sizes and making scaling decisions to add or remove replicas. This is achieved by setting the autoscaling_config field in deployment options. The configuration includes parameters like min_replicas, max_replicas, initial_replicas, and target_num_ongoing_requests_per_replica. \n\nThe min_replicas and max_replicas fields define the range of replicas that the autoscaler can choose from. The initial_replicas field sets the starting number of replicas, and if it's not specified, it defaults to min_replicas. The target_num_ongoing_requests_per_replica parameter determines how aggressively the autoscaler should react to traffic. \n\nThe Ray Serve Autoscaler operates at the application level and sits on top of the Ray Autoscaler. It requests Ray to start a number of replica actors based on the request demand. If the Ray Autoscaler determines there aren\u2019t enough available CPUs to place these actors, it requests more Ray nodes from the underlying cloud provider. \n\nThere are also parameters like downscale_delay_s and upscale_delay_s that control the frequency of autoscaling work, and smoothing_factor to speed up or slow down each autoscaling step. The metrics_interval_s parameter controls how often each replica sends metrics to the autoscaler."
+        },
+        "generated_response": "The autoscaling feature in a Ray Serve application automatically adjusts the number of replicas based on the load. The Ray Serve Autoscaler, which runs in the Serve Controller actor, receives metrics from each ServeHandle and replica. It periodically checks the ServeHandle queues and in-flight queries on replicas to determine whether to scale the number of replicas. When new replicas are discovered, the ServeHandle sends any buffered or new queries to the replica until the maximum number of concurrent queries is reached. Autoscaling in Ray Serve considers logical resource requests for scaling, such as those specified in @ray.remote and displayed in ray status, rather than physical machine utilization. If there are insufficient resources, the request will be queued, and the autoscaler will add nodes to satisfy the resource demands in the queue. Idle nodes, which have no active tasks, actors, or objects, are removed by the autoscaler after a certain period of time.",
+        "score": 4.5,
+        "reasoning": "The generated answer is highly relevant to the user query and provides a detailed explanation of how autoscaling works in a Ray Serve application. It correctly describes the process of adjusting the number of replicas based on load, the role of the Serve Controller actor, and the consideration of logical resource requests for scaling. However, it does not mention specific configuration parameters like min_replicas, max_replicas, initial_replicas, and target_num_ongoing_requests_per_replica, which are included in the reference answer."
     },
     {
         "question": "how do I get the address of a ray node",
-        "golden_response": "You can get the address of a Ray node by accessing the 'NodeManagerAddress' field of the node's information. This field contains the PrivateIP of the node that the raylet is on.",
-        "generated_response": "To get the address of a Ray node, you can use the Ray.nodes API to fetch all nodes and map the node ID to the corresponding IP. This will give you the IP address of the node, which can be used as the address of the Ray node.",
-        "score": 4.5,
-        "reasoning": "The generated answer is relevant and mostly correct. It provides a method to get the address of a Ray node, which is what the user asked for. However, it doesn't mention the 'NodeManagerAddress' field specifically, which is a minor detail that was included in the reference answer."
+        "golden_response": {
+            "question": "how do I get the address of a ray node",
+            "source": "https://docs.ray.io/en/master/ray-core/miscellaneous.html#node-information",
+            "response": "You can get the address of a Ray node by accessing the 'NodeManagerAddress' field of the node's information. This field provides the PrivateIP of the node that the raylet is on."
+        },
+        "generated_response": "You can use the `ray.nodes` API to fetch all nodes and map the node ID to the corresponding IP address.",
+        "score": 3.5,
+        "reasoning": "The generated answer is relevant to the user query and provides a method to get the address of a Ray node. However, it does not mention the 'NodeManagerAddress' field which is part of the correct procedure according to the reference answer."
     },
     {
         "question": "Does Ray support NCCL?",
-        "golden_response": "Yes, Ray does support NCCL. It uses NCCL as one of the optional high-performance communication backends in its collective communication library.",
+        "golden_response": {
+            "question": "Does Ray support NCCL?",
+            "source": "https://docs.ray.io/en/master/ray-more-libs/ray-collective.html",
+            "response": "Yes, Ray does support NCCL. It uses NCCL as one of the optional high-performance communication backends in its collective communication library."
+        },
         "generated_response": "Yes, Ray supports NCCL as one of the optional high-performance communication backends for its collective communication library.",
         "score": 5.0,
-        "reasoning": "The generated answer is completely relevant to the user query and is fully correct. It matches the reference answer in terms of content and accuracy."
+        "reasoning": "The generated answer is completely relevant and correct, providing the same information as the reference answer. It is also concise and clear."
     },
     {
         "question": "Is Ray integrated with DeepSpeed?",
-        "golden_response": "Yes, Ray is integrated with DeepSpeed. This integration allows for the optimization of the training process, including the ability to offload and partition optimizer and parameter states, which can reduce GPU memory usage. This is particularly useful when working with large models that may not fit on smaller GPU types.",
+        "golden_response": {
+            "question": "Is Ray integrated with DeepSpeed?",
+            "source": "https://docs.ray.io/en/master/ray-air/examples/gptj_deepspeed_fine_tuning.html#fine-tuning-the-model-with-ray-air-a-name-train-a",
+            "response": "Yes, Ray is integrated with DeepSpeed. This integration allows for the optimization of the training process, including the ability to offload and partition optimizer and parameter states, which can reduce GPU memory usage. This is particularly useful when working with large models that may not fit on smaller GPU types."
+        },
         "generated_response": "Yes, Ray is integrated with DeepSpeed.",
         "score": 4.0,
-        "reasoning": "The generated answer is relevant and correct, as it confirms that Ray is integrated with DeepSpeed. However, it lacks the detailed explanation provided in the reference answer, which would have made it more informative and complete."
+        "reasoning": "The generated answer is correct and relevant to the user query. However, it lacks the detailed explanation provided in the reference answer."
     },
     {
         "question": "what will happen if I use AsyncIO's await to wait for a Ray future like `await x.remote()`",
-        "golden_response": "If you use AsyncIO's await to wait for a Ray future like `await x.remote()`, it will allow you to wait for the completion of the remote task. This is possible because ObjectRefs can be translated to asyncio.Futures. This feature enables you to integrate Ray futures into existing concurrent applications.",
-        "generated_response": "The given context information does not provide any specific information about what will happen if you use AsyncIO's await to wait for a Ray future like `await x.remote()`.",
-        "score": 1.0,
-        "reasoning": "The generated answer is not relevant to the user query. It does not provide any information about what will happen if AsyncIO's await is used to wait for a Ray future."
+        "golden_response": {
+            "question": "what will happen if I use AsyncIO's await to wait for a Ray future like `await x.remote()`",
+            "source": "https://docs.ray.io/en/master/ray-core/actors/async_api.html#objectrefs-as-asyncio-futures",
+            "response": "If you use AsyncIO's await to wait for a Ray future like `await x.remote()`, it will allow you to wait for the completion of the remote task. This feature is useful for integrating Ray futures into existing concurrent applications."
+        },
+        "generated_response": "Using AsyncIO's `await` to wait for a Ray future like `await x.remote()` will allow the program to continue executing other tasks while waiting for the Ray future to complete. This enables concurrency and parallelism in the program, as other tasks can be executed while waiting for the result of the Ray future. This is in contrast to using `ray.get()` which is a blocking call and would prevent the program from executing other tasks until the result is available.",
+        "score": 4.5,
+        "reasoning": "The generated answer is relevant and mostly correct. It accurately explains the benefits of using AsyncIO's `await` with Ray futures, such as enabling concurrency and parallelism. However, it does not explicitly mention that it allows you to wait for the completion of the remote task, which is mentioned in the reference answer."
     },
     {
         "question": "How would you compare Spark, Ray, Dask?",
-        "golden_response": "The context does not provide information on how to compare Spark, Ray, and Dask.",
-        "generated_response": "Spark, Ray, and Dask are all frameworks that are used for distributed computing and data processing. Spark is a popular framework that is designed for big data processing and analytics. It provides a wide range of libraries and APIs for various data processing tasks and has a strong ecosystem.\n\nRay, on the other hand, is a general-purpose distributed computing framework that is designed for high-performance and fault-tolerant applications. It provides a flexible and scalable platform for building distributed applications and supports various programming languages.\n\nDask is a Python library that is specifically designed for parallel computing and scaling analytics workloads. It provides familiar APIs that mimic the NumPy and Pandas libraries, allowing for easy integration with existing Python data processing workflows. Dask also provides automatic data parallelism and optimized operations for efficient computation.\n\nIn summary, Spark is a comprehensive big data processing framework, Ray is a general-purpose distributed computing framework, and Dask is a Python library for parallel computing and scaling analytics workloads. The choice between these frameworks depends on the specific requirements and use cases of the application.",
+        "golden_response": {
+            "question": "How would you compare Spark, Ray, Dask?",
+            "source": "https://docs.ray.io/en/master/data/overview.html#how-does-ray-data-compare-to-x-for-offline-inference",
+            "response": "The context does not provide information on how to compare Spark, Ray, and Dask."
+        },
+        "generated_response": "Spark, Ray, and Dask are all frameworks that are used for distributed computing and data processing. Spark is a popular framework that is known for its ability to handle large-scale batch processing workloads. It provides a streaming paradigm and is commonly used for big data analytics and machine learning tasks. Ray, on the other hand, is a framework that is designed for distributed computing and is particularly well-suited for GPU workloads, such as deep learning inference. It provides features like launching cloud clusters and shared-memory store. Dask is a Python library that is focused on parallel computing and is geared towards scaling analytics and scientific computing workloads. It provides big data collections that mimic the APIs of NumPy and Pandas, allowing for operations on larger-than-memory data and execution on multi-machine clusters. Overall, while Spark is widely used for batch processing, Ray is better suited for GPU workloads, and Dask is focused on parallel computing and scaling analytics workloads.",
         "score": 5.0,
-        "reasoning": "The generated answer is highly relevant to the user query and provides a detailed comparison of Spark, Ray, and Dask. It correctly identifies the key features and use cases of each framework, and provides a summary that ties everything together. The answer is also well-structured and easy to understand."
+        "reasoning": "The generated answer is highly relevant to the user query and provides a detailed comparison of Spark, Ray, and Dask. It not only explains what each of these frameworks is used for, but also highlights their unique features and use cases. The answer is fully correct and provides more information than the reference answer."
     },
     {
         "question": "why would ray overload a node w/ more task that the resources allow ?",
-        "golden_response": "Ray allows for overloading a node with more tasks than the resources allow because it operates on logical resources, not physical ones. This means that the resource requirements of tasks or actors do not impose limits on actual physical resource usage. Ray doesn't prevent a task from using more resources than specified. It's the user's responsibility to ensure tasks or actors do not exceed the specified resource requirements. Ray doesn't provide CPU isolation for tasks or actors, instead, it lets the operating system schedule and run the task.",
+        "golden_response": {
+            "question": "why would ray overload a node w/ more task that the resources allow ?",
+            "source": "https://docs.ray.io/en/master/ray-core/scheduling/resources.html#physical-resources-and-logical-resources",
+            "response": "Ray allows for overloading a node with more tasks than the resources allow because it treats resources as logical rather than physical. This means that the resource requirements of tasks or actors do not impose limits on actual physical resource usage. It's the user's responsibility to ensure that tasks or actors do not use more resources than specified. Ray doesn't provide CPU isolation for tasks or actors, and it lets the operating system schedule and run the task. However, it does provide GPU isolation by setting the CUDA_VISIBLE_DEVICES environment variable."
+        },
         "generated_response": "Ray may overload a node with more tasks than the resources allow if the tasks or actors use more than their proportionate share of memory. This can lead to issues like out-of-memory (OOM) errors. To prevent this, it is recommended to reduce the number of concurrently running tasks or actors on each node by increasing the amount of resources requested by them. By increasing the amount of resources, Ray ensures that the sum of the resource requirements of all concurrently running tasks and actors on a given node does not exceed the node's total resources.",
-        "score": 3.5,
-        "reasoning": "The generated answer is relevant to the user query and provides useful information about how Ray might overload a node with more tasks than the resources allow. However, it does not fully align with the reference answer. The generated answer focuses on memory usage and how to prevent overloading, while the reference answer explains that Ray operates on logical resources and does not provide CPU isolation. The generated answer is partially correct but lacks some key details."
+        "score": 2.5,
+        "reasoning": "The generated answer is relevant to the user query but it contains mistakes. It incorrectly suggests that Ray overloads a node due to tasks or actors using more than their share of memory and recommends increasing the amount of resources requested by them. The reference answer correctly explains that Ray treats resources as logical rather than physical, and it's the user's responsibility to ensure tasks or actors do not use more resources than specified."
     },
     {
         "question": "when should I use Ray Client?",
-        "golden_response": "Ray Client should be used when you want to connect an interactive Python shell to a remote cluster. It is particularly useful for developing interactively in a local Python shell. However, it requires a stable connection to the remote cluster and will terminate the workload if the connection is lost for more than 30 seconds. If you have a long running workload that you want to run on your cluster, it is recommended to use Ray Jobs instead. Please note that Ray Client may not work as expected for ML workloads like Ray Tune or Ray Train.",
-        "generated_response": "Ray Client should be used when you want to connect an interactive Python shell to a remote cluster. It allows you to leverage a remote Ray cluster just like you would with Ray running on your local machine. By changing `ray.init()` to `ray.init(\"ray://<head_node_host>:<port>\")`, you can connect from your laptop or anywhere else directly to a remote cluster and scale-out your Ray code, while still being able to develop interactively in a Python shell. However, it is important to note that Ray Client may not work as expected when using Ray for ML workloads like Ray Tune or Ray Train. In such cases, it is recommended to use the Ray Jobs API for interactive development on ML projects.",
+        "golden_response": {
+            "question": "when should I use Ray Client?",
+            "source": "https://docs.ray.io/en/master/cluster/running-applications/job-submission/ray-client.html#when-to-use-ray-client",
+            "response": "Ray Client should be used when you want to connect an interactive Python shell to a remote cluster. It is particularly useful for developing interactively in a local Python shell. However, it requires a stable connection to the remote cluster and will terminate the workload if the connection is lost for more than 30 seconds. If you have a long running workload that you want to run on your cluster, it is recommended to use Ray Jobs instead. Please note that Ray Client may not work as expected for ML workloads, such as Ray Tune or Ray Train."
+        },
+        "generated_response": "Ray Client should be used when you want to connect an interactive Python shell to a remote cluster. It is useful for developing interactively in a local Python shell while leveraging a remote Ray cluster. However, it has architectural limitations and may not work as expected when using Ray for ML workloads. If you have a long-running workload that you want to run on your cluster, it is recommended to use Ray Jobs instead.",
         "score": 4.5,
-        "reasoning": "The generated answer is highly relevant and mostly correct. It accurately explains when to use Ray Client, how to use it, and its limitations. However, it does not mention the requirement of a stable connection to the remote cluster and the termination of the workload if the connection is lost for more than 30 seconds, which is included in the reference answer. This is a minor omission, but it is an important detail for users to know."
+        "reasoning": "The generated answer is highly relevant and mostly correct. It accurately explains when to use Ray Client and mentions its limitations. However, it omits the detail about the requirement of a stable connection and the termination of the workload if the connection is lost for more than 30 seconds, which was present in the reference answer."
     },
     {
         "question": "how to scatter actors across the cluster?",
-        "golden_response": "To scatter actors across the cluster, you can use the \"SPREAD\" scheduling strategy. This can be done by creating instances of the actor with the \"SPREAD\" option. Here is an example:\n\n```python\nactors = [SpreadActor.options(scheduling_strategy=\"SPREAD\").remote() for _ in range(10)]\n```\n\nIn this example, 10 instances of the SpreadActor class are created and spread across the cluster.",
-        "generated_response": "To scatter actors across the cluster, you can use the \"SPREAD\" scheduling strategy. By specifying the scheduling strategy as \"SPREAD\" when creating the actors, Ray will try to spread the actors among the available nodes in the cluster. This can be done using the `options` method when creating the actors. Here is an example:\n\n```python\n@ray.remote(num_cpus=1)\nclass SpreadActor:\n    pass\n\n# Spread actors across the cluster.\nactors = [SpreadActor.options(scheduling_strategy=\"SPREAD\").remote() for _ in range(10)]\n```\n\nBy setting the `scheduling_strategy` parameter to \"SPREAD\" in the `options` method, the actors will be scattered across the cluster.",
-        "score": 5.0,
-        "reasoning": "The generated answer is highly relevant to the user query and is fully correct. It provides a clear explanation of how to scatter actors across the cluster using the \"SPREAD\" scheduling strategy, and even includes a code example to illustrate the process. The answer also provides additional context by explaining what the \"SPREAD\" scheduling strategy does and how to use the `options` method when creating the actors."
+        "golden_response": {
+            "question": "how to scatter actors across the cluster?",
+            "source": "https://docs.ray.io/en/master/ray-core/scheduling/index.html#spread",
+            "response": "To scatter actors across the cluster, you can use the \"SPREAD\" scheduling strategy. This can be done by creating instances of the actor with the \"SPREAD\" option. Here is an example:\n\n```python\nactors = [SpreadActor.options(scheduling_strategy=\"SPREAD\").remote() for _ in range(10)]\n```\n\nIn this example, 10 instances of the SpreadActor class are created and spread across the cluster."
+        },
+        "generated_response": "To scatter actors across the cluster, you can use the \"SPREAD\" scheduling strategy. By specifying the scheduling_strategy parameter as \"SPREAD\" when creating the actors, Ray will try to spread the tasks or actors among available nodes in the cluster. This ensures that the actors are distributed across different nodes rather than being concentrated on a single node.",
+        "score": 4.5,
+        "reasoning": "The generated answer is relevant and correct. It explains the use of the \"SPREAD\" scheduling strategy to scatter actors across the cluster, similar to the reference answer. However, it lacks the specific code example provided in the reference answer."
     }
 ]
\ No newline at end of file
diff --git a/notebooks/utils.py b/notebooks/utils.py
index 38c3b9b94e13dee8a9d644b1cff7b00f82cc7c59..74440020d6c6ff26da223625bbf7446f68e38801 100644
--- a/notebooks/utils.py
+++ b/notebooks/utils.py
@@ -23,7 +23,7 @@ def _get_vector_store_index(
 def get_query_engine(
     llm_model_name: str = "meta-llama/Llama-2-70b-chat-hf",
     temperature: float = 0.1,
-    embedding_model_name = "thenlper/gte-base",
+    embedding_model_name = "text-embedding-ada-002",
     similarity_top_k=2
 ):
     embed_model = get_embedding_model(embedding_model_name)
@@ -40,7 +40,7 @@ def get_query_engine(
 
 
 def get_retriever(    
-    embedding_model_name = "thenlper/gte-base",
+    embedding_model_name = "text-embedding-ada-002",
     similarity_top_k=2
 ):
 
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..5a1715bdaeb5aaab5477a9bc83e58ce190da93e6
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,138 @@
+aiohttp==3.8.6
+aiosignal==1.3.1
+annotated-types==0.6.0
+anyio==3.7.1
+appnope==0.1.3
+argon2-cffi==23.1.0
+argon2-cffi-bindings==21.2.0
+arrow==1.3.0
+asttokens==2.4.0
+async-lru==2.0.4
+async-timeout==4.0.3
+asyncpg==0.28.0
+attrs==23.1.0
+Babel==2.13.0
+backcall==0.2.0
+beautifulsoup4==4.12.2
+bleach==6.1.0
+certifi==2023.7.22
+cffi==1.16.0
+charset-normalizer==3.3.0
+click==8.1.7
+comm==0.1.4
+dataclasses-json==0.6.1
+debugpy==1.8.0
+decorator==5.1.1
+defusedxml==0.7.1
+executing==2.0.0
+fastjsonschema==2.18.1
+filelock==3.12.4
+fqdn==1.5.1
+frozenlist==1.4.0
+fsspec==2023.9.2
+greenlet==3.0.0
+idna==3.4
+ipykernel==6.25.2
+ipython==8.16.1
+ipython-genutils==0.2.0
+ipywidgets==8.1.1
+isoduration==20.11.0
+jedi==0.19.1
+Jinja2==3.1.2
+joblib==1.3.2
+json5==0.9.14
+jsonpatch==1.33
+jsonpointer==2.4
+jsonschema==4.19.1
+jsonschema-specifications==2023.7.1
+jupyter==1.0.0
+jupyter-console==6.6.3
+jupyter-events==0.7.0
+jupyter-lsp==2.2.0
+jupyter_client==8.3.1
+jupyter_core==5.3.2
+jupyter_server==2.7.3
+jupyter_server_terminals==0.4.4
+jupyterlab==4.0.6
+jupyterlab-pygments==0.2.2
+jupyterlab-widgets==3.0.9
+jupyterlab_server==2.25.0
+langchain==0.0.310
+langsmith==0.0.43
+llama-index==0.8.41
+MarkupSafe==2.1.3
+marshmallow==3.20.1
+matplotlib-inline==0.1.6
+mistune==3.0.2
+msgpack==1.0.7
+multidict==6.0.4
+mypy-extensions==1.0.0
+nbclient==0.8.0
+nbconvert==7.9.2
+nbformat==5.9.2
+nest-asyncio==1.5.8
+nltk==3.8.1
+notebook==7.0.4
+notebook_shim==0.2.3
+numpy==1.26.0
+openai==0.28.1
+overrides==7.4.0
+packaging==23.2
+pandas==2.1.1
+pandocfilters==1.5.0
+parso==0.8.3
+pexpect==4.8.0
+pgvector==0.2.3
+pickleshare==0.7.5
+platformdirs==3.11.0
+prometheus-client==0.17.1
+prompt-toolkit==3.0.39
+protobuf==4.24.4
+psutil==5.9.5
+psycopg2-binary==2.9.9
+ptyprocess==0.7.0
+pure-eval==0.2.2
+pyarrow==13.0.0
+pycparser==2.21
+pydantic==2.4.2
+pydantic_core==2.10.1
+Pygments==2.16.1
+python-dateutil==2.8.2
+python-json-logger==2.0.7
+pytz==2023.3.post1
+PyYAML==6.0.1
+pyzmq==25.1.1
+qtconsole==5.4.4
+QtPy==2.4.0
+ray==2.7.0
+referencing==0.30.2
+regex==2023.10.3
+requests==2.31.0
+rfc3339-validator==0.1.4
+rfc3986-validator==0.1.1
+rpds-py==0.10.4
+Send2Trash==1.8.2
+six==1.16.0
+sniffio==1.3.0
+soupsieve==2.5
+SQLAlchemy==2.0.21
+stack-data==0.6.3
+tenacity==8.2.3
+terminado==0.17.1
+tiktoken==0.5.1
+tinycss2==1.2.1
+tornado==6.3.3
+tqdm==4.66.1
+traitlets==5.11.2
+types-python-dateutil==2.8.19.14
+typing-inspect==0.9.0
+typing_extensions==4.8.0
+tzdata==2023.3
+uri-template==1.3.0
+urllib3==1.26.17
+wcwidth==0.2.8
+webcolors==1.13
+webencodings==0.5.1
+websocket-client==1.6.4
+widgetsnbextension==4.0.9
+yarl==1.9.2
diff --git a/setup-pgvector.sh b/setup-pgvector.sh
deleted file mode 100755
index dcebb07d854fdbeefb1be312f7d222dbc01f790c..0000000000000000000000000000000000000000
--- a/setup-pgvector.sh
+++ /dev/null
@@ -1,15 +0,0 @@
-#!/bin/bash
-# Install postgres
-sudo apt install -y wget ca-certificates
-wget --quiet -O - https://www.postgresql.org/media/keys/ACCC4CF8.asc | sudo apt-key add -
-sudo sh -c 'echo "deb http://apt.postgresql.org/pub/repos/apt/ $(lsb_release -cs)-pgdg main" >> /etc/apt/sources.list.d/pgdg.list'
-sudo apt update && sudo apt install -y postgresql postgresql-contrib
-# Install pgvector
-sudo apt install -y postgresql-server-dev-16
-pushd /tmp && git clone --branch v0.4.4 https://github.com/pgvector/pgvector.git && pushd pgvector && make && sudo make install && popd && popd
-# Activate pgvector and the database
-echo 'ray ALL=(ALL:ALL) NOPASSWD:ALL' | sudo tee /etc/sudoers
-sudo service postgresql start
-# pragma: allowlist nextline secret
-sudo -u postgres psql -c "ALTER USER postgres with password 'postgres';"
-sudo -u postgres psql -c "CREATE EXTENSION vector;"