diff --git a/demo_apps/OctoAI_API_examples/Getting_to_know_Llama.ipynb b/demo_apps/OctoAI_API_examples/Getting_to_know_Llama.ipynb
new file mode 100644
index 0000000000000000000000000000000000000000..d0105e10bb7393f00f56617df4171397d2649911
--- /dev/null
+++ b/demo_apps/OctoAI_API_examples/Getting_to_know_Llama.ipynb
@@ -0,0 +1,1029 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "LERqQn5v8-ak"
+   },
+   "source": [
+    "# **Getting to know Llama 2: Everything you need to start building**\n",
+    "Our goal in this session is to provide a guided tour of Llama 2, including understanding different Llama 2 models, how and where to access them, Generative AI and Chatbot architectures, prompt engineering, RAG (Retrieval Augmented Generation), Fine-tuning and more. All this is implemented with a starter code for you to take it and use it in your Llama 2 projects."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "ioVMNcTesSEk"
+   },
+   "source": [
+    "##**0 - Prerequisites**\n",
+    "* Basic understanding of Large Language Models\n",
+    "\n",
+    "* Basic understanding of Python"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "executionInfo": {
+     "elapsed": 248,
+     "status": "ok",
+     "timestamp": 1695832228254,
+     "user": {
+      "displayName": "Amit Sangani",
+      "userId": "11552178012079240149"
+     },
+     "user_tz": 420
+    },
+    "id": "ktEA7qXmwdUM"
+   },
+   "outputs": [],
+   "source": [
+    "# presentation layer code\n",
+    "\n",
+    "import base64\n",
+    "from IPython.display import Image, display\n",
+    "import matplotlib.pyplot as plt\n",
+    "\n",
+    "def mm(graph):\n",
+    "  graphbytes = graph.encode(\"ascii\")\n",
+    "  base64_bytes = base64.b64encode(graphbytes)\n",
+    "  base64_string = base64_bytes.decode(\"ascii\")\n",
+    "  display(Image(url=\"https://mermaid.ink/img/\" + base64_string))\n",
+    "\n",
+    "def genai_app_arch():\n",
+    "  mm(\"\"\"\n",
+    "  flowchart TD\n",
+    "    A[Users] --> B(Applications e.g. mobile, web)\n",
+    "    B --> |Hosted API|C(Platforms e.g. Custom, OctoAI, HuggingFace, Replicate)\n",
+    "    B -- optional --> E(Frameworks e.g. LangChain)\n",
+    "    C-->|User Input|D[Llama 2]\n",
+    "    D-->|Model Output|C\n",
+    "    E --> C\n",
+    "    classDef default fill:#CCE6FF,stroke:#84BCF5,textColor:#1C2B33,fontFamily:trebuchet ms;\n",
+    "  \"\"\")\n",
+    "\n",
+    "def rag_arch():\n",
+    "  mm(\"\"\"\n",
+    "  flowchart TD\n",
+    "    A[User Prompts] --> B(Frameworks e.g. LangChain)\n",
+    "    B <--> |Database, Docs, XLS|C[fa:fa-database External Data]\n",
+    "    B -->|API|D[Llama 2]\n",
+    "    classDef default fill:#CCE6FF,stroke:#84BCF5,textColor:#1C2B33,fontFamily:trebuchet ms;\n",
+    "  \"\"\")\n",
+    "\n",
+    "def llama2_family():\n",
+    "  mm(\"\"\"\n",
+    "  graph LR;\n",
+    "      llama-2 --> llama-2-7b\n",
+    "      llama-2 --> llama-2-13b\n",
+    "      llama-2 --> llama-2-70b\n",
+    "      llama-2-7b --> llama-2-7b-chat\n",
+    "      llama-2-13b --> llama-2-13b-chat\n",
+    "      llama-2-70b --> llama-2-70b-chat\n",
+    "      classDef default fill:#CCE6FF,stroke:#84BCF5,textColor:#1C2B33,fontFamily:trebuchet ms;\n",
+    "  \"\"\")\n",
+    "\n",
+    "def apps_and_llms():\n",
+    "  mm(\"\"\"\n",
+    "  graph LR;\n",
+    "    users --> apps\n",
+    "    apps --> frameworks\n",
+    "    frameworks --> platforms\n",
+    "    platforms --> Llama 2\n",
+    "    classDef default fill:#CCE6FF,stroke:#84BCF5,textColor:#1C2B33,fontFamily:trebuchet ms;\n",
+    "  \"\"\")\n",
+    "\n",
+    "import ipywidgets as widgets\n",
+    "from IPython.display import display, Markdown\n",
+    "\n",
+    "# Create a text widget\n",
+    "API_KEY = widgets.Password(\n",
+    "    value='',\n",
+    "    placeholder='',\n",
+    "    description='API_KEY:',\n",
+    "    disabled=False\n",
+    ")\n",
+    "\n",
+    "def md(t):\n",
+    "  display(Markdown(t))\n",
+    "\n",
+    "def bot_arch():\n",
+    "  mm(\"\"\"\n",
+    "  graph LR;\n",
+    "  user --> prompt\n",
+    "  prompt --> i_safety\n",
+    "  i_safety --> context\n",
+    "  context --> Llama_2\n",
+    "  Llama_2 --> output\n",
+    "  output --> o_safety\n",
+    "  i_safety --> memory\n",
+    "  o_safety --> memory\n",
+    "  memory --> context\n",
+    "  o_safety --> user\n",
+    "  classDef default fill:#CCE6FF,stroke:#84BCF5,textColor:#1C2B33,fontFamily:trebuchet ms;\n",
+    "  \"\"\")\n",
+    "\n",
+    "def fine_tuned_arch():\n",
+    "  mm(\"\"\"\n",
+    "  graph LR;\n",
+    "      Custom_Dataset --> Pre-trained_Llama\n",
+    "      Pre-trained_Llama --> Fine-tuned_Llama\n",
+    "      Fine-tuned_Llama --> RLHF\n",
+    "      RLHF --> |Loss:Cross-Entropy|Fine-tuned_Llama\n",
+    "      classDef default fill:#CCE6FF,stroke:#84BCF5,textColor:#1C2B33,fontFamily:trebuchet ms;\n",
+    "  \"\"\")\n",
+    "\n",
+    "def load_data_faiss_arch():\n",
+    "  mm(\"\"\"\n",
+    "  graph LR;\n",
+    "      documents --> textsplitter\n",
+    "      textsplitter --> embeddings\n",
+    "      embeddings --> vectorstore\n",
+    "      classDef default fill:#CCE6FF,stroke:#84BCF5,textColor:#1C2B33,fontFamily:trebuchet ms;\n",
+    "  \"\"\")\n",
+    "\n",
+    "def mem_context():\n",
+    "  mm(\"\"\"\n",
+    "      graph LR\n",
+    "      context(text)\n",
+    "      user_prompt --> context\n",
+    "      instruction --> context\n",
+    "      examples --> context\n",
+    "      memory --> context\n",
+    "      context --> tokenizer\n",
+    "      tokenizer --> embeddings\n",
+    "      embeddings --> LLM\n",
+    "      classDef default fill:#CCE6FF,stroke:#84BCF5,textColor:#1C2B33,fontFamily:trebuchet ms;\n",
+    "  \"\"\")\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "i4Np_l_KtIno"
+   },
+   "source": [
+    "##**1 - Understanding Llama 2**"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "PGPSI3M5PGTi"
+   },
+   "source": [
+    "### **1.1 - What is Llama 2?**\n",
+    "\n",
+    "* State of the art (SOTA), Open Source LLM\n",
+    "* 7B, 13B, 70B\n",
+    "* Pretrained + Chat\n",
+    "* Choosing model: Size, Quality, Cost, Speed\n",
+    "* [Research paper](https://ai.meta.com/research/publications/llama-2-open-foundation-and-fine-tuned-chat-models/)\n",
+    "\n",
+    "* [Responsible use guide](https://ai.meta.com/llama/responsible-use-guide/)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/",
+     "height": 240
+    },
+    "executionInfo": {
+     "elapsed": 248,
+     "status": "ok",
+     "timestamp": 1695832233087,
+     "user": {
+      "displayName": "Amit Sangani",
+      "userId": "11552178012079240149"
+     },
+     "user_tz": 420
+    },
+    "id": "OXRCC7wexZXd",
+    "outputId": "1feb1918-df4b-4cec-d09e-ffe55c12090b"
+   },
+   "outputs": [],
+   "source": [
+    "llama2_family()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "aYeHVVh45bdT"
+   },
+   "source": [
+    "###**1.2 - Accessing Llama 2**\n",
+    "* Download + Self Host (on-premise)\n",
+    "* Hosted API Platform (e.g. [OctoAI](https://octoai.cloud/), [Replicate](https://replicate.com/meta))\n",
+    "* Hosted Container Platform (e.g. [Azure](https://techcommunity.microsoft.com/t5/ai-machine-learning-blog/introducing-llama-2-on-azure/ba-p/3881233), [AWS](https://aws.amazon.com/blogs/machine-learning/llama-2-foundation-models-from-meta-are-now-available-in-amazon-sagemaker-jumpstart/), [GCP](https://console.cloud.google.com/vertex-ai/publishers/google/model-garden/139))\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "kBuSay8vtzL4"
+   },
+   "source": [
+    "### **1.3 - Use Cases of Llama 2**\n",
+    "* Content Generation\n",
+    "* Chatbots\n",
+    "* Summarization\n",
+    "* Programming (e.g. Code Llama)\n",
+    "\n",
+    "* and many more..."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "sd54g0OHuqBY"
+   },
+   "source": [
+    "##**2 - Using Llama 2**\n",
+    "\n",
+    "In this notebook, we are going to access [Llama 13b chat model](https://octoai.cloud/tools/text/chat?mode=demo&model=llama-2-13b-chat-fp16) using hosted API from OctoAI."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "h3YGMDJidHtH"
+   },
+   "source": [
+    "### **2.1 - Install dependencies**"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "VhN6hXwx7FCp"
+   },
+   "outputs": [],
+   "source": [
+    "# Install dependencies and initialize\n",
+    "%pip install -qU \\\n",
+    "    octoai-sdk \\\n",
+    "    langchain \\\n",
+    "    sentence_transformers \\\n",
+    "    pdf2image \\\n",
+    "    pdfminer \\\n",
+    "    pdfminer.six \\\n",
+    "    unstructured \\\n",
+    "    faiss-cpu \\\n",
+    "    pillow-heif \\\n",
+    "    opencv-python \\\n",
+    "    unstructured-inference \\\n",
+    "    pikepdf"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "Z8Y8qjEjmg50"
+   },
+   "outputs": [],
+   "source": [
+    "# model on OctoAI platform that we will use for inferencing\n",
+    "# We will use llama 13b chat model hosted on OctoAI server ()\n",
+    "\n",
+    "llama2_13b = \"llama-2-13b-chat-fp16\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "8hkWpqWD28ho"
+   },
+   "outputs": [],
+   "source": [
+    "# We will use OctoAI hosted cloud environment\n",
+    "# Obtain OctoAI API key → https://octo.ai/docs/getting-started/how-to-create-an-octoai-access-token\n",
+    "\n",
+    "# enter your replicate api token\n",
+    "from getpass import getpass\n",
+    "import os\n",
+    "\n",
+    "OCTOAI_API_TOKEN = getpass()\n",
+    "os.environ[\"OCTOAI_API_TOKEN\"] = OCTOAI_API_TOKEN\n",
+    "\n",
+    "# alternatively, you can also store the tokens in environment variables and load it here"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "bVCHZmETk36v"
+   },
+   "outputs": [],
+   "source": [
+    "# we will use OctoAI's hosted API\n",
+    "from octoai.client import Client\n",
+    "\n",
+    "client = Client(OCTOAI_API_TOKEN)\n",
+    "\n",
+    "# text completion with input prompt\n",
+    "def Completion(prompt):\n",
+    "    output = client.chat.completions.create(\n",
+    "        messages=[\n",
+    "            {\n",
+    "                \"role\": \"user\",\n",
+    "                \"content\": prompt\n",
+    "            }\n",
+    "        ],\n",
+    "        model=\"llama-2-13b-chat-fp16\",\n",
+    "        max_tokens=1000\n",
+    "    )\n",
+    "    return output.choices[0].message.content\n",
+    "\n",
+    "# chat completion with input prompt and system prompt\n",
+    "def ChatCompletion(prompt, system_prompt=None):\n",
+    "    output = client.chat.completions.create(\n",
+    "        messages=[\n",
+    "            {\n",
+    "                \"role\": \"system\",\n",
+    "                \"content\": system_prompt\n",
+    "            },\n",
+    "            {\n",
+    "                \"role\": \"user\",\n",
+    "                \"content\": prompt\n",
+    "            }\n",
+    "        ],\n",
+    "        model=\"llama-2-13b-chat-fp16\",\n",
+    "        max_tokens=1000\n",
+    "    )\n",
+    "    return output.choices[0].message.content"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "5Jxq0pmf6L73"
+   },
+   "source": [
+    "### **2.2 - Basic completion**"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "H93zZBIk6tNU"
+   },
+   "outputs": [],
+   "source": [
+    "output = Completion(prompt=\"The typical color of a llama is: \")\n",
+    "md(output)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "StccjUDh6W0Q"
+   },
+   "source": [
+    "### **2.3 - System prompts**\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "VRnFogxd6rTc"
+   },
+   "outputs": [],
+   "source": [
+    "output = ChatCompletion(\n",
+    "    prompt=\"The typical color of a llama is: \",\n",
+    "    system_prompt=\"respond with only one word\"\n",
+    "  )\n",
+    "md(output)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "Hp4GNa066pYy"
+   },
+   "source": [
+    "### **2.4 - Response formats**\n",
+    "* Can support different formatted outputs e.g. text, JSON, etc."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "HTN79h4RptgQ"
+   },
+   "outputs": [],
+   "source": [
+    "output = ChatCompletion(\n",
+    "    prompt=\"The typical color of a llama is: \",\n",
+    "    system_prompt=\"response in json format\"\n",
+    "  )\n",
+    "md(output)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "cWs_s9y-avIT"
+   },
+   "source": [
+    "## **3 - Gen AI Application Architecture**\n",
+    "\n",
+    "Here is the high-level tech stack/architecture of Generative AI application."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/",
+     "height": 446
+    },
+    "executionInfo": {
+     "elapsed": 405,
+     "status": "ok",
+     "timestamp": 1695832253437,
+     "user": {
+      "displayName": "Amit Sangani",
+      "userId": "11552178012079240149"
+     },
+     "user_tz": 420
+    },
+    "id": "j9BGuI-9AOL5",
+    "outputId": "72b2613f-a434-4219-f063-52a409af97cc"
+   },
+   "outputs": [],
+   "source": [
+    "genai_app_arch()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "6UlxBtbgys6j"
+   },
+   "source": [
+    "##4 - **Chatbot Architecture**\n",
+    "\n",
+    "Here are the key components and the information flow in a chatbot.\n",
+    "\n",
+    "* User Prompts\n",
+    "* Input Safety\n",
+    "* Llama 2\n",
+    "* Output Safety\n",
+    "\n",
+    "* Memory & Context"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/",
+     "height": 178
+    },
+    "executionInfo": {
+     "elapsed": 249,
+     "status": "ok",
+     "timestamp": 1695832257063,
+     "user": {
+      "displayName": "Amit Sangani",
+      "userId": "11552178012079240149"
+     },
+     "user_tz": 420
+    },
+    "id": "tO5HnB56ys6t",
+    "outputId": "f222d35b-626f-4dc1-b7af-a156a0f3d58b"
+   },
+   "outputs": [],
+   "source": [
+    "bot_arch()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "r4DyTLD5ys6t"
+   },
+   "source": [
+    "### **4.1 - Chat conversation**\n",
+    "* LLMs are stateless\n",
+    "* Single Turn\n",
+    "\n",
+    "* Multi Turn (Memory)\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "EMM_egWMys6u"
+   },
+   "outputs": [],
+   "source": [
+    "# example of single turn chat\n",
+    "prompt_chat = \"What is the average lifespan of a Llama?\"\n",
+    "output = ChatCompletion(prompt=prompt_chat, system_prompt=\"answer the last question in few words\")\n",
+    "md(output)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "sZ7uVKDYucgi"
+   },
+   "outputs": [],
+   "source": [
+    "# example without previous context. LLM's are stateless and cannot understand \"they\" without previous context\n",
+    "prompt_chat = \"What animal family are they?\"\n",
+    "output = ChatCompletion(prompt=prompt_chat, system_prompt=\"answer the last question in few words\")\n",
+    "md(output)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "WQl3wmfbyBQ1"
+   },
+   "source": [
+    "Chat app requires us to send in previous context to LLM to get in valid responses. Below is an example of Multi-turn chat."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "t7SZe5fT3HG3"
+   },
+   "outputs": [],
+   "source": [
+    "# example of multi-turn chat, with storing previous context\n",
+    "prompt_chat = \"\"\"\n",
+    "User: What is the average lifespan of a Llama?\n",
+    "Assistant: Sure! The average lifespan of a llama is around 20-30 years.\n",
+    "User: What animal family are they?\n",
+    "\"\"\"\n",
+    "output = ChatCompletion(prompt=prompt_chat, system_prompt=\"answer the last question\")\n",
+    "md(output)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "moXnmJ_xyD10"
+   },
+   "source": [
+    "### **4.2 - Prompt Engineering**\n",
+    "* Prompt engineering refers to the science of designing effective prompts to get desired responses\n",
+    "\n",
+    "* Helps reduce hallucination\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "t-v-FeZ4ztTB"
+   },
+   "source": [
+    "#### **4.2.1 - In-Context Learning (e.g. Zero-shot, Few-shot)**\n",
+    " * In-context learning - specific method of prompt engineering where demonstration of task are provided as part of prompt.\n",
+    "  1. Zero-shot learning - model is performing tasks without any\n",
+    "input examples.\n",
+    "  2. Few or “N-Shot” Learning - model is performing and behaving based on input examples in user's prompt."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "6W71MFNZyRkQ"
+   },
+   "outputs": [],
+   "source": [
+    "# Zero-shot example. To get positive/negative/neutral sentiment, we need to give examples in the prompt\n",
+    "prompt = '''\n",
+    "Classify: I saw a Gecko.\n",
+    "Sentiment: ?\n",
+    "'''\n",
+    "output = ChatCompletion(prompt, system_prompt=\"one word response\")\n",
+    "md(output)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "MCQRjf1Y1RYJ"
+   },
+   "outputs": [],
+   "source": [
+    "# By giving examples to Llama, it understands the expected output format.\n",
+    "\n",
+    "prompt = '''\n",
+    "Classify: I love Llamas!\n",
+    "Sentiment: Positive\n",
+    "Classify: I dont like Snakes.\n",
+    "Sentiment: Negative\n",
+    "Classify: I saw a Gecko.\n",
+    "Sentiment:'''\n",
+    "\n",
+    "output = ChatCompletion(prompt, system_prompt=\"One word response\")\n",
+    "md(output)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "8UmdlTmpDZxA"
+   },
+   "outputs": [],
+   "source": [
+    "# another zero-shot learning\n",
+    "prompt = '''\n",
+    "QUESTION: Vicuna?\n",
+    "ANSWER:'''\n",
+    "\n",
+    "output = ChatCompletion(prompt, system_prompt=\"one word response\")\n",
+    "md(output)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "M_EcsUo1zqFD"
+   },
+   "outputs": [],
+   "source": [
+    "# Another few-shot learning example with formatted prompt.\n",
+    "\n",
+    "prompt = '''\n",
+    "QUESTION: Llama?\n",
+    "ANSWER: Yes\n",
+    "QUESTION: Alpaca?\n",
+    "ANSWER: Yes\n",
+    "QUESTION: Rabbit?\n",
+    "ANSWER: No\n",
+    "QUESTION: Vicuna?\n",
+    "ANSWER:'''\n",
+    "\n",
+    "output = ChatCompletion(prompt, system_prompt=\"one word response\")\n",
+    "md(output)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "mbr124Y197xl"
+   },
+   "source": [
+    "#### **4.2.2 - Chain of Thought**\n",
+    "\"Chain of thought\" enables complex reasoning through logical step by step thinking and generates meaningful and contextually relevant responses."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "Xn8zmLBQzpgj"
+   },
+   "outputs": [],
+   "source": [
+    "# Standard prompting\n",
+    "prompt = '''\n",
+    "Llama started with 5 tennis balls. It buys 2 more cans of tennis balls. Each can has 3 tennis balls. How many tennis balls does Llama have now?\n",
+    "'''\n",
+    "\n",
+    "output = ChatCompletion(prompt, system_prompt=\"provide short answer\")\n",
+    "md(output)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "lKNOj79o1Kwu"
+   },
+   "outputs": [],
+   "source": [
+    "# Chain-Of-Thought prompting\n",
+    "prompt = '''\n",
+    "Llama started with 5 tennis balls. It buys 2 more cans of tennis balls. Each can has 3 tennis balls. How many tennis balls does Llama have now?\n",
+    "Let's think step by step.\n",
+    "'''\n",
+    "\n",
+    "output = ChatCompletion(prompt, system_prompt=\"provide short answer\")\n",
+    "md(output)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "C7tDW-AH770Y"
+   },
+   "source": [
+    "### **4.3 - Retrieval Augmented Generation (RAG)**\n",
+    "* Prompt Eng Limitations - Knowledge cutoff & lack of specialized data\n",
+    "\n",
+    "* Retrieval Augmented Generation(RAG) allows us to retrieve snippets of information from external data sources and augment it to the user's prompt to get tailored responses from Llama 2.\n",
+    "\n",
+    "For our demo, we are going to download an external PDF file from a URL and query against the content in the pdf file to get contextually relevant information back with the help of Llama!\n",
+    "\n",
+    "\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/",
+     "height": 259
+    },
+    "executionInfo": {
+     "elapsed": 329,
+     "status": "ok",
+     "timestamp": 1695832267093,
+     "user": {
+      "displayName": "Amit Sangani",
+      "userId": "11552178012079240149"
+     },
+     "user_tz": 420
+    },
+    "id": "Fl1LPltpRQD9",
+    "outputId": "4410c9bf-3559-4a05-cebb-a5731bb094c1"
+   },
+   "outputs": [],
+   "source": [
+    "rag_arch()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "JJaGMLl_4vYm"
+   },
+   "source": [
+    "#### **4.3.1 - LangChain**\n",
+    "LangChain is a framework that helps make it easier to implement RAG."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "aoqU3KTcHTWN"
+   },
+   "outputs": [],
+   "source": [
+    "# langchain setup\n",
+    "from langchain.llms.octoai_endpoint import OctoAIEndpoint\n",
+    "# Use the Llama 2 model hosted on OctoAI\n",
+    "# Temperature: Adjusts randomness of outputs, greater than 1 is random and 0 is deterministic, 0.75 is a good starting value\n",
+    "# top_p: When decoding text, samples from the top p percentage of most likely tokens; lower to ignore less likely tokens\n",
+    "# max_new_tokens: Maximum number of tokens to generate. A word is generally 2-3 tokens\n",
+    "llama_model = OctoAIEndpoint(\n",
+    "    endpoint_url=\"https://text.octoai.run/v1/chat/completions\",\n",
+    "    model_kwargs={\n",
+    "        \"model\": llama2_13b,\n",
+    "        \"messages\": [\n",
+    "            {\n",
+    "                \"role\": \"system\",\n",
+    "                \"content\": \"You are a helpful, respectful and honest assistant.\"\n",
+    "            }\n",
+    "        ],\n",
+    "        \"max_tokens\": 1000,\n",
+    "        \"top_p\": 1,\n",
+    "        \"temperature\": 0.75\n",
+    "    },\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "gAV2EkZqcruF"
+   },
+   "outputs": [],
+   "source": [
+    "# Step 1: load the external data source. In our case, we will load Meta’s “Responsible Use Guide” pdf document.\n",
+    "from langchain.document_loaders import OnlinePDFLoader\n",
+    "loader = OnlinePDFLoader(\"https://ai.meta.com/static-resource/responsible-use-guide/\")\n",
+    "documents = loader.load()\n",
+    "\n",
+    "# Step 2: Get text splits from document\n",
+    "from langchain.text_splitter import RecursiveCharacterTextSplitter\n",
+    "text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=20)\n",
+    "all_splits = text_splitter.split_documents(documents)\n",
+    "\n",
+    "# Step 3: Use the embedding model\n",
+    "from langchain.vectorstores import FAISS\n",
+    "from langchain.embeddings import OctoAIEmbeddings\n",
+    "embeddings = OctoAIEmbeddings(endpoint_url=\"https://text.octoai.run/v1/embeddings\")\n",
+    "\n",
+    "# Step 4: Use vector store to store embeddings\n",
+    "vectorstore = FAISS.from_documents(all_splits, embeddings)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "K2l8S5tBxlkc"
+   },
+   "source": [
+    "#### **4.3.2 - LangChain Q&A Retriever**\n",
+    "* ConversationalRetrievalChain\n",
+    "\n",
+    "* Query the Source documents\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "NmEhBe3Kiyre"
+   },
+   "outputs": [],
+   "source": [
+    "# Query against your own data\n",
+    "from langchain.chains import ConversationalRetrievalChain\n",
+    "chain = ConversationalRetrievalChain.from_llm(llama_model, vectorstore.as_retriever(), return_source_documents=True)\n",
+    "\n",
+    "chat_history = []\n",
+    "query = \"How is Meta approaching open science in two short sentences?\"\n",
+    "result = chain.invoke({\"question\": query, \"chat_history\": chat_history})\n",
+    "md(result['answer'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "CelLHIvoy2Ke"
+   },
+   "outputs": [],
+   "source": [
+    "# This time your previous question and answer will be included as a chat history which will enable the ability\n",
+    "# to ask follow up questions.\n",
+    "chat_history = [(query, result[\"answer\"])]\n",
+    "query = \"How is it benefiting the world?\"\n",
+    "result = chain({\"question\": query, \"chat_history\": chat_history})\n",
+    "md(result['answer'])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "TEvefAWIJONx"
+   },
+   "source": [
+    "## **5 - Fine-Tuning Models**\n",
+    "\n",
+    "* Limitatons of Prompt Eng and RAG\n",
+    "* Fine-Tuning Arch\n",
+    "* Types (PEFT, LoRA, QLoRA)\n",
+    "* Using PyTorch for Pre-Training & Fine-Tuning\n",
+    "\n",
+    "* Evals + Quality\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/",
+     "height": 79
+    },
+    "executionInfo": {
+     "elapsed": 327,
+     "status": "ok",
+     "timestamp": 1695832272878,
+     "user": {
+      "displayName": "Amit Sangani",
+      "userId": "11552178012079240149"
+     },
+     "user_tz": 420
+    },
+    "id": "0a9CvJ8YcTzV",
+    "outputId": "56a6d573-a195-4e3c-834d-a3b23485186c"
+   },
+   "outputs": [],
+   "source": [
+    "fine_tuned_arch()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "_8lcgdZa8onC"
+   },
+   "source": [
+    "## **6 - Responsible AI**\n",
+    "\n",
+    "* Power + Responsibility\n",
+    "* Hallucinations\n",
+    "* Input & Output Safety\n",
+    "* Red-teaming (simulating real-world cyber attackers)\n",
+    "* [Responsible Use Guide](https://ai.meta.com/llama/responsible-use-guide/)\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "pbqb006R-T_k"
+   },
+   "source": [
+    "##**7 - Conclusion**\n",
+    "* Active research on LLMs and Llama\n",
+    "* Leverage the power of Llama and its open community\n",
+    "* Safety and responsible use is paramount!\n",
+    "\n",
+    "* Call-To-Action\n",
+    "  * [Replicate Free Credits](https://replicate.fyi/connect2023) for Connect attendees!\n",
+    "  * This notebook is available through Llama Github recipes\n",
+    "  * Use Llama in your projects and give us feedback\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "gSz5dTMxp7xo"
+   },
+   "source": [
+    "#### **Resources**\n",
+    "- [GitHub - Llama 2](https://github.com/facebookresearch/llama)\n",
+    "- [Github - LLama 2 Recipes](https://github.com/facebookresearch/llama-recipes)\n",
+    "- [Llama 2](https://ai.meta.com/llama/)\n",
+    "- [Research Paper](https://ai.meta.com/research/publications/llama-2-open-foundation-and-fine-tuned-chat-models/)\n",
+    "- [Model Card](https://github.com/facebookresearch/llama/blob/main/MODEL_CARD.md)\n",
+    "- [Responsible Use Guide](https://ai.meta.com/llama/responsible-use-guide/)\n",
+    "- [Acceptable Use Policy](https://ai.meta.com/llama/use-policy/)\n",
+    "- [OctoAI](https://octoai.cloud/)\n",
+    "- [LangChain](https://www.langchain.com/)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "V7aI6fhZp-KC"
+   },
+   "source": [
+    "#### **Authors & Contact**\n",
+    "  * asangani@meta.com, [Amit Sangani | LinkedIn](https://www.linkedin.com/in/amitsangani/)\n",
+    "  * mohsena@meta.com, [Mohsen Agsen | LinkedIn](https://www.linkedin.com/in/mohsen-agsen-62a9791/)\n",
+    "  * Adapted to run on OctoAI by Thierry Moreau - tmoreau@octo.ai"
+   ]
+  }
+ ],
+ "metadata": {
+  "colab": {
+   "collapsed_sections": [
+    "ioVMNcTesSEk"
+   ],
+   "machine_shape": "hm",
+   "provenance": [],
+   "toc_visible": true
+  },
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}