From 410d8cf486b0e113660ed8bc1fe3087ef601090a Mon Sep 17 00:00:00 2001
From: Jeff Tang <jeff.x.tang@gmail.com>
Date: Tue, 10 Oct 2023 17:16:26 -0700
Subject: [PATCH] update based on PR feedback

---
 llama-demo-apps/BreakingNews.ipynb    | 14 +++-----------
 llama-demo-apps/HelloLlamaCloud.ipynb | 27 ++++++++++++---------------
 llama-demo-apps/Llama2_Gradio.ipynb   |  2 +-
 llama-demo-apps/README.md             | 21 ++++++++++++++++++---
 llama-demo-apps/StructuredLlama.ipynb | 18 +++---------------
 llama-demo-apps/VideoSummary.ipynb    | 16 ++++------------
 llama-demo-apps/streamlit_llama2.py   |  4 ++--
 7 files changed, 43 insertions(+), 59 deletions(-)

diff --git a/llama-demo-apps/BreakingNews.ipynb b/llama-demo-apps/BreakingNews.ipynb
index 8d8fd450..fea9ef4b 100644
--- a/llama-demo-apps/BreakingNews.ipynb
+++ b/llama-demo-apps/BreakingNews.ipynb
@@ -82,25 +82,17 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": null,
    "id": "c12fc2cb",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Init param `input` is deprecated, please use `model_kwargs` instead.\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "# set llm to be using Llama2 hosted on Replicate\n",
     "llama2_13b_chat = \"meta/llama-2-13b-chat:f4e2de70d66816a838a89eeeb621910adffb0dd0baba3976c96980970978018d\"\n",
     "\n",
     "llm = Replicate(\n",
     "    model=llama2_13b_chat,\n",
-    "    input={\"temperature\": 0.01, \"max_length\": 2000, \"top_p\": 1},\n",
+    "    model_kwargs={\"temperature\": 0.01, \"top_p\": 1, \"max_new_tokens\":500}\n",
     ")"
    ]
   },
diff --git a/llama-demo-apps/HelloLlamaCloud.ipynb b/llama-demo-apps/HelloLlamaCloud.ipynb
index 8d914cf6..ad564aae 100644
--- a/llama-demo-apps/HelloLlamaCloud.ipynb
+++ b/llama-demo-apps/HelloLlamaCloud.ipynb
@@ -20,7 +20,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "!pip install langchain replicate sentence-transformers"
+    "!pip install langchain replicate sentence-transformers chromadb"
    ]
   },
   {
@@ -47,25 +47,17 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": null,
    "id": "ad536adb",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Init param `input` is deprecated, please use `model_kwargs` instead.\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "from langchain.llms import Replicate\n",
     "\n",
     "llama2_13b = \"meta/llama-2-13b-chat:f4e2de70d66816a838a89eeeb621910adffb0dd0baba3976c96980970978018d\"\n",
     "llm = Replicate(\n",
     "    model=llama2_13b,\n",
-    "    input={\"temperature\": 0.01, \"max_length\": 500, \"top_p\": 1},\n",
+    "    model_kwargs={\"temperature\": 0.01, \"top_p\": 1, \"max_new_tokens\":500}\n",
     ")"
    ]
   },
@@ -220,7 +212,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# there're more 30 vector stores (DBs) supported by LangChain. Chroma is light-weight and in memory so it's easy to get started with\n",
+    "# there're more than 30 vector stores (DBs) supported by LangChain. Chroma is light-weight and in memory so it's easy to get started with\n",
     "# other vector stores can be used to store large amount of data - see https://python.langchain.com/docs/integrations/vectorstores\n",
     "from langchain.vectorstores import Chroma\n",
     "\n",
@@ -238,7 +230,9 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# split the loaded documents into chunks \n",
+    "# split the loaded documents into chunks. \n",
+    "# in genreral, use larger chuck sizes for highly structured text such as code and smaller size for \n",
+    "# less structured text. you may need to experiment with different chunk sizes and overlap values to find out the best numbers.\n",
     "text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=20)\n",
     "all_splits = text_splitter.split_documents(docs)\n",
     "\n",
@@ -387,7 +381,10 @@
     "chat_history.append((followup, followup_answer[\"answer\"]))\n",
     "more_followup = \"what tasks can it assist with?\"\n",
     "more_followup_answer = chat_chain({\"question\": more_followup, \"chat_history\": chat_history})\n",
-    "print(more_followup_answer['answer'])"
+    "print(more_followup_answer['answer'])\n",
+    "\n",
+    "# results get cut off - you may set \"max_new_tokens\" in the Replicate call above to a larger number (like 1000 below) to avoid the cut off\n",
+    "#    model_kwargs={\"temperature\": 0.01, \"top_p\": 1, \"max_new_tokens\": 1000}"
    ]
   }
  ],
diff --git a/llama-demo-apps/Llama2_Gradio.ipynb b/llama-demo-apps/Llama2_Gradio.ipynb
index 9b8e1856..d23d95f7 100644
--- a/llama-demo-apps/Llama2_Gradio.ipynb
+++ b/llama-demo-apps/Llama2_Gradio.ipynb
@@ -55,7 +55,7 @@
     "\n",
     "llm = Replicate(\n",
     "    model=llama2_13b_chat,\n",
-    "    input={\"temperature\": 0.01, \"max_length\": 2000, \"top_p\": 1},\n",
+    "    model_kwargs={\"temperature\": 0.01, \"top_p\": 1, \"max_new_tokens\":500}\n",
     ")\n",
     "\n",
     "\n",
diff --git a/llama-demo-apps/README.md b/llama-demo-apps/README.md
index c240c65a..ef86b035 100644
--- a/llama-demo-apps/README.md
+++ b/llama-demo-apps/README.md
@@ -1,8 +1,8 @@
 # Llama2 Demo Apps 
 
-This folder showcases the Llama2-powered apps. If you need a general understanding of GenAI, Llama2, prompt engineering and RAG, be sure to first check the [Getting to know Llama 2 notebook](https://github.com/facebookresearch/llama-recipes/blob/main/examples/Getting_to_know_Llama.ipynb) and its Meta Connect video [here](https://www.facebook.com/watch/?v=662153709222699).
+This folder showcases Llama2-powered demo apps. If you need a general understanding of GenAI, Llama2, prompt engineering and RAG, be sure to first check the [Getting to know Llama 2 notebook](https://github.com/facebookresearch/llama-recipes/blob/main/examples/Getting_to_know_Llama.ipynb) and its Meta Connect video [here](https://www.facebook.com/watch/?v=662153709222699).
 
-Here we start with three quickstart demos showing how to run Llama2 locally on a Mac, remotely in the cloud, and on a Google Colab to ask Llama2 general questions or questions about unstructured data not trained for the model.
+We start with three quickstart demos showing how to run Llama2 locally on a Mac, remotely in the cloud, and on a Google Colab to ask Llama2 general questions or questions about unstructured data not trained for the model.
 
 We then show three demos that ask Llama2 to summarize a YouTube video, to answer questions about structured data stored in a database, and to answer questions about live search results.
 
@@ -10,6 +10,21 @@ We also show how to build quick web UI for Llama2 demo apps using Streamlit and
 
 More advanced Llama2 demo apps will be coming soon.
 
+## Setting Up Environment
+
+The quickest way to test run the notebook demo apps on your local machine is to create a Conda envinronment and start running the Jupyter notebook as follows:
+```
+conda create -n llama-demo-apps python=3.8
+conda activate llama-demo-apps
+pip install jupyter
+cd <your_work_folder>
+git clone https://github.com/facebookresearch/llama-recipes
+cd llama-recipes/llama-demo-apps
+jupyter notebook
+```
+
+You can also upload the notebooks to Google Colab.
+
 ## HelloLlama - Quickstart in Running Llama2 (Almost) Everywhere*
 
 The first three demo apps show:
@@ -19,7 +34,7 @@ The first three demo apps show:
 * how to ask follow up questions to Llama by sending previous questions and answers as the context along with the new question, hence performing multi-turn chat or conversation with Llama.
 
 ### [Running Llama2 Locally on Mac](HelloLlamaLocal.ipynb)
-To run Llama2 locally on Mac using [llama-cpp-python](https://github.com/abetlen/llama-cpp-python), first open the notebook `HelloLlamaLocal`. Then replace `<path-to-ggml-model-q4_0.gguf>` in the notebook `HelloLlamaLocal` with the path either to your downloaded quantized model file [here](https://drive.google.com/file/d/1afPv3HOy73BE2MoYCgYJvBDeQNa9rZbj/view?usp=sharing), or to the `ggml-model-q4_0.gguf` file built with the following commands:
+To run Llama2 locally on Mac using [llama-cpp-python](https://github.com/abetlen/llama-cpp-python), first open the notebook `HelloLlamaLocal`. Then replace `<path-to-ggml-model-q4_0.gguf>` in the notebook `HelloLlamaLocal` with the path either to your downloaded quantized model file [here](https://huggingface.co/TheBloke/Llama-2-7b-Chat-GGUF/resolve/main/llama-2-7b-chat.Q4_0.gguf), or to the `ggml-model-q4_0.gguf` file built with the following commands:
 ```
 git clone https://github.com/ggerganov/llama.cpp
 cd llama.cpp
diff --git a/llama-demo-apps/StructuredLlama.ipynb b/llama-demo-apps/StructuredLlama.ipynb
index ec3dbd1f..ddc3aa29 100644
--- a/llama-demo-apps/StructuredLlama.ipynb
+++ b/llama-demo-apps/StructuredLlama.ipynb
@@ -57,18 +57,10 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": null,
    "id": "9dcd744c",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Init param `input` is deprecated, please use `model_kwargs` instead.\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "llama2_13b_chat = \"meta/llama-2-13b-chat:f4e2de70d66816a838a89eeeb621910adffb0dd0baba3976c96980970978018d\"\n",
     "\n",
@@ -76,7 +68,7 @@
     "# \"Sure! Here's the SQL query for the given input question: \" before the SQL query; otherwise custom parsing will be needed.\n",
     "llm = Replicate(\n",
     "    model=llama2_13b_chat,\n",
-    "    input={\"temperature\": 0.01, \"max_length\": 500, \"top_p\": 1, \"system_prompt\": \"Given an input question, convert it to a SQL query. No pre-amble.\"},\n",
+    "    model_kwargs={\"temperature\": 0.01, \"top_p\": 1, \"max_new_tokens\":500, \"system_prompt\": \"Given an input question, convert it to a SQL query. No pre-amble.\"},\n",
     ")"
    ]
   },
@@ -89,10 +81,6 @@
    "source": [
     "db = SQLDatabase.from_uri(\"sqlite:///nba_roster.db\", sample_rows_in_table_info= 0)\n",
     "\n",
-    "# use the default sqlite prompt defined in \n",
-    "# https://github.com/langchain-ai/langchain/blob/33eb5f8300cd21c91a2f8d10c62197637931fa0a/libs/langchain/langchain/chains/sql_database/prompt.py#L211\n",
-    "# db_chain = SQLDatabaseChain.from_llm(llm, db, verbose=True)\n",
-    "\n",
     "# customize the default sqlite prompt defined in the link above\n",
     "PROMPT_SUFFIX = \"\"\"\n",
     "Only use the following tables:\n",
diff --git a/llama-demo-apps/VideoSummary.ipynb b/llama-demo-apps/VideoSummary.ipynb
index 7d6fab95..edcab0b3 100644
--- a/llama-demo-apps/VideoSummary.ipynb
+++ b/llama-demo-apps/VideoSummary.ipynb
@@ -8,7 +8,7 @@
     "## This demo app shows:\n",
     "* how to use LangChain's YoutubeLoader to retrieve the caption in a YouTube video;\n",
     "* how to ask Llama to summarize the content (per the Llama's input size limit) of the video in a naive way using LangChain's stuff method;\n",
-    "* how to bypass the limit of Llama's max input token size by using more sophisticated way using LangChain's map_reduce and refine methods."
+    "* how to bypass the limit of Llama's max input token size by using more sophisticated way using LangChain's map_reduce and refine methods - see [here](https://python.langchain.com/docs/use_cases/summarization) for more info."
    ]
   },
   {
@@ -94,18 +94,10 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": null,
    "id": "adf8cf3d",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Init param `input` is deprecated, please use `model_kwargs` instead.\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "# set llm to be Llama2-13b model; if you use local Llama, just set llm accordingly - see the HelloLlamaLocal notebook\n",
     "from langchain.llms import Replicate\n",
@@ -113,7 +105,7 @@
     "llama2_13b = \"meta/llama-2-13b-chat:f4e2de70d66816a838a89eeeb621910adffb0dd0baba3976c96980970978018d\"\n",
     "llm = Replicate(\n",
     "    model=llama2_13b,\n",
-    "    input={\"temperature\": 0.01, \"max_length\": 500, \"top_p\": 1},\n",
+    "    model_kwargs={\"temperature\": 0.01, \"top_p\": 1, \"max_new_tokens\":500}\n",
     ")"
    ]
   },
diff --git a/llama-demo-apps/streamlit_llama2.py b/llama-demo-apps/streamlit_llama2.py
index 1f88a555..1d8404fd 100644
--- a/llama-demo-apps/streamlit_llama2.py
+++ b/llama-demo-apps/streamlit_llama2.py
@@ -12,11 +12,11 @@ def generate_response(input_text):
 
     llm = Replicate(
         model=llama2_13b_chat,
-        input={"temperature": 0.01, "max_length": 2000, "top_p": 1},
+        model_kwargs={"temperature": 0.01, "top_p": 1, "max_new_tokens":500}
     )
     st.info(llm(input_text))
 
 with st.form("my_form"):
     text = st.text_area("Enter text:", "What is Generative AI?")
     submitted = st.form_submit_button("Submit")
-    generate_response(text)
\ No newline at end of file
+    generate_response(text)
-- 
GitLab