From 410d8cf486b0e113660ed8bc1fe3087ef601090a Mon Sep 17 00:00:00 2001 From: Jeff Tang <jeff.x.tang@gmail.com> Date: Tue, 10 Oct 2023 17:16:26 -0700 Subject: [PATCH] update based on PR feedback --- llama-demo-apps/BreakingNews.ipynb | 14 +++----------- llama-demo-apps/HelloLlamaCloud.ipynb | 27 ++++++++++++--------------- llama-demo-apps/Llama2_Gradio.ipynb | 2 +- llama-demo-apps/README.md | 21 ++++++++++++++++++--- llama-demo-apps/StructuredLlama.ipynb | 18 +++--------------- llama-demo-apps/VideoSummary.ipynb | 16 ++++------------ llama-demo-apps/streamlit_llama2.py | 4 ++-- 7 files changed, 43 insertions(+), 59 deletions(-) diff --git a/llama-demo-apps/BreakingNews.ipynb b/llama-demo-apps/BreakingNews.ipynb index 8d8fd450..fea9ef4b 100644 --- a/llama-demo-apps/BreakingNews.ipynb +++ b/llama-demo-apps/BreakingNews.ipynb @@ -82,25 +82,17 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "id": "c12fc2cb", "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Init param `input` is deprecated, please use `model_kwargs` instead.\n" - ] - } - ], + "outputs": [], "source": [ "# set llm to be using Llama2 hosted on Replicate\n", "llama2_13b_chat = \"meta/llama-2-13b-chat:f4e2de70d66816a838a89eeeb621910adffb0dd0baba3976c96980970978018d\"\n", "\n", "llm = Replicate(\n", " model=llama2_13b_chat,\n", - " input={\"temperature\": 0.01, \"max_length\": 2000, \"top_p\": 1},\n", + " model_kwargs={\"temperature\": 0.01, \"top_p\": 1, \"max_new_tokens\":500}\n", ")" ] }, diff --git a/llama-demo-apps/HelloLlamaCloud.ipynb b/llama-demo-apps/HelloLlamaCloud.ipynb index 8d914cf6..ad564aae 100644 --- a/llama-demo-apps/HelloLlamaCloud.ipynb +++ b/llama-demo-apps/HelloLlamaCloud.ipynb @@ -20,7 +20,7 @@ "metadata": {}, "outputs": [], "source": [ - "!pip install langchain replicate sentence-transformers" + "!pip install langchain replicate sentence-transformers chromadb" ] }, { @@ -47,25 +47,17 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "id": "ad536adb", "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Init param `input` is deprecated, please use `model_kwargs` instead.\n" - ] - } - ], + "outputs": [], "source": [ "from langchain.llms import Replicate\n", "\n", "llama2_13b = \"meta/llama-2-13b-chat:f4e2de70d66816a838a89eeeb621910adffb0dd0baba3976c96980970978018d\"\n", "llm = Replicate(\n", " model=llama2_13b,\n", - " input={\"temperature\": 0.01, \"max_length\": 500, \"top_p\": 1},\n", + " model_kwargs={\"temperature\": 0.01, \"top_p\": 1, \"max_new_tokens\":500}\n", ")" ] }, @@ -220,7 +212,7 @@ "metadata": {}, "outputs": [], "source": [ - "# there're more 30 vector stores (DBs) supported by LangChain. Chroma is light-weight and in memory so it's easy to get started with\n", + "# there're more than 30 vector stores (DBs) supported by LangChain. Chroma is light-weight and in memory so it's easy to get started with\n", "# other vector stores can be used to store large amount of data - see https://python.langchain.com/docs/integrations/vectorstores\n", "from langchain.vectorstores import Chroma\n", "\n", @@ -238,7 +230,9 @@ "metadata": {}, "outputs": [], "source": [ - "# split the loaded documents into chunks \n", + "# split the loaded documents into chunks. \n", + "# in genreral, use larger chuck sizes for highly structured text such as code and smaller size for \n", + "# less structured text. you may need to experiment with different chunk sizes and overlap values to find out the best numbers.\n", "text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=20)\n", "all_splits = text_splitter.split_documents(docs)\n", "\n", @@ -387,7 +381,10 @@ "chat_history.append((followup, followup_answer[\"answer\"]))\n", "more_followup = \"what tasks can it assist with?\"\n", "more_followup_answer = chat_chain({\"question\": more_followup, \"chat_history\": chat_history})\n", - "print(more_followup_answer['answer'])" + "print(more_followup_answer['answer'])\n", + "\n", + "# results get cut off - you may set \"max_new_tokens\" in the Replicate call above to a larger number (like 1000 below) to avoid the cut off\n", + "# model_kwargs={\"temperature\": 0.01, \"top_p\": 1, \"max_new_tokens\": 1000}" ] } ], diff --git a/llama-demo-apps/Llama2_Gradio.ipynb b/llama-demo-apps/Llama2_Gradio.ipynb index 9b8e1856..d23d95f7 100644 --- a/llama-demo-apps/Llama2_Gradio.ipynb +++ b/llama-demo-apps/Llama2_Gradio.ipynb @@ -55,7 +55,7 @@ "\n", "llm = Replicate(\n", " model=llama2_13b_chat,\n", - " input={\"temperature\": 0.01, \"max_length\": 2000, \"top_p\": 1},\n", + " model_kwargs={\"temperature\": 0.01, \"top_p\": 1, \"max_new_tokens\":500}\n", ")\n", "\n", "\n", diff --git a/llama-demo-apps/README.md b/llama-demo-apps/README.md index c240c65a..ef86b035 100644 --- a/llama-demo-apps/README.md +++ b/llama-demo-apps/README.md @@ -1,8 +1,8 @@ # Llama2 Demo Apps -This folder showcases the Llama2-powered apps. If you need a general understanding of GenAI, Llama2, prompt engineering and RAG, be sure to first check the [Getting to know Llama 2 notebook](https://github.com/facebookresearch/llama-recipes/blob/main/examples/Getting_to_know_Llama.ipynb) and its Meta Connect video [here](https://www.facebook.com/watch/?v=662153709222699). +This folder showcases Llama2-powered demo apps. If you need a general understanding of GenAI, Llama2, prompt engineering and RAG, be sure to first check the [Getting to know Llama 2 notebook](https://github.com/facebookresearch/llama-recipes/blob/main/examples/Getting_to_know_Llama.ipynb) and its Meta Connect video [here](https://www.facebook.com/watch/?v=662153709222699). -Here we start with three quickstart demos showing how to run Llama2 locally on a Mac, remotely in the cloud, and on a Google Colab to ask Llama2 general questions or questions about unstructured data not trained for the model. +We start with three quickstart demos showing how to run Llama2 locally on a Mac, remotely in the cloud, and on a Google Colab to ask Llama2 general questions or questions about unstructured data not trained for the model. We then show three demos that ask Llama2 to summarize a YouTube video, to answer questions about structured data stored in a database, and to answer questions about live search results. @@ -10,6 +10,21 @@ We also show how to build quick web UI for Llama2 demo apps using Streamlit and More advanced Llama2 demo apps will be coming soon. +## Setting Up Environment + +The quickest way to test run the notebook demo apps on your local machine is to create a Conda envinronment and start running the Jupyter notebook as follows: +``` +conda create -n llama-demo-apps python=3.8 +conda activate llama-demo-apps +pip install jupyter +cd <your_work_folder> +git clone https://github.com/facebookresearch/llama-recipes +cd llama-recipes/llama-demo-apps +jupyter notebook +``` + +You can also upload the notebooks to Google Colab. + ## HelloLlama - Quickstart in Running Llama2 (Almost) Everywhere* The first three demo apps show: @@ -19,7 +34,7 @@ The first three demo apps show: * how to ask follow up questions to Llama by sending previous questions and answers as the context along with the new question, hence performing multi-turn chat or conversation with Llama. ### [Running Llama2 Locally on Mac](HelloLlamaLocal.ipynb) -To run Llama2 locally on Mac using [llama-cpp-python](https://github.com/abetlen/llama-cpp-python), first open the notebook `HelloLlamaLocal`. Then replace `<path-to-ggml-model-q4_0.gguf>` in the notebook `HelloLlamaLocal` with the path either to your downloaded quantized model file [here](https://drive.google.com/file/d/1afPv3HOy73BE2MoYCgYJvBDeQNa9rZbj/view?usp=sharing), or to the `ggml-model-q4_0.gguf` file built with the following commands: +To run Llama2 locally on Mac using [llama-cpp-python](https://github.com/abetlen/llama-cpp-python), first open the notebook `HelloLlamaLocal`. Then replace `<path-to-ggml-model-q4_0.gguf>` in the notebook `HelloLlamaLocal` with the path either to your downloaded quantized model file [here](https://huggingface.co/TheBloke/Llama-2-7b-Chat-GGUF/resolve/main/llama-2-7b-chat.Q4_0.gguf), or to the `ggml-model-q4_0.gguf` file built with the following commands: ``` git clone https://github.com/ggerganov/llama.cpp cd llama.cpp diff --git a/llama-demo-apps/StructuredLlama.ipynb b/llama-demo-apps/StructuredLlama.ipynb index ec3dbd1f..ddc3aa29 100644 --- a/llama-demo-apps/StructuredLlama.ipynb +++ b/llama-demo-apps/StructuredLlama.ipynb @@ -57,18 +57,10 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "id": "9dcd744c", "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Init param `input` is deprecated, please use `model_kwargs` instead.\n" - ] - } - ], + "outputs": [], "source": [ "llama2_13b_chat = \"meta/llama-2-13b-chat:f4e2de70d66816a838a89eeeb621910adffb0dd0baba3976c96980970978018d\"\n", "\n", @@ -76,7 +68,7 @@ "# \"Sure! Here's the SQL query for the given input question: \" before the SQL query; otherwise custom parsing will be needed.\n", "llm = Replicate(\n", " model=llama2_13b_chat,\n", - " input={\"temperature\": 0.01, \"max_length\": 500, \"top_p\": 1, \"system_prompt\": \"Given an input question, convert it to a SQL query. No pre-amble.\"},\n", + " model_kwargs={\"temperature\": 0.01, \"top_p\": 1, \"max_new_tokens\":500, \"system_prompt\": \"Given an input question, convert it to a SQL query. No pre-amble.\"},\n", ")" ] }, @@ -89,10 +81,6 @@ "source": [ "db = SQLDatabase.from_uri(\"sqlite:///nba_roster.db\", sample_rows_in_table_info= 0)\n", "\n", - "# use the default sqlite prompt defined in \n", - "# https://github.com/langchain-ai/langchain/blob/33eb5f8300cd21c91a2f8d10c62197637931fa0a/libs/langchain/langchain/chains/sql_database/prompt.py#L211\n", - "# db_chain = SQLDatabaseChain.from_llm(llm, db, verbose=True)\n", - "\n", "# customize the default sqlite prompt defined in the link above\n", "PROMPT_SUFFIX = \"\"\"\n", "Only use the following tables:\n", diff --git a/llama-demo-apps/VideoSummary.ipynb b/llama-demo-apps/VideoSummary.ipynb index 7d6fab95..edcab0b3 100644 --- a/llama-demo-apps/VideoSummary.ipynb +++ b/llama-demo-apps/VideoSummary.ipynb @@ -8,7 +8,7 @@ "## This demo app shows:\n", "* how to use LangChain's YoutubeLoader to retrieve the caption in a YouTube video;\n", "* how to ask Llama to summarize the content (per the Llama's input size limit) of the video in a naive way using LangChain's stuff method;\n", - "* how to bypass the limit of Llama's max input token size by using more sophisticated way using LangChain's map_reduce and refine methods." + "* how to bypass the limit of Llama's max input token size by using more sophisticated way using LangChain's map_reduce and refine methods - see [here](https://python.langchain.com/docs/use_cases/summarization) for more info." ] }, { @@ -94,18 +94,10 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "id": "adf8cf3d", "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Init param `input` is deprecated, please use `model_kwargs` instead.\n" - ] - } - ], + "outputs": [], "source": [ "# set llm to be Llama2-13b model; if you use local Llama, just set llm accordingly - see the HelloLlamaLocal notebook\n", "from langchain.llms import Replicate\n", @@ -113,7 +105,7 @@ "llama2_13b = \"meta/llama-2-13b-chat:f4e2de70d66816a838a89eeeb621910adffb0dd0baba3976c96980970978018d\"\n", "llm = Replicate(\n", " model=llama2_13b,\n", - " input={\"temperature\": 0.01, \"max_length\": 500, \"top_p\": 1},\n", + " model_kwargs={\"temperature\": 0.01, \"top_p\": 1, \"max_new_tokens\":500}\n", ")" ] }, diff --git a/llama-demo-apps/streamlit_llama2.py b/llama-demo-apps/streamlit_llama2.py index 1f88a555..1d8404fd 100644 --- a/llama-demo-apps/streamlit_llama2.py +++ b/llama-demo-apps/streamlit_llama2.py @@ -12,11 +12,11 @@ def generate_response(input_text): llm = Replicate( model=llama2_13b_chat, - input={"temperature": 0.01, "max_length": 2000, "top_p": 1}, + model_kwargs={"temperature": 0.01, "top_p": 1, "max_new_tokens":500} ) st.info(llm(input_text)) with st.form("my_form"): text = st.text_area("Enter text:", "What is Generative AI?") submitted = st.form_submit_button("Submit") - generate_response(text) \ No newline at end of file + generate_response(text) -- GitLab