diff --git a/llama-demo-apps/Llama2_Gradio.ipynb b/llama-demo-apps/Llama2_Gradio.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..1a1ecc6f2e41a2264b7519f1bed28003b21a2700 --- /dev/null +++ b/llama-demo-apps/Llama2_Gradio.ipynb @@ -0,0 +1,104 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "928041cc", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Init param `input` is deprecated, please use `model_kwargs` instead.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Running on local URL: http://127.0.0.1:7860\n", + "\n", + "To create a public link, set `share=True` in `launch()`.\n" + ] + }, + { + "data": { + "text/html": [ + "<div><iframe src=\"http://127.0.0.1:7860/\" width=\"100%\" height=\"500\" allow=\"autoplay; camera; microphone; clipboard-read; clipboard-write;\" frameborder=\"0\" allowfullscreen></iframe></div>" + ], + "text/plain": [ + "<IPython.core.display.HTML object>" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [] + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from langchain.schema import AIMessage, HumanMessage\n", + "import gradio as gr\n", + "from langchain.llms import Replicate\n", + "import os\n", + "\n", + "os.environ[\"REPLICATE_API_TOKEN\"] = \"<your replicate api token>\"\n", + "\n", + "llama2_13b_chat = \"meta/llama-2-13b-chat:f4e2de70d66816a838a89eeeb621910adffb0dd0baba3976c96980970978018d\"\n", + "\n", + "llm = Replicate(\n", + " model=llama2_13b_chat,\n", + " input={\"temperature\": 0.01, \"max_length\": 2000, \"top_p\": 1},\n", + ")\n", + "\n", + "\n", + "def predict(message, history):\n", + " history_langchain_format = []\n", + " for human, ai in history:\n", + " history_langchain_format.append(HumanMessage(content=human))\n", + " history_langchain_format.append(AIMessage(content=ai))\n", + " history_langchain_format.append(HumanMessage(content=message))\n", + " gpt_response = llm(message) #history_langchain_format)\n", + " return gpt_response#.content\n", + "\n", + "gr.ChatInterface(predict).launch()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5f21f5a4", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.18" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/llama-demo-apps/README.md b/llama-demo-apps/README.md new file mode 100644 index 0000000000000000000000000000000000000000..1fd67dca8a02c2f1868375ad1ce05b7c25a6e12e --- /dev/null +++ b/llama-demo-apps/README.md @@ -0,0 +1,83 @@ +# Llama2 Demo Apps + +This folder showcases the Llama2-powered apps. + +## HelloLlama - Quickstart in Running Llama2 + +This demo app shows how to use [LangChain](https://github.com/langchain-ai/langchain), an open-source framework for building LLM apps, to quickly build Llama2-power apps: to ask Llama2 general or custom-data-specific natural language questions and get answers back, in both single-turn QA mode and multi-turn chat mode. It has three versions: + +### Running Llama2 locally on Mac +To run Llama2 locally on Mac using [llama-cpp-python](https://github.com/abetlen/llama-cpp-python), open a Terminal, execute the commands below to install required packages and launch the notebook to run each cell - notice the cells starting with calling `from langchain.chains import ConversationalRetrievalChain` shows how to have a multi-turn dialog with chat history passed to the next question. + +``` +conda create -n llama_demo_apps python=3.8 +conda activate llama_demo_apps +CMAKE_ARGS="-DLLAMA_METAL=on" FORCE_CMAKE=1 pip install llama-cpp-python +pip install langchain +pip install sentence-transformers +pip install docarray +pip install jupyter +cd <your_work_folder> +git clone https://github.com/facebookresearch/llama-recipes +cd llama-recipes/llama-demo-apps +jupyter notebook +``` + +Then in the launched browser, select the notebook `HelloLlamaLocal.ipynb` and run each cell - before running cell #3, you need to download the 6GB quantized Llama2-13b-chat model file [here](https://drive.google.com/file/d/1afPv3HOy73BE2MoYCgYJvBDeQNa9rZbj/view?usp=sharing) first, then change the replace <path-to-llama-2-13b-chat-ggml-model-q4_0.gguf> with the path to your downloaded `ggml-model-q4_0.gguf` file. + +### Running Llama2 in Google Colab +To run Llama2 in Google Colab using [llama-cpp-python](https://github.com/abetlen/llama-cpp-python), click the Colab notebook link [here](https://colab.research.google.com/drive/1-uBXt4L-6HNS2D8Iny2DwUpVS4Ub7jnk?usp=sharing) and download the quantized Llama2-13b-chat model [here](https://drive.google.com/file/d/1afPv3HOy73BE2MoYCgYJvBDeQNa9rZbj/view?usp=sharing) and upload it, as well as the nba.csv file in this repo to your Google drive, so you can access those files in cells #6 and #14. Then run each cell. Notice on the Colab T4 GPU, the inference in cell #18 took more than 20 minnutes to return; running the notebook locally on M1 MBP took about 20 seconds. + +### Running Llama2 Hosted in the Cloud +[The Cloud version](HelloLlamaCloud.ipynb) uses LangChain with Llama2 hosted in the cloud on [Replicate](https://replicate.com). The demo shows how to use LangChain to ask Llama2 questions about **unstructured** data stored in a PDF. + +[Note on using Replicate](#replicate_note) To run the demo app, you'll need to first sign in with Replicate with your github account, then create a free API token [here](https://replicate.com/account/api-tokens) that you can use for a while. After the free trial ends, you'll need to enter billing info to continue to use Llama2 hosted on Replicate - according to Replicate's [Run time and cost](https://replicate.com/meta/llama-2-13b-chat) for the Llama2-13b-chat model used in our demo apps, the model "costs $0.000725 per second. Predictions typically complete within 10 seconds." This means each call to the Llama2-13b-chat model costs less than $0.01 if the call completes within 10 seconds. If you want absolutely no costs, you can refer to the section "Running Llama2 locally on Mac" above. + +## [NBA2023-24](StructuredLlama.ipynb): Ask Llama2 about Structured Data +This demo app shows how to use LangChain and Llama2 to let users ask questions about **structured** data stored in a SQL DB. As the 2023-24 NBA season is around the corner, we use the NBA roster info saved in a SQLite DB to show you how to ask Llama2 questions about your favorite teams or players. + +## [VideoSummary](VideoSummary.ipynb): +This demo app uses Llama2 to return a text summary of a YouTube video. + +## [BreakingNews](LiveSearch.ipynb): Ask Llama2 about Live Data +This demo app shows how to perform live data augmented generation tasks with Llama2 and [LlamaIndex](https://github.com/run-llama/llama_index), another leading open-source framework for building LLM apps: it uses the [You.com serarch API](https://documentation.you.com/quickstart) to get breaking news and ask Llama2 about them. + +## Quick Web UI for Llama2 Chat +If you prefer to see Llama2 in action in a web UI, instead of the notebooks above, you can try one of the two methods: + +### Running [Streamlit](https://streamlit.io/) with Llama2 +Open a Terminal, run the following commands: +``` +pip install streamlit langchain replicate +git clone https://github.com/facebookresearch/llama-recipes +cd llama-recipes/llama-demo-apps +``` + +Replace the `<your replicate api token>` in `streamlit_llama2.py` with your API token created [here](https://replicate.com/account/api-tokens) - for more info, see the note [above](#replicate_note). + +Then run the command `streamlit run streamlit_llama2.py` and you'll see on your browser the following UI with question and answer - you can enter new text question, click Submit, and see Llama2's answer: + + + + +### Running [Gradio](https://www.gradio.app/) with Llama2 + +To see how to query Llama2 and get answers with the Gradio UI both from the notebook and web, just launch the notebook `Llama2_Gradio.ipynb`, replace the `<your replicate api token>` with your API token created [here](https://replicate.com/account/api-tokens) - for more info, see the note [above](#replicate_note). + +enter your question, click Submit. You'll see in the notebook or a browser with URL http://127.0.0.1:7860 the following UI: + + + +## LICENSE + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. \ No newline at end of file diff --git a/llama-demo-apps/llama2-streamlit.png b/llama-demo-apps/llama2-streamlit.png new file mode 100644 index 0000000000000000000000000000000000000000..74f886295df9310b9ed981a6032275cfbf1d3482 Binary files /dev/null and b/llama-demo-apps/llama2-streamlit.png differ diff --git a/llama-demo-apps/llama2-streamlit2.png b/llama-demo-apps/llama2-streamlit2.png new file mode 100644 index 0000000000000000000000000000000000000000..b0c464cb7b1ac09cde854af16158c5462dab7038 Binary files /dev/null and b/llama-demo-apps/llama2-streamlit2.png differ diff --git a/llama-demo-apps/llm_log1.png b/llama-demo-apps/llm_log1.png new file mode 100644 index 0000000000000000000000000000000000000000..118d0ca2d644fb985d25f583d1f8d29f249c9133 Binary files /dev/null and b/llama-demo-apps/llm_log1.png differ diff --git a/llama-demo-apps/streamlit_llama2.py b/llama-demo-apps/streamlit_llama2.py new file mode 100644 index 0000000000000000000000000000000000000000..1f88a555fd467304bdc9873d1def947c04a4c2ad --- /dev/null +++ b/llama-demo-apps/streamlit_llama2.py @@ -0,0 +1,22 @@ +import streamlit as st +from langchain.llms import Replicate +import os + +st.title("Llama2-powered Streamlit App") + +with st.sidebar: + os.environ["REPLICATE_API_TOKEN"] = "<your replicate api token>" + +def generate_response(input_text): + llama2_13b_chat = "meta/llama-2-13b-chat:f4e2de70d66816a838a89eeeb621910adffb0dd0baba3976c96980970978018d" + + llm = Replicate( + model=llama2_13b_chat, + input={"temperature": 0.01, "max_length": 2000, "top_p": 1}, + ) + st.info(llm(input_text)) + +with st.form("my_form"): + text = st.text_area("Enter text:", "What is Generative AI?") + submitted = st.form_submit_button("Submit") + generate_response(text) \ No newline at end of file