From e1dd1a7b382fc40013731580c2a8fc10cee3f63a Mon Sep 17 00:00:00 2001 From: Pia Papanna <ppapanna@meta.com> Date: Thu, 27 Jun 2024 15:55:31 -0700 Subject: [PATCH] use_cases folder update --- recipes/use_cases/README.md | 12 ++++++------ ...ag-agent.ipynb => langgraph_rag_agent.ipynb} | 0 ...al.ipynb => langgraph_rag_agent_local.ipynb} | 0 ...ipynb => langgraph_tool_calling_agent.ipynb} | 0 .../use_cases/{ => coding}/text2sql/csv2db.py | 0 recipes/use_cases/{ => coding}/text2sql/nba.txt | 0 .../{ => coding}/text2sql/nba_roster.db | Bin .../text2sql/structured_llama.ipynb} | 0 .../use_cases/{ => coding}/text2sql/txt2csv.py | 0 .../RAG_chatbot/RAG_Chatbot_Example.ipynb | 0 .../data/Llama Getting Started Guide.pdf | Bin .../RAG_chatbot/requirements.txt | 0 .../vectorstore/db_faiss/index.faiss | Bin .../RAG_chatbot/vectorstore/db_faiss/index.pkl | Bin ...mongodb_llama3_huggingface_open_source.ipynb | 0 .../messenger_llama/llama_messenger.py | 0 .../messenger_llama/messenger_llama3.md | 0 .../sales_bot/Musical_instruments_reviews.csv | 0 .../sales_bot/SalesBot.ipynb | 0 .../whatsapp_llama/llama_chatbot.py | 0 .../whatsapp_llama/whatsapp_llama3.md | 0 .../{LiveData.ipynb => live_data.ipynb} | 0 recipes/{ => use_cases}/multilingual/README.md | 16 ++++++++-------- .../multilingual/extend_tokenizer.py | 0 .../multilingual/img/phase1_eval_loss.png} | Bin .../multilingual/img/phase1_train_loss.png} | Bin .../multilingual/img/phase2_eval_loss.png} | Bin .../multilingual/img/phase2_train_loss.png} | Bin .../multilingual/prepare_data.py | 0 .../multilingual/train_tokenizer.py | 0 .../{VideoSummary.ipynb => video_summary.ipynb} | 0 31 files changed, 14 insertions(+), 14 deletions(-) rename recipes/use_cases/agents/langchain/{langgraph-rag-agent.ipynb => langgraph_rag_agent.ipynb} (100%) rename recipes/use_cases/agents/langchain/{langgraph-rag-agent-local.ipynb => langgraph_rag_agent_local.ipynb} (100%) rename recipes/use_cases/agents/langchain/{langgraph-tool-calling-agent.ipynb => langgraph_tool_calling_agent.ipynb} (100%) rename recipes/use_cases/{ => coding}/text2sql/csv2db.py (100%) rename recipes/use_cases/{ => coding}/text2sql/nba.txt (100%) rename recipes/use_cases/{ => coding}/text2sql/nba_roster.db (100%) rename recipes/use_cases/{text2sql/StructuredLlama.ipynb => coding/text2sql/structured_llama.ipynb} (100%) rename recipes/use_cases/{ => coding}/text2sql/txt2csv.py (100%) rename recipes/use_cases/{chatbots => customerservice_chatbots}/RAG_chatbot/RAG_Chatbot_Example.ipynb (100%) rename recipes/use_cases/{chatbots => customerservice_chatbots}/RAG_chatbot/data/Llama Getting Started Guide.pdf (100%) rename recipes/use_cases/{chatbots => customerservice_chatbots}/RAG_chatbot/requirements.txt (100%) rename recipes/use_cases/{chatbots => customerservice_chatbots}/RAG_chatbot/vectorstore/db_faiss/index.faiss (100%) rename recipes/use_cases/{chatbots => customerservice_chatbots}/RAG_chatbot/vectorstore/db_faiss/index.pkl (100%) rename recipes/use_cases/{chatbots => customerservice_chatbots}/RAG_chatbot/vectorstore/mongodb/rag_mongodb_llama3_huggingface_open_source.ipynb (100%) rename recipes/use_cases/{chatbots => customerservice_chatbots}/messenger_llama/llama_messenger.py (100%) rename recipes/use_cases/{chatbots => customerservice_chatbots}/messenger_llama/messenger_llama3.md (100%) rename recipes/use_cases/{chatbots => customerservice_chatbots}/sales_bot/Musical_instruments_reviews.csv (100%) rename recipes/use_cases/{chatbots => customerservice_chatbots}/sales_bot/SalesBot.ipynb (100%) rename recipes/use_cases/{chatbots => customerservice_chatbots}/whatsapp_llama/llama_chatbot.py (100%) rename recipes/use_cases/{chatbots => customerservice_chatbots}/whatsapp_llama/whatsapp_llama3.md (100%) rename recipes/use_cases/{LiveData.ipynb => live_data.ipynb} (100%) rename recipes/{ => use_cases}/multilingual/README.md (94%) rename recipes/{ => use_cases}/multilingual/extend_tokenizer.py (100%) rename recipes/{multilingual/imgs/phase1-eval-loss.png => use_cases/multilingual/img/phase1_eval_loss.png} (100%) rename recipes/{multilingual/imgs/phase1-train-loss.png => use_cases/multilingual/img/phase1_train_loss.png} (100%) rename recipes/{multilingual/imgs/phase2-eval-loss.png => use_cases/multilingual/img/phase2_eval_loss.png} (100%) rename recipes/{multilingual/imgs/phase2-train-loss.png => use_cases/multilingual/img/phase2_train_loss.png} (100%) rename recipes/{ => use_cases}/multilingual/prepare_data.py (100%) rename recipes/{ => use_cases}/multilingual/train_tokenizer.py (100%) rename recipes/use_cases/{VideoSummary.ipynb => video_summary.ipynb} (100%) diff --git a/recipes/use_cases/README.md b/recipes/use_cases/README.md index 49b55d73..2ba91347 100644 --- a/recipes/use_cases/README.md +++ b/recipes/use_cases/README.md @@ -1,22 +1,22 @@ -## [VideoSummary](VideoSummary.ipynb): Ask Llama 3 to Summarize a Long YouTube Video (using Replicate or [OctoAI](../3p_integration/octoai/VideoSummary.ipynb)) +## [VideoSummary](video_summary.ipynb): Ask Llama 3 to Summarize a Long YouTube Video (using Replicate or [OctoAI](../3p_integration/octoai/video_summary.ipynb)) This demo app uses Llama 3 to return a text summary of a YouTube video. It shows how to retrieve the caption of a YouTube video and how to ask Llama to summarize the content in different ways, from the simplest naive way that works for short text to more advanced methods of using LangChain's map_reduce and refine to overcome the 8K context length limit of Llama 3. -## [NBA2023-24](./text2sql/StructuredLlama.ipynb): Ask Llama 3 about Structured Data +## [NBA2023-24](./coding/text2sql/structured_llama.ipynb): Ask Llama 3 about Structured Data This demo app shows how to use LangChain and Llama 3 to let users ask questions about **structured** data stored in a SQL DB. As the 2023-24 NBA season is entering the playoff, we use the NBA roster info saved in a SQLite DB to show you how to ask Llama 3 questions about your favorite teams or players. ## [live_data](live_data.ipynb): Ask Llama 3 about Live Data (using Replicate or [OctoAI](../3p_integration/octoai/live_data.ipynb)) This demo app shows how to perform live data augmented generation tasks with Llama 3, [LlamaIndex](https://github.com/run-llama/llama_index), another leading open-source framework for building LLM apps, and the [Tavily](https://tavily.com) live search API. -## [WhatsApp Chatbot](./chatbots/whatsapp_llama/whatsapp_llama3.md): Building a Llama 3 Enabled WhatsApp Chatbot +## [WhatsApp Chatbot](./customerservice_chatbots/whatsapp_llama/whatsapp_llama3.md): Building a Llama 3 Enabled WhatsApp Chatbot This step-by-step tutorial shows how to use the [WhatsApp Business API](https://developers.facebook.com/docs/whatsapp/cloud-api/overview) to build a Llama 3 enabled WhatsApp chatbot. -## [Messenger Chatbot](./chatbots/messenger_llama/messenger_llama3.md): Building a Llama 3 Enabled Messenger Chatbot +## [Messenger Chatbot](./customerservice_chatbots/messenger_llama/messenger_llama3.md): Building a Llama 3 Enabled Messenger Chatbot This step-by-step tutorial shows how to use the [Messenger Platform](https://developers.facebook.com/docs/messenger-platform/overview) to build a Llama 3 enabled Messenger chatbot. -### RAG Chatbot Example (running [locally](./chatbots/RAG_chatbot/RAG_Chatbot_Example.ipynb) or on [OctoAI](../3p_integration/octoai/RAG_Chatbot_example/RAG_Chatbot_Example.ipynb)) +### RAG Chatbot Example (running [locally](./customerservice_chatbots/RAG_chatbot/RAG_Chatbot_Example.ipynb) or on [OctoAI](../3p_integration/octoai/RAG_Chatbot_example/RAG_Chatbot_Example.ipynb)) A complete example of how to build a Llama 3 chatbot hosted on your browser that can answer questions based on your own data using retrieval augmented generation (RAG). You can run Llama2 locally if you have a good enough GPU or on OctoAI if you follow the note [here](../README.md#octoai_note). -## [Sales Bot](./chatbots/sales_bot/SalesBot.ipynb): Sales Bot with Llama3 - A Summarization and RAG Use Case +## [Sales Bot](./customerservice_chatbots/sales_bot/SalesBot.ipynb): Sales Bot with Llama3 - A Summarization and RAG Use Case An summarization + RAG use case built around the Amazon product review Kaggle dataset to build a helpful Music Store Sales Bot. The summarization and RAG are built on top of Llama models hosted on OctoAI, and the vector database is hosted on Weaviate Cloud Services. ## [Media Generation](./MediaGen.ipynb): Building a Video Generation Pipeline with Llama3 diff --git a/recipes/use_cases/agents/langchain/langgraph-rag-agent.ipynb b/recipes/use_cases/agents/langchain/langgraph_rag_agent.ipynb similarity index 100% rename from recipes/use_cases/agents/langchain/langgraph-rag-agent.ipynb rename to recipes/use_cases/agents/langchain/langgraph_rag_agent.ipynb diff --git a/recipes/use_cases/agents/langchain/langgraph-rag-agent-local.ipynb b/recipes/use_cases/agents/langchain/langgraph_rag_agent_local.ipynb similarity index 100% rename from recipes/use_cases/agents/langchain/langgraph-rag-agent-local.ipynb rename to recipes/use_cases/agents/langchain/langgraph_rag_agent_local.ipynb diff --git a/recipes/use_cases/agents/langchain/langgraph-tool-calling-agent.ipynb b/recipes/use_cases/agents/langchain/langgraph_tool_calling_agent.ipynb similarity index 100% rename from recipes/use_cases/agents/langchain/langgraph-tool-calling-agent.ipynb rename to recipes/use_cases/agents/langchain/langgraph_tool_calling_agent.ipynb diff --git a/recipes/use_cases/text2sql/csv2db.py b/recipes/use_cases/coding/text2sql/csv2db.py similarity index 100% rename from recipes/use_cases/text2sql/csv2db.py rename to recipes/use_cases/coding/text2sql/csv2db.py diff --git a/recipes/use_cases/text2sql/nba.txt b/recipes/use_cases/coding/text2sql/nba.txt similarity index 100% rename from recipes/use_cases/text2sql/nba.txt rename to recipes/use_cases/coding/text2sql/nba.txt diff --git a/recipes/use_cases/text2sql/nba_roster.db b/recipes/use_cases/coding/text2sql/nba_roster.db similarity index 100% rename from recipes/use_cases/text2sql/nba_roster.db rename to recipes/use_cases/coding/text2sql/nba_roster.db diff --git a/recipes/use_cases/text2sql/StructuredLlama.ipynb b/recipes/use_cases/coding/text2sql/structured_llama.ipynb similarity index 100% rename from recipes/use_cases/text2sql/StructuredLlama.ipynb rename to recipes/use_cases/coding/text2sql/structured_llama.ipynb diff --git a/recipes/use_cases/text2sql/txt2csv.py b/recipes/use_cases/coding/text2sql/txt2csv.py similarity index 100% rename from recipes/use_cases/text2sql/txt2csv.py rename to recipes/use_cases/coding/text2sql/txt2csv.py diff --git a/recipes/use_cases/chatbots/RAG_chatbot/RAG_Chatbot_Example.ipynb b/recipes/use_cases/customerservice_chatbots/RAG_chatbot/RAG_Chatbot_Example.ipynb similarity index 100% rename from recipes/use_cases/chatbots/RAG_chatbot/RAG_Chatbot_Example.ipynb rename to recipes/use_cases/customerservice_chatbots/RAG_chatbot/RAG_Chatbot_Example.ipynb diff --git a/recipes/use_cases/chatbots/RAG_chatbot/data/Llama Getting Started Guide.pdf b/recipes/use_cases/customerservice_chatbots/RAG_chatbot/data/Llama Getting Started Guide.pdf similarity index 100% rename from recipes/use_cases/chatbots/RAG_chatbot/data/Llama Getting Started Guide.pdf rename to recipes/use_cases/customerservice_chatbots/RAG_chatbot/data/Llama Getting Started Guide.pdf diff --git a/recipes/use_cases/chatbots/RAG_chatbot/requirements.txt b/recipes/use_cases/customerservice_chatbots/RAG_chatbot/requirements.txt similarity index 100% rename from recipes/use_cases/chatbots/RAG_chatbot/requirements.txt rename to recipes/use_cases/customerservice_chatbots/RAG_chatbot/requirements.txt diff --git a/recipes/use_cases/chatbots/RAG_chatbot/vectorstore/db_faiss/index.faiss b/recipes/use_cases/customerservice_chatbots/RAG_chatbot/vectorstore/db_faiss/index.faiss similarity index 100% rename from recipes/use_cases/chatbots/RAG_chatbot/vectorstore/db_faiss/index.faiss rename to recipes/use_cases/customerservice_chatbots/RAG_chatbot/vectorstore/db_faiss/index.faiss diff --git a/recipes/use_cases/chatbots/RAG_chatbot/vectorstore/db_faiss/index.pkl b/recipes/use_cases/customerservice_chatbots/RAG_chatbot/vectorstore/db_faiss/index.pkl similarity index 100% rename from recipes/use_cases/chatbots/RAG_chatbot/vectorstore/db_faiss/index.pkl rename to recipes/use_cases/customerservice_chatbots/RAG_chatbot/vectorstore/db_faiss/index.pkl diff --git a/recipes/use_cases/chatbots/RAG_chatbot/vectorstore/mongodb/rag_mongodb_llama3_huggingface_open_source.ipynb b/recipes/use_cases/customerservice_chatbots/RAG_chatbot/vectorstore/mongodb/rag_mongodb_llama3_huggingface_open_source.ipynb similarity index 100% rename from recipes/use_cases/chatbots/RAG_chatbot/vectorstore/mongodb/rag_mongodb_llama3_huggingface_open_source.ipynb rename to recipes/use_cases/customerservice_chatbots/RAG_chatbot/vectorstore/mongodb/rag_mongodb_llama3_huggingface_open_source.ipynb diff --git a/recipes/use_cases/chatbots/messenger_llama/llama_messenger.py b/recipes/use_cases/customerservice_chatbots/messenger_llama/llama_messenger.py similarity index 100% rename from recipes/use_cases/chatbots/messenger_llama/llama_messenger.py rename to recipes/use_cases/customerservice_chatbots/messenger_llama/llama_messenger.py diff --git a/recipes/use_cases/chatbots/messenger_llama/messenger_llama3.md b/recipes/use_cases/customerservice_chatbots/messenger_llama/messenger_llama3.md similarity index 100% rename from recipes/use_cases/chatbots/messenger_llama/messenger_llama3.md rename to recipes/use_cases/customerservice_chatbots/messenger_llama/messenger_llama3.md diff --git a/recipes/use_cases/chatbots/sales_bot/Musical_instruments_reviews.csv b/recipes/use_cases/customerservice_chatbots/sales_bot/Musical_instruments_reviews.csv similarity index 100% rename from recipes/use_cases/chatbots/sales_bot/Musical_instruments_reviews.csv rename to recipes/use_cases/customerservice_chatbots/sales_bot/Musical_instruments_reviews.csv diff --git a/recipes/use_cases/chatbots/sales_bot/SalesBot.ipynb b/recipes/use_cases/customerservice_chatbots/sales_bot/SalesBot.ipynb similarity index 100% rename from recipes/use_cases/chatbots/sales_bot/SalesBot.ipynb rename to recipes/use_cases/customerservice_chatbots/sales_bot/SalesBot.ipynb diff --git a/recipes/use_cases/chatbots/whatsapp_llama/llama_chatbot.py b/recipes/use_cases/customerservice_chatbots/whatsapp_llama/llama_chatbot.py similarity index 100% rename from recipes/use_cases/chatbots/whatsapp_llama/llama_chatbot.py rename to recipes/use_cases/customerservice_chatbots/whatsapp_llama/llama_chatbot.py diff --git a/recipes/use_cases/chatbots/whatsapp_llama/whatsapp_llama3.md b/recipes/use_cases/customerservice_chatbots/whatsapp_llama/whatsapp_llama3.md similarity index 100% rename from recipes/use_cases/chatbots/whatsapp_llama/whatsapp_llama3.md rename to recipes/use_cases/customerservice_chatbots/whatsapp_llama/whatsapp_llama3.md diff --git a/recipes/use_cases/LiveData.ipynb b/recipes/use_cases/live_data.ipynb similarity index 100% rename from recipes/use_cases/LiveData.ipynb rename to recipes/use_cases/live_data.ipynb diff --git a/recipes/multilingual/README.md b/recipes/use_cases/multilingual/README.md similarity index 94% rename from recipes/multilingual/README.md rename to recipes/use_cases/multilingual/README.md index d4fb7c97..2f6341b5 100644 --- a/recipes/multilingual/README.md +++ b/recipes/use_cases/multilingual/README.md @@ -5,10 +5,10 @@ Please read more about OpenHathi [here](https://www.sarvam.ai/blog/announcing-op ## Data The original OpenHathi model uses a combination of [Sangraha](https://huggingface.co/datasets/ai4bharat/sangraha) and Wikipedia as its primary data sources. If the reader is interested in using these sources, they would also have to preprocess the data: clean, filter, and deduplicate. See [Setu](https://github.com/AI4Bharat/setu) for an easy way to do this at scale. -In this tutorial, we will use the [Varta](https://huggingface.co/datasets/rahular/varta) dataset which contains 40M+ news articles taken from [DailyHunt](https://m.dailyhunt.in/). Since this data is already high-quality, we can skip the pre-processing step mentioned above. We will use the Hindi subset here, but you can add any other language present in the dataset by only passing the right language code (advanced users can also tweak the code to add multiple languages at once). +In this tutorial, we will use the [Varta](https://huggingface.co/datasets/rahular/varta) dataset which contains 40M+ news articles taken from [DailyHunt](https://m.dailyhunt.in/). Since this data is already high-quality, we can skip the pre-processing step mentioned above. We will use the Hindi subset here, but you can add any other language present in the dataset by only passing the right language code (advanced users can also tweak the code to add multiple languages at once). ## Tokenizer -Our first step towards augmenting a new language to an LLM is creating a better tokenizer. We define 'better' in terms of fertility score or the number of in-language tokens present in the tokenizer. Note that we should add new tokens without disturbing the original vocabulary, and therefore creating a better tokenizer usually involves 2 steps: (i) building a new, in-language only tokenizer, and (ii) merging this new tokenizer with the original. +Our first step towards augmenting a new language to an LLM is creating a better tokenizer. We define 'better' in terms of fertility score or the number of in-language tokens present in the tokenizer. Note that we should add new tokens without disturbing the original vocabulary, and therefore creating a better tokenizer usually involves 2 steps: (i) building a new, in-language only tokenizer, and (ii) merging this new tokenizer with the original. ### Building the in-language tokenizer For this, we will first download and prepare the data for training the tokenizer: @@ -62,7 +62,7 @@ Note: OpenHathi's final data mixture also contains monolingual data and romanize We can easily create data for both phases using any translation model. OpenHathi uses [IndicTrans2](https://github.com/AI4Bharat/IndicTrans2). We provide sample code for both phases below. ### Phase 1 -With the assumption that we don't have source-native data, let us first get some English data to translate. +With the assumption that we don't have source-native data, let us first get some English data to translate. ``` from datasets import load_dataset @@ -118,7 +118,7 @@ phase2_ds.save_to_disk("data/phase2") ``` ### Train -Finally, we can start finetuning Llama2 on these datasets by following the [finetuning recipes](https://github.com/meta-llama/llama-recipes/tree/main/recipes/finetuning). Remember to pass the new tokenizer path as an argument to the script: `--tokenizer_name=./extended_tokenizer`. +Finally, we can start finetuning Llama2 on these datasets by following the [finetuning recipes](https://github.com/meta-llama/llama-recipes/tree/main/recipes/quickstart/finetuning). Remember to pass the new tokenizer path as an argument to the script: `--tokenizer_name=./extended_tokenizer`. OpenHathi was trained on 64 A100 80GB GPUs. Here are the hyperparameters used and other training details: - maximum learning rate: 2e-4 @@ -141,16 +141,16 @@ The resulting (partial) loss plots from the OpenHathi training are shown below: Phase 1: train loss - + Phase 1: eval loss - + Phase 2: train loss - + Phase 2: eval loss - + diff --git a/recipes/multilingual/extend_tokenizer.py b/recipes/use_cases/multilingual/extend_tokenizer.py similarity index 100% rename from recipes/multilingual/extend_tokenizer.py rename to recipes/use_cases/multilingual/extend_tokenizer.py diff --git a/recipes/multilingual/imgs/phase1-eval-loss.png b/recipes/use_cases/multilingual/img/phase1_eval_loss.png similarity index 100% rename from recipes/multilingual/imgs/phase1-eval-loss.png rename to recipes/use_cases/multilingual/img/phase1_eval_loss.png diff --git a/recipes/multilingual/imgs/phase1-train-loss.png b/recipes/use_cases/multilingual/img/phase1_train_loss.png similarity index 100% rename from recipes/multilingual/imgs/phase1-train-loss.png rename to recipes/use_cases/multilingual/img/phase1_train_loss.png diff --git a/recipes/multilingual/imgs/phase2-eval-loss.png b/recipes/use_cases/multilingual/img/phase2_eval_loss.png similarity index 100% rename from recipes/multilingual/imgs/phase2-eval-loss.png rename to recipes/use_cases/multilingual/img/phase2_eval_loss.png diff --git a/recipes/multilingual/imgs/phase2-train-loss.png b/recipes/use_cases/multilingual/img/phase2_train_loss.png similarity index 100% rename from recipes/multilingual/imgs/phase2-train-loss.png rename to recipes/use_cases/multilingual/img/phase2_train_loss.png diff --git a/recipes/multilingual/prepare_data.py b/recipes/use_cases/multilingual/prepare_data.py similarity index 100% rename from recipes/multilingual/prepare_data.py rename to recipes/use_cases/multilingual/prepare_data.py diff --git a/recipes/multilingual/train_tokenizer.py b/recipes/use_cases/multilingual/train_tokenizer.py similarity index 100% rename from recipes/multilingual/train_tokenizer.py rename to recipes/use_cases/multilingual/train_tokenizer.py diff --git a/recipes/use_cases/VideoSummary.ipynb b/recipes/use_cases/video_summary.ipynb similarity index 100% rename from recipes/use_cases/VideoSummary.ipynb rename to recipes/use_cases/video_summary.ipynb -- GitLab