add subdoc summary pack (#10934)

68639a67 · Jerry Liu · GitHub · a607f97e · 68639a67 · 68639a67
Unverified Commit 68639a67 authored 1 year ago by Jerry Liu Committed by GitHub 1 year ago
--- a/llama-index-packs/llama-index-packs-subdoc-summary/BUILD
+++ b/llama-index-packs/llama-index-packs-subdoc-summary/BUILD
+python_sources()
+poetry_requirements(
+    name="poetry",
+)
--- a/llama-index-packs/llama-index-packs-subdoc-summary/Makefile
+++ b/llama-index-packs/llama-index-packs-subdoc-summary/Makefile
+GIT_ROOT ?= $(shell git rev-parse --show-toplevel)
+help:	## Show all Makefile targets.
+	@grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[33m%-30s\033[0m %s\n", $$1, $$2}'
+format:	## Run code autoformatters (black).
+	pre-commit install
+	git ls-files | xargs pre-commit run black --files
+lint:	## Run linters: pre-commit (black, ruff, codespell) and mypy
+	pre-commit install && git ls-files | xargs pre-commit run --show-diff-on-failure --files
+test:	## Run tests via pytest.
+	pytest tests
+watch-docs:	## Build and watch documentation.
+	sphinx-autobuild docs/ docs/_build/html --open-browser --watch $(GIT_ROOT)/llama_index/
--- a/llama-index-packs/llama-index-packs-subdoc-summary/README.md
+++ b/llama-index-packs/llama-index-packs-subdoc-summary/README.md
+# LlamaIndex Packs Integration: Subdoc-Summary
+This LlamaPack provides an advanced technique for injecting each chunk with "sub-document" metadata. This context augmentation technique is helpful for both retrieving relevant context and for synthesizing correct answers.
+It is a step beyond simply adding a summary of the document as the metadata to each chunk. Within a long document, there can be multiple distinct themes, and we want each chunk to be grounded in global but relevant context.
+This technique was inspired by our "Practical Tips and Tricks" video: https://www.youtube.com/watch?v=ZP1F9z-S7T0.
+## Installation
+```bash
+pip install llama-index llama-index-packs-subdoc-summary
+```
+## CLI Usage
+You can download llamapacks directly using `llamaindex-cli`, which comes installed with the `llama-index` python package:
+```bash
+llamaindex-cli download-llamapack SubDocSummaryPack --download-dir ./subdoc_summary_pack
+```
+You can then inspect the files at `./subdoc_summary_pack` and use them as a template for your own project.
+## Code Usage
+You can download the pack to a the `./subdoc_summary_pack` directory:
+```python
+from llama_index.core.llama_pack import download_llama_pack
+# download and install dependencies
+SubDocSummaryPack = download_llama_pack(
+    "SubDocSummaryPack", "./subdoc_summary_pack"
+)
+# You can use any llama-hub loader to get documents!
+subdoc_summary_pack = SubDocSummaryPack(
+    documents,
+    parent_chunk_size=8192,  # default,
+    child_chunk_size=512,  # default
+    llm=OpenAI(model="gpt-3.5-turbo"),
+    embed_model=OpenAIEmbedding(),
+)
+```
+Initializing the pack will split documents into parent chunks and child chunks. It will inject parent chunk summaries into child chunks, and index the child chunks.
+Running the pack will run the query engine over the vectorized child chunks.
+```python
+response = subdoc_summary_pack.run("<query>", similarity_top_k=2)
+```
--- a/llama-index-packs/llama-index-packs-subdoc-summary/examples/subdoc-summary.ipynb
+++ b/llama-index-packs/llama-index-packs-subdoc-summary/examples/subdoc-summary.ipynb
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "8dd0acdb-5aec-4129-8772-81f56d6b25cf",
+   "metadata": {},
+   "source": [
+    "# Sub-Document Summary Metadata Pack\n",
+    "\n",
+    "This LlamaPack provides an advanced technique for injecting each chunk with \"sub-document\" metadata. This context augmentation technique is helpful for both retrieving relevant context and for synthesizing correct answers.\n",
+    "\n",
+    "It is a step beyond simply adding a summary of the document as the metadata to each chunk. Within a long document, there can be multiple distinct themes, and we want each chunk to be grounded in global but relevant context."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "66818da6-a3fb-4537-b30a-922a8a0ef99e",
+   "metadata": {},
+   "source": [
+    "## Setup Data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "317a3207-1211-4a6a-bd7d-3ab14f399951",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "811.82s - pydevd: Sending message related to process being replaced timed-out after 5 seconds\n",
+      "817.00s - pydevd: Sending message related to process being replaced timed-out after 5 seconds\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current\n",
+      "                                 Dload  Upload   Total   Spent    Left  Speed\n",
+      "100 13.0M  100 13.0M    0     0  27.7M      0 --:--:-- --:--:-- --:--:-- 28.0M\n"
+     ]
+    }
+   ],
+   "source": [
+    "!mkdir -p 'data/'\n",
+    "!curl 'https://arxiv.org/pdf/2307.09288.pdf' -o 'data/llama2.pdf'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "bf6ab9c0-c993-4ab2-8343-b294676d7550",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from llama_index.core import SimpleDirectoryReader\n",
+    "\n",
+    "documents = SimpleDirectoryReader(\"data\").load_data()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "98bfbe4b-539c-469c-82e6-1f823f28d5f4",
+   "metadata": {},
+   "source": [
+    "## Run the Sub-Document Summary Metadata Pack"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "af4b815e-f5ce-406b-9dcb-5a23fc9f96db",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%pip install llama-index-packs-subdoc-summary llama-index-llms-openai llama-index-embeddings-openai"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d619362b-ae45-4e47-b400-1c2ce7262496",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from llama_index.packs.subdoc_summary import SubDocSummaryPack\n",
+    "from llama_index.llms.openai import OpenAI\n",
+    "from llama_index.embeddings.openai import OpenAIEmbedding\n",
+    "\n",
+    "subdoc_summary_pack = SubDocSummaryPack(\n",
+    "    documents,\n",
+    "    parent_chunk_size=8192,  # default,\n",
+    "    child_chunk_size=512,  # default\n",
+    "    llm=OpenAI(model=\"gpt-3.5-turbo\"),\n",
+    "    embed_model=OpenAIEmbedding(),\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "fb11a60d-d356-40c5-84c1-4135382bfbfd",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/markdown": [
+       "Llama 2 was pretrained using an optimized auto-regressive transformer with robust data cleaning, updated data mixes, training on 40% more total tokens, doubling the context length, and using grouped-query attention to improve inference scalability for larger models."
+      ],
+      "text/plain": [
+       "<IPython.core.display.Markdown object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/markdown": [
+       "**Node ID:** 172a1344-d48d-443b-8383-677037570c06<br>**Similarity:** 0.8720929924174893<br>**Text:** page_label: 1\n",
+       "file_name: llama2.pdf\n",
+       "file_path: data/llama2.pdf\n",
+       "file_type: application/pdf\n",
+       "file_size: 13661300\n",
+       "creation_date: 2024-02-17\n",
+       "last_modified_date: 2024-02-17\n",
+       "last_accessed_date: 2024-02-17\n",
+       "context_summary: Llama 2 is a collection of pretrained and fine-tuned large language models optimized for dialogue use cases, ranging from 7 billion to 70 billion parameters. The models, known as Llama 2-Chat, have shown superior performance compared to open-source chat models on various benchmarks and are considered as potential alternatives to closed-source models.\n",
+       "\n",
+       "Llama 2 : Open Foundation and Fine-Tuned Chat Models\n",
+       "Hugo Touvron∗Louis Martin†Kevin Stone†\n",
+       "Peter Albert Amjad Almahairi Yasmine Babaei Nikolay Bashlykov Soumya Batra\n",
+       "Prajjwal Bhargava Shruti Bhosale Dan Bikel Lukas Blecher Cristian Canton Ferrer Moya Chen\n",
+       "Guillem Cucurull David Esiobu Jude Fernandes Jeremy Fu Wenyin Fu Brian Fuller\n",
+       "Cynthia Gao Vedanuj Goswami Naman Goyal Anthony Hartshorn Saghar Hosseini Rui Hou\n",
+       "Hakan Inan Marcin Kardas Viktor Kerkez Madian Khabsa Isabel Kloumann Artem Korenev\n",
+       "Punit Singh Koura Marie-Anne Lachaux Thibaut Lavril Jenya Lee Diana Liskovich\n",
+       "Yinghai Lu Yuning Mao Xavier Martinet Todor Mihaylov Pushkar Mishra\n",
+       "Igor Molybog Yixin Nie Andrew Poulton Jeremy Reizenstein Rashi Rungta Kalyan Saladi\n",
+       "Alan Schelten Ruan Silva Eric Michael Smith Ranjan Subramanian Xiaoqing Ellen Tan Binh Tang\n",
+       "Ross Taylor Adina Williams Jian Xiang Kuan Puxin Xu Zheng Yan Iliyan Zarov Yuchen Zhang\n",
+       "Angela Fan Melanie Kambadur Sharan Narang Aurelien Rodriguez Robert Stojnic\n",
+       "Sergey Edunov Thomas Scialom∗\n",
+       "GenAI, Meta\n",
+       "Abstract\n",
+       "In this work, we develop and release Llama 2, a collection of pretrained and fine-tuned\n",
+       "large language models (LLMs) ranging in scale from 7 billion to 70 billion parameters.\n",
+       "Our fine-tuned LLMs, called Llama 2-Chat , are optimized for dialogue use cases. Our\n",
+       "models outperform open-source chat models on most benchmarks we tested, and based on\n",
+       "ourhumanevaluationsforhelpfulnessandsafety,maybeasuitablesubstituteforclosed-\n",
+       "source models. We provide a detailed description of our approach to fine-tuning and safety\n",
+       "improvements of Llama 2-Chat in order to enable the community to build on our work and\n",
+       "contribute to the responsible development of LLMs.<br>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.Markdown object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/markdown": [
+       "**Node ID:** dbbde2a7-d51c-4245-959d-ba97ba414b55<br>**Similarity:** 0.8700958215249326<br>**Text:** page_label: 5\n",
+       "file_name: llama2.pdf\n",
+       "file_path: data/llama2.pdf\n",
+       "file_type: application/pdf\n",
+       "file_size: 13661300\n",
+       "creation_date: 2024-02-17\n",
+       "last_modified_date: 2024-02-17\n",
+       "last_accessed_date: 2024-02-17\n",
+       "context_summary: Llama 2-Chat is developed through pretraining, supervised fine-tuning, and reinforcement learning with human feedback methodologies, focusing on refining the model iteratively. The training process involves using an optimized auto-regressive transformer, robust data cleaning, updated data mixes, and specific architectural enhancements like increased context length and grouped-query attention.\n",
+       "\n",
+       "Figure4: Trainingof Llama 2-Chat : Thisprocessbeginswiththe pretraining ofLlama 2 usingpublicly\n",
+       "availableonlinesources. Followingthis,wecreateaninitialversionof Llama 2-Chat throughtheapplication\n",
+       "ofsupervised fine-tuning . Subsequently, the model is iteratively refined using Reinforcement Learning\n",
+       "with Human Feedback (RLHF) methodologies, specifically through rejection sampling and Proximal Policy\n",
+       "Optimization(PPO).ThroughouttheRLHFstage,theaccumulationof iterativerewardmodelingdata in\n",
+       "parallel with model enhancements is crucial to ensure the reward models remain within distribution.\n",
+       "2 Pretraining\n",
+       "Tocreatethenewfamilyof Llama 2models,webeganwiththepretrainingapproachdescribedinTouvronetal.\n",
+       "(2023), using an optimized auto-regressive transformer, but made several changes to improve performance.\n",
+       "Specifically,weperformedmorerobustdatacleaning,updatedourdatamixes,trainedon40%moretotal\n",
+       "tokens,doubledthecontextlength,andusedgrouped-queryattention(GQA)toimproveinferencescalability\n",
+       "for our larger models. Table 1 compares the attributes of the new Llama 2 models with the Llama 1 models.\n",
+       "2.1 Pretraining Data\n",
+       "Our training corpus includes a new mix of data from publicly available sources, which does not include data\n",
+       "fromMeta’sproductsorservices. Wemadeanefforttoremovedatafromcertainsitesknowntocontaina\n",
+       "highvolumeofpersonalinformationaboutprivateindividuals. Wetrainedon2trilliontokensofdataasthis\n",
+       "providesagoodperformance–costtrade-off,up-samplingthemostfactualsourcesinanefforttoincrease\n",
+       "knowledge and dampen hallucinations.\n",
+       "Weperformedavarietyofpretrainingdatainvestigationssothatuserscanbetterunderstandthepotential\n",
+       "capabilities and limitations of our models; results can be found in Section 4.1.\n",
+       "2.2 Training Details\n",
+       "We adopt most of the pretraining setting and model architecture from Llama 1 .<br>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.Markdown object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "from IPython.display import Markdown, display\n",
+    "from llama_index.core.response.notebook_utils import display_source_node\n",
+    "\n",
+    "response = subdoc_summary_pack.run(\"How was Llama2 pretrained?\")\n",
+    "display(Markdown(str(response)))\n",
+    "for n in response.source_nodes:\n",
+    "    display_source_node(n, source_length=10000, metadata_mode=\"all\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1181af9d-680f-4ba3-89e2-f88b12a89cc7",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/markdown": [
+       "The latest ChatGPT model, equipped with Ghost Attention (GAtt), demonstrates strong multi-turn memory ability by consistently referring to defined attributes for up to 20 turns in a conversation. This integration of GAtt in the ChatGPT model allows for efficient long context attention beyond 2048 tokens, showcasing potential for robust performance in handling extended contexts."
+      ],
+      "text/plain": [
+       "<IPython.core.display.Markdown object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/markdown": [
+       "**Node ID:** 005a3c23-8d97-4e5d-957e-98ad2dfb93ad<br>**Similarity:** 0.7923889627946064<br>**Text:** page_label: 54\n",
+       "file_name: llama2.pdf\n",
+       "file_path: data/llama2.pdf\n",
+       "file_type: application/pdf\n",
+       "file_size: 13661300\n",
+       "creation_date: 2024-02-17\n",
+       "last_modified_date: 2024-02-17\n",
+       "last_accessed_date: 2024-02-17\n",
+       "context_summary: Llama 2-Chat with GAtt consistently refers to defined attributes for up to 20 turns, showcasing strong multi-turn memory ability. The integration of GAtt in Llama 2-Chat enables efficient long context attention beyond 2048 tokens, indicating potential for robust performance in handling extended contexts.\n",
+       "\n",
+       "Dialogue Turn Baseline + GAtt\n",
+       "2 100% 100%\n",
+       "4 10% 100%\n",
+       "6 0% 100%\n",
+       "20 0% 100%\n",
+       "Table30: GAttresults. Llama 2-Chat withGAttisabletorefertoattributes100%ofthetime,forupto20\n",
+       "turns from our human evaluation. We limited the evaluated attributes to public figures and hobbies.\n",
+       "Theattentionnowspansbeyond20turns. Wetestedthemodelabilitytorememberthesystemarguments\n",
+       "troughahumanevaluation. Thearguments(e.g. hobbies,persona)aredefinedduringthefirstmessage,and\n",
+       "then from turn 2 to 20. We explicitly asked the model to refer to them (e.g. “What is your favorite hobby?”,\n",
+       "“Whatisyourname?”),tomeasurethemulti-turnmemoryabilityof Llama 2-Chat . Wereporttheresults\n",
+       "inTable30. EquippedwithGAtt, Llama 2-Chat maintains100%accuracy,alwaysreferringtothedefined\n",
+       "attribute,andso,upto20turns(wedidnotextendthehumanevaluationmore,andalltheexampleshad\n",
+       "lessthan4048tokensintotalovertheturns). Asacomparison, Llama 2-Chat withoutGAttcannotanymore\n",
+       "refer to the attributes after only few turns: from 100% at turn t+1, to 10% at turn t+3 and then 0%.\n",
+       "GAttZero-shotGeneralisation. Wetriedatinferencetimetosetconstrainnotpresentinthetrainingof\n",
+       "GAtt. For instance, “answer in one sentence only”, for which the model remained consistent, as illustrated in\n",
+       "Figure 28.\n",
+       "We applied first GAtt to Llama 1 , which was pretrained with a context length of 2048 tokens and then\n",
+       "fine-tuned with 4096 max length. We tested if GAtt works beyond 2048 tokens, and the model arguably\n",
+       "managed to understand attributes beyond this window. This promising result indicates that GAtt could be\n",
+       "adapted as an efficient technique for long context attention.\n",
+       "A.3.6 How Far Can Model-Based Evaluation Go?<br>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.Markdown object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/markdown": [
+       "**Node ID:** 0b1719e9-d7fa-42af-890b-5eeb946857c5<br>**Similarity:** 0.7837282816384877<br>**Text:** page_label: 16\n",
+       "file_name: llama2.pdf\n",
+       "file_path: data/llama2.pdf\n",
+       "file_type: application/pdf\n",
+       "file_size: 13661300\n",
+       "creation_date: 2024-02-17\n",
+       "last_modified_date: 2024-02-17\n",
+       "last_accessed_date: 2024-02-17\n",
+       "context_summary: The text discusses the challenges faced in maintaining multi-turn consistency in dialogue systems and introduces a method called Ghost Attention (GAtt) to address these issues. GAtt involves incorporating instructions throughout a conversation to ensure dialogue control over multiple turns.\n",
+       "\n",
+       "Figure 9: Issues with multi-turn memory (left)can be improved with GAtt (right).\n",
+       "We train for between 200and400iterations for all our models, and use evaluations on held-out prompts for\n",
+       "earlystopping. EachiterationofPPOonthe70Bmodeltakesonaverage ≈330seconds. Totrainquicklywith\n",
+       "large batch sizes, we use FSDP (Zhao et al., 2023). This was effective when using O(1) forward or backward\n",
+       "passes,butcausedalargeslowdown( ≈20×)duringgeneration,evenwhenusingalargebatchsizeandKV\n",
+       "cache. We were able to mitigate this by consolidating the model weights to each node once before generation\n",
+       "and then freeing the memory after generation, resuming the rest of the training loop.\n",
+       "3.3 System Message for Multi-Turn Consistency\n",
+       "In a dialogue setup, some instructions should apply for all the conversation turns, e.g., to respond succinctly,\n",
+       "or to“act as”some public figure. When we provided such instructions to Llama 2-Chat , the subsequent\n",
+       "response should always respect the constraint. However, our initial RLHF models tended to forget the initial\n",
+       "instruction after a few turns of dialogue, as illustrated in Figure 9 (left).\n",
+       "To address these limitations, we propose Ghost Attention (GAtt), a very simple method inspired by Context\n",
+       "Distillation (Bai et al., 2022b) that hacks the fine-tuning data to help the attention focus in a multi-stage\n",
+       "process. GAtt enables dialogue control over multiple turns, as illustrated in Figure 9 (right).\n",
+       "GAttMethod. Assumewe haveaccess toa multi-turndialoguedataset betweentwo persons(e.g., auser\n",
+       "and an assistant), with a list of messages [u1, a1, . . . , u n, an], where unandancorrespond to the user and\n",
+       "assistant messages for turn n, respectively. Then, we define an instruction, inst, that should be respected\n",
+       "throughout the dialogue. For example, instcould be “act as.” We can then synthetically concatenate this\n",
+       "instruction to all the user messages of the conversation.\n",
+       "Next, we can sample from this synthetic data using the latest RLHF model.<br>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.Markdown object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "from IPython.display import Markdown, display\n",
+    "\n",
+    "response = subdoc_summary_pack.run(\n",
+    "    \"What is the functionality of latest ChatGPT memory.\"\n",
+    ")\n",
+    "display(Markdown(str(response)))\n",
+    "\n",
+    "for n in response.source_nodes:\n",
+    "    display_source_node(n, source_length=10000, metadata_mode=\"all\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "llama_index_v3",
+   "language": "python",
+   "name": "llama_index_v3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
+%% Cell type:markdown id:8dd0acdb-5aec-4129-8772-81f56d6b25cf tags:
+# Sub-Document Summary Metadata Pack
+This LlamaPack provides an advanced technique for injecting each chunk with "sub-document" metadata. This context augmentation technique is helpful for both retrieving relevant context and for synthesizing correct answers.
+It is a step beyond simply adding a summary of the document as the metadata to each chunk. Within a long document, there can be multiple distinct themes, and we want each chunk to be grounded in global but relevant context.
+%% Cell type:markdown id:66818da6-a3fb-4537-b30a-922a8a0ef99e tags:
+## Setup Data
+%% Cell type:code id:317a3207-1211-4a6a-bd7d-3ab14f399951 tags:
+``` python
+!mkdir -p 'data/'
+!curl 'https://arxiv.org/pdf/2307.09288.pdf' -o 'data/llama2.pdf'
+```
+%% Output
+    811.82s - pydevd: Sending message related to process being replaced timed-out after 5 seconds
+    817.00s - pydevd: Sending message related to process being replaced timed-out after 5 seconds
+      % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
+                                     Dload  Upload   Total   Spent    Left  Speed
+    100 13.0M  100 13.0M    0     0  27.7M      0 --:--:-- --:--:-- --:--:-- 28.0M
+%% Cell type:code id:bf6ab9c0-c993-4ab2-8343-b294676d7550 tags:
+``` python
+from llama_index.core import SimpleDirectoryReader
+documents = SimpleDirectoryReader("data").load_data()
+```
+%% Cell type:markdown id:98bfbe4b-539c-469c-82e6-1f823f28d5f4 tags:
+## Run the Sub-Document Summary Metadata Pack
+%% Cell type:code id:af4b815e-f5ce-406b-9dcb-5a23fc9f96db tags:
+``` python
+%pip install llama-index-packs-subdoc-summary llama-index-llms-openai llama-index-embeddings-openai
+```
+%% Cell type:code id:d619362b-ae45-4e47-b400-1c2ce7262496 tags:
+``` python
+from llama_index.packs.subdoc_summary import SubDocSummaryPack
+from llama_index.llms.openai import OpenAI
+from llama_index.embeddings.openai import OpenAIEmbedding
+subdoc_summary_pack = SubDocSummaryPack(
+    documents,
+    parent_chunk_size=8192,  # default,
+    child_chunk_size=512,  # default
+    llm=OpenAI(model="gpt-3.5-turbo"),
+    embed_model=OpenAIEmbedding(),
+)
+```
+%% Cell type:code id:fb11a60d-d356-40c5-84c1-4135382bfbfd tags:
+``` python
+from IPython.display import Markdown, display
+from llama_index.core.response.notebook_utils import display_source_node
+response = subdoc_summary_pack.run("How was Llama2 pretrained?")
+display(Markdown(str(response)))
+for n in response.source_nodes:
+    display_source_node(n, source_length=10000, metadata_mode="all")
+```
+%% Output
+    Llama 2 was pretrained using an optimized auto-regressive transformer with robust data cleaning, updated data mixes, training on 40% more total tokens, doubling the context length, and using grouped-query attention to improve inference scalability for larger models.
+    **Node ID:** 172a1344-d48d-443b-8383-677037570c06<br>**Similarity:** 0.8720929924174893<br>**Text:** page_label: 1
+    file_name: llama2.pdf
+    file_path: data/llama2.pdf
+    file_type: application/pdf
+    file_size: 13661300
+    creation_date: 2024-02-17
+    last_modified_date: 2024-02-17
+    last_accessed_date: 2024-02-17
+    context_summary: Llama 2 is a collection of pretrained and fine-tuned large language models optimized for dialogue use cases, ranging from 7 billion to 70 billion parameters. The models, known as Llama 2-Chat, have shown superior performance compared to open-source chat models on various benchmarks and are considered as potential alternatives to closed-source models.
+    Llama 2 : Open Foundation and Fine-Tuned Chat Models
+    Hugo Touvron∗Louis Martin†Kevin Stone†
+    Peter Albert Amjad Almahairi Yasmine Babaei Nikolay Bashlykov Soumya Batra
+    Prajjwal Bhargava Shruti Bhosale Dan Bikel Lukas Blecher Cristian Canton Ferrer Moya Chen
+    Guillem Cucurull David Esiobu Jude Fernandes Jeremy Fu Wenyin Fu Brian Fuller
+    Cynthia Gao Vedanuj Goswami Naman Goyal Anthony Hartshorn Saghar Hosseini Rui Hou
+    Hakan Inan Marcin Kardas Viktor Kerkez Madian Khabsa Isabel Kloumann Artem Korenev
+    Punit Singh Koura Marie-Anne Lachaux Thibaut Lavril Jenya Lee Diana Liskovich
+    Yinghai Lu Yuning Mao Xavier Martinet Todor Mihaylov Pushkar Mishra
+    Igor Molybog Yixin Nie Andrew Poulton Jeremy Reizenstein Rashi Rungta Kalyan Saladi
+    Alan Schelten Ruan Silva Eric Michael Smith Ranjan Subramanian Xiaoqing Ellen Tan Binh Tang
+    Ross Taylor Adina Williams Jian Xiang Kuan Puxin Xu Zheng Yan Iliyan Zarov Yuchen Zhang
+    Angela Fan Melanie Kambadur Sharan Narang Aurelien Rodriguez Robert Stojnic
+    Sergey Edunov Thomas Scialom∗
+    GenAI, Meta
+    Abstract
+    In this work, we develop and release Llama 2, a collection of pretrained and fine-tuned
+    large language models (LLMs) ranging in scale from 7 billion to 70 billion parameters.
+    Our fine-tuned LLMs, called Llama 2-Chat , are optimized for dialogue use cases. Our
+    models outperform open-source chat models on most benchmarks we tested, and based on
+    ourhumanevaluationsforhelpfulnessandsafety,maybeasuitablesubstituteforclosed-
+    source models. We provide a detailed description of our approach to fine-tuning and safety
+    improvements of Llama 2-Chat in order to enable the community to build on our work and
+    contribute to the responsible development of LLMs.<br>
+    **Node ID:** dbbde2a7-d51c-4245-959d-ba97ba414b55<br>**Similarity:** 0.8700958215249326<br>**Text:** page_label: 5
+    file_name: llama2.pdf
+    file_path: data/llama2.pdf
+    file_type: application/pdf
+    file_size: 13661300
+    creation_date: 2024-02-17
+    last_modified_date: 2024-02-17
+    last_accessed_date: 2024-02-17
+    context_summary: Llama 2-Chat is developed through pretraining, supervised fine-tuning, and reinforcement learning with human feedback methodologies, focusing on refining the model iteratively. The training process involves using an optimized auto-regressive transformer, robust data cleaning, updated data mixes, and specific architectural enhancements like increased context length and grouped-query attention.
+    Figure4: Trainingof Llama 2-Chat : Thisprocessbeginswiththe pretraining ofLlama 2 usingpublicly
+    availableonlinesources. Followingthis,wecreateaninitialversionof Llama 2-Chat throughtheapplication
+    ofsupervised fine-tuning . Subsequently, the model is iteratively refined using Reinforcement Learning
+    with Human Feedback (RLHF) methodologies, specifically through rejection sampling and Proximal Policy
+    Optimization(PPO).ThroughouttheRLHFstage,theaccumulationof iterativerewardmodelingdata in
+    parallel with model enhancements is crucial to ensure the reward models remain within distribution.
+    2 Pretraining
+    Tocreatethenewfamilyof Llama 2models,webeganwiththepretrainingapproachdescribedinTouvronetal.
+    (2023), using an optimized auto-regressive transformer, but made several changes to improve performance.
+    Specifically,weperformedmorerobustdatacleaning,updatedourdatamixes,trainedon40%moretotal
+    tokens,doubledthecontextlength,andusedgrouped-queryattention(GQA)toimproveinferencescalability
+    for our larger models. Table 1 compares the attributes of the new Llama 2 models with the Llama 1 models.
+    2.1 Pretraining Data
+    Our training corpus includes a new mix of data from publicly available sources, which does not include data
+    fromMeta’sproductsorservices. Wemadeanefforttoremovedatafromcertainsitesknowntocontaina
+    highvolumeofpersonalinformationaboutprivateindividuals. Wetrainedon2trilliontokensofdataasthis
+    providesagoodperformance–costtrade-off,up-samplingthemostfactualsourcesinanefforttoincrease
+    knowledge and dampen hallucinations.
+    Weperformedavarietyofpretrainingdatainvestigationssothatuserscanbetterunderstandthepotential
+    capabilities and limitations of our models; results can be found in Section 4.1.
+    2.2 Training Details
+    We adopt most of the pretraining setting and model architecture from Llama 1 .<br>
+%% Cell type:code id:1181af9d-680f-4ba3-89e2-f88b12a89cc7 tags:
+``` python
+from IPython.display import Markdown, display
+response = subdoc_summary_pack.run(
+    "What is the functionality of latest ChatGPT memory."
+)
+display(Markdown(str(response)))
+for n in response.source_nodes:
+    display_source_node(n, source_length=10000, metadata_mode="all")
+```
+%% Output
+    The latest ChatGPT model, equipped with Ghost Attention (GAtt), demonstrates strong multi-turn memory ability by consistently referring to defined attributes for up to 20 turns in a conversation. This integration of GAtt in the ChatGPT model allows for efficient long context attention beyond 2048 tokens, showcasing potential for robust performance in handling extended contexts.
+    **Node ID:** 005a3c23-8d97-4e5d-957e-98ad2dfb93ad<br>**Similarity:** 0.7923889627946064<br>**Text:** page_label: 54
+    file_name: llama2.pdf
+    file_path: data/llama2.pdf
+    file_type: application/pdf
+    file_size: 13661300
+    creation_date: 2024-02-17
+    last_modified_date: 2024-02-17
+    last_accessed_date: 2024-02-17
+    context_summary: Llama 2-Chat with GAtt consistently refers to defined attributes for up to 20 turns, showcasing strong multi-turn memory ability. The integration of GAtt in Llama 2-Chat enables efficient long context attention beyond 2048 tokens, indicating potential for robust performance in handling extended contexts.
+    Dialogue Turn Baseline + GAtt
+    2 100% 100%
+    4 10% 100%
+    6 0% 100%
+    20 0% 100%
+    Table30: GAttresults. Llama 2-Chat withGAttisabletorefertoattributes100%ofthetime,forupto20
+    turns from our human evaluation. We limited the evaluated attributes to public figures and hobbies.
+    Theattentionnowspansbeyond20turns. Wetestedthemodelabilitytorememberthesystemarguments
+    troughahumanevaluation. Thearguments(e.g. hobbies,persona)aredefinedduringthefirstmessage,and
+    then from turn 2 to 20. We explicitly asked the model to refer to them (e.g. “What is your favorite hobby?”,
+    “Whatisyourname?”),tomeasurethemulti-turnmemoryabilityof Llama 2-Chat . Wereporttheresults
+    inTable30. EquippedwithGAtt, Llama 2-Chat maintains100%accuracy,alwaysreferringtothedefined
+    attribute,andso,upto20turns(wedidnotextendthehumanevaluationmore,andalltheexampleshad
+    lessthan4048tokensintotalovertheturns). Asacomparison, Llama 2-Chat withoutGAttcannotanymore
+    refer to the attributes after only few turns: from 100% at turn t+1, to 10% at turn t+3 and then 0%.
+    GAttZero-shotGeneralisation. Wetriedatinferencetimetosetconstrainnotpresentinthetrainingof
+    GAtt. For instance, “answer in one sentence only”, for which the model remained consistent, as illustrated in
+    Figure 28.
+    We applied first GAtt to Llama 1 , which was pretrained with a context length of 2048 tokens and then
+    fine-tuned with 4096 max length. We tested if GAtt works beyond 2048 tokens, and the model arguably
+    managed to understand attributes beyond this window. This promising result indicates that GAtt could be
+    adapted as an efficient technique for long context attention.
+    A.3.6 How Far Can Model-Based Evaluation Go?<br>
+    **Node ID:** 0b1719e9-d7fa-42af-890b-5eeb946857c5<br>**Similarity:** 0.7837282816384877<br>**Text:** page_label: 16
+    file_name: llama2.pdf
+    file_path: data/llama2.pdf
+    file_type: application/pdf
+    file_size: 13661300
+    creation_date: 2024-02-17
+    last_modified_date: 2024-02-17
+    last_accessed_date: 2024-02-17
+    context_summary: The text discusses the challenges faced in maintaining multi-turn consistency in dialogue systems and introduces a method called Ghost Attention (GAtt) to address these issues. GAtt involves incorporating instructions throughout a conversation to ensure dialogue control over multiple turns.
+    Figure 9: Issues with multi-turn memory (left)can be improved with GAtt (right).
+    We train for between 200and400iterations for all our models, and use evaluations on held-out prompts for
+    earlystopping. EachiterationofPPOonthe70Bmodeltakesonaverage ≈330seconds. Totrainquicklywith
+    large batch sizes, we use FSDP (Zhao et al., 2023). This was effective when using O(1) forward or backward
+    passes,butcausedalargeslowdown( ≈20×)duringgeneration,evenwhenusingalargebatchsizeandKV
+    cache. We were able to mitigate this by consolidating the model weights to each node once before generation
+    and then freeing the memory after generation, resuming the rest of the training loop.
+    3.3 System Message for Multi-Turn Consistency
+    In a dialogue setup, some instructions should apply for all the conversation turns, e.g., to respond succinctly,
+    or to“act as”some public figure. When we provided such instructions to Llama 2-Chat , the subsequent
+    response should always respect the constraint. However, our initial RLHF models tended to forget the initial
+    instruction after a few turns of dialogue, as illustrated in Figure 9 (left).
+    To address these limitations, we propose Ghost Attention (GAtt), a very simple method inspired by Context
+    Distillation (Bai et al., 2022b) that hacks the fine-tuning data to help the attention focus in a multi-stage
+    process. GAtt enables dialogue control over multiple turns, as illustrated in Figure 9 (right).
+    GAttMethod. Assumewe haveaccess toa multi-turndialoguedataset betweentwo persons(e.g., auser
+    and an assistant), with a list of messages [u1, a1, . . . , u n, an], where unandancorrespond to the user and
+    assistant messages for turn n, respectively. Then, we define an instruction, inst, that should be respected
+    throughout the dialogue. For example, instcould be “act as.” We can then synthetically concatenate this
+    instruction to all the user messages of the conversation.
+    Next, we can sample from this synthetic data using the latest RLHF model.<br>
--- a/llama-index-packs/llama-index-packs-subdoc-summary/llama_index/packs/subdoc_summary/BUILD
+++ b/llama-index-packs/llama-index-packs-subdoc-summary/llama_index/packs/subdoc_summary/BUILD
+python_sources()
--- a/llama-index-packs/llama-index-packs-subdoc-summary/llama_index/packs/subdoc_summary/__init__.py
+++ b/llama-index-packs/llama-index-packs-subdoc-summary/llama_index/packs/subdoc_summary/__init__.py
+from llama_index.packs.subdoc_summary.base import SubDocSummaryPack
+__all__ = ["SubDocSummaryPack"]
--- a/llama-index-packs/llama-index-packs-subdoc-summary/llama_index/packs/subdoc_summary/base.py
+++ b/llama-index-packs/llama-index-packs-subdoc-summary/llama_index/packs/subdoc_summary/base.py
+"""Subdoc Summary."""
+from typing import Any, Dict, List, Optional, List
+from llama_index.core.llama_pack import BaseLlamaPack
+from llama_index.core.schema import Document
+from llama_index.core.text_splitter import SentenceSplitter
+from llama_index.core.utils import print_text
+from llama_index.core import SummaryIndex, VectorStoreIndex
+from llama_index.core.embeddings import BaseEmbedding
+from llama_index.core.llms import LLM
+DEFAULT_SUMMARY_PROMPT_STR = """\
+Please give a concise summary of the context in 1-2 sentences.
+"""
+class SubDocSummaryPack(BaseLlamaPack):
+    """Pack for injecting sub-doc metadata into each chunk."""
+    def __init__(
+        self,
+        documents: List[Document],
+        parent_chunk_size: int = 8192,
+        parent_chunk_overlap: int = 512,
+        child_chunk_size: int = 512,
+        child_chunk_overlap: int = 32,
+        summary_prompt_str: str = DEFAULT_SUMMARY_PROMPT_STR,
+        verbose: bool = False,
+        embed_model: Optional[BaseEmbedding] = None,
+        llm: Optional[LLM] = None,
+    ) -> None:
+        """Init params."""
+        self.parent_chunk_size = parent_chunk_size
+        self.child_chunk_size = child_chunk_size
+        self.parent_splitter = SentenceSplitter(
+            chunk_size=parent_chunk_size, chunk_overlap=parent_chunk_overlap
+        )
+        self.child_splitter = SentenceSplitter(
+            chunk_size=child_chunk_size, chunk_overlap=child_chunk_overlap
+        )
+        self.summary_prompt_str = summary_prompt_str
+        self.embed_model = embed_model
+        self.llm = llm
+        parent_nodes = self.parent_splitter.get_nodes_from_documents(documents)
+        all_child_nodes = []
+        # For each parent node, extract the child nodes and print the text
+        for idx, parent_node in enumerate(parent_nodes):
+            if verbose:
+                print_text(
+                    f"> Processing parent chunk {idx + 1} of {len(parent_nodes)}\n",
+                    color="blue",
+                )
+            # get summary
+            summary_index = SummaryIndex([parent_node])
+            summary_query_engine = summary_index.as_query_engine(
+                response_mode="tree_summarize"
+            )
+            parent_summary = summary_query_engine.query(DEFAULT_SUMMARY_PROMPT_STR)
+            if verbose:
+                print_text(f"Extracted summary: {parent_summary}\n", color="pink")
+            # attach summary to all child nodes
+            child_nodes = self.child_splitter.get_nodes_from_documents([parent_node])
+            for child_node in child_nodes:
+                child_node.metadata["context_summary"] = str(parent_summary)
+            all_child_nodes.extend(child_nodes)
+        # build vector index for child nodes
+        self.vector_index = VectorStoreIndex(
+            all_child_nodes, embed_model=self.embed_model
+        )
+        self.vector_retriever = self.vector_index.as_retriever()
+        self.vector_query_engine = self.vector_index.as_query_engine(llm=llm)
+        self.verbose = verbose
+    def get_modules(self) -> Dict[str, Any]:
+        """Get modules."""
+        return {
+            "vector_index": self.vector_index,
+            "vector_retriever": self.vector_retriever,
+            "vector_query_engine": self.vector_query_engine,
+        }
+    def run(self, *args: Any, **kwargs: Any) -> Any:
+        """Run the pipeline."""
+        return self.vector_query_engine.query(*args, **kwargs)
--- a/llama-index-packs/llama-index-packs-subdoc-summary/pyproject.toml
+++ b/llama-index-packs/llama-index-packs-subdoc-summary/pyproject.toml
+[build-system]
+build-backend = "poetry.core.masonry.api"
+requires = ["poetry-core"]
+[tool.codespell]
+check-filenames = true
+check-hidden = true
+# Feel free to un-skip examples, and experimental, you will just need to
+# work through many typos (--write-changes and --interactive will help)
+skip = "*.csv,*.html,*.json,*.jsonl,*.pdf,*.txt,*.ipynb"
+[tool.llamahub]
+classes = ["SubDocSummaryPack"]
+contains_example = false
+import_path = "llama_index.packs.subdoc_summary"
+[tool.mypy]
+disallow_untyped_defs = true
+# Remove venv skip when integrated with pre-commit
+exclude = ["_static", "build", "examples", "notebooks", "venv"]
+ignore_missing_imports = true
+python_version = "3.8"
+[tool.poetry]
+authors = ["Your Name <you@example.com>"]
+description = "llama-index packs subdoc-summary implementation"
+license = "MIT"
+name = "llama-index-packs-subdoc-summary"
+packages = [{include = "llama_index/"}]
+readme = "README.md"
+version = "0.1.0"
+[tool.poetry.dependencies]
+python = ">=3.8.1,<3.12"
+llama-index-core = "^0.10.0"
+[tool.poetry.group.dev.dependencies]
+black = {extras = ["jupyter"], version = "<=23.9.1,>=23.7.0"}
+codespell = {extras = ["toml"], version = ">=v2.2.6"}
+ipython = "8.10.0"
+jupyter = "^1.0.0"
+mypy = "0.991"
+pre-commit = "3.2.0"
+pylint = "2.15.10"
+pytest = "7.2.1"
+pytest-mock = "3.11.1"
+ruff = "0.0.292"
+tree-sitter-languages = "^1.8.0"
+types-Deprecated = ">=0.1.0"
+types-PyYAML = "^6.0.12.12"
+types-protobuf = "^4.24.0.4"
+types-redis = "4.5.5.0"
+types-requests = "2.28.11.8"  # TODO: unpin when mypy>0.991
+types-setuptools = "67.1.0.0"
--- a/llama-index-packs/llama-index-packs-subdoc-summary/tests/__init__.py
+++ b/llama-index-packs/llama-index-packs-subdoc-summary/tests/__init__.py