diff --git a/recipes/quickstart/NotebookLlama/Step-1 PDF Pre-Processing Logic.ipynb b/recipes/quickstart/NotebookLlama/Step-1 PDF Pre-Processing Logic.ipynb
index 54e1ea05cd5dec6e69a20ac860947157ea8871d0..310edd7035e46f054f706624626a178d78fab3f6 100644
--- a/recipes/quickstart/NotebookLlama/Step-1 PDF Pre-Processing Logic.ipynb	
+++ b/recipes/quickstart/NotebookLlama/Step-1 PDF Pre-Processing Logic.ipynb	
@@ -279,11 +279,47 @@
     "\n",
     "Be very smart and aggressive with removing details, you will get a running portion of the text and keep returning the processed text.\n",
     "\n",
+    "PLEASE DO NOT ADD MARKDOWN FORMATTING, STOP ADDING SPECIAL CHARACTERS THAT MARKDOWN CAPATILISATION ETC LIKES\n",
+    "\n",
     "ALWAYS start your response directly with processed text and NO ACKNOWLEDGEMENTS about my questions ok?\n",
     "Here is the text:\n",
     "\"\"\""
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 54,
+   "id": "24e8a547-9d7c-4e2f-be9e-a3aea09cce76",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def create_word_bounded_chunks(text, target_chunk_size):\n",
+    "    \"\"\"\n",
+    "    Split text into chunks at word boundaries close to the target chunk size.\n",
+    "    \"\"\"\n",
+    "    words = text.split()\n",
+    "    chunks = []\n",
+    "    current_chunk = []\n",
+    "    current_length = 0\n",
+    "    \n",
+    "    for word in words:\n",
+    "        word_length = len(word) + 1  # +1 for the space\n",
+    "        if current_length + word_length > target_chunk_size and current_chunk:\n",
+    "            # Join the current chunk and add it to chunks\n",
+    "            chunks.append(' '.join(current_chunk))\n",
+    "            current_chunk = [word]\n",
+    "            current_length = word_length\n",
+    "        else:\n",
+    "            current_chunk.append(word)\n",
+    "            current_length += word_length\n",
+    "    \n",
+    "    # Add the last chunk if it exists\n",
+    "    if current_chunk:\n",
+    "        chunks.append(' '.join(current_chunk))\n",
+    "    \n",
+    "    return chunks"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 22,
@@ -340,7 +376,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 51,
+   "execution_count": 55,
    "id": "a0183c47-339d-4041-ae83-77fc34931075",
    "metadata": {},
    "outputs": [],
@@ -348,6 +384,38 @@
     "INPUT_FILE = \"./extracted_text.txt\"  # Replace with your file path\n",
     "CHUNK_SIZE = 1000  # Adjust chunk size if needed\n",
     "\n",
+    "chunks = create_word_bounded_chunks(text, CHUNK_SIZE)\n",
+    "num_chunks = len(chunks)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 56,
+   "id": "bb36814f-9310-4734-bf54-e16a5032339e",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "101"
+      ]
+     },
+     "execution_count": 56,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "num_chunks"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 57,
+   "id": "447188d3-ebf0-42d5-940e-4d7e0d9dbf32",
+   "metadata": {},
+   "outputs": [],
+   "source": [
     "# Read the file\n",
     "with open(INPUT_FILE, 'r', encoding='utf-8') as file:\n",
     "    text = file.read()\n",
@@ -369,7 +437,7 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "982a87c01ac14a5187843c211d6add24",
+       "model_id": "b4470041255746c2b3f2fd838db40c6a",
        "version_major": 2,
        "version_minor": 0
       },
@@ -393,23 +461,19 @@
      "output_type": "stream",
      "text": [
       "INPUT TEXT:\n",
-      "1\n",
-      "A Survey on Knowledge Distillation of Large\n",
-      "Language Models\n",
-      "Xiaohan Xu1, Ming Li2, Chongyang Tao3, Tao Shen4, Reynold Cheng1, Jinyang Li1,\n",
-      "Can Xu5, Dacheng Tao6, Tianyi Zhou2\n",
-      "1The University of Hong Kong2University of Maryland3Microsoft\n",
-      "4University of Technology Sydney5Peking University6The University of Sydney\n",
-      "{shawnxxh,chongyangtao,hishentao }@gmail.com {minglii,tianyi }@umd.edu\n",
-      "ckcheng@cs.hku.hk jl0725@connect.hku.hk\n",
-      "Abstract —In the era of Large Language Models (LLMs), Knowledge Distillati...\n",
+      "1 A Survey on Knowledge Distillation of Large Language Models Xiaohan Xu1, Ming Li2, Chongyang Tao3, Tao Shen4, Reynold Cheng1, Jinyang Li1, Can Xu5, Dacheng Tao6, Tianyi Zhou2 1The University of Hong Kong2University of Maryland3Microsoft 4University of Technology Sydney5Peking University6The University of Sydney {shawnxxh,chongyangtao,hishentao }@gmail.com {minglii,tianyi }@umd.edu ckcheng@cs.hku.hk jl0725@connect.hku.hk Abstract —In the era of Large Language Models (LLMs), Knowledge Distillati...\n",
       "\n",
       "PROCESSED TEXT:\n",
-      "Tao Shen4, Reynold Cheng1, Jinyang Li1,\n",
-      "Can Xu5, Dacheng Tao6, Tianyi Zhou2\n",
-      "1The University of Hong Kong2University of Maryland3Microsoft\n",
-      "4University of Technology Sydney5Peking University6The University of Sydney\n",
-      "{shawnxxh,chongyangtao,hishentao }@gmail.com {minglii,tianyi }@umd.edu\n",
+      "ng Tao, Tao Shen, Reynold Cheng, Jinyang Li, Can Xu, Dacheng Tao, Tianyi Zhou**\n",
+      "\n",
+      "**The University of Hong Kong**\n",
+      "**University of Maryland**\n",
+      "**Microsoft**\n",
+      "**University of Technology Sydney**\n",
+      "**Peking University**\n",
+      "\n",
+      "**shawnxxh, chongyangtao, hishentao**@gmail.com\n",
+      "**minglii, tianyi**@umd.edu\n",
       "ckcheng@cs.hku.hk...\n",
       "==========================================================================================\n",
       "\n"
@@ -427,14 +491,220 @@
      "output_type": "stream",
      "text": [
       "INPUT TEXT:\n",
-      "ed knowledge to smaller models and its utility in model compression and self-\n",
-      "improvement. Our survey is meticulously structured around three foundational pillars: algorithm ,skill, and verticalization – providing\n",
-      "a comprehensive examination of KD mechanisms, the enhancement of specific cognitive abilities, and their practical implications\n",
-      "across diverse fields. Crucially, the survey navigates the intricate interplay between data augmentation (DA) and KD, illustrating how\n",
-      "DA emerges as a powerfu...\n",
+      "advanced knowledge to smaller models and its utility in model compression and self- improvement. Our survey is meticulously structured around three foundational pillars: algorithm ,skill, and verticalization – providing a comprehensive examination of KD mechanisms, the enhancement of specific cognitive abilities, and their practical implications across diverse fields. Crucially, the survey navigates the intricate interplay between data augmentation (DA) and KD, illustrating how DA emerges as a p...\n",
+      "\n",
+      "PROCESSED TEXT:\n",
+      "Our survey examines three foundational pillars: **algorithm**, **skill**, and **verticalization**, providing a comprehensive examination of Knowledge Distillation (KD) mechanisms, the enhancement of specific cognitive abilities, and their practical implications across diverse fields. Crucially, the survey navigates the intricate interplay between Data Augmentation (DA) and KD, illustrating how DA emerges as a powerful paradigm within the KD framework to bolster Large Language Models' (LLMs) perf...\n",
+      "==========================================================================================\n",
+      "\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INPUT TEXT:\n",
+      "distillation and proposing future research directions. By bridging the gap between proprietary and open-source LLMs, this survey underscores the potential for more accessible, efficient, and powerful AI solutions. Most importantly, we firmly advocate for compliance with the legal terms that regulate the use of LLMs, ensuring ethical and lawful application of KD of LLMs. An associated Github repository is available at https://github.com/Tebmer/Awesome-Knowledge-Distillation-of-LLMs. Index Terms —...\n",
+      "\n",
+      "PROCESSED TEXT:\n",
+      "ful AI solutions....\n",
+      "==========================================================================================\n",
+      "\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INPUT TEXT:\n",
+      "complexity, have un- locked new realms of possibility, from generating human- like text to offering sophisticated problem-solving capa- bilities. The core significance of these LLMs lies in their emergent abilities (Wei et al., 2022a,b; Xu et al., 2024a), a phenomenon where the models display capabilities beyond their explicit training objectives, enabling them to tackle a diverse array of tasks with remarkable proficiency. Their deep understanding of context, nuance, and the intrica- cies of hu...\n",
+      "\n",
+      "PROCESSED TEXT:\n",
+      "sophisticated problem-solving capabilities. The core significance of these LLMs lies in their emergent abilities, a phenomenon where the models display capabilities beyond their explicit training objectives, enabling them to tackle a diverse array of tasks with remarkable proficiency. Their deep understanding of context, nuance, and intricacies of human language enables them to excel in a wide array of applications, from creative content generation to problem-solving. For simplicity, we use prop...\n",
+      "==========================================================================================\n",
+      "\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INPUT TEXT:\n",
+      "applications, promising to revolutionize industries, augment human creativity, and redefine our interaction with technology. Despite the remarkable capabilities of proprietary LLMs like GPT-4 and Gemini, they are not without their shortcom- ings, particularly when viewed in light of the advantages offered by open-source models. A significant drawback is their limited accessibility and higher cost (OpenAI et al., 2023). These proprietary models often come with substantial usage fees and restricte...\n",
+      "\n",
+      "PROCESSED TEXT:\n",
+      "teraction with technology. Despite remarkable capabilities of proprietary LLMs like GPT-4 and Gemini, they are not without their shortcomings, particularly when viewed in light of the advantages offered by open-source models.\n",
+      "\n",
+      "Limited accessibility and higher cost are significant drawbacks. Proprietary models often come with substantial usage fees and restricted access, making them less attainable for individuals and smaller organizations. Data privacy and security concerns also arise when using...\n",
+      "==========================================================================================\n",
+      "\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INPUT TEXT:\n",
+      "applica- tions. The constraints of accessibility, cost, and adaptability thus present significant challenges in leveraging the full potential of proprietary LLMs. In contrast to proprietary LLMs, open-source modelsarXiv:2402.13116v3 [cs.CL] 8 Mar 2024 2 like LLaMA (Touvron et al., 2023) and Mistral (Jiang et al., 2023a) bring several notable advantages. One of the primary benefits of open-source models is their accessibility and adaptability. Without the constraints of licensing fees or restrict...\n",
+      "\n",
+      "PROCESSED TEXT:\n",
+      "challenges in leveraging the full potential of proprietary LLMs. In contrast to proprietary LLMs, open-source models arXiv:2402.13116v3 [cs.CL] 8 Mar 2024 2 like LLaMA (Touvron et al., 2023) and Mistral (Jiang et al., 2023a) bring several notable advantages. One of the primary benefits of open-source models is their accessibility and adaptability. Without the constraints of licensing fees or restrictive usage policies, these models are more readily available to a broader range of users, from ind...\n",
+      "==========================================================================================\n",
+      "\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INPUT TEXT:\n",
+      "of drawbacks, primarily stemming from their relatively limited scale and resources compared to their proprietary counterparts. One of the most significant limitations is the smaller model scale, which often results in lower per- formance on real-world tasks with a bunch of instruc- tions (Zheng et al., 2023a). These models, with fewer pa- rameters, may struggle to capture the depth and breadth of knowledge embodied in larger models like GPT-4. Ad- ditionally, the pre-training investment in these...\n",
+      "\n",
+      "PROCESSED TEXT:\n",
+      "ietary counterparts.**\n",
+      "\n",
+      "**One of the most significant limitations is the smaller model scale, resulting in lower performance on real-world tasks with a multitude of instructions.**\n",
+      "\n",
+      "**These models, with fewer parameters, may struggle to capture the depth and breadth of knowledge embodied in larger models like GPT-4.**\n",
+      "\n",
+      "**Traditionally, the pre-training investment in these open-source models is typically less substantial.**\n",
+      "\n",
+      "**This reduced investment can lead to a narrower range of pre-training d...\n",
+      "==========================================================================================\n",
+      "\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INPUT TEXT:\n",
+      "effectiveness in specialized applications. This limitation becomes particularly evident when these models are compared to the highly fine-tuned proprietary LLMs, which are often tailored to excel in a wide array of complex scenarios (OpenAI et al., 2023). Primarily, recognizing the disparities between propri- etary and open-source LLMs, KD techniques have surged as a means to bridge the performance gap between these models (Gou et al., 2021; Gupta and Agrawal, 2022). Knowl- edge distillation, in...\n",
+      "\n",
+      "PROCESSED TEXT:\n",
+      "models are compared to the highly fine-tuned proprietary LLMs, which are often tailored to excel in a wide array of complex scenarios (OpenAI et al., 2023). Primarily, recognizing the disparities between proprietary and open-source LLMs, knowledge distillation techniques have surged as a means to bridge the performance gap between these models (Gou et al., 2021; Gupta and Agrawal, 2022)....\n",
+      "==========================================================================================\n",
+      "\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INPUT TEXT:\n",
+      "augmentation (DA) (Feng et al., 2021) has emerged as a prevalent paradigm to achieve knowledge distillation of LLMs, where a small seed of knowledge is used to prompt the LLM to generate more data with respect to a specific skill or domain (Taori et al., 2023). Secondly, KD still retains its fundamental role in compressing LLMs, making them more efficient without significant loss in performance. (Gu et al., 2024; Agarwal et al., 2024). More recently, the strategy of employing open-source LLMs as...\n",
+      "\n",
+      "PROCESSED TEXT:\n",
+      "e a seed of knowledge is used to prompt LLMs to generate data concerning a specific skill or domain (Taori et al., 2023). Secondly, KD still retains its role in compressing LLMs, making them more efficient without loss in performance. (Gu et al., 2024; Agarwal et al., 2024). Recently, the strategy of using open-source LLMs as teachers for their own self-improvement has emerged as a promising approach, enhancing their capabilities significantly (Yuan et al., 2024a; Chen et al., 2024a)....\n",
+      "==========================================================================================\n",
+      "\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INPUT TEXT:\n",
+      "trend of self-improvement via self-generated knowledge. A key aspect of the knowledge distillation is the en- hancement of skills such as advanced context following (e.g., in-context learning (Huang et al., 2022a) and in- struction following (Taori et al., 2023)), improved align- ment with user intents (e.g., human values/principles (Cui et al., 2023a), and thinking patterns like chain-of-thought (CoT) (Mukherjee et al., 2023)), and NLP task specialization (e.g., semantic understanding (Ding et ...\n",
+      "\n",
+      "PROCESSED TEXT:\n",
+      "enabling the enhancement of skills such as advanced context following and instruction following, alignment with user intents and thinking patterns like chain-of-thought, and NLP task specialization....\n",
+      "==========================================================================================\n",
+      "\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INPUT TEXT:\n",
+      "performance by learning from the proprietary models that have been extensively trained and fine-tuned in these areas. The benefits of knowledge distillation in the era of LLMs are multifaceted and transformative (Gu et al., 2024). Through a suite of distillation techniques, the gap between proprietary and open-source models is significantly nar- rowed (Chiang et al., 2023; Xu et al., 2023a) and even filled (Zhao et al., 2023a). This process not only streamlines computational requirements but als...\n",
+      "\n",
+      "PROCESSED TEXT:\n",
+      "ned in these areas, the benefits of knowledge distillation in the era of LLMs are multifaceted and transformative, through a suite of distillation techniques, the gap between proprietary and open-source models is narrowed, and environmental sustainability of AI operations is enhanced, as open-source models become more proficient in less computational overhead, fostering a more accessible and equitable AI landscape, where smaller entities and individual researchers gain access to state-of-the-art...\n",
+      "==========================================================================================\n",
+      "\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INPUT TEXT:\n",
+      "catalyzing innovation and growth across various industries and research domains. The escalating need for a comprehensive survey on the knowledge distillation of LLMs stems from the rapidly evolving landscape of AI (OpenAI et al., 2023; Team et al., 2023) and the increasing complexity of these models. As AI continues to penetrate various sectors, the ability to effi- ciently and effectively distill knowledge from proprietary LLMs to open-source ones becomes not just a technical aspiration but a p...\n",
       "\n",
       "PROCESSED TEXT:\n",
-      "ulously structured around three foundational pillars: algorithm, skill, and verticalization – providing a comprehensive examination of knowledge distillation mechanisms, the enhancement of specific cognitive abilities, and their practical implications across diverse fields. Crucially, the survey navigates the intricate interplay between data augmentation (DA) and knowledge distillation, illustrating how DA emerges as a powerful paradigm within the knowledge distillation framework to bolster larg...\n",
+      "ed for a comprehensive survey on the knowledge distillation of LLMs stems from the rapidly evolving landscape of AI and the increasing complexity of these models....\n",
       "==========================================================================================\n",
       "\n"
      ]
@@ -451,17 +721,34 @@
      "output_type": "stream",
      "text": [
       "INPUT TEXT:\n",
-      "on and\n",
-      "proposing future research directions. By bridging the gap between proprietary and open-source LLMs, this survey underscores the\n",
-      "potential for more accessible, efficient, and powerful AI solutions. Most importantly, we firmly advocate for compliance with the legal\n",
-      "terms that regulate the use of LLMs, ensuring ethical and lawful application of KD of LLMs. An associated Github repository is available\n",
-      "at https://github.com/Tebmer/Awesome-Knowledge-Distillation-of-LLMs.\n",
-      "Index Terms —Large lang...\n",
+      "SupervisedFine-tuningX,Y preferenceRankOptimizationy,1y,2y3y1y2y3≻≻rank…… DataCuration X,YrawdatasynthesizefeedbackFeedback input outputSelf-Knowledge outputinputinput YlabelLabelingExpansion X,YdemonstrationsexpandFeature featureinput,outputextractSec.4Sec.5 Sec.3.1Sec.3.2 Fig. 2: An overview of this survey on knowledge distillation of large language models. Note that ‘Section’ is abbreviated as ‘Sec.’ in this figure. RM S(·)denotes the student reward model. the growing demand for more accessib...\n",
       "\n",
       "PROCESSED TEXT:\n",
-      "ce LLMs, this survey underscores the potential for more accessible, efficient, and powerful AI solutions. Most importantly, we firmly advocate for compliance with the legal terms that regulate the use of LLMs, ensuring ethical and lawful application of knowledge distillation. An associated Github repository is available at https://github.com/Tebmer/Awesome-Knowledge-Distillation-of-LLMs.\n",
+      "ynthesizefeedbackFeedback input outputSelf-Knowledge outputinputinput YlabelLabelingExpansion X,YdemonstrationexponentialexpandFeature featureinput,outputsec.4sec.5 sec.3.1sec.3.2 fig. 2: An overview of this survey on knowledge distillation of large language models....\n",
+      "==========================================================================================\n",
+      "\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INPUT TEXT:\n",
+      "gaps in current techniques and proposing direc- tions for future research. Survey Organization. The remainder of this survey is orga- nized into several comprehensive sections, each designed to offer a deep dive into the multifaceted aspects of knowledge distillation within the realm ofLLMs. Following this intro- duction, §2 provides a foundational overview of knowledge distillation, comparing traditional techniques with those emerging in the era of LLMs and highlighting the role of data augment...\n",
+      "\n",
+      "PROCESSED TEXT:\n",
+      "arch. Survey Organization. The remainder of this survey is orga- nized into several comprehensive sections, each designed to offer a deep dive into the multifaceted aspects of knowledge distillation within the realm ofLLMs.\n",
       "\n",
-      "Index Terms —Large language models, knowledge distillation, data augmentation, skill distillation, supervise...\n",
+      "Following this intro- duction, §2 provides a foundational overview of knowledge distillation, comparing traditional techniques with those emerging in the era of LLMs and highlighting the role of data augmentation (DA) in this context.\n",
+      "\n",
+      "§3 delves into approaches to elicit kno...\n",
       "==========================================================================================\n",
       "\n"
      ]
@@ -478,19 +765,10 @@
      "output_type": "stream",
      "text": [
       "INPUT TEXT:\n",
-      " have un-\n",
-      "locked new realms of possibility, from generating human-\n",
-      "like text to offering sophisticated problem-solving capa-\n",
-      "bilities. The core significance of these LLMs lies in their\n",
-      "emergent abilities (Wei et al., 2022a,b; Xu et al., 2024a), a\n",
-      "phenomenon where the models display capabilities beyond\n",
-      "their explicit training objectives, enabling them to tackle a\n",
-      "diverse array of tasks with remarkable proficiency. Their\n",
-      "deep understanding of context, nuance, and the intrica-\n",
-      "cies of human languag...\n",
+      "includes discus- sions on natural language understanding (NLU), genera- tion (NLG), information retrieval, recommendation systems, and the evaluation of text generation. In §5, we ventureinto domain-specific vertical distillation, showcasing how knowledge distillation techniques are applied within spe- cialized fields such as law, healthcare, finance, and science, illustrating the practical implications and transformative impact of these approaches. The survey suggests open problems in §6, ident...\n",
       "\n",
       "PROCESSED TEXT:\n",
-      "ergent** abilities, a phenomenon where the models display capabilities beyond their explicit training objectives, enabling them to tackle a diverse array of tasks with remarkable proficiency. Their **deep** understanding of context, nuance, and intricacies of human language enables them to excel in a wide array of applications, from creative content generation to complex problem-solving....\n",
+      "mmendation systems, and the evaluation of text generation....\n",
       "==========================================================================================\n",
       "\n"
      ]
@@ -507,22 +785,10 @@
      "output_type": "stream",
      "text": [
       "INPUT TEXT:\n",
-      "g to revolutionize industries,\n",
-      "augment human creativity, and redefine our interaction with\n",
-      "technology.\n",
-      "Despite the remarkable capabilities of proprietary LLMs\n",
-      "like GPT-4 and Gemini, they are not without their shortcom-\n",
-      "ings, particularly when viewed in light of the advantages\n",
-      "offered by open-source models. A significant drawback is\n",
-      "their limited accessibility and higher cost (OpenAI et al.,\n",
-      "2023). These proprietary models often come with substantial\n",
-      "usage fees and restricted access, making them ...\n",
+      "process of transferring knowledge from a large, complex model (teacher) to a smaller, more efficient model (student) (Gou et al., 2021). This technique is pivotal in mitigating the challenges posed by the computational demands and resource constraints of deploying large-scale models in practical applications. Historically, knowledge distillation techniques, prior to the era of LLMs, primarily concentrated on transferring knowledge from complex, often cumbersome neural net- works to more compact ...\n",
       "\n",
       "PROCESSED TEXT:\n",
-      "ogy.\n",
-      "Despite the remarkable capabilities of proprietary LLMs like GPT-4 and Gemini, they are not without their shortcomings, particularly when viewed in light of the advantages offered by open-source models.\n",
-      "A significant drawback is their limited accessibility and higher cost (OpenAI et al., 2023). These models often come with substantial usage fees and restricted access, making them less attainable for individuals and smaller organizations.\n",
-      "In terms of data privacy and security, using these pr...\n",
+      "Gou et al., 2021). This technique is pivotal in mitigating the challenges posed by the computational demands and resource constraints of deploying large-scale models in practical applications....\n",
       "==========================================================================================\n",
       "\n"
      ]
@@ -539,23 +805,10 @@
      "output_type": "stream",
      "text": [
       "INPUT TEXT:\n",
-      "straints of accessibility, cost, and adaptability\n",
-      "thus present significant challenges in leveraging the full\n",
-      "potential of proprietary LLMs.\n",
-      "In contrast to proprietary LLMs, open-source modelsarXiv:2402.13116v3  [cs.CL]  8 Mar 2024\n",
-      "2\n",
-      "like LLaMA (Touvron et al., 2023) and Mistral (Jiang et al.,\n",
-      "2023a) bring several notable advantages. One of the primary\n",
-      "benefits of open-source models is their accessibility and\n",
-      "adaptability. Without the constraints of licensing fees or\n",
-      "restrictive usage policies, t...\n",
+      "Mammoth (Yue et al., 2023a), Mixed Distill (Chenglin et al., 2023) ExpansionSelf-Instruct (Wang et al., 2022a), Alpaca (Taori et al., 2023), Code Alpaca (Chaudhary, 2023) Self-Align (Sun et al., 2024b), WizardLM (Xu et al., 2023a), WizardCoder (Luo et al., 2023a), WizardMath (Luo et al., 2023b), AugGPT (Dai et al., 2023a), TDG (He et al., 2023b) CurationUltraChat (Ding et al., 2023b), Phi-1 (Gunasekar et al., 2023), Phi-1.5 (Li et al., 2023a), Phi-2 (Mar, 2023), Magicoder (Wei et al., 2023), Wav...\n",
       "\n",
       "PROCESSED TEXT:\n",
-      "aging the full\n",
-      "potential of proprietary LLMs. In contrast to proprietary LLMs, open-source models\n",
-      "arXiv:2402.13116v3  [cs.CL]  8 Mar 2024\n",
-      "2\n",
-      "like LLaMA (Touvron et al., 2023) and Mistral (Jiang et al., 2023a) bring several notable advantages. One of the primary benefits of open-source models is their accessibility and adaptability. Without the constraints of licensing fees or restrictive usage policies, these models are more readily available to a broader range of users, from individual researche...\n",
+      ", Alpaca (T et al., 2023), Code Alpaca (C et al., 2023) Self-Align (S et al., 2024b), WizardLM (X et al., 2023a), WizardCoder (L et al., 2023a), WizardMath (L et al., 2023b), AugGPT (D et al., 2023a), TDG (H et al., 2023b) CurationUltraChat (D et al., 2023b), Phi-1 (G et al., 2023), Phi-1.5 (L et al., 2023a), Phi-2 (M, 2023), Magicoder (W et al., 2023), WaveCoder (Y et al., 2024) ZeroGen (Y et al., 2022), SunGen (G et al., 2023a), InPars (B et al., 2022) FeatureBabyLlama (T and Tastet, 2023), Mi...\n",
       "==========================================================================================\n",
       "\n"
      ]
@@ -572,19 +825,10 @@
      "output_type": "stream",
      "text": [
       "INPUT TEXT:\n",
-      "y stemming from their relatively\n",
-      "limited scale and resources compared to their proprietary\n",
-      "counterparts. One of the most significant limitations is\n",
-      "the smaller model scale, which often results in lower per-\n",
-      "formance on real-world tasks with a bunch of instruc-\n",
-      "tions (Zheng et al., 2023a). These models, with fewer pa-\n",
-      "rameters, may struggle to capture the depth and breadth\n",
-      "of knowledge embodied in larger models like GPT-4. Ad-\n",
-      "ditionally, the pre-training investment in these open-source\n",
-      "models is...\n",
+      "(Chen et al., 2023a), GKD (Agarwal et al., 2024) Self-KnowledgeSelf-Instruct (Wang et al., 2022a), Self-Align (Sun et al., 2024b), RLCD (Yang et al., 2024a), ImpDistill (Jung et al., 2023), LMSI (Huang et al., 2023a), ReST (Gulcehre et al., 2023), Self-Rewarding (Yuan et al., 2024a), Baize (Xu et al., 2023b), STaR (Zelikman et al., 2022) DistillationSupervised Fine-TuningAlpaca (Taori et al., 2023), Vicuna (Chiang et al., 2023), WizardLM (Xu et al., 2023a), Self-Instruct (Wang et al., 2022a), Ba...\n",
       "\n",
       "PROCESSED TEXT:\n",
-      "parts. One of the most significant limitations is the smaller model scale, which often results in lower performance on real-world tasks with a bunch of instructions (Zheng et al., 2023a). These models, with fewer parameters, may struggle to capture the depth and breadth of knowledge embodied in larger models like GPT-4. Additionally, the pre-training investment in these open-source models is typically less substantial. This reduced investment can lead to a narrower range of pre-training data, po...\n",
+      "Self-Align (Sun et al., 2024b), RLCD (Yang et al., 2024a), ImpDistill (Jung et al., 2023), LMSI (Huang et al., 2023a), ReST (Gulcehre et al., 2023), Self-Rewarding (Yuan et al., 2024a), Baize (Xu et al., 2023b), STaR (Zelikman et al., 2022) DistillationSupervised Fine-TuningAlpaca (Taori et al., 2023), Vicuna (Chiang et al., 2023), WizardLM (Xu et al., 2023a), Self-Instruct (Wang et al., 2022a), Baize (Xu et al., 2023b), STaR (Zelikman et al., 2022), Divergence and SimilarityDistilGPT (Sanh et a...\n",
       "==========================================================================================\n",
       "\n"
      ]
@@ -601,26 +845,70 @@
      "output_type": "stream",
      "text": [
       "INPUT TEXT:\n",
-      "ized applications. This\n",
-      "limitation becomes particularly evident when these models\n",
-      "are compared to the highly fine-tuned proprietary LLMs,\n",
-      "which are often tailored to excel in a wide array of complex\n",
-      "scenarios (OpenAI et al., 2023).\n",
-      "Primarily, recognizing the disparities between propri-\n",
-      "etary and open-source LLMs, KD techniques have surged\n",
-      "as a means to bridge the performance gap between these\n",
-      "models (Gou et al., 2021; Gupta and Agrawal, 2022). Knowl-\n",
-      "edge distillation, in this context, involves ...\n",
+      "al., 2023), CycleAlign (Hong et al., 2023), Skill DistillationContext FollowingInstruction FollowingSelf-Instruct (Wang et al., 2022a), Alpaca (Taori et al., 2023), Vicuna (Chiang et al., 2023), WizardLM (Xu et al., 2023a), Orca (Mukherjee et al., 2023), Orca 2 (Mitra et al., 2023), WizardMath (Luo et al., 2023b), Llama-GPT4 (Peng et al., 2023a), Multi-turn DialogueVicuna (Chiang et al., 2023), Baize (Xu et al., 2023b), UltraLLaMA (Ding et al., 2023b), CAMEL (Li et al., 2023b), OpenChat (Wang et...\n",
       "\n",
       "PROCESSED TEXT:\n",
-      "===========================\n",
+      "struct (Wang et al., 2022a), Alpaca (Taori et al., 2023), Vicuna (Chiang et al., 2023), WizardLM (Xu et al., 2023a), Orca (Mukherjee et al., 2023), Orca 2 (Mitra et al., 2023), WizardMath (Luo et al., 2023b), Llama-GPT4 (Peng et al., 2023a), Multi-turn DialogueVicuna (Chiang et al., 2023), Baize (Xu et al., 2023b), UltraLLaMA (Ding et al., 2023b), CAMEL (Li et al., 2023b), OpenChat (Wang et al., 2023c), Zephyr (Tunstall et al., 2023), RAG Capbility KARD (Kang et al., 2023a), SAIL (Luo et al., 20...\n",
+      "==========================================================================================\n",
+      "\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INPUT TEXT:\n",
+      "(Lee et al., 2023a), Zephy (Tunstall et al., 2023), UltraFeedback (Cui et al., 2023a), ValueCAI (Bai et al., 2022a), Align Honesty (Yang et al., 2023a), SANDBOX (Liu et al., 2023b), Self-Align (Sun et al., 2024b), UltraFeedback (Cui et al., 2023a), RLCD (Yang et al., 2024a) AgentTool UsingToolformer (Schick et al., 2023), Graph-ToolFormer (Zhang, 2023), Gorilla (Patil et al., 2023), ToolAlpaca (Tang et al., 2023a), ToolLLM (Qin et al., 2023a), CRAFT (Yuan et al., 2023a), Confucius (Gao et al., 2...\n",
       "\n",
-      "When it comes to recognizing the differences between proprietary and open-source LLMs, it becomes particularly evident when comparing them to highly fine-tuned proprietary models. These models are often tailored to excel in a wide array of complex scenarios.\n",
+      "PROCESSED TEXT:\n",
+      "i et al., 2022a), Align Honesty (Yang et al., 2023a), SANDBOX (Liu et al., 2023b), Self-Align (Sun et al., 2024b), UltraFeedback (Cui et al., 2023a), RLCD (Yang et al., 2024a) AgentToolformer (Schick et al., 2023), Graph-Toolformer (Zhang, 2023), Gorilla (Patil et al., 2023), ToolAlpaca (Tang et al., 2023a), ToolLLM (Qin et al., 2023a), CRAFT (Yuan et al., 2023a), Confucius (Gao et al., 2023b), MLLM-Tool (Wang et al., 2024), α-UMi (Shen et al., 2024), PlanningFireAct (Chen et al., 2023b), AgentT...\n",
+      "==========================================================================================\n",
+      "\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INPUT TEXT:\n",
+      "2022), NLGInheritSumm (Xu et al., 2023c), RECOMP (Xu et al., 2024b), MaRio (Ramnath et al., 2023), ID (Jung et al., 2023), GPT-3 Labeling (Wang et al., 2021b), BioGPT (Guo et al., 2023a), ChatGPT NMT (Yang and Nicolai, 2023), Information RetrievalQUILL (Srinivasan et al., 2022), Promptgator (Dai et al., 2023b), InPars (Bonifacio et al., 2022), AugTriever (Meng et al., 2023), (Sun et al., 2023a), RankVicuna (Pradeep et al., 2023a), RankZephyr (Pradeep et al., 2023b), ExaRanker (Ferraretto et al.,...\n",
       "\n",
-      "**The Limitation of Proprietary Models**\n",
-      "------------------------------------\n",
+      "PROCESSED TEXT:\n",
+      "al., 2023 GPT-3 Labeling Wang et al., 2021b BioGPT Guo et al., 2023a ChatGPT NMT Yang and Nicolai, 2023 Information RetrievalQUILL Srinivasan et al., 2022 Promptgator Dai et al., 2023b InPars Bonifacio et al., 2022 AugTriever Meng et al., 2023 RankVicuna Pradeep et al., 2023a RankZephyr Pradeep et al., 2023b ExaRanker Ferraretto et al., 2023 Recommendation NDR Mysore et al., 2023 InstrcutRec Zhang et al., 2023b ONCE Liu et al., 2023c Text Generation Evaluation PandaLM Wang et al., 2023b Promethe...\n",
+      "==========================================================================================\n",
+      "\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INPUT TEXT:\n",
+      "al., 2024), Code Clean (Jain et al., 2023), Multi-ModalityLLaVA (Liu et al., 2023e), SVIT (Zhao et al., 2023b), LVIS-Instruct4V (Wang et al., 2023e), Shikra (Chen et al., 2023c), LSKD (Park et al., 2023), DetGPT (Pi et al., 2023; Zhao et al., 2023c), LRV (Liu et al., 2023f), NExT-GPT (Wu et al., 2023b), Valley (Luo et al., 2023d), ILuvUI (Jiang et al., 2023d), StableLLaVA (Li et al., 2023c), PointLLM (Xu et al., 2023e), Verticalization DistillationLaw (Huang et al., 2023b; Cui et al., 2023b); Me...\n",
       "\n",
-      "Recognizing the disparities between proprietary and open-source LLMs is crucial. The fine-tuning process for proprietary models is a...\n",
+      "PROCESSED TEXT:\n",
+      "al., 2023b), LVIS-Instruct4V (Wang et al., 2023e), Shikra (Chen et al., 2023c), LSKD (Park et al., 2023), DetGPT (Pi et al., 2023; Zhao et al., 2023c), LRV (Liu et al., 2023f), NExT-GPT (Wu et al., 2023b), Valley (Luo et al., 2023d), ILuvUI (Jiang et al., 2023d), StableLLaVA (Li et al., 2023c), PointLLM (Xu et al., 2023e), Verticalization DistillationLaw (Huang et al., 2023b; Cui et al., 2023b); Medical & Healthcare (Zhang et al., 2023c; Chen et al., 2023d); Finance (Zhang and Yang, 2023); Scien...\n",
       "==========================================================================================\n",
       "\n"
      ]
@@ -629,6 +917,7 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
+      "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n",
       "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
      ]
     },
@@ -637,19 +926,17 @@
      "output_type": "stream",
      "text": [
       "INPUT TEXT:\n",
-      "t al., 2021) has emerged as a\n",
-      "prevalent paradigm to achieve knowledge distillation of\n",
-      "LLMs, where a small seed of knowledge is used to prompt\n",
-      "the LLM to generate more data with respect to a specific\n",
-      "skill or domain (Taori et al., 2023). Secondly, KD still retains\n",
-      "its fundamental role in compressing LLMs, making them\n",
-      "more efficient without significant loss in performance. (Gu\n",
-      "et al., 2024; Agarwal et al., 2024). More recently, the strategy\n",
-      "of employing open-source LLMs as teachers for their own\n",
-      "s...\n",
+      "earlier methods involved training a smaller student network to mimic the output of a larger teacher network, often through techniques like soft target training, where the student learns from the softened softmax output of the teacher. Please refer to the survey (Gou et al., 2021) for more details on general knowledge distillation techniques in AI and DL. In contrast, the advent of LLMs has revolutionized the knowledge distillation landscape. The current era of knowledge distillation in LLMs shif...\n",
+      "\n",
+      "PROCESSED TEXT:\n",
+      "r network, often through techniques like soft target training, where the student learns from the softened softmax output of the teacher. This approach has been refined to the current era of knowledge distillation in LLMs, where the focus shifts from mere architecture compression to knowledge elicitation and transfer....\n",
+      "==========================================================================================\n",
+      "\n",
+      "INPUT TEXT:\n",
+      "replicate the output behavior of the teacher model or reduce the model size , the current focus in LLM-based knowledge distillation is to extract and transfer the rich, nuanced understanding that these models have developed. The key to this modern approach lies in heuristic and carefully designed prompts, which are used to elicit specific knowledge (Ding et al., 2023b) or capabilities (Chaudhary, 2023) from the LLMs. These prompts are crafted to tap into the LLM’s understanding and capabilities ...\n",
       "\n",
       "PROCESSED TEXT:\n",
-      "ed of knowledge is used to prompt the LLM to generate more data with respect to a specific skill or domain. Secondly, KD retains its fundamental role in compressing LLMs, making them more efficient without significant loss in performance. (Gu et al., 2024; Agarwal et al., 2024). More recently, the strategy of employing open-source LLMs as teachers for their own self-improvement has emerged as a promising approach, enhancing their capabilities significantly. Figure 1 provides an illustration of t...\n",
+      "...\n",
       "==========================================================================================\n",
       "\n"
      ]
@@ -666,35 +953,72 @@
      "output_type": "stream",
      "text": [
       "INPUT TEXT:\n",
+      "of LLMs, where the models exhibit capabilities beyond their explicit training objectives. Furthermore, this era of knowledge distillation also em- phasizes the transfer of more abstract qualities such as reasoning patterns (Mitra et al., 2023), preference align- ment (Cui et al., 2023a), and value alignment (Sun et al., 2024b). This is in stark contrast to the earlier focus on output replication (Taori et al., 2023), indicating a shift towards a more holistic and comprehensive transfer of cognit...\n",
       "\n",
-      "via self-generated knowledge.\n",
-      "A key aspect of the knowledge distillation is the en-\n",
-      "hancement of skills such as advanced context following\n",
-      "(e.g., in-context learning (Huang et al., 2022a) and in-\n",
-      "struction following (Taori et al., 2023)), improved align-\n",
-      "ment with user intents (e.g., human values/principles (Cui\n",
-      "et al., 2023a), and thinking patterns like chain-of-thought\n",
-      "(CoT) (Mukherjee et al., 2023)), and NLP task specialization\n",
-      "(e.g., semantic understanding (Ding et al., 2023a), and code\n",
-      "gen...\n",
+      "PROCESSED TEXT:\n",
+      "ize the transfer of more abstract qualities such as reasoning patterns, preference alignment, and value alignment. This shift towards a more holistic and comprehensive transfer of cognitive capabilities is in stark contrast to the earlier focus on output replication, indicating a greater emphasis on the development of more complex and nuanced thought processes. The current techniques involve not just the replication of outputs, but also the emulation of the thought processes and decision-making ...\n",
+      "==========================================================================================\n",
+      "\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INPUT TEXT:\n",
+      "LLMs, Data Augmentation (DA) (Wang et al., 2022a; Ye et al., 2022) emerges as a critical paradigm integral to the process of knowledge distillation. Unlike traditional DA techniques such as paraphrasing (Gangal et al., 2022) orback-translation (Longpre et al., 2019), which primarily aim at expanding the training dataset in a somewhat mechanical manner. DA within the context of LLMs focuses on the generation of novel, context-rich training data tailored to specific domains and skills. This innova...\n",
+      "\n",
+      "PROCESSED TEXT:\n",
+      "istillation, driving innovation in the field of Large Language Models (LLMs) through the generation of novel, context-rich training data tailored to specific domains and skills. This is distinct from traditional DA techniques such as paraphrasing and back-translation, which primarily aim at expanding the training dataset in a mechanical manner....\n",
+      "==========================================================================================\n",
+      "\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INPUT TEXT:\n",
+      "as a potent mechanism for bridging the knowl- edge and capability gap between proprietary and open- source models. Through DA, LLMs are prompted to create targeted, high-quality datasets that are not merely larger in volume but are also rich in diversity and specificity. This approach enables the distillation process to be more effec- tive, ensuring that the distilled models not only replicate the teacher model’s output behavior but also embody its deep-seated understanding and cognitive strateg...\n",
+      "\n",
+      "PROCESSED TEXT:\n",
+      "high-quality, diverse datasets that not only increase volume but also richness and specificity, enabling the distillation process to be more effective. This approach ensures that the models replicate the teacher model's output behavior and embody its deep-seated understanding and cognitive strategies.\n",
+      "\n",
+      "The significance of DA for achieving KD in the LLM era cannot be overstated. It acts as a force multiplier, enabling distilled models to acquire and refine capabilities that would otherwise requir...\n",
+      "==========================================================================================\n",
+      "\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INPUT TEXT:\n",
+      "pivotal shift towards a more efficient, sustainable, and accessible approach to harnessing the power of LLMs. It empowers open-source models with the ability to approximate the contextual adeptness, ethical alignment, and deep semantic insights characteristic of their proprietary counterparts, thereby democratizing access to advanced AI capabilities and fostering innovation across a broader spectrum of applications and users. 2.3 Survey Scope Building on the discussions introduced earlier, this ...\n",
       "\n",
       "PROCESSED TEXT:\n",
-      "standing\n",
-      "and in-context learning\n",
-      "improved alignment with user intents\n",
-      "human values and principles\n",
-      "thinking patterns like chain-of-thought\n",
-      "and NLP task specialization\n",
-      "semantic understanding\n",
-      "code generation\n",
-      "these skills are crucial for a wide range of applications\n",
-      "from casual conversations to complex problem-solving\n",
-      "in specialized domains\n",
-      "in vertical domains like healthcare\n",
-      "law\n",
-      "science\n",
-      "where accuracy and contextual knowledge are paramount\n",
-      "knowledge distillation enables open-source models to improv...\n",
+      "pproach to harnessing the power of LLMs empowers open-source models with the ability to approximate contextual adeptness, ethical alignment, and deep semantic insights characteristic of proprietary counterparts, democratizing access to advanced AI capabilities and fostering innovation across a broader spectrum of applications and users. Survey aims to comprehensively explore the landscape of knowledge distillation within the context of LLMs, following a meticulously structured taxonomy. Scope de...\n",
       "==========================================================================================\n",
       "\n"
      ]
@@ -711,25 +1035,10 @@
      "output_type": "stream",
      "text": [
       "INPUT TEXT:\n",
-      "rom the\n",
-      "proprietary models that have been extensively trained and\n",
-      "fine-tuned in these areas.\n",
-      "The benefits of knowledge distillation in the era of\n",
-      "LLMs are multifaceted and transformative (Gu et al., 2024).\n",
-      "Through a suite of distillation techniques, the gap between\n",
-      "proprietary and open-source models is significantly nar-\n",
-      "rowed (Chiang et al., 2023; Xu et al., 2023a) and even\n",
-      "filled (Zhao et al., 2023a). This process not only streamlines\n",
-      "computational requirements but also enhances the environ-\n",
-      "m...\n",
+      "distillation. KD Algorithms. This segment focuses on the technical foundations and methodologies of knowledge distillation. It includes an in-depth exploration of the processes involved in constructing knowledge from teacher models (e.g., pro- prietary LLMs) and integrating this knowledge into student models (e.g., open-source LLMs). Under the umbrella of ‘knowledge ’, we delve into strategies such as labeling (Hsieh et al., 2023), expansion (Taori et al., 2023), curation (Gu- nasekar et al., 20...\n",
       "\n",
       "PROCESSED TEXT:\n",
-      "of knowledge distillation in the era of LLMs are multifaceted and transformative.\n",
-      "Through a suite of distillation techniques, the gap between proprietary and open-source models is significantly narrowed.\n",
-      "This process streamlines computational requirements and enhances environmental sustainability of AI operations.\n",
-      "Open-source models become more proficient with lesser computational overhead.\n",
-      "Furthermore, knowledge distillation fosters an accessible and equitable AI landscape.\n",
-      "Smaller entities and...\n",
+      "f knowledge distillation. It explores processes involved in constructing knowledge from teacher models and integrating this knowledge into student models. Strategies include labeling, expansion, curation, feature understanding, feedback mechanisms, and self-knowledge generation....\n",
       "==========================================================================================\n",
       "\n"
      ]
@@ -746,22 +1055,10 @@
      "output_type": "stream",
      "text": [
       "INPUT TEXT:\n",
-      " growth across various industries\n",
-      "and research domains.\n",
-      "The escalating need for a comprehensive survey on the\n",
-      "knowledge distillation of LLMs stems from the rapidly\n",
-      "evolving landscape of AI (OpenAI et al., 2023; Team et al.,\n",
-      "2023) and the increasing complexity of these models. As AI\n",
-      "continues to penetrate various sectors, the ability to effi-\n",
-      "ciently and effectively distill knowledge from proprietary\n",
-      "LLMs to open-source ones becomes not just a technical\n",
-      "aspiration but a practical necessity. This ...\n",
+      "et al., 2023a), and rank optimization strategies (Tunstall et al., 2023). This analysis aims to illuminate how these algorithms facilitate the trans- fer of knowledge, ensuring that open-source models can replicate and, in some cases, surpass the capabilities of their proprietary counterparts. Skill Distillation. This facet examines the specific compe- tencies and capabilities enhanced through KD. It encom- passes detailed discussions on context following (Taori et al., 2023; Luo et al., 2023c),...\n",
       "\n",
       "PROCESSED TEXT:\n",
-      "ey on the\n",
-      "knowledge distillation of LLMs stems from the rapidly evolving landscape of AI\n",
-      "and the increasing complexity of these models\n",
-      "as ai continues to penetrate various sectors the ability to efficiently and effectively distill knowledge from proprietary lls to open-source ones becomes not just a technical aspiration but a practical necessity...\n",
+      "luminate how these algorithms facilitate knowledge transfer, ensuring that open-source models can replicate and, in some cases, surpass proprietary counterparts. Skill Distillation. This aspect examines the specific competencies and capabilities enhanced through Knowledge Distillation. It covers detailed discussions on context following (Taori et al., 2023; Luo et al., 2023c) and retrieval-augmented generation (RAG) capabilities. In the realm of alignment (Mitra et al., 2023; Tunstall et al., 20...\n",
       "==========================================================================================\n",
       "\n"
      ]
@@ -778,33 +1075,12 @@
      "output_type": "stream",
      "text": [
       "INPUT TEXT:\n",
-      "eRankOptimizationy,1y,2y3y1y2y3≻≻rank……\n",
-      "DataCuration\n",
-      "X,YrawdatasynthesizefeedbackFeedback\n",
-      "input\n",
-      "outputSelf-Knowledge\n",
-      "outputinputinput\n",
-      "YlabelLabelingExpansion\n",
-      "X,YdemonstrationsexpandFeature\n",
-      "featureinput,outputextractSec.4Sec.5\n",
-      "Sec.3.1Sec.3.2\n",
-      "Fig. 2: An overview of this survey on knowledge distillation of large language models. Note that ‘Section’ is abbreviated\n",
-      "as ‘Sec.’ in this figure. RM S(·)denotes the student reward model.\n",
-      "the growing demand for more accessible, cost-effective, and\n",
-      "adaptable ...\n",
+      "lan- guage generation (NLG), information retrieval, recommen- dation systems, text generation evaluation, and code gen- eration. Finally, the survey addresses multi-modality (Liu et al., 2023e; Zhao et al., 2023b), exploring how KD enhances LLMs’ ability to interpret and integrate multiple forms of input, enriching their utility and applicability across various contexts. Verticalization Distillation. This section assesses the ap- plication of KD across diverse vertical domains, offering insights...\n",
       "\n",
       "PROCESSED TEXT:\n",
-      "utSelf-Knowledge\n",
-      "labelingExpansion\n",
-      "X,YdemonstrationsexpandFeature\n",
-      "featureinput,outputextractSec.4Sec.5\n",
-      "Sec.3.1Sec.3.2\n",
-      "Fig. 2: An overview of this survey on knowledge distillation of large language models. Note that ‘Section’ is abbreviated\n",
-      "as ‘Sec.’ in this figure. RM S(·)denotes the student reward model.\n",
-      "the growing demand for more accessible, cost-effective, and\n",
-      "adaptable AI solutions that can cater to a diverse range\n",
-      "of applications and users. A survey in this field is vital\n",
-      "for synthesizing ...\n",
+      "nd Code Generation.\"\n",
+      "\n",
+      "\"Final Recommendation Systems\"...\n",
       "==========================================================================================\n",
       "\n"
      ]
@@ -821,19 +1097,12 @@
      "output_type": "stream",
      "text": [
       "INPUT TEXT:\n",
-      "posing direc-\n",
-      "tions for future research.\n",
-      "Survey Organization. The remainder of this survey is orga-\n",
-      "nized into several comprehensive sections, each designed to\n",
-      "offer a deep dive into the multifaceted aspects of knowledge\n",
-      "distillation within the realm ofLLMs. Following this intro-\n",
-      "duction, §2 provides a foundational overview of knowledge\n",
-      "distillation, comparing traditional techniques with those\n",
-      "emerging in the era of LLMs and highlighting the role of\n",
-      "data augmentation (DA) in this context. §3 del...\n",
+      "meet the nuanced demands of different industries, thus contributing to the broader AI and ML ecosystem. By navigating through these facets, this survey en- deavors to provide an extensive and nuanced analysis of knowledge distillation in the era of LLMs. It serves as a guide for researchers, practitioners, and enthusiasts in the field, shedding light on current methodologies, challenges, and opportunities for innovation in this rapidly evolving domain. Declaration. This survey represents our ear...\n",
       "\n",
       "PROCESSED TEXT:\n",
-      "s organized into several comprehensive sections, each designed to offer a deep dive into the multifaceted aspects of knowledge distillation within the realm of LLMs. Following this introduction, §2 provides a foundational overview of knowledge distillation, comparing traditional techniques with those emerging in the era of LLMs and highlighting the role of data augmentation (DA) in this context. §3 delves into the approaches to elicit knowledge from teacher LLMs and core distillation algorithms,...\n",
+      "stem. By navigating through these facets, this survey aims to provide an extensive and nuanced analysis of knowledge distillation in the era of LLMs. It serves as a guide for researchers, practitioners, and enthusiasts in the field, shedding light on current methodologies, challenges, and opportunities for innovation in this rapidly evolving domain.\n",
+      "\n",
+      "This survey represents our earnest effort to provide a comprehensive and insightful overview of knowledge distillation techniques applied to LLMs, ...\n",
       "==========================================================================================\n",
       "\n"
      ]
@@ -850,17 +1119,10 @@
      "output_type": "stream",
      "text": [
       "INPUT TEXT:\n",
-      "guage understanding (NLU), genera-\n",
-      "tion (NLG), information retrieval, recommendation systems,\n",
-      "and the evaluation of text generation. In §5, we ventureinto domain-specific vertical distillation, showcasing how\n",
-      "knowledge distillation techniques are applied within spe-\n",
-      "cialized fields such as law, healthcare, finance, and science,\n",
-      "illustrating the practical implications and transformative\n",
-      "impact of these approaches. The survey suggests open\n",
-      "problems in §6, identifying current challenges and gaps in...\n",
+      "foundational paradigms of knowledge dis- tillation, highlighting key methodologies and their impacts across a range of applications. 2.4 Distillation Pipeline in LLM Era SeedKnowledgeSkill/Domain TeacherLLMKnowledgeElicitationStudentModelDistillationAlgorithmsteer driveGeneratedKnowledgeLearningObjectivetrain Fig. 4: An illustration of a general pipeline to distill knowl- edge from a large language model to a student model. The general distillation pipeline of LLMs is a structured and methodical...\n",
       "\n",
       "PROCESSED TEXT:\n",
-      "lves the process of identifying, selecting, and combining the most relevant knowledge from a dataset to create a distilled representation of the original information....\n",
+      "across a range of applications. Distillation Pipeline in LLM Era....\n",
       "==========================================================================================\n",
       "\n"
      ]
@@ -877,26 +1139,10 @@
      "output_type": "stream",
      "text": [
       "INPUT TEXT:\n",
-      "large, complex model (teacher) to a\n",
-      "smaller, more efficient model (student) (Gou et al., 2021).\n",
-      "This technique is pivotal in mitigating the challenges posed\n",
-      "by the computational demands and resource constraints of\n",
-      "deploying large-scale models in practical applications.\n",
-      "Historically, knowledge distillation techniques, prior to\n",
-      "the era of LLMs, primarily concentrated on transferring\n",
-      "knowledge from complex, often cumbersome neural net-\n",
-      "works to more compact and efficient architectures (Sanh\n",
-      "et al.,...\n",
+      "seen in Figure 2. I. Target Skill or Domain Steering Teacher LLM. The first stage involves directing the teacher LLM towards a specific target skill or domain. This is achieved through care- fully crafted instructions or templates that guide the LLM’s focus. These instructions are designed to elicit responses that demonstrate the LLM’s proficiency in a particular area, be it a specialized domain like healthcare or law, or a skill such as reasoning or language understanding. The objective here is...\n",
       "\n",
       "PROCESSED TEXT:\n",
-      "ue is pivotal in mitigating the challenges posed\n",
-      "by the computational demands and resource constraints of\n",
-      "deploying large-scale models in practical applications.\n",
-      "Historically, knowledge distillation techniques, prior\n",
-      "to the era of LLMs, primarily concentrated on transferring\n",
-      "knowledge from complex, often cumbersome neural net-\n",
-      "works to more compact and efficient architectures (Sanh\n",
-      "et al., 2019; Kim and Rush, 2016)....\n",
+      "wards a specific target skill or domain. This is achieved through carefully crafted instructions or templates that guide the LLM's focus. These instructions are designed to elicit responses that demonstrate the LLM's proficiency in a particular area, be it a specialized domain like healthcare or law, or a skill such as reasoning or language understanding. The objective is to utilize the teacher LLM's extensive training and nuanced capabilities to generate outputs that are rich in the specific kn...\n",
       "==========================================================================================\n",
       "\n"
      ]
@@ -913,22 +1159,10 @@
      "output_type": "stream",
      "text": [
       "INPUT TEXT:\n",
-      " (Chenglin et al., 2023)\n",
-      "ExpansionSelf-Instruct (Wang et al., 2022a), Alpaca (Taori et al., 2023), Code Alpaca (Chaudhary, 2023)\n",
-      "Self-Align (Sun et al., 2024b), WizardLM (Xu et al., 2023a), WizardCoder (Luo et al., 2023a),\n",
-      "WizardMath (Luo et al., 2023b), AugGPT (Dai et al., 2023a), TDG (He et al., 2023b)\n",
-      "CurationUltraChat (Ding et al., 2023b), Phi-1 (Gunasekar et al., 2023), Phi-1.5 (Li et al., 2023a),\n",
-      "Phi-2 (Mar, 2023), Magicoder (Wei et al., 2023), WaveCoder (Yu et al., 2024)\n",
-      "ZeroGen (Ye et al...\n",
+      "to generate more elaborate and detailed outputs based on this initial infor- mation. The seed knowledge is crucial as it provides a foundation upon which the teacher model can build and expand, thereby creating more comprehensive and in-depth knowledge examples. III. Generation of Distillation Knowledge. In response to the seed knowledge and steering instructions, the teacher LLM generates knowledge examples. These examples are predominantly in the form of question-and-answer (QA) dialogues or n...\n",
       "\n",
       "PROCESSED TEXT:\n",
-      "Code Alpaca (Chaudhary, 2023)\n",
-      "Self-Align (Sun et al., 2024b), WizardLM (Xu et al., 2023a), WizardCoder (Luo et al., 2023a),\n",
-      "WizardMath (Luo et al., 2023b), AugGPT (Dai et al., 2023a), TDG (He et al., 2023b)\n",
-      "CurationUltraChat (Ding et al., 2023b), Phi-1 (Gunasekar et al., 2023), Phi-1.5 (Li et al., 2023a),\n",
-      "Phi-2 (Mar, 2023), Magicoder (Wei et al., 2023), WaveCoder (Yu et al., 2024)\n",
-      "ZeroGen (Ye et al., 2022), SunGen (Gao et al., 2023a), InPars (Bonifacio et al., 2022)\n",
-      "FeatureBabyLlama (Timiryasov ...\n",
+      "ild and expand, creating more comprehensive and in-depth knowledge examples. These examples are predominantly in the form of question-and-answer (QA) dialogues or narrative explanations, aligning with the natural language processing/understanding capabilities of the 7 LLM. In certain specialized cases, the outputs may include logits or hidden features, although this is less common due to the complexity and specific requirements of such data forms. The generated knowledge examples constitute the ...\n",
       "==========================================================================================\n",
       "\n"
      ]
@@ -945,22 +1179,10 @@
      "output_type": "stream",
      "text": [
       "INPUT TEXT:\n",
-      "2024)\n",
-      "Self-KnowledgeSelf-Instruct (Wang et al., 2022a), Self-Align (Sun et al., 2024b), RLCD (Yang et al., 2024a),\n",
-      "ImpDistill (Jung et al., 2023), LMSI (Huang et al., 2023a), ReST (Gulcehre et al., 2023),\n",
-      "Self-Rewarding (Yuan et al., 2024a), Baize (Xu et al., 2023b), STaR (Zelikman et al., 2022)\n",
-      "DistillationSupervised Fine-TuningAlpaca (Taori et al., 2023), Vicuna (Chiang et al., 2023), WizardLM (Xu et al., 2023a),\n",
-      "Self-Instruct (Wang et al., 2022a), Baize (Xu et al., 2023b), STaR (Zelikman et a...\n",
+      "Specific Learn- ing Objective. The final stage involves the utilization of the generated knowledge examples to train the student model. This training is guided by a loss function that aligns with the learning objectives. The loss function quantifies the student model’s performance in replicating or adapting the knowledge from the teacher model. By minimizing this loss, the student model learns to emulate the target skills or domain knowledge of the teacher, thereby acquiring similar capabilities...\n",
       "\n",
       "PROCESSED TEXT:\n",
-      "et al., 2024a), \n",
-      "ImpDistill (Jung et al., 2023), LMSI (Huang et al., 2023a), \n",
-      "ReST (Gulcehre et al., 2023), Self-Rewarding (Yuan et al., 2024a), Baize (Xu et al., 2023b), \n",
-      "STaR (Zelikman et al., 2022), \n",
-      "DistillationSupervised Fine-TuningAlpaca (Taori et al., 2023), Vicuna (Chiang et al., 2023), \n",
-      "WizardLM (Xu et al., 2023a), \n",
-      "Self-Instruct (Wang et al., 2022a), Baize (Xu et al., 2023b), STaR (Zelikman et al., 2022), \n",
-      "Divergence and SimilarityDistilGPT (Sanh et al., 2019), f-Distill (Wen et al., 2...\n",
+      "of. the. generated. knowledge. examples. to. train. the. student. model. This. training. is. guided. by. a. loss. function. that. aligns. with. the. learning. objectives. The. loss. function. quantifies. the. student. model’s. performance. in. replicating. or. adapting. the. knowledge. from. the. teacher. model. By. minimizing. this. loss. the. student. model. learns. to. emulate. the. target. skills. or. domain. knowledge. of. the. teacher. thereby. acquiring. similar. capabilities. The. proces...\n",
       "==========================================================================================\n",
       "\n"
      ]
@@ -977,21 +1199,10 @@
      "output_type": "stream",
      "text": [
       "INPUT TEXT:\n",
-      "kill\n",
-      "DistillationContext FollowingInstruction FollowingSelf-Instruct (Wang et al., 2022a), Alpaca (Taori et al., 2023), Vicuna (Chiang et al., 2023),\n",
-      "WizardLM (Xu et al., 2023a), Orca (Mukherjee et al., 2023), Orca 2 (Mitra et al., 2023),\n",
-      "WizardMath (Luo et al., 2023b), Llama-GPT4 (Peng et al., 2023a),\n",
-      "Multi-turn DialogueVicuna (Chiang et al., 2023), Baize (Xu et al., 2023b), UltraLLaMA (Ding et al., 2023b),\n",
-      "CAMEL (Li et al., 2023b), OpenChat (Wang et al., 2023c), Zephyr (Tunstall et al., 2023),...\n",
+      "domain to steer the LLM and elicit knowledge, s∼ S denotes an example of the seed knowledge, upon which the LLM can explore to generate novel knowledge, Parse( o, s)stands for to parse the distillation example ( e.g., (x, y)) from the teacher LLM’s output o(plus the input sin some cases), andpTrepresents the teacher LLM with parameters θT. Given the datasets D(kd) Ibuilt for distillation, we then define a learning objective as L=X ILI(D(kd) I;θS), (2) whereP Idenotes there could be multiple task...\n",
       "\n",
       "PROCESSED TEXT:\n",
-      "et al., 2023), Vicuna (Chiang et al., 2023),\n",
-      "WizardLM (Xu et al., 2023a), Orca (Mukherjee et al., 2023), Orca 2 (Mitra et al., 2023),\n",
-      "WizardMath (Luo et al., 2023b), Llama-GPT4 (Peng et al., 2023a),\n",
-      "Multi-turn DialogueVicuna (Chiang et al., 2023), Baize (Xu et al., 2023b), UltraLLaMA (Ding et al., 2023b),\n",
-      "CAMEL (Li et al., 2023b), OpenChat (Wang et al., 2023c), Zephyr (Tunstall et al., 2023),\n",
-      "RAG Capbility KARD (Kang et al., 2023a), SAIL (Luo et al., 2023c), Self-RAG (Asai et al., 2023),\n",
-      "Alignme...\n",
+      "which the LLM can explore to generate novel knowledge, Parse( o, s)stands for to parse the distillation example ( e.g., (x, y)) from the teacher LLM’s output o(plus the input sin some cases), andpTrepresents the teacher LLM with parameters θT. Given the datasets D(kd) Ibuilt for distillation, we then define a learning objective as L(XIL;θS), (2) where Idenotes there could be multiple tasks or skills being distilled into one student model, LI(·;·)stands for a specific learning objective, and θSpa...\n",
       "==========================================================================================\n",
       "\n"
      ]
@@ -1008,20 +1219,10 @@
      "output_type": "stream",
      "text": [
       "INPUT TEXT:\n",
-      "2023), UltraFeedback (Cui et al., 2023a),\n",
-      "ValueCAI (Bai et al., 2022a), Align Honesty (Yang et al., 2023a), SANDBOX (Liu et al., 2023b),\n",
-      "Self-Align (Sun et al., 2024b), UltraFeedback (Cui et al., 2023a), RLCD (Yang et al., 2024a)\n",
-      "AgentTool UsingToolformer (Schick et al., 2023), Graph-ToolFormer (Zhang, 2023), Gorilla (Patil et al., 2023),\n",
-      "ToolAlpaca (Tang et al., 2023a), ToolLLM (Qin et al., 2023a), CRAFT (Yuan et al., 2023a),\n",
-      "Confucius (Gao et al., 2023b), MLLM-Tool (Wang et al., 2024), α-UMi (...\n",
+      "it is categorized into two principal steps: ‘Knowledge,’ focusing on eliciting knowledge from teacher LLMs (Eq.1), and ‘Distillation,’ centered on injecting this knowledge into student models (Eq.2). We will elaborate on these two processes in the subsequent sections. 3.1 Knowledge This section focuses on the approaches to elicit knowledge from teacher LLMs. According to the manners to acquire knowledge, we divided them into Labeling ,Expansion ,DataCuration ,Feature ,Feedback , and Self-Knowled...\n",
       "\n",
       "PROCESSED TEXT:\n",
-      ", 2023a), SANDBOX (Liu et al., 2023b),\n",
-      "Self-Align (Sun et al., 2024b), UltraFeedback (Cui et al., 2023a), RLCD (Yang et al., 2024a)\n",
-      "AgentToolformer (Schick et al., 2023), Graph-ToolFormer (Zhang, 2023), Gorilla (Patil et al., 2023),\n",
-      "ToolAlpaca (Tang et al., 2023a), ToolLLM (Qin et al., 2023a), CRAFT (Yuan et al., 2023a),\n",
-      "Confucius (Gao et al., 2023b), MLLM-Tool (Wang et al., 2024), α-UMi (Shen et al., 2024),\n",
-      "PlanningFireAct (Chen et al., 2023b), AgentTuning (Zeng et al., 2023a), Lumos (Yin et al...\n",
+      "o Labeling, Expansion, DataCuration, Feature, and Feedback....\n",
       "==========================================================================================\n",
       "\n"
      ]
@@ -1038,22 +1239,10 @@
      "output_type": "stream",
      "text": [
       "INPUT TEXT:\n",
-      "OMP (Xu et al., 2024b), MaRio (Ramnath et al., 2023),\n",
-      "ID (Jung et al., 2023), GPT-3 Labeling (Wang et al., 2021b), BioGPT (Guo et al., 2023a),\n",
-      "ChatGPT NMT (Yang and Nicolai, 2023),\n",
-      "Information RetrievalQUILL (Srinivasan et al., 2022), Promptgator (Dai et al., 2023b), InPars (Bonifacio et al., 2022),\n",
-      "AugTriever (Meng et al., 2023), (Sun et al., 2023a), RankVicuna (Pradeep et al., 2023a),\n",
-      "RankZephyr (Pradeep et al., 2023b), ExaRanker (Ferraretto et al., 2023),\n",
-      "Recommendation NDR (Mysore et al., 20...\n",
+      "dataset and feeding it into LLMs to obtain the desired generations. Moreover, the generation of yis controllable through the predefined Iandc. This process can be formulated as follows: D(lab)={x, y|x∼ X, y∼pT(y|I⊕c⊕x)}. (3) Input xcould be sourced from existing NLP task datasets, which serve as typical reservoirs for distillation efforts. Numerous works have sought to harness the capa- bilities of powerful LLMs as teachers for annotating dataset samples across a range of tasks. For instance, ef...\n",
       "\n",
       "PROCESSED TEXT:\n",
-      "l., 2021b), BioGPT (Guo et al., 2023a),\n",
-      "ChatGPT NMT (Yang and Nicolai, 2023),\n",
-      "Information RetrievalQUILL (Srinivasan et al., 2022), Promptgator (Dai et al., 2023b), InPars (Bonifacio et al., 2022),\n",
-      "AugTriever (Meng et al., 2023), (Sun et al., 2023a), RankVicuna (Pradeep et al., 2023a),\n",
-      "RankZephyr (Pradeep et al., 2023b), ExaRanker (Ferraretto et al., 2023),\n",
-      "Recommendation NDR (Mysore et al., 2023), InstrcutRec (Zhang et al., 2023b), ONCE (Liu et al., 2023c),\n",
-      "Text Generation EvaluationPandaLM (Wa...\n",
+      "s controllable through the predefined Iandc. This process can be formulated as follows: D(lab)={x, y|x∼ X, y∼pT(y|I⊕c⊕x)}....\n",
       "==========================================================================================\n",
       "\n"
      ]
@@ -1070,18 +1259,10 @@
      "output_type": "stream",
      "text": [
       "INPUT TEXT:\n",
-      "lti-ModalityLLaVA (Liu et al., 2023e), SVIT (Zhao et al., 2023b), LVIS-Instruct4V (Wang et al., 2023e), Shikra (Chen et al., 2023c),\n",
-      "LSKD (Park et al., 2023), DetGPT (Pi et al., 2023; Zhao et al., 2023c), LRV (Liu et al., 2023f), NExT-GPT (Wu et al., 2023b),\n",
-      "Valley (Luo et al., 2023d), ILuvUI (Jiang et al., 2023d), StableLLaVA (Li et al., 2023c), PointLLM (Xu et al., 2023e),\n",
-      "Verticalization\n",
-      "DistillationLaw (Huang et al., 2023b; Cui et al., 2023b); Medical & Healthcare (Zhang et al., 2023c; Chen ...\n",
+      "al., 2023; Li et al., 2022; Ho et al., 2023; Magister et al., 2023; Fu et al., 2023; Ramnath et al., 2023; Li et al., 2023d; Liu et al., 2023g), among others. Rather than concentrating on specific tasks, many current works focus on labeling outputs based on instructions, thereby teaching student models to solve tasks in a more flexible way by following in- structions. Collections of various NLP tasks, complemented by instructional templates, serve as valuable input sources forx. For instance, FL...\n",
       "\n",
       "PROCESSED TEXT:\n",
-      "23e), Shikra (Chen et al., 2023c),\n",
-      "LSKD (Park et al., 2023), DetGPT (Pi et al., 2023; Zhao et al., 2023c), LRV (Liu et al., 2023f), NExT-GPT (Wu et al., 2023b),\n",
-      "Valley (Luo et al., 2023d), ILuvUI (Jiang et al., 2023d), StableLLaVA (Li et al., 2023c), PointLLM (Xu et al., 2023e),\n",
-      "Verticalization\n",
-      "DistillationLaw (Huang et al., 2023b; Cui et al., 2023b); Medical & Healthcare (Zhang et al., 2023c; Chen et al., 2023d); Finance (Zhang and Yang, 2023)...\n",
+      "., 2023; Li et al., 2023d; Liu et al., 2023g), often requiring multiple iterations of model training and fine-tuning, to achieve satisfactory results. These efforts have led to the development of various NLP models, which can be used for a range of applications, such as language translation, sentiment analysis, and text summarization....\n",
       "==========================================================================================\n",
       "\n"
      ]
@@ -1098,25 +1279,15 @@
      "output_type": "stream",
      "text": [
       "INPUT TEXT:\n",
-      "network to mimic the\n",
-      "output of a larger teacher network, often through techniques\n",
-      "like soft target training, where the student learns from\n",
-      "the softened softmax output of the teacher. Please refer to\n",
-      "the survey (Gou et al., 2021) for more details on general\n",
-      "knowledge distillation techniques in AI and DL.\n",
-      "In contrast, the advent of LLMs has revolutionized\n",
-      "the knowledge distillation landscape. The current era of\n",
-      "knowledge distillation in LLMs shifts the focus from mere\n",
-      "architecture compression to t...\n",
+      "powerful LLMs, like ShareGPT. Additionally, Xu et al. (2023b) and Anand et al. (2023) label the real questions sampled from forums like Quora and Stack Overflow. Moreover, the process of labeling could be guided by instructions Ior demonstrations c. A commonly used in- struction type for guiding labeling is chain-of-thought (CoT) prompt (Hsieh et al., 2023; Fu et al., 2023; Magister et al., 2023). Mukherjee et al. (2023) add multiple system messages (e.g. “You must generate a detailed and long a...\n",
       "\n",
       "PROCESSED TEXT:\n",
-      "training, where the student learns from the softened softmax output of the teacher.\n",
-      "refer to the survey (Gou et al., 2021) for more details on general knowledge distillation techniques in AI and DL.\n",
-      "In contrast, the advent of LLMs has revolutionized\n",
-      "the knowledge distillation landscape.\n",
-      "The current era of knowledge distillation in LLMs shifts the focus from mere\n",
-      "architecture compression to the more nuanced process of knowledge elicitation and transfer\n",
-      "Taori et al., 2023; Chaudhary, 2023; Tunstal...\n",
+      "iled and long answers to questions. \n",
+      "Anand et al. (2023) suggest using multiple system messages to elicit rich signals. \n",
+      "Yue et al. (2023a) and Chenglin et al. (2023) propose a hybrid approach combining CoT and knowledge of system messages. \n",
+      "Fu et al. (2023) and Hsieh et al. (2023) demonstrate the effectiveness of using chain-of-thought (CoT) prompts for labeling. \n",
+      "Xu et al. (2023b) and Magister et al. (2023) show the importance of adding guidance prompts to improve labeling accuracy. \n",
+      "Anand et ...\n",
       "==========================================================================================\n",
       "\n"
      ]
@@ -1133,18 +1304,10 @@
      "output_type": "stream",
      "text": [
       "INPUT TEXT:\n",
-      "r reduce the model size , the current focus in LLM-based\n",
-      "knowledge distillation is to extract and transfer the rich,\n",
-      "nuanced understanding that these models have developed.\n",
-      "The key to this modern approach lies in heuristic and\n",
-      "carefully designed prompts, which are used to elicit specific\n",
-      "knowledge (Ding et al., 2023b) or capabilities (Chaudhary,\n",
-      "2023) from the LLMs. These prompts are crafted to tap\n",
-      "into the LLM’s understanding and capabilities in various\n",
-      "domains, ranging from natural language un...\n",
+      "Generate≻≻𝑦\" 𝑦! 𝑦# 𝑥 𝑥& CorrectExpand𝑐 Fig. 5: An illustration of different knowledge elicitation methods from teacher LLMs. Labeling : The teacher generates the output from the input; Expansion : The teacher generates samples similar to the given demonstrations through in- context learning; Data Curation : The teacher synthesizes data according to meta-information, such as a topic or an entity; Feature : Feed the data into the teacher and extract its internal knowledge, such as logits and featu...\n",
       "\n",
       "PROCESSED TEXT:\n",
-      "tract nuanced understanding from LLMs.\"...\n",
+      "he teacher generates the output from the input; Expansion : The teacher generates samples similar to the given demonstrations through in- context learning; Data Curation : The teacher synthesizes data according to meta-information, such as a topic or an entity; Feature : Feed the data into the teacher and extract its internal knowledge, such as logits and features; Feedback : The teacher provides feedback on the student’s generations, such as preferences, corrections, expansions of challenging s...\n",
       "==========================================================================================\n",
       "\n"
      ]
@@ -1161,19 +1324,10 @@
      "output_type": "stream",
      "text": [
       "INPUT TEXT:\n",
-      "their explicit training objectives.\n",
-      "Furthermore, this era of knowledge distillation also em-\n",
-      "phasizes the transfer of more abstract qualities such as\n",
-      "reasoning patterns (Mitra et al., 2023), preference align-\n",
-      "ment (Cui et al., 2023a), and value alignment (Sun et al.,\n",
-      "2024b). This is in stark contrast to the earlier focus on output\n",
-      "replication (Taori et al., 2023), indicating a shift towards\n",
-      "a more holistic and comprehensive transfer of cognitive\n",
-      "capabilities. The current techniques involve not j...\n",
+      "Overflow. 3.1.2 Expansion While the labeling approach is simple and effective, it faces certain limitations. Primarily, it is constrained by the scale and variety of the input data. In real-world applications, especially those involving user conversations, there are also concerns regarding the privacy of the data involved. To address these limitations, various expansion methods have been proposed (Wang et al., 2022a; Taori et al., 2023; Chaud- hary, 2023; Si et al., 2023; Ji et al., 2023a; Luo e...\n",
       "\n",
       "PROCESSED TEXT:\n",
-      "ch as \n",
-      "reasoning patterns, preference alignment, and value alignment. This shift towards a more holistic and comprehensive transfer of cognitive capabilities. The current techniques involve not just the replication of outputs, but also the emulation of thought processes and decision-making patterns of the teacher model. This involves complex strategies like chain-of-thought prompting, where the student model is trained to learn the reasoning process of the teacher, thereby enhancing its problem-...\n",
+      "es certain limitations. Primarily, it is constrained by the scale and variety of the input data. In real-world applications, especially those involving user conversations, there are also concerns regarding the privacy of the data involved. Various expansion methods have been proposed to address these limitations. These methods take the demonstrations as seed knowledge and aim to expand a large-scale and varied data by in-context learning....\n",
       "==========================================================================================\n",
       "\n"
      ]
@@ -1190,25 +1344,10 @@
      "output_type": "stream",
      "text": [
       "INPUT TEXT:\n",
-      " al., 2022) emerges as a critical paradigm integral\n",
-      "to the process of knowledge distillation. Unlike traditional\n",
-      "DA techniques such as paraphrasing (Gangal et al., 2022) orback-translation (Longpre et al., 2019), which primarily aim\n",
-      "at expanding the training dataset in a somewhat mechanical\n",
-      "manner. DA within the context of LLMs focuses on the\n",
-      "generation of novel, context-rich training data tailored to\n",
-      "specific domains and skills. This innovation is driven by the\n",
-      "unique capabilities of LLMs to ge...\n",
+      "the existing dataset, in the expansion approach, both x andyare generated by teacher LLMs. This process can be formulated as follows: D(exp)={(x, y)|x∼pT(x|I⊕c), y∼pT(y|I⊕x)}.(4) In this formulation, xand yrepresent the new input- output pairs generated by the teacher LLM. The input x is generated based on a set of input-output demonstrations c. The output yis then generated in response to the new input xunder the guidance of an instruction I. Note thatthe demonstrations could be predefined or d...\n",
       "\n",
       "PROCESSED TEXT:\n",
-      "e traditional\n",
-      "DA techniques such as paraphrasing (Gangal et al., 2022) or back-translation (Longpre et al., 2019), which primarily aim\n",
-      "at expanding the training dataset in a somewhat mechanical\n",
-      "manner. DA within the context of LLMs focuses on the\n",
-      "generation of novel, context-rich training data tailored to\n",
-      "specific domains and skills. This innovation is driven by the\n",
-      "unique capabilities of LLMs to generate coherent, diverse,\n",
-      "and intricate data samples that closely mimic the nuanced\n",
-      "understanding ...\n",
+      "is the set of input-output demonstrations, and I is the instruction set....\n",
       "==========================================================================================\n",
       "\n"
      ]
@@ -1225,20 +1364,30 @@
      "output_type": "stream",
      "text": [
       "INPUT TEXT:\n",
-      "capability gap between proprietary and open-\n",
-      "source models. Through DA, LLMs are prompted to create\n",
-      "targeted, high-quality datasets that are not merely larger in\n",
-      "volume but are also rich in diversity and specificity. This\n",
-      "approach enables the distillation process to be more effec-\n",
-      "tive, ensuring that the distilled models not only replicate\n",
-      "the teacher model’s output behavior but also embody its\n",
-      "deep-seated understanding and cognitive strategies.\n",
-      "The significance and necessity of DA for achieving...\n",
+      "subsequent expansion iterations. Subsequently, Taori et al. (2023) applies this ex- pansion method to a more powerful teacher LLM, text- davinci-003, to distill 52K high-quality data. To improve the diversity and coverage during expansion, Wu et al. (2023c) and (Sun et al., 2024b) prompt the teacher LLM to generate instructions corresponding to some specific topics. Xu et al. (2023a) propose an Evol-Instruct method to ex- pand the instructions from two dimensions: difficulty (e.g. rewriting the ...\n",
       "\n",
       "PROCESSED TEXT:\n",
-      "e targeted, high-quality datasets that are not merely larger in volume but also rich in diversity and specificity. This approach enables the distillation process to be more effective, ensuring that the distilled models replicate the teacher model’s output behavior and embody its deep-seated understanding and cognitive strategies.\n",
+      "o a more powerful teacher LLM, text- davinci-003, to distill 52K high-quality data. To improve the diversity and coverage during expansion, Wu et al. (2023c) and (Sun et al., 2024b) prompt the teacher LLM to generate instructions corresponding to some specific topics. Xu et al. (2023a) propose an Evol-Instruct method to expand the instructions from two dimensions: difficulty (e.g. rewriting the question to be more complex) and diversity (e.g. generating more long-tailed instructions). This Evol-...\n",
+      "==========================================================================================\n",
+      "\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INPUT TEXT:\n",
+      "multi- ple conceptually similar, but semantically varied, samples to improve classification performance. Similarly, TDG (He et al., 2023b) proposes the Targeted Data Generation (TDG) framework, which automatically identifies challenging sub- groups within data and generates new samples for these subgroups using LLMs through in-context learning. In summary, the expansion method leverages the in- 9 context learning strengths of LLMs to produce more var- ied and extensive datasets with both inputs ...\n",
       "\n",
-      "DA is a key factor in achieving knowledge discovery in the LLM era, as it enables the distilled models to acquire and refine capabilities that would otherwise require ...\n",
+      "PROCESSED TEXT:\n",
+      "and generates new samples for these subgroups using LLMs through in-context learning, leveraging the strengths of LLMs in contextualized learning. The expansion method produces varied and extensive datasets, but the quality and diversity of the generated data heavily rely on teacher LLMs and initial seed demonstrations. This dependence can lead to biased datasets and homogeneity issues, where the generated samples may be similar, limiting the diversity sought after....\n",
       "==========================================================================================\n",
       "\n"
      ]
@@ -1255,30 +1404,10 @@
      "output_type": "stream",
      "text": [
       "INPUT TEXT:\n",
-      "ssible approach to harnessing\n",
-      "the power of LLMs. It empowers open-source models with\n",
-      "the ability to approximate the contextual adeptness, ethical\n",
-      "alignment, and deep semantic insights characteristic of their\n",
-      "proprietary counterparts, thereby democratizing access to\n",
-      "advanced AI capabilities and fostering innovation across a\n",
-      "broader spectrum of applications and users.\n",
-      "2.3 Survey Scope\n",
-      "Building on the discussions introduced earlier, this survey\n",
-      "aims to comprehensively explore the landscape of knowl...\n",
+      "data. 3.1.3 Data Curation The pursuit of high-quality and scalable data generation in knowledge distillation from LLMs has led to the emergence of the Data Curation approach. This method arises in re- sponse to the limitations observed in both the Labeling and Expansion approaches. These methods often yield data of variable quality and face constraints in quantity. In Labeling, the seed knowledge is sourced from task datasets, leading to potential noise and dirty data. Meanwhile, in Expansion, t...\n",
       "\n",
       "PROCESSED TEXT:\n",
-      "fields, including but not limited to\n",
-      "- Developing AI models that can learn and improve over time\n",
-      "- Enhancing human-AI collaboration and interaction\n",
-      "- Improving language understanding and generation capabilities\n",
-      "- Enabling the creation of high-quality, explainable AI models\n",
-      "2.3.1 Survey Scope\n",
-      "This survey aims to investigate the current state of knowledge distillation within the context of LLMs\n",
-      "in three primary areas:\n",
-      "- KD Algorithms\n",
-      "- Skill Distillation\n",
-      "- Verticalization Distillation\n",
-      "Each facet w...\n",
+      "ion from LLMs has led to the emergence of the Data Curation approach. This method addresses the limitations of both Labeling and Expansion approaches by curating high-quality or large-scale data....\n",
       "==========================================================================================\n",
       "\n"
      ]
@@ -1295,27 +1424,10 @@
      "output_type": "stream",
      "text": [
       "INPUT TEXT:\n",
-      "ions and methodologies of knowledge distillation. It\n",
-      "includes an in-depth exploration of the processes involved\n",
-      "in constructing knowledge from teacher models (e.g., pro-\n",
-      "prietary LLMs) and integrating this knowledge into student\n",
-      "models (e.g., open-source LLMs). Under the umbrella of\n",
-      "‘knowledge ’, we delve into strategies such as labeling (Hsieh\n",
-      "et al., 2023), expansion (Taori et al., 2023), curation (Gu-\n",
-      "nasekar et al., 2023), feature understanding (Agarwal et al.,\n",
-      "6\n",
-      "2024), feedback mechanisms (...\n",
+      "approach to synthesize data from scratch. Numerous diverse meta- information, such as topics or knowledge points, could be incorporated into this process to generate controllable x andy. Thus, this process can be meticulously controlled to yield datasets that are not only large in scale but also of high quality. The formulation for Data Curation can be represented as: D(cur)={(x, y)|x∼pT(x|I⊕m), y∼pT(y|I⊕x)}.(5) In this formulation, mrepresents the diverse meta- information used to guide the syn...\n",
       "\n",
       "PROCESSED TEXT:\n",
-      "onstructing knowledge \n",
-      "from teacher models (e.g., proprietary LLMs) and integrating this \n",
-      "knowledge into student models (e.g., open-source LLMs)\n",
-      "strategies such as labeling, expansion, curation, feature \n",
-      "understanding, feedback mechanisms, and self-knowledge generation \n",
-      "under the umbrella of 'knowledge' we delve into\n",
-      "strategies such as supervised fine-tuning, divergence minimization, \n",
-      "reinforcement learning techniques, and rank optimization strategies\n",
-      "knowledge distillation seeks to uncover vari...\n",
+      "ata into the process to generate controllable outputs. This can be achieved by representing the formulation as D(cur)={(x, y) | x∼pT(x|I⊕m), y∼pT(y|I⊕x)}. Here, mrepresents the diverse metadata used to guide the synthesis of x, and Iis the instruction guiding teacher LLMs to generate x or y. Different studies primarily vary in their source and method of leveraging metadata. UltraChat (Ding et al., 2023b) effectively demonstrates the process of curating high-quality and diverse data by distilling...\n",
       "==========================================================================================\n",
       "\n"
      ]
@@ -1332,25 +1444,35 @@
      "output_type": "stream",
      "text": [
       "INPUT TEXT:\n",
-      "This analysis\n",
-      "aims to illuminate how these algorithms facilitate the trans-\n",
-      "fer of knowledge, ensuring that open-source models can\n",
-      "replicate and, in some cases, surpass the capabilities of their\n",
-      "proprietary counterparts.\n",
-      "Skill Distillation. This facet examines the specific compe-\n",
-      "tencies and capabilities enhanced through KD. It encom-\n",
-      "passes detailed discussions on context following (Taori et al.,\n",
-      "2023; Luo et al., 2023c), with subtopics like instruction\n",
-      "following and retrieval-augmented generat...\n",
+      "the World , they explore 30 meta-topics like ”Technology” and ”Food and Drink.” the teacher LLMs then use this meta-information to distill a broad array of instructions and conversations, achieving a substantial scale of 1.5 million instances. UltraChat stands out with its lexical and topical diversity. The UltraLLaMA model, fine- tuned on this data, consistently surpasses other open-source models. Another notable series, phi(Gunasekar et al., 2023; Li et al., 2023a; Mar, 2023), focuses on disti...\n",
       "\n",
       "PROCESSED TEXT:\n",
-      "models to replicate and potentially surpass proprietary counterparts in various capabilities.\n",
+      "and conversations to distill a substantial scale of 1.5 million instances, leveraging Meta-LLMs....\n",
+      "==========================================================================================\n",
+      "\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INPUT TEXT:\n",
+      "tokens of Python exercises with solutions. Remarkably, thephi-1 model, despite its smaller size, outperforms nearly all open-source models on coding benchmarks like Hu- manEval and MBPP while being 10 times smaller in model size and 100 times smaller in dataset size. MFTCoder (Liu et al., 2023d) utilizes hundreds of Python knowledge points as meta-information to create a CodeExercise Dataset. In contrast, Magicoder (Wei et al., 2023) and WaveCoder (Yu et al., 2024) get raw code collections from ...\n",
       "\n",
-      "**Distillation**\n",
-      "This facet focuses on the specific strengths and capabilities enhanced through Knowledge Distillation. It includes discussions on context and following (Taori et al., 2023; Luo et al., 2023c), instruction following and retrieval-augmented generation (RAG) capabilities.\n",
+      "PROCESSED TEXT:\n",
+      "=\n",
       "\n",
-      "**Alignment**\n",
-      "In the realm of alignment, a survey investigates thinking patterns, persona/preference modeling, and va...\n",
+      "1. The phi-1 model outperforms open-source models on coding benchmarks like HumanEval and MBPP while being 10 times smaller in model size and 100 times smaller in dataset size.\n",
+      "2. MFTCoder (Liu et al., 2023) utilizes hundreds of Python knowledge points as meta-information to create a CodeExercise Dataset.\n",
+      "3. Magicoder (Wei et al., 2023) and WaveCoder (Yu et al., 2024) generate instructional data using open-source code collections from datasets.\n",
+      "4. In NLU tasks, studies (Ye et al., 2022; Gao e...\n",
       "==========================================================================================\n",
       "\n"
      ]
@@ -1367,20 +1489,10 @@
      "output_type": "stream",
      "text": [
       "INPUT TEXT:\n",
-      ", text generation evaluation, and code gen-\n",
-      "eration. Finally, the survey addresses multi-modality (Liu\n",
-      "et al., 2023e; Zhao et al., 2023b), exploring how KD enhances\n",
-      "LLMs’ ability to interpret and integrate multiple forms of\n",
-      "input, enriching their utility and applicability across various\n",
-      "contexts.\n",
-      "Verticalization Distillation. This section assesses the ap-\n",
-      "plication of KD across diverse vertical domains, offering\n",
-      "insights into how distilled LLMs can be tailored for spe-\n",
-      "cialized fields such as La...\n",
+      "et al., 2022; Meng et al., 2023). In conclusion, Data Curation through teacher LLMs has emerged as a promising technique for synthesizing datasets that are not only high-quality and diverse but also large in scale. The success of models like phi-1 in specialized domains underscores the efficacy of this method. The ability to create synthetic datasets will become a crucial technical skill and a key area of focus in AI (Li et al., 2023a). 3.1.4 Feature The previously discussed knowledge elicitatio...\n",
       "\n",
       "PROCESSED TEXT:\n",
-      "iu et al., 2023; Zhao et al., 2023b), exploring how KD enhances LLM's ability to interpret and integrate multiple forms of input, enriching their utility and applicability across various contexts.\n",
-      "Verticalization Distillation. This section assesses the application of KD across diverse vertical domains, offering insights into how distilled LLMs can be tailored for specialized fields such as Law (LAW, 2023), Medical & Healthcare (Wang et al., 2023a), Finance (Zhang and Yang, 2023), Science (Zhang ...\n",
+      "omising technique for synthesizing high-quality and diverse datasets at large scale. The success of models like phi-1 in specialized domains suggests its efficacy. The ability to create synthetic datasets is a crucial technical skill and a key area of focus in AI....\n",
       "==========================================================================================\n",
       "\n"
      ]
@@ -1397,21 +1509,10 @@
      "output_type": "stream",
      "text": [
       "INPUT TEXT:\n",
-      "roader AI and ML ecosystem.\n",
-      "By navigating through these facets, this survey en-\n",
-      "deavors to provide an extensive and nuanced analysis of\n",
-      "knowledge distillation in the era of LLMs. It serves as a\n",
-      "guide for researchers, practitioners, and enthusiasts in the\n",
-      "field, shedding light on current methodologies, challenges,\n",
-      "and opportunities for innovation in this rapidly evolving\n",
-      "domain.\n",
-      "Declaration. This survey represents our earnest effort to\n",
-      "provide a comprehensive and insightful overview of knowl-\n",
-      "edg...\n",
+      "with fewer than 1 billion parameters (cf. Gou et al. (2021) for detail). However, recent research has begun to explore white-box distillation in the context of generative LLMs (Timiryasov and Tastet, 2023; Liang et al., 2023a; Gu et al., 2024; Agarwal et al., 2024; Liu et al., 2023a; Wen et al., 2023; Wan et al., 2024a; Zhao and Zhu, 2023; Qin et al., 2023b; Boizard et al., 2024; Zhong et al., 2024). The typical method for acquiring this feature knowledge involves teacher LLMs annotating the out...\n",
       "\n",
       "PROCESSED TEXT:\n",
-      "n extensive and nuanced analysis of knowledge distillation in the era of LLMs. It serves as a guide for researchers, practitioners, and enthusiasts in the field, shedding light on current methodologies, challenges, and opportunities for innovation in this rapidly evolving domain.\n",
-      "Declaration. This survey represents our earnest effort to provide a comprehensive and insightful overview of knowledge distillation techniques applied to LLMs, focusing on algorithms, skill enhancement, and domain-speci...\n",
+      "). recent research has begun to explore white-box distillation in the context of generative LLMs (Timiryasov and Tastet, 2023; Liang et al., 2023a; Gu et al., 2024; Agarwal et al., 2024; Liu et al., 2023a; Wen et al., 2023; Wan et al., 2024a; Zhao and Zhu, 2023; Qin et al., 2023b; Boizard et al., 2024; Zhong et al., 2024)....\n",
       "==========================================================================================\n",
       "\n"
      ]
@@ -1428,33 +1529,10 @@
      "output_type": "stream",
      "text": [
       "INPUT TEXT:\n",
-      " their impacts\n",
-      "across a range of applications.\n",
-      "2.4 Distillation Pipeline in LLM Era\n",
-      "SeedKnowledgeSkill/Domain\n",
-      "TeacherLLMKnowledgeElicitationStudentModelDistillationAlgorithmsteer\n",
-      "driveGeneratedKnowledgeLearningObjectivetrain\n",
-      "Fig. 4: An illustration of a general pipeline to distill knowl-\n",
-      "edge from a large language model to a student model.\n",
-      "The general distillation pipeline of LLMs is a structured\n",
-      "and methodical process aimed at transferring knowledge\n",
-      "from a sophisticated teacher model to a less ...\n",
+      "(such as output distri- bution) from the teacher LLM. 10 The most straightforward method to elicit feature knowl- edge of teacher is to label a fixed dataset of sequences with token-level probability distributions (Sanh et al., 2019; Wen et al., 2023). To leverage the rich semantic and syntactic knowledge in intermediate layers of the teacher model, TED (Liang et al., 2023a) designs task-aware layer-wise distillation. They align the student’s hidden representations with those of the teacher at e...\n",
       "\n",
       "PROCESSED TEXT:\n",
-      "skill or domain to be learned.**\n",
-      "\n",
-      "their impacts\n",
-      "across a range of applications.\n",
-      "\n",
-      "2.4 Distillation Pipeline in LLM Era\n",
-      "SeedKnowledgeSkill/Domain\n",
-      "TeacherLLMKnowledgeElicitationStudentModelDistillationAlgorithmsteer\n",
-      "driveGeneratedKnowledgeLearningObjectivetrain\n",
-      "Fig. 4: An illustration of a general pipeline to distill knowl-\n",
-      "edge from a large language model to a student model.\n",
-      "The general distillation pipeline of LLMs is a structured\n",
-      "and methodical process aimed at transferring knowledge\n",
-      "from a soph...\n",
+      "r is to label a fixed dataset of sequences with token-level probability distributions (Sanh et al., 2019; Wen et al., 2023). To leverage the rich semantic and syntactic knowledge in intermediate layers of the teacher model, TED (Liang et al., 2023a) designs task-aware layer-wise distillation. They align the student’s hidden representations with those of the teacher at each layer, selectively extracting knowledge pertinent to the target task. Gu et al. (2024) and Agarwal et al. (2024) introduce a...\n",
       "==========================================================================================\n",
       "\n"
      ]
@@ -1471,30 +1549,10 @@
      "output_type": "stream",
      "text": [
       "INPUT TEXT:\n",
-      "lves directing the teacher LLM towards a\n",
-      "specific target skill or domain. This is achieved through care-\n",
-      "fully crafted instructions or templates that guide the LLM’s\n",
-      "focus. These instructions are designed to elicit responses\n",
-      "that demonstrate the LLM’s proficiency in a particular area,\n",
-      "be it a specialized domain like healthcare or law, or a skill\n",
-      "such as reasoning or language understanding. The objective\n",
-      "here is to utilize the teacher LLM’s extensive training and\n",
-      "nuanced capabilities to generate ...\n",
+      "distilling feature knowledge from teacher LLMs have been proposed (Tao et al., 2022a; Liu et al., 2023a; Kim et al., 2023b). These methods aim to preserve the original output distribution when quantizing the LLMs, ensuring minimal loss of performance. Additionally, feature knowledge could serve as a potent source for multi-teacher knowledge distil- lation. Timiryasov and Tastet (2023) leverages an ensemble of GPT-2 and LLaMA as teacher models to extract output distributions. Similarly, FuseLLM (...\n",
       "\n",
       "PROCESSED TEXT:\n",
-      "the LLM’s focus\n",
-      "elicit responses that demonstrate the LLM’s proficiency\n",
-      "in a particular area\n",
-      "specialized domain like healthcare or law\n",
-      "or a skill\n",
-      "such as reasoning or language understanding\n",
-      "objective\n",
-      "to utilize the teacher LLM’s extensive training and nuanced capabilities\n",
-      "to generate outputs that are rich in the specific knowledge or skills desired for the student model\n",
-      "seed knowledge\n",
-      "typically comprises\n",
-      "a small dataset or specific data clues relevant to the elicit\n",
-      "skill or domain knowledge from...\n",
+      "2023a; Kim et al., 2023b). These methods aim to preserve the original output distribution when quantizing the LLMs, ensuring minimal loss of performance....\n",
       "==========================================================================================\n",
       "\n"
      ]
@@ -1511,22 +1569,10 @@
      "output_type": "stream",
      "text": [
       "INPUT TEXT:\n",
-      " seed knowledge is crucial as it provides a\n",
-      "foundation upon which the teacher model can build and\n",
-      "expand, thereby creating more comprehensive and in-depth\n",
-      "knowledge examples.\n",
-      "III. Generation of Distillation Knowledge. In response\n",
-      "to the seed knowledge and steering instructions, the teacher\n",
-      "LLM generates knowledge examples. These examples are\n",
-      "predominantly in the form of question-and-answer (QA)\n",
-      "dialogues or narrative explanations, aligning with the nat-\n",
-      "ural language processing/understanding cap...\n",
+      "knowledge from teacher LLMs, such as output distributions and intermediate layer features, white- box approaches enable a more nuanced transfer of informa- tion. While showing promise, especially in smaller models, its application is not suitable for black-box LLMs where internal parameters are inaccessible. Furthermore, student models distilled from white-box LLMs may underperform compared to their black-box counterparts, as the black-box teacher LLMs (e.g. GPT-4) tend to be more powerful. 3.1....\n",
       "\n",
       "PROCESSED TEXT:\n",
-      "ately creating a more comprehensive and in-depth knowledge base.\n",
-      "III. Distillation of Knowledge. The teacher LLM generates knowledge examples, primarily in the form of question-and-answer dialogues or narrative explanations, aligning with the language processing capabilities of the model.\n",
-      "In certain specialized cases, the outputs may include logits or hidden features, although this is less common due to the complexity of the data form.\n",
-      "The generated knowledge examples form the core of the distil...\n",
+      "ox approaches enable a more nuanced transfer of information. While showing promise, especially in smaller models, its application is not suitable for black-box LLMs where internal parameters are inaccessible. Furthermore, student models distilled from white-box LLMs may underperform compared to their black-box counterparts, as black-box teacher LLMs tend to be more powerful. 3.1.5 Feedback Most previous works focus on one-way knowledge transfer from the teacher to the student for imitation, with...\n",
       "==========================================================================================\n",
       "\n"
      ]
@@ -1543,18 +1589,10 @@
      "output_type": "stream",
      "text": [
       "INPUT TEXT:\n",
-      "ge examples to train the student\n",
-      "model. This training is guided by a loss function that aligns\n",
-      "with the learning objectives. The loss function quantifies\n",
-      "the student model’s performance in replicating or adapting\n",
-      "the knowledge from the teacher model. By minimizing this\n",
-      "loss, the student model learns to emulate the target skills or\n",
-      "domain knowledge of the teacher, thereby acquiring similar\n",
-      "capabilities. The process involves iteratively adjusting the\n",
-      "student model’s parameters to reduce the discre...\n",
+      "through Reinforcement Learning from AI Feedback (RLAIF) (Bai et al., 2022a). Here is a generalized formulation for eliciting feedback knowledge: D(fb)={(x, y, ϕ fb(x, y;θT))|x∼ X, y∼pS(y|x)}, (7) where ydenotes the output generated by the student model in response to x, and ϕfb(·;θT))represents providing feedback from teacher LLMs. This operation evaluates thestudent’s output ygiven the input x, by offering assess- ment, corrective information, or other forms of guidance. This feedback knowledge...\n",
       "\n",
       "PROCESSED TEXT:\n",
-      "xt (o) and combined with the seed knowledge (s) to produce the output (o')...\n",
+      "...\n",
       "==========================================================================================\n",
       "\n"
      ]
@@ -1571,33 +1609,14 @@
      "output_type": "stream",
      "text": [
       "INPUT TEXT:\n",
-      "ch the LLM can\n",
-      "explore to generate novel knowledge, Parse( o, s)stands for\n",
-      "to parse the distillation example ( e.g., (x, y)) from the\n",
-      "teacher LLM’s output o(plus the input sin some cases),\n",
-      "andpTrepresents the teacher LLM with parameters θT.\n",
-      "Given the datasets D(kd)\n",
-      "Ibuilt for distillation, we then define\n",
-      "a learning objective as\n",
-      "L=X\n",
-      "ILI(D(kd)\n",
-      "I;θS), (2)\n",
-      "whereP\n",
-      "Idenotes there could be multiple tasks or skills\n",
-      "being distilled into one student model, LI(·;·)stands for a\n",
-      "specific learning objective, ...\n",
+      "2023; Lee et al., 2023a). Preference, as previously discussed, represents a notable form of feedback knowledge from teacher models. Various knowledge of preferences could be distilled from teachers by prompting it with specific criteria. Bai et al. (2022a) in- troduce RLAIF for distilling harmlessness preferences from LLMs. This involves using an SFT-trained LLM to generate response pairs for each prompt, then ranking them for harmlessness to create a preference dataset. This dataset is distille...\n",
       "\n",
       "PROCESSED TEXT:\n",
-      "on example ( e.g., (x, y)) from the\n",
-      "teacher LLM’s output o(plus the input sin some cases),\n",
-      "andpTrepresents the teacher LLM with parameters θT.\n",
-      "Given the datasets D(kd)\n",
-      "Ibuilt for distillation, we then define\n",
-      "a learning objective as\n",
-      "L=X\n",
-      "ILI(D(kd)\n",
-      "I;θS), (2)\n",
-      "where...\n",
+      "wledge from teacher models. Various knowledge of preferences could be distilled from teachers by prompting it with specific criteria.\n",
+      "\n",
+      "Bai et al. (2022a) introduce RLAIF for distilling harmlessness preferences from LLMs. This involves using an SFT-trained LLM to generate response pairs for each prompt, then ranking them for harmlessness to create a preference dataset. This dataset is distilled into a Preference Model (PM), which then guides the RL training of a more harmless LLM policy.\n",
+      "\n",
+      "Wizard-...\n",
       "==========================================================================================\n",
       "\n"
      ]
@@ -1614,27 +1633,10 @@
      "output_type": "stream",
      "text": [
       "INPUT TEXT:\n",
-      " LLMs (Eq.1), and ‘Distillation,’\n",
-      "centered on injecting this knowledge into student models\n",
-      "(Eq.2). We will elaborate on these two processes in the\n",
-      "subsequent sections.\n",
-      "3.1 Knowledge\n",
-      "This section focuses on the approaches to elicit knowledge\n",
-      "from teacher LLMs. According to the manners to acquire\n",
-      "knowledge, we divided them into Labeling ,Expansion ,DataCuration ,Feature ,Feedback , and Self-Knowledge . Figure 5\n",
-      "shows an illustration of these knowledge elicitation meth-\n",
-      "ods.\n",
-      "3.1.1 Labeling\n",
-      "Labeling...\n",
+      "various instructions and models to produce comparative data. Then, GPT-4 is used to score candidates from various aspects of preference, including instruction-following, truthfulness, honesty and helpfulness. Beyond merely assessing student generations, teachers can also furnish extensive feedback on instances where students underperform. In Lion (Jiang et al., 2023b), teacher model pinpoints instructions that pose challenges to the student model, generating new, more difficult instructions aime...\n",
       "\n",
       "PROCESSED TEXT:\n",
-      "aborate on these two processes in the\n",
-      "subsequent sections.\n",
-      "3.1 Knowledge\n",
-      "This section focuses on the approaches to elicit knowledge\n",
-      "from teacher LLMs. According to the manners to acquire\n",
-      "knowledge, we divided them into Labeling,Expansion,DataCuration,Feature,Feedback, and Self-Knowledge. Figure 5\n",
-      "shows an illustration of these knowledge elicitation methods....\n",
+      "es from various aspects of preference, including instruction-following, truthfulness, honesty and helpfulness. Beyond merely assessing student generations, teachers can also furnish extensive feedback on instances where students underperform....\n",
       "==========================================================================================\n",
       "\n"
      ]
@@ -1651,61 +1653,71 @@
      "output_type": "stream",
      "text": [
       "INPUT TEXT:\n",
-      "lable through the\n",
-      "predefined Iandc. This process can be formulated as\n",
-      "follows:\n",
-      "D(lab)={x, y|x∼ X, y∼pT(y|I⊕c⊕x)}. (3)\n",
-      "Input xcould be sourced from existing NLP task\n",
-      "datasets, which serve as typical reservoirs for distillation\n",
-      "efforts. Numerous works have sought to harness the capa-\n",
-      "bilities of powerful LLMs as teachers for annotating dataset\n",
-      "samples across a range of tasks. For instance, efforts in\n",
-      "natural language understanding involve using LLMs to cat-\n",
-      "egorize text (Gilardi et al., 2023; Ding...\n",
+      "teacher model’s distribution over the student’s generations can itself act as a form of feedback. MiniLLM (Gu et al., 2024) and GKD (Agarwal et al., 2024) present an innovative strategy wherein the student model initially generates sequences, followed by teacher model producing an output distribution as feedback. This method leverages the teacher’s insight to directly inform and refine the student model’s learning process. 3.1.6 Self-Knowledge The knowledge could also be elicited from the studen...\n",
       "\n",
       "PROCESSED TEXT:\n",
-      ", y∼pT(y|I⊕c⊕x)}. (3)\n",
-      "Input xcould be sourced from existing NLP task\n",
-      "datasets, which serve as typical reservoirs for distillation\n",
-      "efforts. Numerous works have sought to harness the\n",
-      "capabilities of powerful LLMs as teachers for annotating dataset\n",
-      "samples across a range of tasks. For instance, efforts in\n",
-      "natural language understanding involve using LLMs to\n",
-      "categorize text (Gilardi et al., 2023; Ding et al., 2023a; He et al.,\n",
-      "2023a), while in natural language generation, LLMs assist\n",
-      "in generating s...\n",
+      "iniLLM and GKD present an innovative strategy wherein the student model generates sequences, followed by teacher model producing an output distribution as feedback. This method leverages the teacher’s insight to directly inform and refine the student model’s learning process. 3.1.6 Self-Knowledge The knowledge can also be elicited from the student itself, which we refer to as Self-Knowledge. In this setting, the same model acts both as the teacher and the student, iteratively improving itself by...\n",
       "==========================================================================================\n",
       "\n"
      ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
+     ]
     }
    ],
    "source": [
-    "processed_text = \"\"  # Initialize complete processed text\n",
     "with open(output_file, 'w', encoding='utf-8') as out_file:\n",
-    "    for chunk_num in tqdm(range(num_chunks), desc=\"Processing chunks\"):\n",
-    "        # Get chunk with overlap\n",
-    "        start_idx = chunk_num * CHUNK_SIZE\n",
-    "        end_idx = start_idx + CHUNK_SIZE\n",
-    "        \n",
-    "        chunk = text[start_idx:end_idx]\n",
-    "        \n",
+    "    for chunk_num, chunk in enumerate(tqdm(chunks, desc=\"Processing chunks\")):\n",
     "        # Process chunk and append to complete text\n",
     "        processed_chunk = process_chunk(chunk, chunk_num)\n",
     "        processed_text += processed_chunk + \"\\n\"\n",
     "        \n",
     "        # Write chunk immediately to file\n",
     "        out_file.write(processed_chunk + \"\\n\")\n",
-    "        \n",
-    "        # Force flush the file to disk\n",
     "        out_file.flush()"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 53,
    "id": "89ef51a7-f13f-49a4-8f73-9ac8ce75319d",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Processing complete!\n",
+      "Input file: ./extracted_text.txt\n",
+      "Output file: clean_extracted_text.txt\n",
+      "Total chunks processed: 101\n",
+      "\n",
+      "Preview of final processed text:\n",
+      "\n",
+      "BEGINNING:\n",
+      "Tao Shen4, Reynold Cheng1, Jinyang Li1,\n",
+      "Can Xu5, Dacheng Tao6, Tianyi Zhou2\n",
+      "1The University of Hong Kong2University of Maryland3Microsoft\n",
+      "4University of Technology Sydney5Peking University6The University of Sydney\n",
+      "{shawnxxh,chongyangtao,hishentao }@gmail.com {minglii,tianyi }@umd.edu\n",
+      "ckcheng@cs.hku.hk\n",
+      "ulously structured around three foundational pillars: algorithm, skill, and verticalization – providing a comprehensive examination of knowledge distillation mechanisms, the enhancement of specific cognitive abilities, and their practical implications across diverse fields. Crucially, the survey navigates the intricate interplay between data augmentation (DA) and knowledge distillation, illustrating how DA emerges as a powerful paradigm within the knowledge distillation framework to bolster large language models' performance. By leveraging DA to generate context-rich, skill-specific training data, knowledge distillation transcends traditional boundaries, enabling open-source models to app\n",
+      "\n",
+      "...\n",
+      "\n",
+      "END:\n",
+      "se style but not the reasoning process. To improve, new methods are proposed that not only imitate the response but also novel thinking patterns.\n",
+      "del that has been fine-tuned to continuously revise its own answer until it provides a high-quality response in a single inference. During training, it utilizes both the final response and feedback chain as the fitting target. This pattern, response with the revision process, shows a promising performance gain. Following SelFee, Reflection-Tuning (Li et al., 2023e, 2024d) also utilizes the reflection process as the learning pattern. Noticing the lack of reasoning imitation of the previous methods, Orca (Mukherjee et al., 2023) first proposes Explanation tuning, which aims to learn the reasoning steps, including explanation traces, step-by-step thought processes, and other complex instructions, from the teacher model, rather than just the vanilla styles. Extensive experiments verify the effectiveness of distilling with this thinking pattern.\n",
+      "\n",
+      "\n"
+     ]
+    }
+   ],
    "source": [
     "print(f\"\\nProcessing complete!\")\n",
     "print(f\"Input file: {INPUT_FILE}\")\n",