From af5df1d08378b9fde319f3011f6e6c9c2337b2ad Mon Sep 17 00:00:00 2001
From: Alex Yang <himself65@outlook.com>
Date: Wed, 10 Apr 2024 09:26:26 -0500
Subject: [PATCH] feat: add `llm-stream` event (#707)

---
 apps/docs/docs/recipes/cost-analysis.mdx      |   2 +-
 examples/package.json                         |   1 +
 examples/recipes/cost-analysis.ts             |  24 +-
 packages/core/e2e/fixtures/llm/open_ai.ts     |  13 +-
 packages/core/e2e/node/snapshot/agent.snap    |  31 +-
 .../core/e2e/node/snapshot/gpt-4-turbo.snap   |  31 +-
 packages/core/e2e/node/snapshot/llm.snap      | 424 +++++++++++++++++-
 .../snapshot/queryEngine_subquestion.snap     |  33 +-
 packages/core/e2e/node/utils.ts               |  39 +-
 .../core/src/callbacks/CallbackManager.ts     |   7 +-
 packages/core/src/llm/LLM.ts                  |   2 +-
 packages/core/src/llm/anthropic.ts            |   5 +-
 packages/core/src/llm/mistral.ts              |   1 +
 packages/core/src/llm/ollama.ts               |   1 +
 packages/core/src/llm/open_ai.ts              |   1 +
 packages/core/src/llm/types.ts                |  11 +-
 packages/core/src/llm/utils.ts                |  14 +-
 pnpm-lock.yaml                                |   3 +
 18 files changed, 561 insertions(+), 82 deletions(-)

diff --git a/apps/docs/docs/recipes/cost-analysis.mdx b/apps/docs/docs/recipes/cost-analysis.mdx
index eb5eeb3e7..ed9b0fe1d 100644
--- a/apps/docs/docs/recipes/cost-analysis.mdx
+++ b/apps/docs/docs/recipes/cost-analysis.mdx
@@ -6,7 +6,7 @@ This page shows how to track LLM cost using APIs.
 
 The callback manager is a class that manages the callback functions.
 
-You can register `llm-start`, and `llm-end` callbacks to the callback manager for tracking the cost.
+You can register `llm-start`, `llm-end`, and `llm-stream` callbacks to the callback manager for tracking the cost.
 
 import CodeBlock from "@theme/CodeBlock";
 import CodeSource from "!raw-loader!../../../../examples/recipes/cost-analysis";
diff --git a/examples/package.json b/examples/package.json
index 6837aa22e..36fd832b7 100644
--- a/examples/package.json
+++ b/examples/package.json
@@ -19,6 +19,7 @@
   "devDependencies": {
     "@types/node": "^18.19.31",
     "ts-node": "^10.9.2",
+    "tsx": "^4.7.2",
     "typescript": "^5.4.4"
   },
   "scripts": {
diff --git a/examples/recipes/cost-analysis.ts b/examples/recipes/cost-analysis.ts
index cf8d102b4..79b162c3b 100644
--- a/examples/recipes/cost-analysis.ts
+++ b/examples/recipes/cost-analysis.ts
@@ -6,7 +6,8 @@ import { extractText } from "llamaindex/llm/utils";
 const encoding = encodingForModel("gpt-4-0125-preview");
 
 const llm = new OpenAI({
-  model: "gpt-4-0125-preview",
+  // currently is "gpt-4-turbo-2024-04-09"
+  model: "gpt-4-turbo",
 });
 
 let tokenCount = 0;
@@ -19,18 +20,25 @@ Settings.callbackManager.on("llm-start", (event) => {
   console.log("Token count:", tokenCount);
   // https://openai.com/pricing
   // $10.00 / 1M tokens
-  console.log(`Price: $${(tokenCount / 1_000_000) * 10}`);
+  console.log(`Total Price: $${(tokenCount / 1_000_000) * 10}`);
 });
-Settings.callbackManager.on("llm-end", (event) => {
-  const { response } = event.detail.payload;
-  tokenCount += encoding.encode(extractText(response.message.content)).length;
-  console.log("Token count:", tokenCount);
+
+Settings.callbackManager.on("llm-stream", (event) => {
+  const { chunk } = event.detail.payload;
+  const { delta } = chunk;
+  tokenCount += encoding.encode(extractText(delta)).length;
+  if (tokenCount > 20) {
+    // This is just an example, you can set your own limit or handle it differently
+    throw new Error("Token limit exceeded!");
+  }
+});
+Settings.callbackManager.on("llm-end", () => {
   // https://openai.com/pricing
   // $30.00 / 1M tokens
-  console.log(`Price: $${(tokenCount / 1_000_000) * 30}`);
+  console.log(`Total Price: $${(tokenCount / 1_000_000) * 30}`);
 });
 
-const question = "Hello, how are you?";
+const question = "Hello, how are you? Please response about 50 tokens.";
 console.log("Question:", question);
 void llm
   .chat({
diff --git a/packages/core/e2e/fixtures/llm/open_ai.ts b/packages/core/e2e/fixtures/llm/open_ai.ts
index d8ee4661e..24dfaee2f 100644
--- a/packages/core/e2e/fixtures/llm/open_ai.ts
+++ b/packages/core/e2e/fixtures/llm/open_ai.ts
@@ -52,19 +52,12 @@ export class OpenAI implements LLM {
       }
 
       if (llmCompleteMockStorage.llmEventEnd.length > 0) {
-        const response =
-          llmCompleteMockStorage.llmEventEnd.shift()!["response"];
+        const { id, response } = llmCompleteMockStorage.llmEventEnd.shift()!;
         if (params.stream) {
-          const content = response.message.content as string;
-          // maybe this is not the correct way to split the content, but it's good enough for now
-          const tokens = content.split("");
           return {
             [Symbol.asyncIterator]: async function* () {
-              const delta = tokens.shift();
-              if (delta) {
-                yield {
-                  delta,
-                } as ChatResponseChunk;
+              while (llmCompleteMockStorage.llmEventStream.at(-1)?.id === id) {
+                yield llmCompleteMockStorage.llmEventStream.shift()!["chunk"];
               }
             },
           };
diff --git a/packages/core/e2e/node/snapshot/agent.snap b/packages/core/e2e/node/snapshot/agent.snap
index c6ab51720..4571ee496 100644
--- a/packages/core/e2e/node/snapshot/agent.snap
+++ b/packages/core/e2e/node/snapshot/agent.snap
@@ -1,7 +1,7 @@
 {
   "llmEventStart": [
     {
-      "id": "88a7035b-b493-4e95-8902-666ede936fb6",
+      "id": "HIDDEN",
       "messages": [
         {
           "content": "What is the weather in San Francisco?",
@@ -10,7 +10,7 @@
       ]
     },
     {
-      "id": "79c7f739-cc24-4cf8-b605-89a460111da1",
+      "id": "HIDDEN",
       "messages": [
         {
           "content": "What is the weather in San Francisco?",
@@ -22,7 +22,7 @@
           "options": {
             "toolCalls": [
               {
-                "id": "call_kIqRf0PiYEa6uIQzI3wYZJkR",
+                "id": "HIDDEN",
                 "type": "function",
                 "function": {
                   "name": "Weather",
@@ -37,7 +37,7 @@
           "role": "tool",
           "options": {
             "name": "Weather",
-            "tool_call_id": "call_kIqRf0PiYEa6uIQzI3wYZJkR"
+            "tool_call_id": "HIDDEN"
           }
         }
       ]
@@ -45,12 +45,12 @@
   ],
   "llmEventEnd": [
     {
-      "id": "88a7035b-b493-4e95-8902-666ede936fb6",
+      "id": "HIDDEN",
       "response": {
         "raw": {
-          "id": "chatcmpl-9CMHEE56nQ0oLqGeOeJuky31TIHZY",
+          "id": "HIDDEN",
           "object": "chat.completion",
-          "created": 1712732596,
+          "created": 114514,
           "model": "gpt-3.5-turbo-0125",
           "choices": [
             {
@@ -60,7 +60,7 @@
                 "content": null,
                 "tool_calls": [
                   {
-                    "id": "call_kIqRf0PiYEa6uIQzI3wYZJkR",
+                    "id": "HIDDEN",
                     "type": "function",
                     "function": {
                       "name": "Weather",
@@ -78,7 +78,7 @@
             "completion_tokens": 15,
             "total_tokens": 64
           },
-          "system_fingerprint": "fp_b28b39ffa8"
+          "system_fingerprint": "HIDDEN"
         },
         "message": {
           "content": "",
@@ -86,7 +86,7 @@
           "options": {
             "toolCalls": [
               {
-                "id": "call_kIqRf0PiYEa6uIQzI3wYZJkR",
+                "id": "HIDDEN",
                 "type": "function",
                 "function": {
                   "name": "Weather",
@@ -99,12 +99,12 @@
       }
     },
     {
-      "id": "79c7f739-cc24-4cf8-b605-89a460111da1",
+      "id": "HIDDEN",
       "response": {
         "raw": {
-          "id": "chatcmpl-9CMHF8HQai1B6w1BpAd5GAHLKfbkR",
+          "id": "HIDDEN",
           "object": "chat.completion",
-          "created": 1712732597,
+          "created": 114514,
           "model": "gpt-3.5-turbo-0125",
           "choices": [
             {
@@ -122,7 +122,7 @@
             "completion_tokens": 14,
             "total_tokens": 92
           },
-          "system_fingerprint": "fp_b28b39ffa8"
+          "system_fingerprint": "HIDDEN"
         },
         "message": {
           "content": "The weather in San Francisco is currently 35 degrees and sunny.",
@@ -131,5 +131,6 @@
         }
       }
     }
-  ]
+  ],
+  "llmEventStream": []
 }
\ No newline at end of file
diff --git a/packages/core/e2e/node/snapshot/gpt-4-turbo.snap b/packages/core/e2e/node/snapshot/gpt-4-turbo.snap
index 8a93f3d8d..960c40f77 100644
--- a/packages/core/e2e/node/snapshot/gpt-4-turbo.snap
+++ b/packages/core/e2e/node/snapshot/gpt-4-turbo.snap
@@ -1,7 +1,7 @@
 {
   "llmEventStart": [
     {
-      "id": "3c5024e0-df1d-4a29-b491-9712324bd520",
+      "id": "HIDDEN",
       "messages": [
         {
           "content": "What is the weather in San Jose?",
@@ -10,7 +10,7 @@
       ]
     },
     {
-      "id": "860b61c3-3c3a-4301-8200-9d6c0668cae5",
+      "id": "HIDDEN",
       "messages": [
         {
           "content": "What is the weather in San Jose?",
@@ -22,7 +22,7 @@
           "options": {
             "toolCalls": [
               {
-                "id": "call_wlpohl1FXSCU9vV2CsjTPSWE",
+                "id": "HIDDEN",
                 "type": "function",
                 "function": {
                   "name": "Weather",
@@ -37,7 +37,7 @@
           "role": "tool",
           "options": {
             "name": "Weather",
-            "tool_call_id": "call_wlpohl1FXSCU9vV2CsjTPSWE"
+            "tool_call_id": "HIDDEN"
           }
         }
       ]
@@ -45,12 +45,12 @@
   ],
   "llmEventEnd": [
     {
-      "id": "3c5024e0-df1d-4a29-b491-9712324bd520",
+      "id": "HIDDEN",
       "response": {
         "raw": {
-          "id": "chatcmpl-9CQt20hfgKNlrbsbu47j40GzHzFUJ",
+          "id": "HIDDEN",
           "object": "chat.completion",
-          "created": 1712750316,
+          "created": 114514,
           "model": "gpt-3.5-turbo-0125",
           "choices": [
             {
@@ -60,7 +60,7 @@
                 "content": null,
                 "tool_calls": [
                   {
-                    "id": "call_wlpohl1FXSCU9vV2CsjTPSWE",
+                    "id": "HIDDEN",
                     "type": "function",
                     "function": {
                       "name": "Weather",
@@ -78,7 +78,7 @@
             "completion_tokens": 15,
             "total_tokens": 64
           },
-          "system_fingerprint": "fp_b28b39ffa8"
+          "system_fingerprint": "HIDDEN"
         },
         "message": {
           "content": "",
@@ -86,7 +86,7 @@
           "options": {
             "toolCalls": [
               {
-                "id": "call_wlpohl1FXSCU9vV2CsjTPSWE",
+                "id": "HIDDEN",
                 "type": "function",
                 "function": {
                   "name": "Weather",
@@ -99,12 +99,12 @@
       }
     },
     {
-      "id": "860b61c3-3c3a-4301-8200-9d6c0668cae5",
+      "id": "HIDDEN",
       "response": {
         "raw": {
-          "id": "chatcmpl-9CQt2PPpt5qL8wl3lipBYJXLZXeQi",
+          "id": "HIDDEN",
           "object": "chat.completion",
-          "created": 1712750316,
+          "created": 114514,
           "model": "gpt-3.5-turbo-0125",
           "choices": [
             {
@@ -122,7 +122,7 @@
             "completion_tokens": 14,
             "total_tokens": 92
           },
-          "system_fingerprint": "fp_b28b39ffa8"
+          "system_fingerprint": "HIDDEN"
         },
         "message": {
           "content": "The weather in San Jose is currently 45 degrees and sunny.",
@@ -131,5 +131,6 @@
         }
       }
     }
-  ]
+  ],
+  "llmEventStream": []
 }
\ No newline at end of file
diff --git a/packages/core/e2e/node/snapshot/llm.snap b/packages/core/e2e/node/snapshot/llm.snap
index 6074f1524..4fb4a33ba 100644
--- a/packages/core/e2e/node/snapshot/llm.snap
+++ b/packages/core/e2e/node/snapshot/llm.snap
@@ -1,7 +1,7 @@
 {
   "llmEventStart": [
     {
-      "id": "68846dc5-d099-4ab1-b987-3eb5376c9859",
+      "id": "HIDDEN",
       "messages": [
         {
           "content": "Hello",
@@ -10,7 +10,7 @@
       ]
     },
     {
-      "id": "de46b84e-7345-430f-b8fa-423354b630c9",
+      "id": "HIDDEN",
       "messages": [
         {
           "content": "hello",
@@ -21,12 +21,12 @@
   ],
   "llmEventEnd": [
     {
-      "id": "68846dc5-d099-4ab1-b987-3eb5376c9859",
+      "id": "HIDDEN",
       "response": {
         "raw": {
-          "id": "chatcmpl-9CMHDMY3CO1uI6P3JfJMNg94dhrbN",
+          "id": "HIDDEN",
           "object": "chat.completion",
-          "created": 1712732595,
+          "created": 114514,
           "model": "gpt-3.5-turbo-0125",
           "choices": [
             {
@@ -44,7 +44,7 @@
             "completion_tokens": 9,
             "total_tokens": 17
           },
-          "system_fingerprint": "fp_b28b39ffa8"
+          "system_fingerprint": "HIDDEN"
         },
         "message": {
           "content": "Hello! How can I assist you today?",
@@ -54,9 +54,199 @@
       }
     },
     {
-      "id": "de46b84e-7345-430f-b8fa-423354b630c9",
+      "id": "HIDDEN",
       "response": {
-        "raw": null,
+        "raw": [
+          {
+            "raw": {
+              "id": "HIDDEN",
+              "object": "chat.completion.chunk",
+              "created": 114514,
+              "model": "gpt-3.5-turbo-0125",
+              "system_fingerprint": "HIDDEN",
+              "choices": [
+                {
+                  "index": 0,
+                  "delta": {
+                    "content": "Hello"
+                  },
+                  "logprobs": null,
+                  "finish_reason": null
+                }
+              ]
+            },
+            "options": {},
+            "delta": "Hello"
+          },
+          {
+            "raw": {
+              "id": "HIDDEN",
+              "object": "chat.completion.chunk",
+              "created": 114514,
+              "model": "gpt-3.5-turbo-0125",
+              "system_fingerprint": "HIDDEN",
+              "choices": [
+                {
+                  "index": 0,
+                  "delta": {
+                    "content": "!"
+                  },
+                  "logprobs": null,
+                  "finish_reason": null
+                }
+              ]
+            },
+            "options": {},
+            "delta": "!"
+          },
+          {
+            "raw": {
+              "id": "HIDDEN",
+              "object": "chat.completion.chunk",
+              "created": 114514,
+              "model": "gpt-3.5-turbo-0125",
+              "system_fingerprint": "HIDDEN",
+              "choices": [
+                {
+                  "index": 0,
+                  "delta": {
+                    "content": " How"
+                  },
+                  "logprobs": null,
+                  "finish_reason": null
+                }
+              ]
+            },
+            "options": {},
+            "delta": " How"
+          },
+          {
+            "raw": {
+              "id": "HIDDEN",
+              "object": "chat.completion.chunk",
+              "created": 114514,
+              "model": "gpt-3.5-turbo-0125",
+              "system_fingerprint": "HIDDEN",
+              "choices": [
+                {
+                  "index": 0,
+                  "delta": {
+                    "content": " can"
+                  },
+                  "logprobs": null,
+                  "finish_reason": null
+                }
+              ]
+            },
+            "options": {},
+            "delta": " can"
+          },
+          {
+            "raw": {
+              "id": "HIDDEN",
+              "object": "chat.completion.chunk",
+              "created": 114514,
+              "model": "gpt-3.5-turbo-0125",
+              "system_fingerprint": "HIDDEN",
+              "choices": [
+                {
+                  "index": 0,
+                  "delta": {
+                    "content": " I"
+                  },
+                  "logprobs": null,
+                  "finish_reason": null
+                }
+              ]
+            },
+            "options": {},
+            "delta": " I"
+          },
+          {
+            "raw": {
+              "id": "HIDDEN",
+              "object": "chat.completion.chunk",
+              "created": 114514,
+              "model": "gpt-3.5-turbo-0125",
+              "system_fingerprint": "HIDDEN",
+              "choices": [
+                {
+                  "index": 0,
+                  "delta": {
+                    "content": " assist"
+                  },
+                  "logprobs": null,
+                  "finish_reason": null
+                }
+              ]
+            },
+            "options": {},
+            "delta": " assist"
+          },
+          {
+            "raw": {
+              "id": "HIDDEN",
+              "object": "chat.completion.chunk",
+              "created": 114514,
+              "model": "gpt-3.5-turbo-0125",
+              "system_fingerprint": "HIDDEN",
+              "choices": [
+                {
+                  "index": 0,
+                  "delta": {
+                    "content": " you"
+                  },
+                  "logprobs": null,
+                  "finish_reason": null
+                }
+              ]
+            },
+            "options": {},
+            "delta": " you"
+          },
+          {
+            "raw": {
+              "id": "HIDDEN",
+              "object": "chat.completion.chunk",
+              "created": 114514,
+              "model": "gpt-3.5-turbo-0125",
+              "system_fingerprint": "HIDDEN",
+              "choices": [
+                {
+                  "index": 0,
+                  "delta": {
+                    "content": " today"
+                  },
+                  "logprobs": null,
+                  "finish_reason": null
+                }
+              ]
+            },
+            "options": {},
+            "delta": " today"
+          },
+          {
+            "raw": {
+              "id": "HIDDEN",
+              "object": "chat.completion.chunk",
+              "created": 114514,
+              "model": "gpt-3.5-turbo-0125",
+              "system_fingerprint": "HIDDEN",
+              "choices": [
+                {
+                  "index": 0,
+                  "delta": {
+                    "content": "?"
+                  },
+                  "logprobs": null,
+                  "finish_reason": null
+                }
+              ]
+            },
+            "options": {},
+            "delta": "?"
+          }
+        ],
         "message": {
           "content": "Hello! How can I assist you today?",
           "role": "assistant",
@@ -64,5 +254,223 @@
         }
       }
     }
+  ],
+  "llmEventStream": [
+    {
+      "id": "HIDDEN",
+      "chunk": {
+        "raw": {
+          "id": "HIDDEN",
+          "object": "chat.completion.chunk",
+          "created": 114514,
+          "model": "gpt-3.5-turbo-0125",
+          "system_fingerprint": "HIDDEN",
+          "choices": [
+            {
+              "index": 0,
+              "delta": {
+                "content": "Hello"
+              },
+              "logprobs": null,
+              "finish_reason": null
+            }
+          ]
+        },
+        "options": {},
+        "delta": "Hello"
+      }
+    },
+    {
+      "id": "HIDDEN",
+      "chunk": {
+        "raw": {
+          "id": "HIDDEN",
+          "object": "chat.completion.chunk",
+          "created": 114514,
+          "model": "gpt-3.5-turbo-0125",
+          "system_fingerprint": "HIDDEN",
+          "choices": [
+            {
+              "index": 0,
+              "delta": {
+                "content": "!"
+              },
+              "logprobs": null,
+              "finish_reason": null
+            }
+          ]
+        },
+        "options": {},
+        "delta": "!"
+      }
+    },
+    {
+      "id": "HIDDEN",
+      "chunk": {
+        "raw": {
+          "id": "HIDDEN",
+          "object": "chat.completion.chunk",
+          "created": 114514,
+          "model": "gpt-3.5-turbo-0125",
+          "system_fingerprint": "HIDDEN",
+          "choices": [
+            {
+              "index": 0,
+              "delta": {
+                "content": " How"
+              },
+              "logprobs": null,
+              "finish_reason": null
+            }
+          ]
+        },
+        "options": {},
+        "delta": " How"
+      }
+    },
+    {
+      "id": "HIDDEN",
+      "chunk": {
+        "raw": {
+          "id": "HIDDEN",
+          "object": "chat.completion.chunk",
+          "created": 114514,
+          "model": "gpt-3.5-turbo-0125",
+          "system_fingerprint": "HIDDEN",
+          "choices": [
+            {
+              "index": 0,
+              "delta": {
+                "content": " can"
+              },
+              "logprobs": null,
+              "finish_reason": null
+            }
+          ]
+        },
+        "options": {},
+        "delta": " can"
+      }
+    },
+    {
+      "id": "HIDDEN",
+      "chunk": {
+        "raw": {
+          "id": "HIDDEN",
+          "object": "chat.completion.chunk",
+          "created": 114514,
+          "model": "gpt-3.5-turbo-0125",
+          "system_fingerprint": "HIDDEN",
+          "choices": [
+            {
+              "index": 0,
+              "delta": {
+                "content": " I"
+              },
+              "logprobs": null,
+              "finish_reason": null
+            }
+          ]
+        },
+        "options": {},
+        "delta": " I"
+      }
+    },
+    {
+      "id": "HIDDEN",
+      "chunk": {
+        "raw": {
+          "id": "HIDDEN",
+          "object": "chat.completion.chunk",
+          "created": 114514,
+          "model": "gpt-3.5-turbo-0125",
+          "system_fingerprint": "HIDDEN",
+          "choices": [
+            {
+              "index": 0,
+              "delta": {
+                "content": " assist"
+              },
+              "logprobs": null,
+              "finish_reason": null
+            }
+          ]
+        },
+        "options": {},
+        "delta": " assist"
+      }
+    },
+    {
+      "id": "HIDDEN",
+      "chunk": {
+        "raw": {
+          "id": "HIDDEN",
+          "object": "chat.completion.chunk",
+          "created": 114514,
+          "model": "gpt-3.5-turbo-0125",
+          "system_fingerprint": "HIDDEN",
+          "choices": [
+            {
+              "index": 0,
+              "delta": {
+                "content": " you"
+              },
+              "logprobs": null,
+              "finish_reason": null
+            }
+          ]
+        },
+        "options": {},
+        "delta": " you"
+      }
+    },
+    {
+      "id": "HIDDEN",
+      "chunk": {
+        "raw": {
+          "id": "HIDDEN",
+          "object": "chat.completion.chunk",
+          "created": 114514,
+          "model": "gpt-3.5-turbo-0125",
+          "system_fingerprint": "HIDDEN",
+          "choices": [
+            {
+              "index": 0,
+              "delta": {
+                "content": " today"
+              },
+              "logprobs": null,
+              "finish_reason": null
+            }
+          ]
+        },
+        "options": {},
+        "delta": " today"
+      }
+    },
+    {
+      "id": "HIDDEN",
+      "chunk": {
+        "raw": {
+          "id": "HIDDEN",
+          "object": "chat.completion.chunk",
+          "created": 114514,
+          "model": "gpt-3.5-turbo-0125",
+          "system_fingerprint": "HIDDEN",
+          "choices": [
+            {
+              "index": 0,
+              "delta": {
+                "content": "?"
+              },
+              "logprobs": null,
+              "finish_reason": null
+            }
+          ]
+        },
+        "options": {},
+        "delta": "?"
+      }
+    }
   ]
 }
\ No newline at end of file
diff --git a/packages/core/e2e/node/snapshot/queryEngine_subquestion.snap b/packages/core/e2e/node/snapshot/queryEngine_subquestion.snap
index 9f4549f40..4b86f5604 100644
--- a/packages/core/e2e/node/snapshot/queryEngine_subquestion.snap
+++ b/packages/core/e2e/node/snapshot/queryEngine_subquestion.snap
@@ -1,7 +1,7 @@
 {
   "llmEventStart": [
     {
-      "id": "6542b9be-50e9-4bd6-9e8b-94088c0d0a43",
+      "id": "HIDDEN",
       "messages": [
         {
           "content": "Given a user question, and a list of tools, output a list of relevant sub-questions that when composed can help answer the full user question:\n\n# Example 1\n<Tools>\n```json\n{\n    \"uber_10k\": \"Provides information about Uber financials for year 2021\",\n    \"lyft_10k\": \"Provides information about Lyft financials for year 2021\"\n}\n```\n\n<User Question>\nCompare and contrast the revenue growth and EBITDA of Uber and Lyft for year 2021\n\n<Output>\n```json\n[\n    {\n        \"subQuestion\": \"What is the revenue growth of Uber\",\n        \"toolName\": \"uber_10k\"\n    },\n    {\n        \"subQuestion\": \"What is the EBITDA of Uber\",\n        \"toolName\": \"uber_10k\"\n    },\n    {\n        \"subQuestion\": \"What is the revenue growth of Lyft\",\n        \"toolName\": \"lyft_10k\"\n    },\n    {\n        \"subQuestion\": \"What is the EBITDA of Lyft\",\n        \"toolName\": \"lyft_10k\"\n    }\n]\n```\n\n# Example 2\n<Tools>\n```json\n{\n    \"bill_gates_idea\": \"Get what Bill Gates idea from.\"\n}\n```\n\n<User Question>\nWhat did Bill Gates steal from?\n\n<Output>\n",
@@ -10,7 +10,7 @@
       ]
     },
     {
-      "id": "8e61829a-f816-47d5-95d5-fc93ff479085",
+      "id": "HIDDEN",
       "messages": [
         {
           "content": "Context information is below.\n---------------------\nBill Gates stole from Apple. Steve Jobs stole from Xerox.\n---------------------\nGiven the context information and not prior knowledge, answer the query.\nQuery: What is Bill Gates' idea\nAnswer:",
@@ -19,7 +19,7 @@
       ]
     },
     {
-      "id": "c4b73220-3108-4951-8280-2c6429247f7c",
+      "id": "HIDDEN",
       "messages": [
         {
           "content": "Context information is below.\n---------------------\nSub question: What is Bill Gates' idea\nResponse: Bill Gates' idea was to steal from Apple.\n---------------------\nGiven the context information and not prior knowledge, answer the query.\nQuery: What did Bill Gates steal from?\nAnswer:",
@@ -30,12 +30,12 @@
   ],
   "llmEventEnd": [
     {
-      "id": "6542b9be-50e9-4bd6-9e8b-94088c0d0a43",
+      "id": "HIDDEN",
       "response": {
         "raw": {
-          "id": "chatcmpl-9CMHF9kjM55jAPKnPfZ7Yj6nOZJNf",
+          "id": "HIDDEN",
           "object": "chat.completion",
-          "created": 1712732597,
+          "created": 114514,
           "model": "gpt-3.5-turbo-0125",
           "choices": [
             {
@@ -53,7 +53,7 @@
             "completion_tokens": 35,
             "total_tokens": 325
           },
-          "system_fingerprint": "fp_b28b39ffa8"
+          "system_fingerprint": "HIDDEN"
         },
         "message": {
           "content": "```json\n[\n    {\n        \"subQuestion\": \"What is Bill Gates' idea\",\n        \"toolName\": \"bill_gates_idea\"\n    }\n]\n```",
@@ -63,12 +63,12 @@
       }
     },
     {
-      "id": "8e61829a-f816-47d5-95d5-fc93ff479085",
+      "id": "HIDDEN",
       "response": {
         "raw": {
-          "id": "chatcmpl-9CMHGHagthzFopnyqU5uPFDNx1fnw",
+          "id": "HIDDEN",
           "object": "chat.completion",
-          "created": 1712732598,
+          "created": 114514,
           "model": "gpt-3.5-turbo-0125",
           "choices": [
             {
@@ -86,7 +86,7 @@
             "completion_tokens": 10,
             "total_tokens": 63
           },
-          "system_fingerprint": "fp_b28b39ffa8"
+          "system_fingerprint": "HIDDEN"
         },
         "message": {
           "content": "Bill Gates' idea was to steal from Apple.",
@@ -96,12 +96,12 @@
       }
     },
     {
-      "id": "c4b73220-3108-4951-8280-2c6429247f7c",
+      "id": "HIDDEN",
       "response": {
         "raw": {
-          "id": "chatcmpl-9CMHHL7EUvMjm1jTKZIThu76XNAiN",
+          "id": "HIDDEN",
           "object": "chat.completion",
-          "created": 1712732599,
+          "created": 114514,
           "model": "gpt-3.5-turbo-0125",
           "choices": [
             {
@@ -119,7 +119,7 @@
             "completion_tokens": 6,
             "total_tokens": 68
           },
-          "system_fingerprint": "fp_b28b39ffa8"
+          "system_fingerprint": "HIDDEN"
         },
         "message": {
           "content": "Bill Gates stole from Apple.",
@@ -128,5 +128,6 @@
         }
       }
     }
-  ]
+  ],
+  "llmEventStream": []
 }
\ No newline at end of file
diff --git a/packages/core/e2e/node/utils.ts b/packages/core/e2e/node/utils.ts
index 260e86e6b..352b8bdd6 100644
--- a/packages/core/e2e/node/utils.ts
+++ b/packages/core/e2e/node/utils.ts
@@ -1,4 +1,9 @@
-import { Settings, type LLMEndEvent, type LLMStartEvent } from "llamaindex";
+import {
+  Settings,
+  type LLMEndEvent,
+  type LLMStartEvent,
+  type LLMStreamEvent,
+} from "llamaindex";
 import { readFile, writeFile } from "node:fs/promises";
 import { join } from "node:path";
 import { type test } from "node:test";
@@ -7,11 +12,13 @@ import { fileURLToPath } from "node:url";
 type MockStorage = {
   llmEventStart: LLMStartEvent["detail"]["payload"][];
   llmEventEnd: LLMEndEvent["detail"]["payload"][];
+  llmEventStream: LLMStreamEvent["detail"]["payload"][];
 };
 
 export const llmCompleteMockStorage: MockStorage = {
   llmEventStart: [],
   llmEventEnd: [],
+  llmEventStream: [],
 };
 
 export const testRootDir = fileURLToPath(new URL(".", import.meta.url));
@@ -23,6 +30,7 @@ export async function mockLLMEvent(
   const newLLMCompleteMockStorage: MockStorage = {
     llmEventStart: [],
     llmEventEnd: [],
+    llmEventStream: [],
   };
 
   function captureLLMStart(event: LLMStartEvent) {
@@ -33,6 +41,10 @@ export async function mockLLMEvent(
     newLLMCompleteMockStorage.llmEventEnd.push(event.detail.payload);
   }
 
+  function captureLLMStream(event: LLMStreamEvent) {
+    newLLMCompleteMockStorage.llmEventStream.push(event.detail.payload);
+  }
+
   await readFile(join(testRootDir, "snapshot", `${snapshotName}.snap`), {
     encoding: "utf-8",
   })
@@ -44,6 +56,9 @@ export async function mockLLMEvent(
       result["llmEventStart"].forEach((event) => {
         llmCompleteMockStorage.llmEventStart.push(event);
       });
+      result["llmEventStream"].forEach((event) => {
+        llmCompleteMockStorage.llmEventStream.push(event);
+      });
     })
     .catch((error) => {
       if (error.code === "ENOENT") {
@@ -53,15 +68,25 @@ export async function mockLLMEvent(
     });
   Settings.callbackManager.on("llm-start", captureLLMStart);
   Settings.callbackManager.on("llm-end", captureLLMEnd);
+  Settings.callbackManager.on("llm-stream", captureLLMStream);
 
   t.after(async () => {
+    Settings.callbackManager.off("llm-stream", captureLLMStream);
     Settings.callbackManager.off("llm-end", captureLLMEnd);
     Settings.callbackManager.off("llm-start", captureLLMStart);
     // eslint-disable-next-line turbo/no-undeclared-env-vars
     if (process.env.UPDATE_SNAPSHOT === "1") {
+      const data = JSON.stringify(newLLMCompleteMockStorage, null, 2)
+        .replace(/"id": ".*"/g, `"id": "HIDDEN"`)
+        .replace(/"created": \d+/g, `"created": 114514`)
+        .replace(
+          /"system_fingerprint": ".*"/g,
+          '"system_fingerprint": "HIDDEN"',
+        )
+        .replace(/"tool_call_id": ".*"/g, '"tool_call_id": "HIDDEN"');
       await writeFile(
         join(testRootDir, "snapshot", `${snapshotName}.snap`),
-        JSON.stringify(newLLMCompleteMockStorage, null, 2),
+        data,
       );
       return;
     }
@@ -79,10 +104,20 @@ export async function mockLLMEvent(
         "New LLMStartEvent does not match, please update snapshot",
       );
     }
+
+    if (
+      newLLMCompleteMockStorage.llmEventStream.length !==
+      llmCompleteMockStorage.llmEventStream.length
+    ) {
+      throw new Error(
+        "New LLMStreamEvent does not match, please update snapshot",
+      );
+    }
   });
   // cleanup
   t.after(() => {
     llmCompleteMockStorage.llmEventEnd = [];
     llmCompleteMockStorage.llmEventStart = [];
+    llmCompleteMockStorage.llmEventStream = [];
   });
 }
diff --git a/packages/core/src/callbacks/CallbackManager.ts b/packages/core/src/callbacks/CallbackManager.ts
index f79f972fb..d596615ac 100644
--- a/packages/core/src/callbacks/CallbackManager.ts
+++ b/packages/core/src/callbacks/CallbackManager.ts
@@ -5,7 +5,11 @@ import {
   EventCaller,
   getEventCaller,
 } from "../internal/context/EventCaller.js";
-import type { LLMEndEvent, LLMStartEvent } from "../llm/types.js";
+import type {
+  LLMEndEvent,
+  LLMStartEvent,
+  LLMStreamEvent,
+} from "../llm/types.js";
 
 export class LlamaIndexCustomEvent<T = any> extends CustomEvent<T> {
   reason: EventCaller | null;
@@ -44,6 +48,7 @@ export interface LlamaIndexEventMaps {
   stream: CustomEvent<StreamCallbackResponse>;
   "llm-start": LLMStartEvent;
   "llm-end": LLMEndEvent;
+  "llm-stream": LLMStreamEvent;
 }
 
 //#region @deprecated remove in the next major version
diff --git a/packages/core/src/llm/LLM.ts b/packages/core/src/llm/LLM.ts
index 7c8ff98d7..dc37d3718 100644
--- a/packages/core/src/llm/LLM.ts
+++ b/packages/core/src/llm/LLM.ts
@@ -366,7 +366,7 @@ export class Portkey extends BaseLLM {
 
       idx_counter++;
 
-      yield { delta: part.choices[0].delta?.content ?? "" };
+      yield { raw: part, delta: part.choices[0].delta?.content ?? "" };
     }
     return;
   }
diff --git a/packages/core/src/llm/anthropic.ts b/packages/core/src/llm/anthropic.ts
index 9c471eebe..364525d7a 100644
--- a/packages/core/src/llm/anthropic.ts
+++ b/packages/core/src/llm/anthropic.ts
@@ -212,7 +212,10 @@ export class Anthropic extends BaseLLM {
       if (typeof content !== "string") continue;
 
       idx_counter++;
-      yield { delta: content };
+      yield {
+        raw: part,
+        delta: content,
+      };
     }
     return;
   }
diff --git a/packages/core/src/llm/mistral.ts b/packages/core/src/llm/mistral.ts
index 28a11ab41..d933327b5 100644
--- a/packages/core/src/llm/mistral.ts
+++ b/packages/core/src/llm/mistral.ts
@@ -137,6 +137,7 @@ export class MistralAI extends BaseLLM {
       idx_counter++;
 
       yield {
+        raw: part,
         delta: part.choices[0].delta.content ?? "",
       };
     }
diff --git a/packages/core/src/llm/ollama.ts b/packages/core/src/llm/ollama.ts
index 85e475cf2..c7b132bc8 100644
--- a/packages/core/src/llm/ollama.ts
+++ b/packages/core/src/llm/ollama.ts
@@ -14,6 +14,7 @@ import type {
 
 const messageAccessor = (data: any): ChatResponseChunk => {
   return {
+    raw: data,
     delta: data.message.content,
   };
 };
diff --git a/packages/core/src/llm/open_ai.ts b/packages/core/src/llm/open_ai.ts
index 6d6db4eb5..6279f924e 100644
--- a/packages/core/src/llm/open_ai.ts
+++ b/packages/core/src/llm/open_ai.ts
@@ -404,6 +404,7 @@ export class OpenAI extends BaseLLM<
       });
 
       yield {
+        raw: part,
         // add tool calls to final chunk
         options: toolCalls.length > 0 ? { toolCalls: toolCalls } : {},
         delta: choice.delta.content ?? "",
diff --git a/packages/core/src/llm/types.ts b/packages/core/src/llm/types.ts
index 1b27b1309..bbe8aa8e0 100644
--- a/packages/core/src/llm/types.ts
+++ b/packages/core/src/llm/types.ts
@@ -22,6 +22,13 @@ export type LLMEndEvent = LLMBaseEvent<
     response: ChatResponse;
   }
 >;
+export type LLMStreamEvent = LLMBaseEvent<
+  "llm-stream",
+  {
+    id: UUID;
+    chunk: ChatResponseChunk;
+  }
+>;
 
 /**
  * @internal
@@ -127,7 +134,7 @@ export interface ChatResponse<
   /**
    * Raw response from the LLM
    *
-   * It's possible that this is `null` if the LLM response an iterable of chunks
+   * If LLM response an iterable of chunks, this will be an array of those chunks
    */
   raw: object | null;
 }
@@ -140,10 +147,12 @@ export type ChatResponseChunk<
 > =
   AdditionalMessageOptions extends Record<string, unknown>
     ? {
+        raw: object | null;
         delta: string;
         options?: AdditionalMessageOptions;
       }
     : {
+        raw: object | null;
         delta: string;
         options: AdditionalMessageOptions;
       };
diff --git a/packages/core/src/llm/utils.ts b/packages/core/src/llm/utils.ts
index b37cc05da..44f256cbc 100644
--- a/packages/core/src/llm/utils.ts
+++ b/packages/core/src/llm/utils.ts
@@ -2,6 +2,7 @@ import { AsyncLocalStorage, randomUUID } from "@llamaindex/env";
 import { getCallbackManager } from "../internal/settings/CallbackManager.js";
 import type {
   ChatResponse,
+  ChatResponseChunk,
   LLM,
   LLMChat,
   MessageContent,
@@ -83,14 +84,14 @@ export function wrapLLMEvent(
         [Symbol.asyncIterator]: response[Symbol.asyncIterator].bind(response),
       };
       response[Symbol.asyncIterator] = async function* () {
-        const finalResponse: ChatResponse = {
-          raw: null,
+        const finalResponse = {
+          raw: [] as ChatResponseChunk[],
           message: {
             content: "",
             role: "assistant",
             options: {},
           },
-        };
+        } satisfies ChatResponse;
         let firstOne = false;
         for await (const chunk of originalAsyncIterator) {
           if (!firstOne) {
@@ -105,6 +106,13 @@ export function wrapLLMEvent(
               ...chunk.options,
             };
           }
+          getCallbackManager().dispatchEvent("llm-stream", {
+            payload: {
+              id,
+              chunk,
+            },
+          });
+          finalResponse.raw.push(chunk);
           yield chunk;
         }
         snapshot(() => {
diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml
index d64e6b9d4..1a020a65c 100644
--- a/pnpm-lock.yaml
+++ b/pnpm-lock.yaml
@@ -149,6 +149,9 @@ importers:
       ts-node:
         specifier: ^10.9.2
         version: 10.9.2(@types/node@18.19.31)(typescript@5.4.4)
+      tsx:
+        specifier: ^4.7.2
+        version: 4.7.2
       typescript:
         specifier: ^5.4.4
         version: 5.4.4
-- 
GitLab