From b3fec864132cfd0168de44ba630ff85ac039ee94 Mon Sep 17 00:00:00 2001
From: Yi Ding <yi.s.ding@gmail.com>
Date: Mon, 7 Aug 2023 11:12:57 -0700
Subject: [PATCH] Llama2 fixes with 4 bit model support

---
 .changeset/short-crabs-breathe.md |   5 +
 .changeset/slow-frogs-search.md   |   5 +
 apps/simple/gptllama.ts           |  38 ++++++++
 examples/gptllama.ts              |  37 ++++++++
 examples/package-lock.json        | 110 ++++++----------------
 packages/core/src/llm/LLM.ts      | 148 +++++++++++++++++++++---------
 6 files changed, 218 insertions(+), 125 deletions(-)
 create mode 100644 .changeset/short-crabs-breathe.md
 create mode 100644 .changeset/slow-frogs-search.md
 create mode 100644 apps/simple/gptllama.ts
 create mode 100644 examples/gptllama.ts

diff --git a/.changeset/short-crabs-breathe.md b/.changeset/short-crabs-breathe.md
new file mode 100644
index 000000000..a37e9610d
--- /dev/null
+++ b/.changeset/short-crabs-breathe.md
@@ -0,0 +1,5 @@
+---
+"llamaindex": patch
+---
+
+Add support for new Replicate 4 bit Llama2 models
diff --git a/.changeset/slow-frogs-search.md b/.changeset/slow-frogs-search.md
new file mode 100644
index 000000000..a39ca2e87
--- /dev/null
+++ b/.changeset/slow-frogs-search.md
@@ -0,0 +1,5 @@
+---
+"llamaindex": patch
+---
+
+Bug fixes for Llama2 Replicate
diff --git a/apps/simple/gptllama.ts b/apps/simple/gptllama.ts
new file mode 100644
index 000000000..7e4c91cc5
--- /dev/null
+++ b/apps/simple/gptllama.ts
@@ -0,0 +1,38 @@
+import { OpenAI, LlamaDeuce, ChatMessage, DeuceChatStrategy } from "llamaindex";
+// @ts-ignore
+import * as readline from "node:readline/promises";
+import { stdin as input, stdout as output } from "node:process";
+
+(async () => {
+  const gpt4 = new OpenAI({ model: "gpt-4", temperature: 0.9 });
+  const l2 = new LlamaDeuce({
+    model: "Llama-2-70b-chat-4bit",
+    temperature: 0.9,
+  });
+
+  const rl = readline.createInterface({ input, output });
+  const start = await rl.question("Start: ");
+  const history: ChatMessage[] = [
+    {
+      content:
+        "Prefer shorter answers. Keep your response to 100 words or less.",
+      role: "system",
+    },
+    { content: start, role: "user" },
+  ];
+
+  while (true) {
+    const next = history.length % 2 === 1 ? gpt4 : l2;
+    const r = await next.chat(
+      history.map(({ content, role }) => ({
+        content,
+        role: next === l2 ? role : role === "user" ? "assistant" : "user",
+      }))
+    );
+    history.push({
+      content: r.message.content,
+      role: next === l2 ? "assistant" : "user",
+    });
+    await rl.question((next === l2 ? "Llama: " : "GPT: ") + r.message.content);
+  }
+})();
diff --git a/examples/gptllama.ts b/examples/gptllama.ts
new file mode 100644
index 000000000..0ed1c93cb
--- /dev/null
+++ b/examples/gptllama.ts
@@ -0,0 +1,37 @@
+import { OpenAI, LlamaDeuce, ChatMessage, DeuceChatStrategy } from "llamaindex";
+import * as readline from "node:readline/promises";
+import { stdin as input, stdout as output } from "node:process";
+
+(async () => {
+  const gpt4 = new OpenAI({ model: "gpt-4", temperature: 0.9 });
+  const l2 = new LlamaDeuce({
+    model: "Llama-2-70b-chat-4bit",
+    temperature: 0.9,
+  });
+
+  const rl = readline.createInterface({ input, output });
+  const start = await rl.question("Start: ");
+  const history: ChatMessage[] = [
+    {
+      content:
+        "Prefer shorter answers. Keep your response to 100 words or less.",
+      role: "system",
+    },
+    { content: start, role: "user" },
+  ];
+
+  while (true) {
+    const next = history.length % 2 === 1 ? gpt4 : l2;
+    const r = await next.chat(
+      history.map(({ content, role }) => ({
+        content,
+        role: next === l2 ? role : role === "user" ? "assistant" : "user",
+      }))
+    );
+    history.push({
+      content: r.message.content,
+      role: next === l2 ? "assistant" : "user",
+    });
+    await rl.question((next === l2 ? "Llama: " : "GPT: ") + r.message.content);
+  }
+})();
diff --git a/examples/package-lock.json b/examples/package-lock.json
index 6d8339291..5b0baede8 100644
--- a/examples/package-lock.json
+++ b/examples/package-lock.json
@@ -15,9 +15,9 @@
       }
     },
     "node_modules/@anthropic-ai/sdk": {
-      "version": "0.5.9",
-      "resolved": "https://registry.npmjs.org/@anthropic-ai/sdk/-/sdk-0.5.9.tgz",
-      "integrity": "sha512-9/TYca4qSe0xG40LLNf5vemybw5JAKF5OE6Eiyc+O+h3+VGGPeOKo+1SHaWBP5zS7bGX2o3Ne6EonPWyh9oNqA==",
+      "version": "0.5.10",
+      "resolved": "https://registry.npmjs.org/@anthropic-ai/sdk/-/sdk-0.5.10.tgz",
+      "integrity": "sha512-P8xrIuTUO/6wDzcjQRUROXp4WSqtngbXaE4GpEu0PhEmnq/1Q8vbF1s0o7W07EV3j8zzRoyJxAKovUJtNXH7ew==",
       "dependencies": {
         "@types/node": "^18.11.18",
         "@types/node-fetch": "^2.6.4",
@@ -30,9 +30,9 @@
       }
     },
     "node_modules/@types/node": {
-      "version": "18.17.1",
-      "resolved": "https://registry.npmjs.org/@types/node/-/node-18.17.1.tgz",
-      "integrity": "sha512-xlR1jahfizdplZYRU59JlUx9uzF1ARa8jbhM11ccpCJya8kvos5jwdm2ZAgxSCwOl0fq21svP18EVwPBXMQudw=="
+      "version": "18.17.3",
+      "resolved": "https://registry.npmjs.org/@types/node/-/node-18.17.3.tgz",
+      "integrity": "sha512-2x8HWtFk0S99zqVQABU9wTpr8wPoaDHZUcAkoTKH+nL7kPv3WUI9cRi/Kk5Mz4xdqXSqTkKP7IWNoQQYCnDsTA=="
     },
     "node_modules/@types/node-fetch": {
       "version": "2.6.4",
@@ -55,12 +55,10 @@
       }
     },
     "node_modules/agentkeepalive": {
-      "version": "4.3.0",
-      "resolved": "https://registry.npmjs.org/agentkeepalive/-/agentkeepalive-4.3.0.tgz",
-      "integrity": "sha512-7Epl1Blf4Sy37j4v9f9FjICCh4+KAQOyXgHEwlyBiAQLbhKdq/i2QQU3amQalS/wPhdPzDXPL5DMR5bkn+YeWg==",
+      "version": "4.5.0",
+      "resolved": "https://registry.npmjs.org/agentkeepalive/-/agentkeepalive-4.5.0.tgz",
+      "integrity": "sha512-5GG/5IbQQpC9FpkRGsSvZI5QYeSCzlJHdpBQntCsuTOxhKD8lqKhrleg2Yi7yvMIf82Ycmmqln9U8V9qwEiJew==",
       "dependencies": {
-        "debug": "^4.1.0",
-        "depd": "^2.0.0",
         "humanize-ms": "^1.2.1"
       },
       "engines": {
@@ -77,25 +75,6 @@
       "resolved": "https://registry.npmjs.org/base-64/-/base-64-0.1.0.tgz",
       "integrity": "sha512-Y5gU45svrR5tI2Vt/X9GPd3L0HNIKzGu202EjxrXMpuc2V2CiKgemAbUUsqYmZJvPtCXoUKjNZwBJzsNScUbXA=="
     },
-    "node_modules/base64-js": {
-      "version": "1.5.1",
-      "resolved": "https://registry.npmjs.org/base64-js/-/base64-js-1.5.1.tgz",
-      "integrity": "sha512-AKpaYlHn8t4SVbOHCy+b5+KKgvR4vrsD8vbvrbiQJps7fKDTkjkDry6ji0rUJjC0kzbNePLwzxq8iypo41qeWA==",
-      "funding": [
-        {
-          "type": "github",
-          "url": "https://github.com/sponsors/feross"
-        },
-        {
-          "type": "patreon",
-          "url": "https://www.patreon.com/feross"
-        },
-        {
-          "type": "consulting",
-          "url": "https://feross.org/support"
-        }
-      ]
-    },
     "node_modules/charenc": {
       "version": "0.0.2",
       "resolved": "https://registry.npmjs.org/charenc/-/charenc-0.0.2.tgz",
@@ -124,19 +103,11 @@
       }
     },
     "node_modules/debug": {
-      "version": "4.3.4",
-      "resolved": "https://registry.npmjs.org/debug/-/debug-4.3.4.tgz",
-      "integrity": "sha512-PRWFHuSU3eDtQJPvnNY7Jcket1j0t5OuOsFzPPzsekD52Zl8qUfFIPEiswXqIvHWGVHOgX+7G/vCNNhehwxfkQ==",
+      "version": "3.2.7",
+      "resolved": "https://registry.npmjs.org/debug/-/debug-3.2.7.tgz",
+      "integrity": "sha512-CFjzYYAi4ThfiQvizrFQevTTXHtnCqWfe7x1AhgEscTz6ZbLbfoLRLPugTQyBth6f8ZERVUSyWHFD/7Wu4t1XQ==",
       "dependencies": {
-        "ms": "2.1.2"
-      },
-      "engines": {
-        "node": ">=6.0"
-      },
-      "peerDependenciesMeta": {
-        "supports-color": {
-          "optional": true
-        }
+        "ms": "^2.1.1"
       }
     },
     "node_modules/delayed-stream": {
@@ -147,14 +118,6 @@
         "node": ">=0.4.0"
       }
     },
-    "node_modules/depd": {
-      "version": "2.0.0",
-      "resolved": "https://registry.npmjs.org/depd/-/depd-2.0.0.tgz",
-      "integrity": "sha512-g7nH6P6dyDioJogAAGprGpCtVImJhpPk/roCzdb3fIh61/s/nPsfR6onyMwkCAR/OlC3yBC0lESvUoQEAssIrw==",
-      "engines": {
-        "node": ">= 0.8"
-      }
-    },
     "node_modules/digest-fetch": {
       "version": "1.3.0",
       "resolved": "https://registry.npmjs.org/digest-fetch/-/digest-fetch-1.3.0.tgz",
@@ -215,25 +178,16 @@
       "resolved": "https://registry.npmjs.org/is-buffer/-/is-buffer-1.1.6.tgz",
       "integrity": "sha512-NcdALwpXkTm5Zvvbk7owOUSvVvBKDgKP5/ewfXEznmQFfs4ZRmanOeKBTjRVjka3QFoN6XJ+9F3USqfHqTaU5w=="
     },
-    "node_modules/js-tiktoken": {
-      "version": "1.0.7",
-      "resolved": "https://registry.npmjs.org/js-tiktoken/-/js-tiktoken-1.0.7.tgz",
-      "integrity": "sha512-biba8u/clw7iesNEWLOLwrNGoBP2lA+hTaBLs/D45pJdUPFXyxD6nhcDVtADChghv4GgyAiMKYMiRx7x6h7Biw==",
-      "dependencies": {
-        "base64-js": "^1.5.1"
-      }
-    },
     "node_modules/llamaindex": {
-      "version": "0.0.15",
-      "resolved": "https://registry.npmjs.org/llamaindex/-/llamaindex-0.0.15.tgz",
-      "integrity": "sha512-TgmszmioI2LqTTCLcg8Gk0cFb3rpoGjZS7d4XPS0gEs4VVL6m6hH6ipaVkVBkaeBVjKrgcbvqS4jvjznU4cDtQ==",
+      "version": "0.0.16",
+      "resolved": "https://registry.npmjs.org/llamaindex/-/llamaindex-0.0.16.tgz",
+      "integrity": "sha512-zQa3qGu0SuRrprd9yCo5m5zFBHS4gTyqXlwUaL/aJoiO1sv9o4SQ3xOEZNciWvliHohu5uIkaiOwB85JfZ5eVQ==",
       "dependencies": {
-        "@anthropic-ai/sdk": "^0.5.8",
-        "js-tiktoken": "^1.0.7",
+        "@anthropic-ai/sdk": "^0.5.9",
         "lodash": "^4.17.21",
-        "openai": "4.0.0-beta.6",
+        "openai": "4.0.0-beta.8",
         "pdf-parse": "^1.1.1",
-        "replicate": "^0.12.3",
+        "replicate": "^0.14.1",
         "tiktoken-node": "^0.0.6",
         "uuid": "^9.0.0",
         "wink-nlp": "^1.14.3"
@@ -277,9 +231,9 @@
       }
     },
     "node_modules/ms": {
-      "version": "2.1.2",
-      "resolved": "https://registry.npmjs.org/ms/-/ms-2.1.2.tgz",
-      "integrity": "sha512-sGkPx+VjMtmA6MX27oA4FBFELFCZZ4S4XqeGOXCv68tT+jb3vk/RyaKWP0PTKyWtmLSM0b+adUTEvbs1PEaH2w=="
+      "version": "2.1.3",
+      "resolved": "https://registry.npmjs.org/ms/-/ms-2.1.3.tgz",
+      "integrity": "sha512-6FlzubTLZG3J2a/NVCAleEhjzq5oxgHyaCU9yYXvcLsvoVaHJq/s5xXI6/XXP6tz7R9xAOtHnSO/tXtF3WRTlA=="
     },
     "node_modules/node-domexception": {
       "version": "1.0.0",
@@ -324,9 +278,9 @@
       }
     },
     "node_modules/openai": {
-      "version": "4.0.0-beta.6",
-      "resolved": "https://registry.npmjs.org/openai/-/openai-4.0.0-beta.6.tgz",
-      "integrity": "sha512-sZscRgs6nhBgIt0qcK8XB2PGga6V0Qy9rQn/vKesox/laQDs9tMaOi6rsDhHq15aXQJPROfEB0K9SZoCuyWbSw==",
+      "version": "4.0.0-beta.8",
+      "resolved": "https://registry.npmjs.org/openai/-/openai-4.0.0-beta.8.tgz",
+      "integrity": "sha512-bXBxUK2B288GlLckjnpT+UgyyddP+f4JIT5/df95C6eeqQhf5UkuN41Z3uf/6unPvR8Tthm4jJje1fA0OgsDEw==",
       "dependencies": {
         "@types/node": "^18.11.18",
         "@types/node-fetch": "^2.6.4",
@@ -350,18 +304,10 @@
         "node": ">=6.8.1"
       }
     },
-    "node_modules/pdf-parse/node_modules/debug": {
-      "version": "3.2.7",
-      "resolved": "https://registry.npmjs.org/debug/-/debug-3.2.7.tgz",
-      "integrity": "sha512-CFjzYYAi4ThfiQvizrFQevTTXHtnCqWfe7x1AhgEscTz6ZbLbfoLRLPugTQyBth6f8ZERVUSyWHFD/7Wu4t1XQ==",
-      "dependencies": {
-        "ms": "^2.1.1"
-      }
-    },
     "node_modules/replicate": {
-      "version": "0.12.3",
-      "resolved": "https://registry.npmjs.org/replicate/-/replicate-0.12.3.tgz",
-      "integrity": "sha512-HVWKPoVhWVTONlWk+lUXmq9Vy2J8MxBJMtDBQq3dA5uq71ZzKTh0xvJfvzW4+VLBjhBeL7tkdua6hZJmKfzAPQ==",
+      "version": "0.14.1",
+      "resolved": "https://registry.npmjs.org/replicate/-/replicate-0.14.1.tgz",
+      "integrity": "sha512-3NpuNRbvXoEjY+n/ra24VfQyIRCdLub9GCrU51fFTrMaa6OjRvMC9jGDJSpGRXOLvID75mFgN577HEhA3XEFtg==",
       "engines": {
         "git": ">=2.11.0",
         "node": ">=16.6.0",
diff --git a/packages/core/src/llm/LLM.ts b/packages/core/src/llm/LLM.ts
index 92ed3577b..5374002e8 100644
--- a/packages/core/src/llm/LLM.ts
+++ b/packages/core/src/llm/LLM.ts
@@ -203,20 +203,42 @@ export class OpenAI implements LLM {
 }
 
 export const ALL_AVAILABLE_LLAMADEUCE_MODELS = {
-  "Llama-2-70b-chat": {
+  "Llama-2-70b-chat-old": {
     contextWindow: 4096,
     replicateApi:
       "replicate/llama70b-v2-chat:e951f18578850b652510200860fc4ea62b3b16fac280f83ff32282f87bbd2e48",
+    //^ Previous 70b model. This is also actually 4 bit, although not exllama.
+  },
+  "Llama-2-70b-chat-4bit": {
+    contextWindow: 4096,
+    replicateApi:
+      "replicate/llama70b-v2-chat:2c1608e18606fad2812020dc541930f2d0495ce32eee50074220b87300bc16e1",
+    //^ Model is based off of exllama 4bit.
   },
   "Llama-2-13b-chat": {
     contextWindow: 4096,
     replicateApi:
       "a16z-infra/llama13b-v2-chat:df7690f1994d94e96ad9d568eac121aecf50684a0b0963b25a41cc40061269e5",
   },
+  //^ Last known good 13b non-quantized model. In future versions they add the SYS and INST tags themselves
+  "Llama-2-13b-chat-4bit": {
+    contextWindow: 4096,
+    replicateApi:
+      "a16z-infra/llama13b-v2-chat:2a7f981751ec7fdf87b5b91ad4db53683a98082e9ff7bfd12c8cd5ea85980a52",
+  },
   "Llama-2-7b-chat": {
     contextWindow: 4096,
     replicateApi:
       "a16z-infra/llama7b-v2-chat:4f0a4744c7295c024a1de15e1a63c880d3da035fa1f49bfd344fe076074c8eea",
+    //^ Last (somewhat) known good 7b non-quantized model. In future versions they add the SYS and INST
+    // tags themselves
+    // https://github.com/replicate/cog-llama-template/commit/fa5ce83912cf82fc2b9c01a4e9dc9bff6f2ef137
+    // Problem is that they fix the max_new_tokens issue in the same commit. :-(
+  },
+  "Llama-2-7b-chat-4bit": {
+    contextWindow: 4096,
+    replicateApi:
+      "a16z-infra/llama7b-v2-chat:4f0b260b6a13eb53a6b1891f089d57c08f41003ae79458be5011303d81a394dc",
   },
 };
 
@@ -226,6 +248,8 @@ export enum DeuceChatStrategy {
   METAWBOS = "metawbos",
   //^ This is not exactly right because SentencePiece puts the BOS and EOS token IDs in after tokenization
   // Unfortunately any string only API won't support these properly.
+  REPLICATE4BIT = "replicate4bit",
+  //^ To satisfy Replicate's 4 bit models' requirements where they also insert some INST tags
 }
 
 /**
@@ -240,35 +264,46 @@ export class LlamaDeuce implements LLM {
   replicateSession: ReplicateSession;
 
   constructor(init?: Partial<LlamaDeuce>) {
-    this.model = init?.model ?? "Llama-2-70b-chat";
-    this.chatStrategy = init?.chatStrategy ?? DeuceChatStrategy.META;
+    this.model = init?.model ?? "Llama-2-70b-chat-4bit";
+    this.chatStrategy =
+      init?.chatStrategy ??
+      (this.model.endsWith("4bit")
+        ? DeuceChatStrategy.REPLICATE4BIT
+        : DeuceChatStrategy.METAWBOS); // With BOS and EOS seems to work best
     this.temperature = init?.temperature ?? 0.01; // minimum temperature is 0.01 for Replicate endpoint
     this.topP = init?.topP ?? 1;
-    this.maxTokens = init?.maxTokens ?? undefined; // By default this means it's 500 tokens according to Replicate docs
+    this.maxTokens =
+      init?.maxTokens ??
+      ALL_AVAILABLE_LLAMADEUCE_MODELS[this.model].contextWindow; // For Replicate, the default is 500 tokens which is too low.
     this.replicateSession = init?.replicateSession ?? new ReplicateSession();
   }
 
-  mapMessagesToPrompt(messages: ChatMessage[]): string {
+  mapMessagesToPrompt(messages: ChatMessage[]) {
     if (this.chatStrategy === DeuceChatStrategy.A16Z) {
       return this.mapMessagesToPromptA16Z(messages);
     } else if (this.chatStrategy === DeuceChatStrategy.META) {
       return this.mapMessagesToPromptMeta(messages);
     } else if (this.chatStrategy === DeuceChatStrategy.METAWBOS) {
-      return this.mapMessagesToPromptMeta(messages, true);
+      return this.mapMessagesToPromptMeta(messages, { withBos: true });
+    } else if (this.chatStrategy === DeuceChatStrategy.REPLICATE4BIT) {
+      return this.mapMessagesToPromptMeta(messages, { replicate4Bit: true });
     } else {
       return this.mapMessagesToPromptMeta(messages);
     }
   }
 
-  mapMessagesToPromptA16Z(messages: ChatMessage[]): string {
-    return (
-      messages.reduce((acc, message) => {
-        return (
-          (acc && `${acc}\n\n`) +
-          `${this.mapMessageTypeA16Z(message.role)}${message.content}`
-        );
-      }, "") + "\n\nAssistant:"
-    ); // Here we're differing from A16Z by omitting the space. Generally spaces at the end of prompts decrease performance due to tokenization
+  mapMessagesToPromptA16Z(messages: ChatMessage[]) {
+    return {
+      prompt:
+        messages.reduce((acc, message) => {
+          return (
+            (acc && `${acc}\n\n`) +
+            `${this.mapMessageTypeA16Z(message.role)}${message.content}`
+          );
+        }, "") + "\n\nAssistant:",
+      //^ Here we're differing from A16Z by omitting the space. Generally spaces at the end of prompts decrease performance due to tokenization
+      systemPrompt: undefined,
+    };
   }
 
   mapMessageTypeA16Z(messageType: MessageType): string {
@@ -284,7 +319,11 @@ export class LlamaDeuce implements LLM {
     }
   }
 
-  mapMessagesToPromptMeta(messages: ChatMessage[], withBos = false): string {
+  mapMessagesToPromptMeta(
+    messages: ChatMessage[],
+    opts?: { withBos?: boolean; replicate4Bit?: boolean }
+  ) {
+    const { withBos = false, replicate4Bit = false } = opts ?? {};
     const DEFAULT_SYSTEM_PROMPT = `You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.
 
 If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.`;
@@ -297,37 +336,50 @@ If a question does not make any sense, or is not factually coherent, explain why
     const EOS = "</s>";
 
     if (messages.length === 0) {
-      return "";
+      return { prompt: "", systemPrompt: undefined };
     }
 
+    messages = [...messages]; // so we can use shift without mutating the original array
+
+    let systemPrompt = undefined;
     if (messages[0].role === "system") {
       const systemMessage = messages.shift()!;
 
-      const systemStr = `${B_SYS}${systemMessage.content}${E_SYS}`;
+      if (replicate4Bit) {
+        systemPrompt = systemMessage.content;
+      } else {
+        const systemStr = `${B_SYS}${systemMessage.content}${E_SYS}`;
 
-      if (messages[1].role !== "user") {
-        throw new Error(
-          "LlamaDeuce: if there is a system message, the second message must be a user message."
-        );
-      }
+        // TS Bug: https://github.com/microsoft/TypeScript/issues/9998
+        // @ts-ignore
+        if (messages[0].role !== "user") {
+          throw new Error(
+            "LlamaDeuce: if there is a system message, the second message must be a user message."
+          );
+        }
 
-      const userContent = messages[0].content;
+        const userContent = messages[0].content;
 
-      messages[0].content = `${systemStr}${userContent}`;
+        messages[0].content = `${systemStr}${userContent}`;
+      }
     } else {
-      messages[0].content = `${B_SYS}${DEFAULT_SYSTEM_PROMPT}${E_SYS}${messages[0].content}`;
+      if (!replicate4Bit) {
+        messages[0].content = `${B_SYS}${DEFAULT_SYSTEM_PROMPT}${E_SYS}${messages[0].content}`;
+      }
     }
 
-    return messages.reduce((acc, message, index) => {
-      if (index % 2 === 0) {
-        return (
-          (withBos ? BOS : "") +
-          `${acc}${B_INST} ${message.content.trim()} ${E_INST}`
-        );
-      } else {
-        return `${acc} ${message.content.trim()} ` + (withBos ? EOS : ""); // Yes, the EOS comes after the space. This is not a mistake.
-      }
-    }, "");
+    return {
+      prompt: messages.reduce((acc, message, index) => {
+        if (index % 2 === 0) {
+          return `${acc}${
+            withBos ? BOS : ""
+          }${B_INST} ${message.content.trim()} ${E_INST}`;
+        } else {
+          return `${acc} ${message.content.trim()} ` + (withBos ? EOS : ""); // Yes, the EOS comes after the space. This is not a mistake.
+        }
+      }, ""),
+      systemPrompt,
+    };
   }
 
   async chat(
@@ -337,21 +389,31 @@ If a question does not make any sense, or is not factually coherent, explain why
     const api = ALL_AVAILABLE_LLAMADEUCE_MODELS[this.model]
       .replicateApi as `${string}/${string}:${string}`;
 
-    const prompt = this.mapMessagesToPrompt(messages);
+    const { prompt, systemPrompt } = this.mapMessagesToPrompt(messages);
 
-    const response = await this.replicateSession.replicate.run(api, {
+    const replicateOptions: any = {
       input: {
         prompt,
-        system_prompt: "", // We are already sending the system prompt so set system prompt to empty.
-        max_new_tokens: this.maxTokens,
+        system_prompt: systemPrompt,
         temperature: this.temperature,
         top_p: this.topP,
       },
-    });
+    };
+
+    if (this.model.endsWith("4bit")) {
+      replicateOptions.input.max_new_tokens = this.maxTokens;
+    } else {
+      replicateOptions.input.max_length = this.maxTokens;
+    }
+
+    const response = await this.replicateSession.replicate.run(
+      api,
+      replicateOptions
+    );
     return {
       message: {
-        content: (response as Array<string>).join(""),
-        // We need to do this because Replicate returns a list of strings (for streaming functionality which is not exposed by the run function)
+        content: (response as Array<string>).join("").trimStart(),
+        //^ We need to do this because Replicate returns a list of strings (for streaming functionality which is not exposed by the run function)
         role: "assistant",
       },
     };
-- 
GitLab