diff --git a/.changeset/gorgeous-planets-chew.md b/.changeset/gorgeous-planets-chew.md
new file mode 100644
index 0000000000000000000000000000000000000000..9b6dabd448b7e71bbf89b2a467850d9a3b109adb
--- /dev/null
+++ b/.changeset/gorgeous-planets-chew.md
@@ -0,0 +1,5 @@
+---
+"llamaindex": patch
+---
+
+Breaking: changed default temp to 0.1 matching new Python change by @logan-markewich
diff --git a/.changeset/short-crabs-breathe.md b/.changeset/short-crabs-breathe.md
new file mode 100644
index 0000000000000000000000000000000000000000..a37e9610d62d442648a45b64a64f7d13db90124b
--- /dev/null
+++ b/.changeset/short-crabs-breathe.md
@@ -0,0 +1,5 @@
+---
+"llamaindex": patch
+---
+
+Add support for new Replicate 4 bit Llama2 models
diff --git a/.changeset/slow-frogs-search.md b/.changeset/slow-frogs-search.md
new file mode 100644
index 0000000000000000000000000000000000000000..a39ca2e878d818e33d76f2038e359d839036ddc2
--- /dev/null
+++ b/.changeset/slow-frogs-search.md
@@ -0,0 +1,5 @@
+---
+"llamaindex": patch
+---
+
+Bug fixes for Llama2 Replicate
diff --git a/apps/simple/gptllama.ts b/apps/simple/gptllama.ts
new file mode 100644
index 0000000000000000000000000000000000000000..7e4c91cc57fb3efce018d0d688e5fc367f522248
--- /dev/null
+++ b/apps/simple/gptllama.ts
@@ -0,0 +1,38 @@
+import { OpenAI, LlamaDeuce, ChatMessage, DeuceChatStrategy } from "llamaindex";
+// @ts-ignore
+import * as readline from "node:readline/promises";
+import { stdin as input, stdout as output } from "node:process";
+
+(async () => {
+  const gpt4 = new OpenAI({ model: "gpt-4", temperature: 0.9 });
+  const l2 = new LlamaDeuce({
+    model: "Llama-2-70b-chat-4bit",
+    temperature: 0.9,
+  });
+
+  const rl = readline.createInterface({ input, output });
+  const start = await rl.question("Start: ");
+  const history: ChatMessage[] = [
+    {
+      content:
+        "Prefer shorter answers. Keep your response to 100 words or less.",
+      role: "system",
+    },
+    { content: start, role: "user" },
+  ];
+
+  while (true) {
+    const next = history.length % 2 === 1 ? gpt4 : l2;
+    const r = await next.chat(
+      history.map(({ content, role }) => ({
+        content,
+        role: next === l2 ? role : role === "user" ? "assistant" : "user",
+      }))
+    );
+    history.push({
+      content: r.message.content,
+      role: next === l2 ? "assistant" : "user",
+    });
+    await rl.question((next === l2 ? "Llama: " : "GPT: ") + r.message.content);
+  }
+})();
diff --git a/examples/gptllama.ts b/examples/gptllama.ts
new file mode 100644
index 0000000000000000000000000000000000000000..0ed1c93cb3f11b12e6162482203af5745398a88a
--- /dev/null
+++ b/examples/gptllama.ts
@@ -0,0 +1,37 @@
+import { OpenAI, LlamaDeuce, ChatMessage, DeuceChatStrategy } from "llamaindex";
+import * as readline from "node:readline/promises";
+import { stdin as input, stdout as output } from "node:process";
+
+(async () => {
+  const gpt4 = new OpenAI({ model: "gpt-4", temperature: 0.9 });
+  const l2 = new LlamaDeuce({
+    model: "Llama-2-70b-chat-4bit",
+    temperature: 0.9,
+  });
+
+  const rl = readline.createInterface({ input, output });
+  const start = await rl.question("Start: ");
+  const history: ChatMessage[] = [
+    {
+      content:
+        "Prefer shorter answers. Keep your response to 100 words or less.",
+      role: "system",
+    },
+    { content: start, role: "user" },
+  ];
+
+  while (true) {
+    const next = history.length % 2 === 1 ? gpt4 : l2;
+    const r = await next.chat(
+      history.map(({ content, role }) => ({
+        content,
+        role: next === l2 ? role : role === "user" ? "assistant" : "user",
+      }))
+    );
+    history.push({
+      content: r.message.content,
+      role: next === l2 ? "assistant" : "user",
+    });
+    await rl.question((next === l2 ? "Llama: " : "GPT: ") + r.message.content);
+  }
+})();
diff --git a/examples/package-lock.json b/examples/package-lock.json
index 6d8339291f00b24e5e19b8648e5e7ba33212206f..5b0baede83cb16b4558feec153ea21e413f88998 100644
--- a/examples/package-lock.json
+++ b/examples/package-lock.json
@@ -15,9 +15,9 @@
       }
     },
     "node_modules/@anthropic-ai/sdk": {
-      "version": "0.5.9",
-      "resolved": "https://registry.npmjs.org/@anthropic-ai/sdk/-/sdk-0.5.9.tgz",
-      "integrity": "sha512-9/TYca4qSe0xG40LLNf5vemybw5JAKF5OE6Eiyc+O+h3+VGGPeOKo+1SHaWBP5zS7bGX2o3Ne6EonPWyh9oNqA==",
+      "version": "0.5.10",
+      "resolved": "https://registry.npmjs.org/@anthropic-ai/sdk/-/sdk-0.5.10.tgz",
+      "integrity": "sha512-P8xrIuTUO/6wDzcjQRUROXp4WSqtngbXaE4GpEu0PhEmnq/1Q8vbF1s0o7W07EV3j8zzRoyJxAKovUJtNXH7ew==",
       "dependencies": {
         "@types/node": "^18.11.18",
         "@types/node-fetch": "^2.6.4",
@@ -30,9 +30,9 @@
       }
     },
     "node_modules/@types/node": {
-      "version": "18.17.1",
-      "resolved": "https://registry.npmjs.org/@types/node/-/node-18.17.1.tgz",
-      "integrity": "sha512-xlR1jahfizdplZYRU59JlUx9uzF1ARa8jbhM11ccpCJya8kvos5jwdm2ZAgxSCwOl0fq21svP18EVwPBXMQudw=="
+      "version": "18.17.3",
+      "resolved": "https://registry.npmjs.org/@types/node/-/node-18.17.3.tgz",
+      "integrity": "sha512-2x8HWtFk0S99zqVQABU9wTpr8wPoaDHZUcAkoTKH+nL7kPv3WUI9cRi/Kk5Mz4xdqXSqTkKP7IWNoQQYCnDsTA=="
     },
     "node_modules/@types/node-fetch": {
       "version": "2.6.4",
@@ -55,12 +55,10 @@
       }
     },
     "node_modules/agentkeepalive": {
-      "version": "4.3.0",
-      "resolved": "https://registry.npmjs.org/agentkeepalive/-/agentkeepalive-4.3.0.tgz",
-      "integrity": "sha512-7Epl1Blf4Sy37j4v9f9FjICCh4+KAQOyXgHEwlyBiAQLbhKdq/i2QQU3amQalS/wPhdPzDXPL5DMR5bkn+YeWg==",
+      "version": "4.5.0",
+      "resolved": "https://registry.npmjs.org/agentkeepalive/-/agentkeepalive-4.5.0.tgz",
+      "integrity": "sha512-5GG/5IbQQpC9FpkRGsSvZI5QYeSCzlJHdpBQntCsuTOxhKD8lqKhrleg2Yi7yvMIf82Ycmmqln9U8V9qwEiJew==",
       "dependencies": {
-        "debug": "^4.1.0",
-        "depd": "^2.0.0",
         "humanize-ms": "^1.2.1"
       },
       "engines": {
@@ -77,25 +75,6 @@
       "resolved": "https://registry.npmjs.org/base-64/-/base-64-0.1.0.tgz",
       "integrity": "sha512-Y5gU45svrR5tI2Vt/X9GPd3L0HNIKzGu202EjxrXMpuc2V2CiKgemAbUUsqYmZJvPtCXoUKjNZwBJzsNScUbXA=="
     },
-    "node_modules/base64-js": {
-      "version": "1.5.1",
-      "resolved": "https://registry.npmjs.org/base64-js/-/base64-js-1.5.1.tgz",
-      "integrity": "sha512-AKpaYlHn8t4SVbOHCy+b5+KKgvR4vrsD8vbvrbiQJps7fKDTkjkDry6ji0rUJjC0kzbNePLwzxq8iypo41qeWA==",
-      "funding": [
-        {
-          "type": "github",
-          "url": "https://github.com/sponsors/feross"
-        },
-        {
-          "type": "patreon",
-          "url": "https://www.patreon.com/feross"
-        },
-        {
-          "type": "consulting",
-          "url": "https://feross.org/support"
-        }
-      ]
-    },
     "node_modules/charenc": {
       "version": "0.0.2",
       "resolved": "https://registry.npmjs.org/charenc/-/charenc-0.0.2.tgz",
@@ -124,19 +103,11 @@
       }
     },
     "node_modules/debug": {
-      "version": "4.3.4",
-      "resolved": "https://registry.npmjs.org/debug/-/debug-4.3.4.tgz",
-      "integrity": "sha512-PRWFHuSU3eDtQJPvnNY7Jcket1j0t5OuOsFzPPzsekD52Zl8qUfFIPEiswXqIvHWGVHOgX+7G/vCNNhehwxfkQ==",
+      "version": "3.2.7",
+      "resolved": "https://registry.npmjs.org/debug/-/debug-3.2.7.tgz",
+      "integrity": "sha512-CFjzYYAi4ThfiQvizrFQevTTXHtnCqWfe7x1AhgEscTz6ZbLbfoLRLPugTQyBth6f8ZERVUSyWHFD/7Wu4t1XQ==",
       "dependencies": {
-        "ms": "2.1.2"
-      },
-      "engines": {
-        "node": ">=6.0"
-      },
-      "peerDependenciesMeta": {
-        "supports-color": {
-          "optional": true
-        }
+        "ms": "^2.1.1"
       }
     },
     "node_modules/delayed-stream": {
@@ -147,14 +118,6 @@
         "node": ">=0.4.0"
       }
     },
-    "node_modules/depd": {
-      "version": "2.0.0",
-      "resolved": "https://registry.npmjs.org/depd/-/depd-2.0.0.tgz",
-      "integrity": "sha512-g7nH6P6dyDioJogAAGprGpCtVImJhpPk/roCzdb3fIh61/s/nPsfR6onyMwkCAR/OlC3yBC0lESvUoQEAssIrw==",
-      "engines": {
-        "node": ">= 0.8"
-      }
-    },
     "node_modules/digest-fetch": {
       "version": "1.3.0",
       "resolved": "https://registry.npmjs.org/digest-fetch/-/digest-fetch-1.3.0.tgz",
@@ -215,25 +178,16 @@
       "resolved": "https://registry.npmjs.org/is-buffer/-/is-buffer-1.1.6.tgz",
       "integrity": "sha512-NcdALwpXkTm5Zvvbk7owOUSvVvBKDgKP5/ewfXEznmQFfs4ZRmanOeKBTjRVjka3QFoN6XJ+9F3USqfHqTaU5w=="
     },
-    "node_modules/js-tiktoken": {
-      "version": "1.0.7",
-      "resolved": "https://registry.npmjs.org/js-tiktoken/-/js-tiktoken-1.0.7.tgz",
-      "integrity": "sha512-biba8u/clw7iesNEWLOLwrNGoBP2lA+hTaBLs/D45pJdUPFXyxD6nhcDVtADChghv4GgyAiMKYMiRx7x6h7Biw==",
-      "dependencies": {
-        "base64-js": "^1.5.1"
-      }
-    },
     "node_modules/llamaindex": {
-      "version": "0.0.15",
-      "resolved": "https://registry.npmjs.org/llamaindex/-/llamaindex-0.0.15.tgz",
-      "integrity": "sha512-TgmszmioI2LqTTCLcg8Gk0cFb3rpoGjZS7d4XPS0gEs4VVL6m6hH6ipaVkVBkaeBVjKrgcbvqS4jvjznU4cDtQ==",
+      "version": "0.0.16",
+      "resolved": "https://registry.npmjs.org/llamaindex/-/llamaindex-0.0.16.tgz",
+      "integrity": "sha512-zQa3qGu0SuRrprd9yCo5m5zFBHS4gTyqXlwUaL/aJoiO1sv9o4SQ3xOEZNciWvliHohu5uIkaiOwB85JfZ5eVQ==",
       "dependencies": {
-        "@anthropic-ai/sdk": "^0.5.8",
-        "js-tiktoken": "^1.0.7",
+        "@anthropic-ai/sdk": "^0.5.9",
         "lodash": "^4.17.21",
-        "openai": "4.0.0-beta.6",
+        "openai": "4.0.0-beta.8",
         "pdf-parse": "^1.1.1",
-        "replicate": "^0.12.3",
+        "replicate": "^0.14.1",
         "tiktoken-node": "^0.0.6",
         "uuid": "^9.0.0",
         "wink-nlp": "^1.14.3"
@@ -277,9 +231,9 @@
       }
     },
     "node_modules/ms": {
-      "version": "2.1.2",
-      "resolved": "https://registry.npmjs.org/ms/-/ms-2.1.2.tgz",
-      "integrity": "sha512-sGkPx+VjMtmA6MX27oA4FBFELFCZZ4S4XqeGOXCv68tT+jb3vk/RyaKWP0PTKyWtmLSM0b+adUTEvbs1PEaH2w=="
+      "version": "2.1.3",
+      "resolved": "https://registry.npmjs.org/ms/-/ms-2.1.3.tgz",
+      "integrity": "sha512-6FlzubTLZG3J2a/NVCAleEhjzq5oxgHyaCU9yYXvcLsvoVaHJq/s5xXI6/XXP6tz7R9xAOtHnSO/tXtF3WRTlA=="
     },
     "node_modules/node-domexception": {
       "version": "1.0.0",
@@ -324,9 +278,9 @@
       }
     },
     "node_modules/openai": {
-      "version": "4.0.0-beta.6",
-      "resolved": "https://registry.npmjs.org/openai/-/openai-4.0.0-beta.6.tgz",
-      "integrity": "sha512-sZscRgs6nhBgIt0qcK8XB2PGga6V0Qy9rQn/vKesox/laQDs9tMaOi6rsDhHq15aXQJPROfEB0K9SZoCuyWbSw==",
+      "version": "4.0.0-beta.8",
+      "resolved": "https://registry.npmjs.org/openai/-/openai-4.0.0-beta.8.tgz",
+      "integrity": "sha512-bXBxUK2B288GlLckjnpT+UgyyddP+f4JIT5/df95C6eeqQhf5UkuN41Z3uf/6unPvR8Tthm4jJje1fA0OgsDEw==",
       "dependencies": {
         "@types/node": "^18.11.18",
         "@types/node-fetch": "^2.6.4",
@@ -350,18 +304,10 @@
         "node": ">=6.8.1"
       }
     },
-    "node_modules/pdf-parse/node_modules/debug": {
-      "version": "3.2.7",
-      "resolved": "https://registry.npmjs.org/debug/-/debug-3.2.7.tgz",
-      "integrity": "sha512-CFjzYYAi4ThfiQvizrFQevTTXHtnCqWfe7x1AhgEscTz6ZbLbfoLRLPugTQyBth6f8ZERVUSyWHFD/7Wu4t1XQ==",
-      "dependencies": {
-        "ms": "^2.1.1"
-      }
-    },
     "node_modules/replicate": {
-      "version": "0.12.3",
-      "resolved": "https://registry.npmjs.org/replicate/-/replicate-0.12.3.tgz",
-      "integrity": "sha512-HVWKPoVhWVTONlWk+lUXmq9Vy2J8MxBJMtDBQq3dA5uq71ZzKTh0xvJfvzW4+VLBjhBeL7tkdua6hZJmKfzAPQ==",
+      "version": "0.14.1",
+      "resolved": "https://registry.npmjs.org/replicate/-/replicate-0.14.1.tgz",
+      "integrity": "sha512-3NpuNRbvXoEjY+n/ra24VfQyIRCdLub9GCrU51fFTrMaa6OjRvMC9jGDJSpGRXOLvID75mFgN577HEhA3XEFtg==",
       "engines": {
         "git": ">=2.11.0",
         "node": ">=16.6.0",
diff --git a/packages/core/src/llm/LLM.ts b/packages/core/src/llm/LLM.ts
index 92ed3577bd0433c77faee6bc4d5296552220a874..08e5342c9e03a7cc4bb91711b2e05484c6c81033 100644
--- a/packages/core/src/llm/LLM.ts
+++ b/packages/core/src/llm/LLM.ts
@@ -93,7 +93,7 @@ export class OpenAI implements LLM {
 
   constructor(init?: Partial<OpenAI> & { azure?: AzureOpenAIConfig }) {
     this.model = init?.model ?? "gpt-3.5-turbo";
-    this.temperature = init?.temperature ?? 0;
+    this.temperature = init?.temperature ?? 0.1;
     this.topP = init?.topP ?? 1;
     this.maxTokens = init?.maxTokens ?? undefined;
 
@@ -203,20 +203,42 @@ export class OpenAI implements LLM {
 }
 
 export const ALL_AVAILABLE_LLAMADEUCE_MODELS = {
-  "Llama-2-70b-chat": {
+  "Llama-2-70b-chat-old": {
     contextWindow: 4096,
     replicateApi:
       "replicate/llama70b-v2-chat:e951f18578850b652510200860fc4ea62b3b16fac280f83ff32282f87bbd2e48",
+    //^ Previous 70b model. This is also actually 4 bit, although not exllama.
+  },
+  "Llama-2-70b-chat-4bit": {
+    contextWindow: 4096,
+    replicateApi:
+      "replicate/llama70b-v2-chat:2c1608e18606fad2812020dc541930f2d0495ce32eee50074220b87300bc16e1",
+    //^ Model is based off of exllama 4bit.
   },
   "Llama-2-13b-chat": {
     contextWindow: 4096,
     replicateApi:
       "a16z-infra/llama13b-v2-chat:df7690f1994d94e96ad9d568eac121aecf50684a0b0963b25a41cc40061269e5",
   },
+  //^ Last known good 13b non-quantized model. In future versions they add the SYS and INST tags themselves
+  "Llama-2-13b-chat-4bit": {
+    contextWindow: 4096,
+    replicateApi:
+      "a16z-infra/llama13b-v2-chat:2a7f981751ec7fdf87b5b91ad4db53683a98082e9ff7bfd12c8cd5ea85980a52",
+  },
   "Llama-2-7b-chat": {
     contextWindow: 4096,
     replicateApi:
       "a16z-infra/llama7b-v2-chat:4f0a4744c7295c024a1de15e1a63c880d3da035fa1f49bfd344fe076074c8eea",
+    //^ Last (somewhat) known good 7b non-quantized model. In future versions they add the SYS and INST
+    // tags themselves
+    // https://github.com/replicate/cog-llama-template/commit/fa5ce83912cf82fc2b9c01a4e9dc9bff6f2ef137
+    // Problem is that they fix the max_new_tokens issue in the same commit. :-(
+  },
+  "Llama-2-7b-chat-4bit": {
+    contextWindow: 4096,
+    replicateApi:
+      "a16z-infra/llama7b-v2-chat:4f0b260b6a13eb53a6b1891f089d57c08f41003ae79458be5011303d81a394dc",
   },
 };
 
@@ -226,6 +248,8 @@ export enum DeuceChatStrategy {
   METAWBOS = "metawbos",
   //^ This is not exactly right because SentencePiece puts the BOS and EOS token IDs in after tokenization
   // Unfortunately any string only API won't support these properly.
+  REPLICATE4BIT = "replicate4bit",
+  //^ To satisfy Replicate's 4 bit models' requirements where they also insert some INST tags
 }
 
 /**
@@ -240,35 +264,46 @@ export class LlamaDeuce implements LLM {
   replicateSession: ReplicateSession;
 
   constructor(init?: Partial<LlamaDeuce>) {
-    this.model = init?.model ?? "Llama-2-70b-chat";
-    this.chatStrategy = init?.chatStrategy ?? DeuceChatStrategy.META;
-    this.temperature = init?.temperature ?? 0.01; // minimum temperature is 0.01 for Replicate endpoint
+    this.model = init?.model ?? "Llama-2-70b-chat-4bit";
+    this.chatStrategy =
+      init?.chatStrategy ??
+      (this.model.endsWith("4bit")
+        ? DeuceChatStrategy.REPLICATE4BIT // With the newer A16Z/Replicate models they do the system message themselves.
+        : DeuceChatStrategy.METAWBOS); // With BOS and EOS seems to work best, although they all have problems past a certain point
+    this.temperature = init?.temperature ?? 0.1; // minimum temperature is 0.01 for Replicate endpoint
     this.topP = init?.topP ?? 1;
-    this.maxTokens = init?.maxTokens ?? undefined; // By default this means it's 500 tokens according to Replicate docs
+    this.maxTokens =
+      init?.maxTokens ??
+      ALL_AVAILABLE_LLAMADEUCE_MODELS[this.model].contextWindow; // For Replicate, the default is 500 tokens which is too low.
     this.replicateSession = init?.replicateSession ?? new ReplicateSession();
   }
 
-  mapMessagesToPrompt(messages: ChatMessage[]): string {
+  mapMessagesToPrompt(messages: ChatMessage[]) {
     if (this.chatStrategy === DeuceChatStrategy.A16Z) {
       return this.mapMessagesToPromptA16Z(messages);
     } else if (this.chatStrategy === DeuceChatStrategy.META) {
       return this.mapMessagesToPromptMeta(messages);
     } else if (this.chatStrategy === DeuceChatStrategy.METAWBOS) {
-      return this.mapMessagesToPromptMeta(messages, true);
+      return this.mapMessagesToPromptMeta(messages, { withBos: true });
+    } else if (this.chatStrategy === DeuceChatStrategy.REPLICATE4BIT) {
+      return this.mapMessagesToPromptMeta(messages, { replicate4Bit: true });
     } else {
       return this.mapMessagesToPromptMeta(messages);
     }
   }
 
-  mapMessagesToPromptA16Z(messages: ChatMessage[]): string {
-    return (
-      messages.reduce((acc, message) => {
-        return (
-          (acc && `${acc}\n\n`) +
-          `${this.mapMessageTypeA16Z(message.role)}${message.content}`
-        );
-      }, "") + "\n\nAssistant:"
-    ); // Here we're differing from A16Z by omitting the space. Generally spaces at the end of prompts decrease performance due to tokenization
+  mapMessagesToPromptA16Z(messages: ChatMessage[]) {
+    return {
+      prompt:
+        messages.reduce((acc, message) => {
+          return (
+            (acc && `${acc}\n\n`) +
+            `${this.mapMessageTypeA16Z(message.role)}${message.content}`
+          );
+        }, "") + "\n\nAssistant:",
+      //^ Here we're differing from A16Z by omitting the space. Generally spaces at the end of prompts decrease performance due to tokenization
+      systemPrompt: undefined,
+    };
   }
 
   mapMessageTypeA16Z(messageType: MessageType): string {
@@ -284,7 +319,11 @@ export class LlamaDeuce implements LLM {
     }
   }
 
-  mapMessagesToPromptMeta(messages: ChatMessage[], withBos = false): string {
+  mapMessagesToPromptMeta(
+    messages: ChatMessage[],
+    opts?: { withBos?: boolean; replicate4Bit?: boolean }
+  ) {
+    const { withBos = false, replicate4Bit = false } = opts ?? {};
     const DEFAULT_SYSTEM_PROMPT = `You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.
 
 If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.`;
@@ -297,37 +336,50 @@ If a question does not make any sense, or is not factually coherent, explain why
     const EOS = "</s>";
 
     if (messages.length === 0) {
-      return "";
+      return { prompt: "", systemPrompt: undefined };
     }
 
+    messages = [...messages]; // so we can use shift without mutating the original array
+
+    let systemPrompt = undefined;
     if (messages[0].role === "system") {
       const systemMessage = messages.shift()!;
 
-      const systemStr = `${B_SYS}${systemMessage.content}${E_SYS}`;
+      if (replicate4Bit) {
+        systemPrompt = systemMessage.content;
+      } else {
+        const systemStr = `${B_SYS}${systemMessage.content}${E_SYS}`;
 
-      if (messages[1].role !== "user") {
-        throw new Error(
-          "LlamaDeuce: if there is a system message, the second message must be a user message."
-        );
-      }
+        // TS Bug: https://github.com/microsoft/TypeScript/issues/9998
+        // @ts-ignore
+        if (messages[0].role !== "user") {
+          throw new Error(
+            "LlamaDeuce: if there is a system message, the second message must be a user message."
+          );
+        }
 
-      const userContent = messages[0].content;
+        const userContent = messages[0].content;
 
-      messages[0].content = `${systemStr}${userContent}`;
+        messages[0].content = `${systemStr}${userContent}`;
+      }
     } else {
-      messages[0].content = `${B_SYS}${DEFAULT_SYSTEM_PROMPT}${E_SYS}${messages[0].content}`;
+      if (!replicate4Bit) {
+        messages[0].content = `${B_SYS}${DEFAULT_SYSTEM_PROMPT}${E_SYS}${messages[0].content}`;
+      }
     }
 
-    return messages.reduce((acc, message, index) => {
-      if (index % 2 === 0) {
-        return (
-          (withBos ? BOS : "") +
-          `${acc}${B_INST} ${message.content.trim()} ${E_INST}`
-        );
-      } else {
-        return `${acc} ${message.content.trim()} ` + (withBos ? EOS : ""); // Yes, the EOS comes after the space. This is not a mistake.
-      }
-    }, "");
+    return {
+      prompt: messages.reduce((acc, message, index) => {
+        if (index % 2 === 0) {
+          return `${acc}${
+            withBos ? BOS : ""
+          }${B_INST} ${message.content.trim()} ${E_INST}`;
+        } else {
+          return `${acc} ${message.content.trim()} ` + (withBos ? EOS : ""); // Yes, the EOS comes after the space. This is not a mistake.
+        }
+      }, ""),
+      systemPrompt,
+    };
   }
 
   async chat(
@@ -337,21 +389,31 @@ If a question does not make any sense, or is not factually coherent, explain why
     const api = ALL_AVAILABLE_LLAMADEUCE_MODELS[this.model]
       .replicateApi as `${string}/${string}:${string}`;
 
-    const prompt = this.mapMessagesToPrompt(messages);
+    const { prompt, systemPrompt } = this.mapMessagesToPrompt(messages);
 
-    const response = await this.replicateSession.replicate.run(api, {
+    const replicateOptions: any = {
       input: {
         prompt,
-        system_prompt: "", // We are already sending the system prompt so set system prompt to empty.
-        max_new_tokens: this.maxTokens,
+        system_prompt: systemPrompt,
         temperature: this.temperature,
         top_p: this.topP,
       },
-    });
+    };
+
+    if (this.model.endsWith("4bit")) {
+      replicateOptions.input.max_new_tokens = this.maxTokens;
+    } else {
+      replicateOptions.input.max_length = this.maxTokens;
+    }
+
+    const response = await this.replicateSession.replicate.run(
+      api,
+      replicateOptions
+    );
     return {
       message: {
-        content: (response as Array<string>).join(""),
-        // We need to do this because Replicate returns a list of strings (for streaming functionality which is not exposed by the run function)
+        content: (response as Array<string>).join("").trimStart(),
+        //^ We need to do this because Replicate returns a list of strings (for streaming functionality which is not exposed by the run function)
         role: "assistant",
       },
     };
@@ -386,7 +448,7 @@ export class Anthropic implements LLM {
 
   constructor(init?: Partial<Anthropic>) {
     this.model = init?.model ?? "claude-2";
-    this.temperature = init?.temperature ?? 0;
+    this.temperature = init?.temperature ?? 0.1;
     this.topP = init?.topP ?? 0.999; // Per Ben Mann
     this.maxTokens = init?.maxTokens ?? undefined;