diff --git a/packages/core/src/llm/LLM.ts b/packages/core/src/llm/LLM.ts
deleted file mode 100644
index dc37d3718a436986cfb0336127ee6801ca2e3dd9..0000000000000000000000000000000000000000
--- a/packages/core/src/llm/LLM.ts
+++ /dev/null
@@ -1,373 +0,0 @@
-import { type StreamCallbackResponse } from "../callbacks/CallbackManager.js";
-
-import type { LLMOptions } from "portkey-ai";
-import { getCallbackManager } from "../internal/settings/CallbackManager.js";
-import { BaseLLM } from "./base.js";
-import type { PortkeySession } from "./portkey.js";
-import { getPortkeySession } from "./portkey.js";
-import { ReplicateSession } from "./replicate_ai.js";
-import type {
-  ChatMessage,
-  ChatResponse,
-  ChatResponseChunk,
-  LLMChatParamsNonStreaming,
-  LLMChatParamsStreaming,
-  LLMMetadata,
-  MessageType,
-} from "./types.js";
-import { extractText, wrapLLMEvent } from "./utils.js";
-
-export const ALL_AVAILABLE_LLAMADEUCE_MODELS = {
-  "Llama-2-70b-chat-old": {
-    contextWindow: 4096,
-    replicateApi:
-      "replicate/llama70b-v2-chat:e951f18578850b652510200860fc4ea62b3b16fac280f83ff32282f87bbd2e48",
-    //^ Previous 70b model. This is also actually 4 bit, although not exllama.
-  },
-  "Llama-2-70b-chat-4bit": {
-    contextWindow: 4096,
-    replicateApi:
-      "meta/llama-2-70b-chat:02e509c789964a7ea8736978a43525956ef40397be9033abf9fd2badfe68c9e3",
-    //^ Model is based off of exllama 4bit.
-  },
-  "Llama-2-13b-chat-old": {
-    contextWindow: 4096,
-    replicateApi:
-      "a16z-infra/llama13b-v2-chat:df7690f1994d94e96ad9d568eac121aecf50684a0b0963b25a41cc40061269e5",
-  },
-  //^ Last known good 13b non-quantized model. In future versions they add the SYS and INST tags themselves
-  "Llama-2-13b-chat-4bit": {
-    contextWindow: 4096,
-    replicateApi:
-      "meta/llama-2-13b-chat:f4e2de70d66816a838a89eeeb621910adffb0dd0baba3976c96980970978018d",
-  },
-  "Llama-2-7b-chat-old": {
-    contextWindow: 4096,
-    replicateApi:
-      "a16z-infra/llama7b-v2-chat:4f0a4744c7295c024a1de15e1a63c880d3da035fa1f49bfd344fe076074c8eea",
-    //^ Last (somewhat) known good 7b non-quantized model. In future versions they add the SYS and INST
-    // tags themselves
-    // https://github.com/replicate/cog-llama-template/commit/fa5ce83912cf82fc2b9c01a4e9dc9bff6f2ef137
-    // Problem is that they fix the max_new_tokens issue in the same commit. :-(
-  },
-  "Llama-2-7b-chat-4bit": {
-    contextWindow: 4096,
-    replicateApi:
-      "meta/llama-2-7b-chat:13c3cdee13ee059ab779f0291d29054dab00a47dad8261375654de5540165fb0",
-  },
-};
-
-export enum DeuceChatStrategy {
-  A16Z = "a16z",
-  META = "meta",
-  METAWBOS = "metawbos",
-  //^ This is not exactly right because SentencePiece puts the BOS and EOS token IDs in after tokenization
-  // Unfortunately any string only API won't support these properly.
-  REPLICATE4BIT = "replicate4bit",
-  //^ To satisfy Replicate's 4 bit models' requirements where they also insert some INST tags
-  REPLICATE4BITWNEWLINES = "replicate4bitwnewlines",
-  //^ Replicate's documentation recommends using newlines: https://replicate.com/blog/how-to-prompt-llama
-}
-
-/**
- * Llama2 LLM implementation
- */
-export class LlamaDeuce extends BaseLLM {
-  model: keyof typeof ALL_AVAILABLE_LLAMADEUCE_MODELS;
-  chatStrategy: DeuceChatStrategy;
-  temperature: number;
-  topP: number;
-  maxTokens?: number;
-  replicateSession: ReplicateSession;
-
-  constructor(init?: Partial<LlamaDeuce>) {
-    super();
-    this.model = init?.model ?? "Llama-2-70b-chat-4bit";
-    this.chatStrategy =
-      init?.chatStrategy ??
-      (this.model.endsWith("4bit")
-        ? DeuceChatStrategy.REPLICATE4BITWNEWLINES // With the newer Replicate models they do the system message themselves.
-        : DeuceChatStrategy.METAWBOS); // With BOS and EOS seems to work best, although they all have problems past a certain point
-    this.temperature = init?.temperature ?? 0.1; // minimum temperature is 0.01 for Replicate endpoint
-    this.topP = init?.topP ?? 1;
-    this.maxTokens =
-      init?.maxTokens ??
-      ALL_AVAILABLE_LLAMADEUCE_MODELS[this.model].contextWindow; // For Replicate, the default is 500 tokens which is too low.
-    this.replicateSession = init?.replicateSession ?? new ReplicateSession();
-  }
-
-  get metadata() {
-    return {
-      model: this.model,
-      temperature: this.temperature,
-      topP: this.topP,
-      maxTokens: this.maxTokens,
-      contextWindow: ALL_AVAILABLE_LLAMADEUCE_MODELS[this.model].contextWindow,
-      tokenizer: undefined,
-    };
-  }
-
-  mapMessagesToPrompt(messages: ChatMessage[]) {
-    if (this.chatStrategy === DeuceChatStrategy.A16Z) {
-      return this.mapMessagesToPromptA16Z(messages);
-    } else if (this.chatStrategy === DeuceChatStrategy.META) {
-      return this.mapMessagesToPromptMeta(messages);
-    } else if (this.chatStrategy === DeuceChatStrategy.METAWBOS) {
-      return this.mapMessagesToPromptMeta(messages, { withBos: true });
-    } else if (this.chatStrategy === DeuceChatStrategy.REPLICATE4BIT) {
-      return this.mapMessagesToPromptMeta(messages, {
-        replicate4Bit: true,
-        withNewlines: true,
-      });
-    } else if (this.chatStrategy === DeuceChatStrategy.REPLICATE4BITWNEWLINES) {
-      return this.mapMessagesToPromptMeta(messages, {
-        replicate4Bit: true,
-        withNewlines: true,
-      });
-    } else {
-      return this.mapMessagesToPromptMeta(messages);
-    }
-  }
-
-  mapMessagesToPromptA16Z(messages: ChatMessage[]) {
-    return {
-      prompt:
-        messages.reduce((acc, message) => {
-          return (
-            (acc && `${acc}\n\n`) +
-            `${this.mapMessageTypeA16Z(message.role)}${message.content}`
-          );
-        }, "") + "\n\nAssistant:",
-      //^ Here we're differing from A16Z by omitting the space. Generally spaces at the end of prompts decrease performance due to tokenization
-      systemPrompt: undefined,
-    };
-  }
-
-  mapMessageTypeA16Z(messageType: MessageType): string {
-    switch (messageType) {
-      case "user":
-        return "User: ";
-      case "assistant":
-        return "Assistant: ";
-      case "system":
-        return "";
-      default:
-        throw new Error("Unsupported LlamaDeuce message type");
-    }
-  }
-
-  mapMessagesToPromptMeta(
-    messages: ChatMessage[],
-    opts?: {
-      withBos?: boolean;
-      replicate4Bit?: boolean;
-      withNewlines?: boolean;
-    },
-  ) {
-    const {
-      withBos = false,
-      replicate4Bit = false,
-      withNewlines = false,
-    } = opts ?? {};
-    const DEFAULT_SYSTEM_PROMPT = `You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.
-
-If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.`;
-
-    const B_SYS = "<<SYS>>\n";
-    const E_SYS = "\n<</SYS>>\n\n";
-    const B_INST = "[INST]";
-    const E_INST = "[/INST]";
-    const BOS = "<s>";
-    const EOS = "</s>";
-
-    if (messages.length === 0) {
-      return { prompt: "", systemPrompt: undefined };
-    }
-
-    messages = [...messages]; // so we can use shift without mutating the original array
-
-    let systemPrompt = undefined;
-    if (messages[0].role === "system") {
-      const systemMessage = messages.shift()!;
-
-      if (replicate4Bit) {
-        systemPrompt = systemMessage.content;
-      } else {
-        const systemStr = `${B_SYS}${systemMessage.content}${E_SYS}`;
-
-        // TS Bug: https://github.com/microsoft/TypeScript/issues/9998
-        // @ts-ignore
-        if (messages[0].role !== "user") {
-          throw new Error(
-            "LlamaDeuce: if there is a system message, the second message must be a user message.",
-          );
-        }
-
-        const userContent = messages[0].content;
-
-        messages[0].content = `${systemStr}${userContent}`;
-      }
-    } else {
-      if (!replicate4Bit) {
-        messages[0].content = `${B_SYS}${DEFAULT_SYSTEM_PROMPT}${E_SYS}${messages[0].content}`;
-      }
-    }
-
-    return {
-      prompt: messages.reduce((acc, message, index) => {
-        const content = extractText(message.content);
-        if (index % 2 === 0) {
-          return (
-            `${acc}${withBos ? BOS : ""}${B_INST} ${content.trim()} ${E_INST}` +
-            (withNewlines ? "\n" : "")
-          );
-        } else {
-          return (
-            `${acc} ${content.trim()}` +
-            (withNewlines ? "\n" : " ") +
-            (withBos ? EOS : "")
-          ); // Yes, the EOS comes after the space. This is not a mistake.
-        }
-      }, ""),
-      systemPrompt,
-    };
-  }
-
-  chat(
-    params: LLMChatParamsStreaming,
-  ): Promise<AsyncIterable<ChatResponseChunk>>;
-  chat(params: LLMChatParamsNonStreaming): Promise<ChatResponse>;
-  @wrapLLMEvent
-  async chat(
-    params: LLMChatParamsNonStreaming | LLMChatParamsStreaming,
-  ): Promise<ChatResponse | AsyncIterable<ChatResponseChunk>> {
-    const { messages, stream } = params;
-    const api = ALL_AVAILABLE_LLAMADEUCE_MODELS[this.model]
-      .replicateApi as `${string}/${string}:${string}`;
-
-    const { prompt, systemPrompt } = this.mapMessagesToPrompt(messages);
-
-    const replicateOptions: any = {
-      input: {
-        prompt,
-        system_prompt: systemPrompt,
-        temperature: this.temperature,
-        top_p: this.topP,
-      },
-    };
-
-    if (this.model.endsWith("4bit")) {
-      replicateOptions.input.max_new_tokens = this.maxTokens;
-    } else {
-      replicateOptions.input.max_length = this.maxTokens;
-    }
-
-    //TODO: Add streaming for this
-    if (stream) {
-      throw new Error("Streaming not supported for LlamaDeuce");
-    }
-
-    //Non-streaming
-    const response = await this.replicateSession.replicate.run(
-      api,
-      replicateOptions,
-    );
-    return {
-      raw: response,
-      message: {
-        content: (response as Array<string>).join("").trimStart(),
-        //^ We need to do this because Replicate returns a list of strings (for streaming functionality which is not exposed by the run function)
-        role: "assistant",
-      },
-    };
-  }
-}
-
-export class Portkey extends BaseLLM {
-  apiKey?: string = undefined;
-  baseURL?: string = undefined;
-  mode?: string = undefined;
-  llms?: [LLMOptions] | null = undefined;
-  session: PortkeySession;
-
-  constructor(init?: Partial<Portkey>) {
-    super();
-    this.apiKey = init?.apiKey;
-    this.baseURL = init?.baseURL;
-    this.mode = init?.mode;
-    this.llms = init?.llms;
-    this.session = getPortkeySession({
-      apiKey: this.apiKey,
-      baseURL: this.baseURL,
-      llms: this.llms,
-      mode: this.mode,
-    });
-  }
-
-  get metadata(): LLMMetadata {
-    throw new Error("metadata not implemented for Portkey");
-  }
-
-  chat(
-    params: LLMChatParamsStreaming,
-  ): Promise<AsyncIterable<ChatResponseChunk>>;
-  chat(params: LLMChatParamsNonStreaming): Promise<ChatResponse>;
-  @wrapLLMEvent
-  async chat(
-    params: LLMChatParamsNonStreaming | LLMChatParamsStreaming,
-  ): Promise<ChatResponse | AsyncIterable<ChatResponseChunk>> {
-    const { messages, stream, additionalChatOptions } = params;
-    if (stream) {
-      return this.streamChat(messages, additionalChatOptions);
-    } else {
-      const bodyParams = additionalChatOptions || {};
-      const response = await this.session.portkey.chatCompletions.create({
-        messages: messages.map((message) => ({
-          content: extractText(message.content),
-          role: message.role,
-        })),
-        ...bodyParams,
-      });
-
-      const content = response.choices[0].message?.content ?? "";
-      const role = response.choices[0].message?.role || "assistant";
-      return { raw: response, message: { content, role: role as MessageType } };
-    }
-  }
-
-  async *streamChat(
-    messages: ChatMessage[],
-    params?: Record<string, any>,
-  ): AsyncIterable<ChatResponseChunk> {
-    const chunkStream = await this.session.portkey.chatCompletions.create({
-      messages: messages.map((message) => ({
-        content: extractText(message.content),
-        role: message.role,
-      })),
-      ...params,
-      stream: true,
-    });
-
-    //Indices
-    let idx_counter: number = 0;
-    for await (const part of chunkStream) {
-      //Increment
-      part.choices[0].index = idx_counter;
-      const is_done: boolean =
-        part.choices[0].finish_reason === "stop" ? true : false;
-      //onLLMStream Callback
-
-      const stream_callback: StreamCallbackResponse = {
-        index: idx_counter,
-        isDone: is_done,
-        // token: part,
-      };
-      getCallbackManager().dispatchEvent("stream", stream_callback);
-
-      idx_counter++;
-
-      yield { raw: part, delta: part.choices[0].delta?.content ?? "" };
-    }
-    return;
-  }
-}
diff --git a/packages/core/src/llm/index.ts b/packages/core/src/llm/index.ts
index 2ab4c37d71f23af2203e5359aefdfd9cc919d21e..097b8afdd549514a3ea76740154f582a1f88996c 100644
--- a/packages/core/src/llm/index.ts
+++ b/packages/core/src/llm/index.ts
@@ -1,4 +1,3 @@
-export * from "./LLM.js";
 export { Anthropic } from "./anthropic.js";
 export { FireworksLLM } from "./fireworks.js";
 export { Groq } from "./groq.js";
@@ -9,5 +8,12 @@ export {
 } from "./mistral.js";
 export { Ollama } from "./ollama.js";
 export * from "./open_ai.js";
+export { Portkey } from "./portkey.js";
+export * from "./replicate_ai.js";
+// Note: The type aliases for replicate are to simplify usage for Llama 2 (we're using replicate for Llama 2 support)
+export {
+  ReplicateChatStrategy as DeuceChatStrategy,
+  ReplicateLLM as LlamaDeuce,
+} from "./replicate_ai.js";
 export { TogetherLLM } from "./together.js";
 export * from "./types.js";
diff --git a/packages/core/src/llm/portkey.ts b/packages/core/src/llm/portkey.ts
index d0d90253e187440b0575421dcc99bf5f784542e7..bb3f47b129e20ca81a9c848a3746e57ff88ba733 100644
--- a/packages/core/src/llm/portkey.ts
+++ b/packages/core/src/llm/portkey.ts
@@ -1,7 +1,20 @@
 import { getEnv } from "@llamaindex/env";
 import _ from "lodash";
 import type { LLMOptions } from "portkey-ai";
-import { Portkey } from "portkey-ai";
+import { Portkey as OrigPortKey } from "portkey-ai";
+import { type StreamCallbackResponse } from "../callbacks/CallbackManager.js";
+import { getCallbackManager } from "../internal/settings/CallbackManager.js";
+import { BaseLLM } from "./base.js";
+import type {
+  ChatMessage,
+  ChatResponse,
+  ChatResponseChunk,
+  LLMChatParamsNonStreaming,
+  LLMChatParamsStreaming,
+  LLMMetadata,
+  MessageType,
+} from "./types.js";
+import { extractText, wrapLLMEvent } from "./utils.js";
 
 interface PortkeyOptions {
   apiKey?: string;
@@ -11,7 +24,7 @@ interface PortkeyOptions {
 }
 
 export class PortkeySession {
-  portkey: Portkey;
+  portkey: OrigPortKey;
 
   constructor(options: PortkeyOptions = {}) {
     if (!options.apiKey) {
@@ -22,13 +35,13 @@ export class PortkeySession {
       options.baseURL = getEnv("PORTKEY_BASE_URL") ?? "https://api.portkey.ai";
     }
 
-    this.portkey = new Portkey({});
+    this.portkey = new OrigPortKey({});
     this.portkey.llms = [{}];
     if (!options.apiKey) {
       throw new Error("Set Portkey ApiKey in PORTKEY_API_KEY env variable");
     }
 
-    this.portkey = new Portkey(options);
+    this.portkey = new OrigPortKey(options);
   }
 }
 
@@ -54,3 +67,92 @@ export function getPortkeySession(options: PortkeyOptions = {}) {
   }
   return session;
 }
+
+export class Portkey extends BaseLLM {
+  apiKey?: string = undefined;
+  baseURL?: string = undefined;
+  mode?: string = undefined;
+  llms?: [LLMOptions] | null = undefined;
+  session: PortkeySession;
+
+  constructor(init?: Partial<Portkey>) {
+    super();
+    this.apiKey = init?.apiKey;
+    this.baseURL = init?.baseURL;
+    this.mode = init?.mode;
+    this.llms = init?.llms;
+    this.session = getPortkeySession({
+      apiKey: this.apiKey,
+      baseURL: this.baseURL,
+      llms: this.llms,
+      mode: this.mode,
+    });
+  }
+
+  get metadata(): LLMMetadata {
+    throw new Error("metadata not implemented for Portkey");
+  }
+
+  chat(
+    params: LLMChatParamsStreaming,
+  ): Promise<AsyncIterable<ChatResponseChunk>>;
+  chat(params: LLMChatParamsNonStreaming): Promise<ChatResponse>;
+  @wrapLLMEvent
+  async chat(
+    params: LLMChatParamsNonStreaming | LLMChatParamsStreaming,
+  ): Promise<ChatResponse | AsyncIterable<ChatResponseChunk>> {
+    const { messages, stream, additionalChatOptions } = params;
+    if (stream) {
+      return this.streamChat(messages, additionalChatOptions);
+    } else {
+      const bodyParams = additionalChatOptions || {};
+      const response = await this.session.portkey.chatCompletions.create({
+        messages: messages.map((message) => ({
+          content: extractText(message.content),
+          role: message.role,
+        })),
+        ...bodyParams,
+      });
+
+      const content = response.choices[0].message?.content ?? "";
+      const role = response.choices[0].message?.role || "assistant";
+      return { raw: response, message: { content, role: role as MessageType } };
+    }
+  }
+
+  async *streamChat(
+    messages: ChatMessage[],
+    params?: Record<string, any>,
+  ): AsyncIterable<ChatResponseChunk> {
+    const chunkStream = await this.session.portkey.chatCompletions.create({
+      messages: messages.map((message) => ({
+        content: extractText(message.content),
+        role: message.role,
+      })),
+      ...params,
+      stream: true,
+    });
+
+    //Indices
+    let idx_counter: number = 0;
+    for await (const part of chunkStream) {
+      //Increment
+      part.choices[0].index = idx_counter;
+      const is_done: boolean =
+        part.choices[0].finish_reason === "stop" ? true : false;
+      //onLLMStream Callback
+
+      const stream_callback: StreamCallbackResponse = {
+        index: idx_counter,
+        isDone: is_done,
+        // token: part,
+      };
+      getCallbackManager().dispatchEvent("stream", stream_callback);
+
+      idx_counter++;
+
+      yield { raw: part, delta: part.choices[0].delta?.content ?? "" };
+    }
+    return;
+  }
+}
diff --git a/packages/core/src/llm/replicate_ai.ts b/packages/core/src/llm/replicate_ai.ts
index 5c3d51f5602c72eee461a6cf5cf76c6915fa7f85..b369a1bf37c1d365c841d16036c4fa38c5423107 100644
--- a/packages/core/src/llm/replicate_ai.ts
+++ b/packages/core/src/llm/replicate_ai.ts
@@ -1,5 +1,15 @@
 import { getEnv } from "@llamaindex/env";
 import Replicate from "replicate";
+import { BaseLLM } from "./base.js";
+import type {
+  ChatMessage,
+  ChatResponse,
+  ChatResponseChunk,
+  LLMChatParamsNonStreaming,
+  LLMChatParamsStreaming,
+  MessageType,
+} from "./types.js";
+import { extractText, wrapLLMEvent } from "./utils.js";
 
 export class ReplicateSession {
   replicateKey: string | null = null;
@@ -20,12 +30,271 @@ export class ReplicateSession {
   }
 }
 
-let defaultReplicateSession: ReplicateSession | null = null;
+export const ALL_AVAILABLE_REPLICATE_MODELS = {
+  // TODO: add more models from replicate
+  "Llama-2-70b-chat-old": {
+    contextWindow: 4096,
+    replicateApi:
+      "replicate/llama70b-v2-chat:e951f18578850b652510200860fc4ea62b3b16fac280f83ff32282f87bbd2e48",
+    //^ Previous 70b model. This is also actually 4 bit, although not exllama.
+  },
+  "Llama-2-70b-chat-4bit": {
+    contextWindow: 4096,
+    replicateApi:
+      "meta/llama-2-70b-chat:02e509c789964a7ea8736978a43525956ef40397be9033abf9fd2badfe68c9e3",
+    //^ Model is based off of exllama 4bit.
+  },
+  "Llama-2-13b-chat-old": {
+    contextWindow: 4096,
+    replicateApi:
+      "a16z-infra/llama13b-v2-chat:df7690f1994d94e96ad9d568eac121aecf50684a0b0963b25a41cc40061269e5",
+  },
+  //^ Last known good 13b non-quantized model. In future versions they add the SYS and INST tags themselves
+  "Llama-2-13b-chat-4bit": {
+    contextWindow: 4096,
+    replicateApi:
+      "meta/llama-2-13b-chat:f4e2de70d66816a838a89eeeb621910adffb0dd0baba3976c96980970978018d",
+  },
+  "Llama-2-7b-chat-old": {
+    contextWindow: 4096,
+    replicateApi:
+      "a16z-infra/llama7b-v2-chat:4f0a4744c7295c024a1de15e1a63c880d3da035fa1f49bfd344fe076074c8eea",
+    //^ Last (somewhat) known good 7b non-quantized model. In future versions they add the SYS and INST
+    // tags themselves
+    // https://github.com/replicate/cog-llama-template/commit/fa5ce83912cf82fc2b9c01a4e9dc9bff6f2ef137
+    // Problem is that they fix the max_new_tokens issue in the same commit. :-(
+  },
+  "Llama-2-7b-chat-4bit": {
+    contextWindow: 4096,
+    replicateApi:
+      "meta/llama-2-7b-chat:13c3cdee13ee059ab779f0291d29054dab00a47dad8261375654de5540165fb0",
+  },
+};
 
-export function getReplicateSession(replicateKey: string | null = null) {
-  if (!defaultReplicateSession) {
-    defaultReplicateSession = new ReplicateSession(replicateKey);
+export enum ReplicateChatStrategy {
+  A16Z = "a16z",
+  META = "meta",
+  METAWBOS = "metawbos",
+  //^ This is not exactly right because SentencePiece puts the BOS and EOS token IDs in after tokenization
+  // Unfortunately any string only API won't support these properly.
+  REPLICATE4BIT = "replicate4bit",
+  //^ To satisfy Replicate's 4 bit models' requirements where they also insert some INST tags
+  REPLICATE4BITWNEWLINES = "replicate4bitwnewlines",
+  //^ Replicate's documentation recommends using newlines: https://replicate.com/blog/how-to-prompt-llama
+}
+
+/**
+ * Replicate LLM implementation used
+ */
+export class ReplicateLLM extends BaseLLM {
+  model: keyof typeof ALL_AVAILABLE_REPLICATE_MODELS;
+  chatStrategy: ReplicateChatStrategy;
+  temperature: number;
+  topP: number;
+  maxTokens?: number;
+  replicateSession: ReplicateSession;
+
+  constructor(init?: Partial<ReplicateLLM>) {
+    super();
+    this.model = init?.model ?? "Llama-2-70b-chat-4bit";
+    this.chatStrategy =
+      init?.chatStrategy ??
+      (this.model.endsWith("4bit")
+        ? ReplicateChatStrategy.REPLICATE4BITWNEWLINES // With the newer Replicate models they do the system message themselves.
+        : ReplicateChatStrategy.METAWBOS); // With BOS and EOS seems to work best, although they all have problems past a certain point
+    this.temperature = init?.temperature ?? 0.1; // minimum temperature is 0.01 for Replicate endpoint
+    this.topP = init?.topP ?? 1;
+    this.maxTokens =
+      init?.maxTokens ??
+      ALL_AVAILABLE_REPLICATE_MODELS[this.model].contextWindow; // For Replicate, the default is 500 tokens which is too low.
+    this.replicateSession = init?.replicateSession ?? new ReplicateSession();
+  }
+
+  get metadata() {
+    return {
+      model: this.model,
+      temperature: this.temperature,
+      topP: this.topP,
+      maxTokens: this.maxTokens,
+      contextWindow: ALL_AVAILABLE_REPLICATE_MODELS[this.model].contextWindow,
+      tokenizer: undefined,
+    };
+  }
+
+  mapMessagesToPrompt(messages: ChatMessage[]) {
+    if (this.chatStrategy === ReplicateChatStrategy.A16Z) {
+      return this.mapMessagesToPromptA16Z(messages);
+    } else if (this.chatStrategy === ReplicateChatStrategy.META) {
+      return this.mapMessagesToPromptMeta(messages);
+    } else if (this.chatStrategy === ReplicateChatStrategy.METAWBOS) {
+      return this.mapMessagesToPromptMeta(messages, { withBos: true });
+    } else if (this.chatStrategy === ReplicateChatStrategy.REPLICATE4BIT) {
+      return this.mapMessagesToPromptMeta(messages, {
+        replicate4Bit: true,
+        withNewlines: true,
+      });
+    } else if (
+      this.chatStrategy === ReplicateChatStrategy.REPLICATE4BITWNEWLINES
+    ) {
+      return this.mapMessagesToPromptMeta(messages, {
+        replicate4Bit: true,
+        withNewlines: true,
+      });
+    } else {
+      return this.mapMessagesToPromptMeta(messages);
+    }
+  }
+
+  mapMessagesToPromptA16Z(messages: ChatMessage[]) {
+    return {
+      prompt:
+        messages.reduce((acc, message) => {
+          return (
+            (acc && `${acc}\n\n`) +
+            `${this.mapMessageTypeA16Z(message.role)}${message.content}`
+          );
+        }, "") + "\n\nAssistant:",
+      //^ Here we're differing from A16Z by omitting the space. Generally spaces at the end of prompts decrease performance due to tokenization
+      systemPrompt: undefined,
+    };
+  }
+
+  mapMessageTypeA16Z(messageType: MessageType): string {
+    switch (messageType) {
+      case "user":
+        return "User: ";
+      case "assistant":
+        return "Assistant: ";
+      case "system":
+        return "";
+      default:
+        throw new Error("Unsupported ReplicateLLM message type");
+    }
+  }
+
+  mapMessagesToPromptMeta(
+    messages: ChatMessage[],
+    opts?: {
+      withBos?: boolean;
+      replicate4Bit?: boolean;
+      withNewlines?: boolean;
+    },
+  ) {
+    const {
+      withBos = false,
+      replicate4Bit = false,
+      withNewlines = false,
+    } = opts ?? {};
+    const DEFAULT_SYSTEM_PROMPT = `You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.
+
+If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.`;
+
+    const B_SYS = "<<SYS>>\n";
+    const E_SYS = "\n<</SYS>>\n\n";
+    const B_INST = "[INST]";
+    const E_INST = "[/INST]";
+    const BOS = "<s>";
+    const EOS = "</s>";
+
+    if (messages.length === 0) {
+      return { prompt: "", systemPrompt: undefined };
+    }
+
+    messages = [...messages]; // so we can use shift without mutating the original array
+
+    let systemPrompt = undefined;
+    if (messages[0].role === "system") {
+      const systemMessage = messages.shift()!;
+
+      if (replicate4Bit) {
+        systemPrompt = systemMessage.content;
+      } else {
+        const systemStr = `${B_SYS}${systemMessage.content}${E_SYS}`;
+
+        // TS Bug: https://github.com/microsoft/TypeScript/issues/9998
+        // @ts-ignore
+        if (messages[0].role !== "user") {
+          throw new Error(
+            "ReplicateLLM: if there is a system message, the second message must be a user message.",
+          );
+        }
+
+        const userContent = messages[0].content;
+
+        messages[0].content = `${systemStr}${userContent}`;
+      }
+    } else {
+      if (!replicate4Bit) {
+        messages[0].content = `${B_SYS}${DEFAULT_SYSTEM_PROMPT}${E_SYS}${messages[0].content}`;
+      }
+    }
+
+    return {
+      prompt: messages.reduce((acc, message, index) => {
+        const content = extractText(message.content);
+        if (index % 2 === 0) {
+          return (
+            `${acc}${withBos ? BOS : ""}${B_INST} ${content.trim()} ${E_INST}` +
+            (withNewlines ? "\n" : "")
+          );
+        } else {
+          return (
+            `${acc} ${content.trim()}` +
+            (withNewlines ? "\n" : " ") +
+            (withBos ? EOS : "")
+          ); // Yes, the EOS comes after the space. This is not a mistake.
+        }
+      }, ""),
+      systemPrompt,
+    };
   }
 
-  return defaultReplicateSession;
+  chat(
+    params: LLMChatParamsStreaming,
+  ): Promise<AsyncIterable<ChatResponseChunk>>;
+  chat(params: LLMChatParamsNonStreaming): Promise<ChatResponse>;
+  @wrapLLMEvent
+  async chat(
+    params: LLMChatParamsNonStreaming | LLMChatParamsStreaming,
+  ): Promise<ChatResponse | AsyncIterable<ChatResponseChunk>> {
+    const { messages, stream } = params;
+    const api = ALL_AVAILABLE_REPLICATE_MODELS[this.model]
+      .replicateApi as `${string}/${string}:${string}`;
+
+    const { prompt, systemPrompt } = this.mapMessagesToPrompt(messages);
+
+    const replicateOptions: any = {
+      input: {
+        prompt,
+        system_prompt: systemPrompt,
+        temperature: this.temperature,
+        top_p: this.topP,
+      },
+    };
+
+    if (this.model.endsWith("4bit")) {
+      replicateOptions.input.max_new_tokens = this.maxTokens;
+    } else {
+      replicateOptions.input.max_length = this.maxTokens;
+    }
+
+    //TODO: Add streaming for this
+    if (stream) {
+      throw new Error("Streaming not supported for ReplicateLLM");
+    }
+
+    //Non-streaming
+    const response = await this.replicateSession.replicate.run(
+      api,
+      replicateOptions,
+    );
+    return {
+      raw: response,
+      message: {
+        content: (response as Array<string>).join("").trimStart(),
+        //^ We need to do this because Replicate returns a list of strings (for streaming functionality which is not exposed by the run function)
+        role: "assistant",
+      },
+    };
+  }
 }