diff --git a/package.json b/package.json
index 8e9e9f49ac3503cdc6052638e4a1bc05c6037953..e177cc22af4e303b477911fd422ae619ee7a2135 100644
--- a/package.json
+++ b/package.json
@@ -17,7 +17,8 @@
     "prettier-plugin-tailwindcss": "^0.3.0",
     "ts-jest": "^29.1.0",
     "turbo": "latest",
-    "wink-nlp": "latest"
+    "wink-nlp": "latest",
+    "tiktoken-node": "latest"
   },
   "packageManager": "pnpm@7.15.0",
   "name": "llamascript"
diff --git a/packages/core/src/TextSplitter.ts b/packages/core/src/TextSplitter.ts
index cdcf8677b5193b53c436e21910912c5e20a35d07..1b040aa3db10a0fab25864a5cf92d1e12d46d9b8 100644
--- a/packages/core/src/TextSplitter.ts
+++ b/packages/core/src/TextSplitter.ts
@@ -2,217 +2,234 @@
 
 import { DEFAULT_CHUNK_SIZE, DEFAULT_CHUNK_OVERLAP } from "./constants";
 
+class TextSplit {
+  textChunk: string;
+  numCharOverlap: number | undefined;
 
-class SentenceSplitter {
-  private _separator: string;
-  private _chunk_size: number;
-  private _chunk_overlap: number;
+  constructor(textChunk: string, numCharOverlap: number | undefined = undefined) {
+    this.textChunk = textChunk;
+    this.numCharOverlap = numCharOverlap;
+  }
+}
+
+type SplitRep = [text: string, numTokens: number]
+
+
+
+export class SentenceSplitter {
+  private chunkSize: number;
+  private chunkOverlap: number;
   private tokenizer: any;
-  private _backup_separators: string[];
-  private _paragraph_separator: string | undefined;
-  private _chunking_tokenizer_fn: any;
-  private _secondary_chunking_regex: string;
+  private tokenizerDecoder: any;
+  private paragraphSeparator: string;
+  private chunkingTokenizerFn: any;
   // private _callback_manager: any;
 
   constructor(
-    separator: string = " ",
-    chunk_size: number = DEFAULT_CHUNK_SIZE,
-    chunk_overlap: number = DEFAULT_CHUNK_OVERLAP,
+    chunkSize: number = DEFAULT_CHUNK_SIZE,
+    chunkOverlap: number = DEFAULT_CHUNK_OVERLAP,
     tokenizer: any = null,
-    backup_separators: string[] = ["\n"],
-    paragraph_separator: string | undefined = "\n\n\n",
-    chunking_tokenizer_fn: any = undefined,
-    secondary_chunking_regex: string = "[^,.;。]+[,.;。]?",
+    tokenizerDecoder: any = null,
+    paragraphSeparator: string = "\n\n\n",
+    chunkingTokenizerFn: any = undefined,
     // callback_manager: any = undefined
   ) {
-    if (chunk_overlap > chunk_size) {
+    if (chunkOverlap > chunkSize) {
       throw new Error(
-        `Got a larger chunk overlap (${chunk_overlap}) than chunk size (${chunk_size}), should be smaller.`
+        `Got a larger chunk overlap (${chunkOverlap}) than chunk size (${chunkSize}), should be smaller.`
       );
     }
-    this._separator = separator;
-    this._chunk_size = chunk_size;
-    this._chunk_overlap = chunk_overlap;
-    this.tokenizer = tokenizer;
-    this._backup_separators = backup_separators;
+    this.chunkSize = chunkSize;
+    this.chunkOverlap = chunkOverlap;
     // this._callback_manager = callback_manager || new CallbackManager([]);
 
-    if (chunking_tokenizer_fn == undefined) {
+    if (chunkingTokenizerFn == undefined) {
 
       // define a callable mapping a string to a list of strings
-      const default_chunking_tokenizer_fn = (text: string) => {
+      const defaultChunkingTokenizerFn = (text: string) => {
         var result = text.match(/[^.?!]+[.!?]+[\])'"`’”]*|.+/g);
         return result
       };
       
-      chunking_tokenizer_fn = default_chunking_tokenizer_fn;
+      chunkingTokenizerFn = defaultChunkingTokenizerFn;
     }
 
-    if (tokenizer == undefined) {
+    if (tokenizer == undefined || tokenizerDecoder == undefined) {
       const tiktoken = require('tiktoken-node')
-      let enc = new tiktoken.getEncoding("gpt-2")
+      let enc = new tiktoken.getEncoding("gpt2")
       const default_tokenizer = (text: string) => {
         return enc.encode(text)
       }
+      const defaultTokenizerDecoder = (text: string) => {
+        return enc.decode(text)
+      }
       tokenizer = default_tokenizer
+      tokenizerDecoder = defaultTokenizerDecoder
+    }
+    this.tokenizer = tokenizer;
+    this.tokenizerDecoder = tokenizerDecoder;
+    
+    this.paragraphSeparator = paragraphSeparator;
+    this.chunkingTokenizerFn = chunkingTokenizerFn;
+
+  }
+
+  private getEffectiveChunkSize(extraInfoStr?: string): number {
+    // get "effective" chunk size by removing the metadata
+    let effectiveChunkSize;
+    if (extraInfoStr != undefined) {
+      const numExtraTokens = this.tokenizer(`${extraInfoStr}\n\n`).length + 1;
+      effectiveChunkSize = this.chunkSize - numExtraTokens;
+      if (effectiveChunkSize <= 0) {
+        throw new Error(
+          "Effective chunk size is non positive after considering extra_info"
+        );
+      }
+    } else {
+      effectiveChunkSize = this.chunkSize;
     }
+    return effectiveChunkSize;
     
-    this._paragraph_separator = paragraph_separator;
-    this._chunking_tokenizer_fn = chunking_tokenizer_fn;
-    this._secondary_chunking_regex = secondary_chunking_regex;
+  }
 
+  getParagraphSplits(text: string, effectiveChunkSize?: number): string[] {
+    // get paragraph splits
+    let paragraphSplits: string[] = text.split(this.paragraphSeparator);
+    let idx = 0;
+    if (effectiveChunkSize == undefined) {
+      return paragraphSplits
+    }
+
+    // merge paragraphs that are too small
+    while (idx < paragraphSplits.length) {
+      if (idx < paragraphSplits.length - 1 && paragraphSplits[idx].length < effectiveChunkSize) {
+        paragraphSplits[idx] = [paragraphSplits[idx], paragraphSplits[idx + 1]].join(this.paragraphSeparator);
+        paragraphSplits.splice(idx + 1, 1);
+      } else {
+        idx += 1;
+      }
+    }
+    return paragraphSplits
+    
+  }
+
+  getSentenceSplits(text: string, effectiveChunkSize?: number): string[] {
+    let paragraphSplits = this.getParagraphSplits(text, effectiveChunkSize);
+    // Next we split the text using the chunk tokenizer fn/
+    let splits = [];
+    for (const parText of paragraphSplits) {
+      let sentenceSplits = this.chunkingTokenizerFn(parText);
+      for (const sentence_split of sentenceSplits) {
+        splits.push(sentence_split.trim());
+      }
+    }
+    return splits
   }
 
-  splitText(text: string, extra_info_str?: string): string[] {
+  private processSentenceSplits(sentenceSplits: string[], effectiveChunkSize: number): SplitRep[] {
+    // Process entence splits
+    // Primarily check if any sentences exceed the chunk size. If they don't,
+    // force split by tokenizer
+    let newSplits: SplitRep[] = [];
+    for (const split of sentenceSplits) {
+      let splitTokens = this.tokenizer(split);
+      const split_len = splitTokens.length;
+      if (split_len <= effectiveChunkSize) {
+        newSplits.push([split, split_len]);
+      } else {
+        for (let i = 0; i < split_len; i += effectiveChunkSize) {
+          const cur_split = this.tokenizerDecoder(splitTokens.slice(i, i + effectiveChunkSize));
+          newSplits.push([cur_split, effectiveChunkSize]);
+        }
+      }
+    }
+    return newSplits
+  }
+
+  combineTextSplits(newSentenceSplits: SplitRep[], effectiveChunkSize: number): TextSplit[] {
+    // go through sentence splits, combien to chunks that are within the chunk size
+
+    // docs represents final list of text chunks
+    let docs: TextSplit[] = [];
+    // curDocList represents the current list of sentence splits (that)
+    // will be merged into a chunk
+    let curDocList: string[] = [];
+    let bufferTokens = 0;
+    let curDocTokens = 0;
+    // curDocBuffer represents the current document buffer
+    let curDocBuffer: SplitRep[] = [];
+
+    for (let i = 0; i < newSentenceSplits.length; i++) {
+
+      // update buffer
+      curDocBuffer.push(newSentenceSplits[i]);
+      bufferTokens += newSentenceSplits[i][1] + 1;
+
+      while (bufferTokens > this.chunkOverlap) {
+        // remove first element from curDocBuffer
+        let first_element = curDocBuffer.shift();
+        if (first_element == undefined) {
+          throw new Error("first_element is undefined");
+        }
+        bufferTokens -= first_element[1];
+        bufferTokens -= 1;
+      }
+
+      // if adding newSentenceSplits[i] to curDocBuffer would exceed effectiveChunkSize,
+      // then we need to add the current curDocBuffer to docs
+      if (curDocTokens + newSentenceSplits[i][1] > effectiveChunkSize) {
+        // push curent doc list to docs
+        docs.push(new TextSplit(curDocList.join(" ").trim()));
+        // reset docs list with buffer
+        curDocTokens = 0;
+        curDocList = [];
+        for (let j = 0; j < curDocBuffer.length; j++) {
+          curDocList.push(curDocBuffer[j][0]);
+          curDocTokens += curDocBuffer[j][1] + 1;
+        }
+        
+      }
+
+      curDocList.push(newSentenceSplits[i][0]);
+      curDocTokens += newSentenceSplits[i][1] + 1;
+      
+    }
+    docs.push(new TextSplit(curDocList.join(" ").trim()));
+    return docs
+    
+  }
+
+  splitTextWithOverlaps(text: string, extraInfoStr?: string): TextSplit[] {
+    // Split incoming text and return chunks with overlap size.
+    // Has a preference for complete sentences, phrases, and minimal overlap.
+
+    // here is the typescript code (skip callback manager)
+    if (text == "") {
+      return [];
+    }
+
+    let effectiveChunkSize = this.getEffectiveChunkSize(extraInfoStr);
+    let sentenceSplits = this.getSentenceSplits(text, effectiveChunkSize);
+
+    // Check if any sentences exceed the chunk size. If they don't,
+    // force split by tokenizer
+    let newSentenceSplits = this.processSentenceSplits(sentenceSplits, effectiveChunkSize);
+
+
+    // combine sentence splits into chunks of text that can then be returned
+    let combinedTextSplits = this.combineTextSplits(newSentenceSplits, effectiveChunkSize);
+
+    return combinedTextSplits
+
+  }
+
+
+  splitText(text: string, extraInfoStr?: string): string[] {
     const text_splits = this.splitTextWithOverlaps(text);
-    const chunks = text_splits.map((text_split) => text_split.text_chunk);
+    const chunks = text_splits.map((text_split) => text_split.textChunk);
     return chunks;
   }
 
 
 }
 
-
-
-// class TokenTextSplitter {
-//   private _separator: string;
-//   private _chunk_size: number;
-//   private _chunk_overlap: number;
-//   private tokenizer: any;
-//   private _backup_separators: string[];
-//   private callback_manager: any;
-
-//   constructor(
-//     separator: string = " ",
-//     chunk_size: number = DEFAULT_CHUNK_SIZE,
-//     chunk_overlap: number = DEFAULT_CHUNK_OVERLAP,
-//     tokenizer: any = null,
-//     backup_separators: string[] = ["\n"]
-//     // callback_manager: any = null
-//   ) {
-//     if (chunk_overlap > chunk_size) {
-//       throw new Error(
-//         `Got a larger chunk overlap (${chunk_overlap}) than chunk size (${chunk_size}), should be smaller.`
-//       );
-//     }
-//     this._separator = separator;
-//     this._chunk_size = chunk_size;
-//     this._chunk_overlap = chunk_overlap;
-//     this.tokenizer = tokenizer || globals_helper.tokenizer;
-//     this._backup_separators = backup_separators;
-//     // this.callback_manager = callback_manager || new CallbackManager([]);
-//   }
-
-//   private _reduceChunkSize(
-//     start_idx: number,
-//     cur_idx: number,
-//     splits: string[]
-//   ): number {
-//     let current_doc_total = this.tokenizer(
-//       splits.slice(start_idx, cur_idx).join(this._separator)
-//     ).length;
-//     while (current_doc_total > this._chunk_size) {
-//       const percent_to_reduce =
-//         (current_doc_total - this._chunk_size) / current_doc_total;
-//       const num_to_reduce =
-//         parseInt(percent_to_reduce.toString()) * (cur_idx - start_idx) + 1;
-//       cur_idx -= num_to_reduce;
-//       current_doc_total = this.tokenizer(
-//         splits.slice(start_idx, cur_idx).join(this._separator)
-//       ).length;
-//     }
-//     return cur_idx;
-//   }
-
-//   _preprocessSplits(splits: Array<string>, chunk_size: number): Array<string> {
-//     const new_splits: Array<string> = [];
-//     for (const split of splits) {
-//       const num_cur_tokens = tokenizer(split).length;
-//       if (num_cur_tokens <= chunk_size) {
-//         new_splits.push(split);
-//       } else {
-//         let cur_splits: Array<string> = [split];
-//         if (backup_separators) {
-//           for (const sep of backup_separators) {
-//             if (split.includes(sep)) {
-//               cur_splits = split.split(sep);
-//               break;
-//             }
-//           }
-//         } else {
-//           cur_splits = [split];
-//         }
-
-//         const cur_splits2: Array<string> = [];
-//         for (const cur_split of cur_splits) {
-//           const num_cur_tokens = tokenizer(cur_split).length;
-//           if (num_cur_tokens <= chunk_size) {
-//             cur_splits2.push(cur_split);
-//           } else {
-//             // split cur_split according to chunk size of the token numbers
-//             const cur_split_chunks: Array<string> = [];
-//             let end_idx = cur_split.length;
-//             while (tokenizer(cur_split.slice(0, end_idx)).length > chunk_size) {
-//               for (let i = 1; i < end_idx; i++) {
-//                 const tmp_split = cur_split.slice(0, end_idx - i);
-//                 if (tokenizer(tmp_split).length <= chunk_size) {
-//                   cur_split_chunks.push(tmp_split);
-//                   cur_splits2.push(cur_split.slice(end_idx - i, end_idx));
-//                   end_idx = cur_split.length;
-//                   break;
-//                 }
-//               }
-//             }
-//             cur_split_chunks.push(cur_split);
-//             cur_splits2.push(...cur_split_chunks);
-//           }
-//         }
-//         new_splits.push(...cur_splits2);
-//       }
-//     }
-//     return new_splits;
-//   }
-
-//   _postprocessSplits(docs: TextSplit[]): TextSplit[] {
-//     const new_docs: TextSplit[] = [];
-//     for (const doc of docs) {
-//       if (doc.text_chunk.replace(" ", "") == "") {
-//         continue;
-//       }
-//       new_docs.push(doc);
-//     }
-//     return new_docs;
-//   }
-
-//   splitText(text: string, extra_info_str?: string): string[] {
-//     const text_splits = this.splitTextWithOverlaps(text);
-//     const chunks = text_splits.map((text_split) => text_split.text_chunk);
-//     return chunks;
-//   }
-
-//   splitTextWithOverlaps(text: string) {}
-
-//   truncateText(text: string, separator: string, chunk_size: number): string {
-//     if (text == "") {
-//       return "";
-//     }
-//     // First we naively split the large input into a bunch of smaller ones.
-//     let splits: string[] = text.split(separator);
-//     splits = preprocessSplits(splits, chunk_size);
-
-//     let start_idx = 0;
-//     let cur_idx = 0;
-//     let cur_total = 0;
-//     while (cur_idx < splits.length) {
-//       let cur_token = splits[cur_idx];
-//       let num_cur_tokens = Math.max(tokenizer(cur_token).length, 1);
-//       if (cur_total + num_cur_tokens > chunk_size) {
-//         cur_idx = reduce_chunk_size(start_idx, cur_idx, splits);
-//         break;
-//       }
-//       cur_total += num_cur_tokens;
-//       cur_idx += 1;
-//     }
-//     return splits.slice(start_idx, cur_idx).join(separator);
-//   }
-// }
diff --git a/packages/core/src/tests/TextSplitter.test.ts b/packages/core/src/tests/TextSplitter.test.ts
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..1cfd7abf614ffc4db55507b4052e677893dc7166 100644
--- a/packages/core/src/tests/TextSplitter.test.ts
+++ b/packages/core/src/tests/TextSplitter.test.ts
@@ -0,0 +1,43 @@
+import { SentenceSplitter } from "../TextSplitter";
+
+describe("SentenceSplitter", () => {
+  test("initializes", () => {
+    const sentenceSplitter = new SentenceSplitter();
+    expect(sentenceSplitter).toBeDefined();
+  });
+
+  test("splits paragraphs w/o effective chunk size", () => {
+    const sentenceSplitter = new SentenceSplitter(
+      undefined, undefined, undefined, undefined, "\n"
+    );
+    // generate the same line as above but correct syntax errors
+    let splits = sentenceSplitter.getParagraphSplits("This is a paragraph.\nThis is another paragraph.", undefined);
+    expect(splits).toEqual(["This is a paragraph.", "This is another paragraph."]);
+  });
+
+  test("splits paragraphs with effective chunk size", () => {
+    const sentenceSplitter = new SentenceSplitter(
+      undefined, undefined, undefined, undefined, "\n"
+    );
+    // generate the same line as above but correct syntax errors
+    let splits = sentenceSplitter.getParagraphSplits("This is a paragraph.\nThis is another paragraph.", 1000);
+    expect(splits).toEqual(["This is a paragraph.\nThis is another paragraph."]);
+  });
+  
+  test("splits sentences", () => {
+    const sentenceSplitter = new SentenceSplitter();
+    let splits = sentenceSplitter.getSentenceSplits("This is a sentence. This is another sentence.", undefined);
+    expect(splits).toEqual(["This is a sentence.", "This is another sentence."]);
+  });
+
+  test("overall split text", () => {
+    let sentenceSplitter = new SentenceSplitter(5, 0);
+    let splits = sentenceSplitter.splitText("This is a sentence. This is another sentence.");
+    expect(splits).toEqual(["This is a sentence.", "This is another sentence."]);
+
+    sentenceSplitter = new SentenceSplitter(1000);
+    splits = sentenceSplitter.splitText("This is a sentence. This is another sentence.");
+    expect(splits).toEqual(["This is a sentence. This is another sentence."]);
+  });
+
+});
diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml
index 159b6f8d3889030ddf6a328b632cb15912946cc5..c9bf581855ef97ab0ec043ad755aa795b0314cfb 100644
--- a/pnpm-lock.yaml
+++ b/pnpm-lock.yaml
@@ -29,6 +29,9 @@ importers:
       prettier-plugin-tailwindcss:
         specifier: ^0.3.0
         version: 0.3.0(prettier@2.8.8)
+      tiktoken-node:
+        specifier: latest
+        version: 0.0.6
       ts-jest:
         specifier: ^29.1.0
         version: 29.1.0(@babel/core@7.22.5)(jest@29.5.0)(typescript@4.9.5)
@@ -4755,6 +4758,11 @@ packages:
     resolution: {integrity: sha512-w89qg7PI8wAdvX60bMDP+bFoD5Dvhm9oLheFp5O4a2QF0cSBGsBX4qZmadPMvVqlLJBBci+WqGGOAPvcDeNSVg==}
     dev: true
 
+  /tiktoken-node@0.0.6:
+    resolution: {integrity: sha512-MiprfzPhoKhCflzl0Jyds0VKibAgUGHfJLvBCAXPpum6Lru6ZoKQGsl8lJP0B94LPpby2B2WveOB2tZVfEZQOQ==}
+    engines: {node: '>= 14'}
+    dev: true
+
   /title-case@2.1.1:
     resolution: {integrity: sha512-EkJoZ2O3zdCz3zJsYCsxyq2OC5hrxR9mfdd5I+w8h/tmFfeOxJ+vvkxsKxdmN0WtS9zLdHEgfgVOiMVgv+Po4Q==}
     dependencies: