From 68bf89b01b1c5123e37a42f34c537f7280b22bb0 Mon Sep 17 00:00:00 2001
From: Mateusz Charytoniuk <mateusz.charytoniuk@protonmail.com>
Date: Thu, 18 Jan 2024 22:02:15 +0100
Subject: [PATCH] feat: llama.cpp integration

---
 config.ini.example                            |   8 +-
 docs/pages/docs/changelog/index.md            |   4 +
 docs/pages/docs/features/ai/index.md          |  11 +
 .../pages/docs/features/ai/llama-cpp/index.md |  73 ++++++
 .../tutorials/basic-graphql-schema/index.md   |   2 +-
 .../tutorials/connect-to-llama-cpp/index.md   | 144 +++++++++++
 docs/pages/tutorials/hello-world/index.md     |   2 +-
 ...llamaGenerate.php => LlamaCppGenerate.php} |  25 +-
 src/Command/LlamaCppGenerate/Completion.php   |  32 +++
 src/Command/LlamaCppGenerate/Embedding.php    |  41 +++
 src/Command/LlamaCppHealth.php                |  34 +++
 src/Command/LlamaCppInfill.php                |  44 ++++
 src/Command/OllamaChat.php                    |  71 ------
 src/Command/OllamaGenerate/Completion.php     |  33 ---
 src/Command/OllamaGenerate/Embedding.php      |  51 ----
 src/JsonSerializer.php                        |  19 +-
 src/LlamaCppClient.php                        | 233 ++++++++++++++++++
 src/LlamaCppCompletionRequest.php             |  26 ++
 src/LlamaCppCompletionToken.php               |  20 ++
 ...guration.php => LlamaCppConfiguration.php} |   3 +-
 src/LlamaCppEmbedding.php                     |  15 ++
 ...ssage.php => LlamaCppEmbeddingRequest.php} |  10 +-
 src/LlamaCppHealthStatus.php                  |  12 +
 src/LlamaCppInfill.php                        |  21 ++
 src/LlamaCppInfillRequest.php                 |  27 ++
 ...inkBuilder.php => LlamaCppLinkBuilder.php} |   4 +-
 src/OllamaChatRequest.php                     |  30 ---
 src/OllamaChatRole.php                        |  12 -
 src/OllamaChatSession.php                     |  41 ---
 src/OllamaChatToken.php                       |  21 --
 src/OllamaClient.php                          | 184 --------------
 src/OllamaCompletionRequest.php               |  32 ---
 src/OllamaCompletionToken.php                 |  21 --
 src/OllamaEmbeddingRequest.php                |  27 --
 src/OllamaEmbeddingResponse.php               |  20 --
 src/OllamaRequestOptions.php                  |  27 --
 src/OllamaRequestStopDelimiter.php            |  23 --
 ....php => LlamaCppConfigurationProvider.php} |  22 +-
 38 files changed, 786 insertions(+), 639 deletions(-)
 create mode 100644 docs/pages/docs/features/ai/index.md
 create mode 100644 docs/pages/docs/features/ai/llama-cpp/index.md
 create mode 100644 docs/pages/tutorials/connect-to-llama-cpp/index.md
 rename src/Command/{OllamaGenerate.php => LlamaCppGenerate.php} (52%)
 create mode 100644 src/Command/LlamaCppGenerate/Completion.php
 create mode 100644 src/Command/LlamaCppGenerate/Embedding.php
 create mode 100644 src/Command/LlamaCppHealth.php
 create mode 100644 src/Command/LlamaCppInfill.php
 delete mode 100644 src/Command/OllamaChat.php
 delete mode 100644 src/Command/OllamaGenerate/Completion.php
 delete mode 100644 src/Command/OllamaGenerate/Embedding.php
 create mode 100644 src/LlamaCppClient.php
 create mode 100644 src/LlamaCppCompletionRequest.php
 create mode 100644 src/LlamaCppCompletionToken.php
 rename src/{OllamaConfiguration.php => LlamaCppConfiguration.php} (74%)
 create mode 100644 src/LlamaCppEmbedding.php
 rename src/{OllamaChatMessage.php => LlamaCppEmbeddingRequest.php} (53%)
 create mode 100644 src/LlamaCppHealthStatus.php
 create mode 100644 src/LlamaCppInfill.php
 create mode 100644 src/LlamaCppInfillRequest.php
 rename src/{OllamaLinkBuilder.php => LlamaCppLinkBuilder.php} (82%)
 delete mode 100644 src/OllamaChatRequest.php
 delete mode 100644 src/OllamaChatRole.php
 delete mode 100644 src/OllamaChatSession.php
 delete mode 100644 src/OllamaChatToken.php
 delete mode 100644 src/OllamaClient.php
 delete mode 100644 src/OllamaCompletionRequest.php
 delete mode 100644 src/OllamaCompletionToken.php
 delete mode 100644 src/OllamaEmbeddingRequest.php
 delete mode 100644 src/OllamaEmbeddingResponse.php
 delete mode 100644 src/OllamaRequestOptions.php
 delete mode 100644 src/OllamaRequestStopDelimiter.php
 rename src/SingletonProvider/ConfigurationProvider/{OllamaConfigurationProvider.php => LlamaCppConfigurationProvider.php} (64%)

diff --git a/config.ini.example b/config.ini.example
index cdb398fb..61f30836 100644
--- a/config.ini.example
+++ b/config.ini.example
@@ -15,14 +15,14 @@ default[log_queries] = false
 default[pool_prefill] = false
 default[pool_size] = 8
 
+[llamacpp]
+host = 127.0.0.1
+port = 8081
+
 [manifest]
 background_color = "#ffffff"
 theme_color = "#ffffff"
 
-[ollama]
-host = 127.0.0.1
-port = 11434
-
 [redis]
 default[db_index] = 0
 default[host] = 127.0.0.1
diff --git a/docs/pages/docs/changelog/index.md b/docs/pages/docs/changelog/index.md
index b702a1cb..a7445a8c 100644
--- a/docs/pages/docs/changelog/index.md
+++ b/docs/pages/docs/changelog/index.md
@@ -10,6 +10,10 @@ title: Changelog
 
 # Changelog
 
+## v0.14.0
+
+- Feature: added {{docs/features/ai/llama-cpp/index}} to integrate with LLMs 
+
 ## v0.11.1
 
 - Fix: translation files were incorrectly loaded
diff --git a/docs/pages/docs/features/ai/index.md b/docs/pages/docs/features/ai/index.md
new file mode 100644
index 00000000..4413ef18
--- /dev/null
+++ b/docs/pages/docs/features/ai/index.md
@@ -0,0 +1,11 @@
+---
+collections: 
+    - documents
+layout: dm:document
+parent: docs/features/index
+title: AI
+description: >
+    Use integration features to serve or use AI models.
+---
+
+{{docs/features/ai/*/index}}
diff --git a/docs/pages/docs/features/ai/llama-cpp/index.md b/docs/pages/docs/features/ai/llama-cpp/index.md
new file mode 100644
index 00000000..faa10672
--- /dev/null
+++ b/docs/pages/docs/features/ai/llama-cpp/index.md
@@ -0,0 +1,73 @@
+---
+collections: 
+    - documents
+layout: dm:document
+parent: docs/features/ai/index
+title: llama.cpp
+description: >
+    Use Resonance to connect with llama.cpp server.
+---
+
+## llama.cpp
+
+[llama.cpp](https://github.com/ggerganov/llama.cpp) is an open source framework
+capable of running various LLM models.
+
+It has a built-in HTTP server that supports continuous batching, parallel 
+requests and is optimized for resouces usage.
+
+You can use Resonance to connect with it and process LLM responses.
+
+# Usage
+
+You can also check the tutorial: {{tutorials/connect-to-llama-cpp/index}}
+
+## Configuration
+
+All you need to do is add a configuration section that specifies the llama.cpp
+server location:
+
+```ini
+[llamacpp]
+host = 127.0.0.1
+port = 8081
+```
+
+## Programmatic Use
+
+In your class, you need to use {{docs/features/dependency-injection/index}} to
+inject `LlamaCppClient`:
+
+```php
+<?php
+
+namespace App;
+
+use Distantmagic\Resonance\LlamaCppClient;
+use Distantmagic\Resonance\LlamaCppCompletionRequest;
+
+#[Singleton]
+class LlamaCppGenerate 
+{
+    public function __construct(protected LlamaCppClient $llamaCppClient) 
+    {
+    }
+
+    public function doSomething(): void
+    {
+        $request = new LlamaCppCompletionRequest('How to make a cat happy?');
+
+        $completion = $this->llamaCppClient->generateCompletion($request);
+
+        // each token is a chunk of text, usually few-several letters returned
+        // from the model you are using
+        foreach ($completion as $token) {
+            swoole_error_log(SWOOLE_LOG_DEBUG, (string) $token);
+
+            if ($token->isLast) {
+                // ...do something else
+            }
+        }
+    }
+}
+```
diff --git a/docs/pages/tutorials/basic-graphql-schema/index.md b/docs/pages/tutorials/basic-graphql-schema/index.md
index 12538ef4..d6042c24 100644
--- a/docs/pages/tutorials/basic-graphql-schema/index.md
+++ b/docs/pages/tutorials/basic-graphql-schema/index.md
@@ -3,7 +3,7 @@ collections:
   - tutorials
 layout: dm:tutorial
 parent: tutorials/index
-title: Basic GraphQL Schema
+title: Building a Basic GraphQL Schema
 description: >
     Learn How to Build a Basic GraphQL Schema
 ---
diff --git a/docs/pages/tutorials/connect-to-llama-cpp/index.md b/docs/pages/tutorials/connect-to-llama-cpp/index.md
new file mode 100644
index 00000000..26041365
--- /dev/null
+++ b/docs/pages/tutorials/connect-to-llama-cpp/index.md
@@ -0,0 +1,144 @@
+---
+collections:
+  - tutorials
+layout: dm:tutorial
+parent: tutorials/index
+title: How to Serve LLM Completions (With llama.cpp)
+description: >
+    How to connect with llama.cpp and issue parallel requests for LLM 
+    completions and embeddings with Resonance.
+---
+
+## Preparations
+
+To start, you need to compile 
+[llama.cpp](https://github.com/ggerganov/llama.cpp). You can follow their 
+[README](https://github.com/ggerganov/llama.cpp/blob/master/README.md) for
+instructions.
+
+The server is compiled alongside other targets by default.
+
+Once you have the server running, we can continue.
+
+## Troubleshooting
+
+### Obtaining Open-Source LLM
+
+I recommend starting either with [llama2](https://ai.meta.com/llama/) or 
+[Mistral](https://mistral.ai/). You need to download the pretrained weights
+and convert them into GGUF format before they can be used with 
+[llama.cpp](https://github.com/ggerganov/llama.cpp).
+
+### Starting Server Without a GPU 
+
+[llama.cpp](https://github.com/ggerganov/llama.cpp) supports CPU-only setups,
+so you don't have to do any additional configuration. It will be slow, but
+you will still have tokens generated.
+
+### Running With a Low VRAM Memory
+
+You can try quantization if you don't have enough VRAM on your GPU to run a 
+specific model. That lowers the response quality and the memory the model
+needs to use. Llama.cpp has a utility to quantize models:
+
+```shell
+$ ./quantize ./models/7B/ggml-model-f16.gguf ./models/7B/ggml-model-q4_0.gguf q4_0
+```
+
+10GB of VRAM is enough to run most quantized models.
+
+## Starting llama.cpp Server
+
+While writing this tutorial, I had a server started with a command:
+
+```shell
+$ ./server 
+    --model ~/llama-2-7b-chat/ggml-model-q4_0.gguf 
+    --n-gpu-layers 200000 
+    --ctx-size 2048 
+    --parallel 8 
+    --cont-batching
+    --mlock 
+    --port 8081
+```
+
+`cont-batching` parameter is essential, because it enables 
+continuous batching, which is an optimization technique that allows parallel 
+request.
+
+Without it, even with multiple `parallel` slots, the server could 
+answer to only one request at a time. `cont-batching` allows the server to 
+respond
+to multiple completion requests in parallel.
+
+## Configuring Resonance
+
+All you need to do is add a configuration section that specifies the llama.cpp
+server location:
+
+```ini
+[llamacpp]
+host = 127.0.0.1
+port = 8081
+```
+
+## Testing 
+
+Resonance has built-in commands that connect to llama.cpp and issue requests.
+You can send a sample prompt through `llamacpp:completion`:
+
+```shell
+$ php ./bin/resonance.php llamacpp:completion "How to write a 'Hello, world' in PHP?"
+To write a "Hello, world" in PHP, you can use the following code:
+
+<?php
+  echo "Hello, world!";
+?>
+
+This will produce a simple "Hello, world!" message when executed.
+```
+
+## Programmatic Use
+
+In your class, you need to use {{docs/features/dependency-injection/index}} to
+inject `LlamaCppClient`:
+
+```php
+<?php
+
+namespace App;
+
+use Distantmagic\Resonance\LlamaCppClient;
+use Distantmagic\Resonance\LlamaCppCompletionRequest;
+
+#[Singleton]
+class LlamaCppGenerate 
+{
+    public function __construct(protected LlamaCppClient $llamaCppClient) 
+    {
+    }
+
+    public function doSomething(): void
+    {
+        $request = new LlamaCppCompletionRequest('How to make a cat happy?');
+
+        $completion = $this->llamaCppClient->generateCompletion($request);
+
+        // each token is a chunk of text, usually few-several letters returned
+        // from the model you are using
+        foreach ($completion as $token) {
+            swoole_error_log(SWOOLE_LOG_DEBUG, (string) $token);
+
+            if ($token->isLast) {
+                // ...do something else
+            }
+        }
+    }
+}
+```
+
+## Summary
+
+In this tutorial, we went through how to start 
+[llama.cpp](https://github.com/ggerganov/llama.cpp) server and connect to it 
+with Resonance.
diff --git a/docs/pages/tutorials/hello-world/index.md b/docs/pages/tutorials/hello-world/index.md
index 268fc59b..696d334e 100644
--- a/docs/pages/tutorials/hello-world/index.md
+++ b/docs/pages/tutorials/hello-world/index.md
@@ -3,7 +3,7 @@ collections:
   - tutorials
 layout: dm:tutorial
 parent: tutorials/index
-title: Hello, World!
+title: "'Hello, World' with Resonance"
 description: >
     Let's walk step by step through the basic Resonance project.
 ---
diff --git a/src/Command/OllamaGenerate.php b/src/Command/LlamaCppGenerate.php
similarity index 52%
rename from src/Command/OllamaGenerate.php
rename to src/Command/LlamaCppGenerate.php
index 9cbc8bb1..681b789e 100644
--- a/src/Command/OllamaGenerate.php
+++ b/src/Command/LlamaCppGenerate.php
@@ -5,19 +5,18 @@ declare(strict_types=1);
 namespace Distantmagic\Resonance\Command;
 
 use Distantmagic\Resonance\CoroutineCommand;
-use Distantmagic\Resonance\OllamaClient;
+use Distantmagic\Resonance\LlamaCppClient;
 use Distantmagic\Resonance\SwooleConfiguration;
 use Symfony\Component\Console\Input\InputArgument;
 use Symfony\Component\Console\Input\InputInterface;
-use Symfony\Component\Console\Input\InputOption;
 use Symfony\Component\Console\Output\OutputInterface;
 
-abstract class OllamaGenerate extends CoroutineCommand
+abstract class LlamaCppGenerate extends CoroutineCommand
 {
-    abstract protected function executeOllamaCommand(InputInterface $input, OutputInterface $output, string $model, string $prompt): int;
+    abstract protected function executeLlamaCppCommand(InputInterface $input, OutputInterface $output, string $prompt): int;
 
     public function __construct(
-        protected OllamaClient $ollamaClient,
+        protected LlamaCppClient $llamaCppClient,
         SwooleConfiguration $swooleConfiguration,
     ) {
         parent::__construct($swooleConfiguration);
@@ -25,26 +24,20 @@ abstract class OllamaGenerate extends CoroutineCommand
 
     protected function configure(): void
     {
-        $this->addArgument('prompt', InputArgument::REQUIRED);
-        $this->addOption(
-            default: 'mistral',
-            mode: InputOption::VALUE_REQUIRED,
-            name: 'model',
+        $this->addArgument(
+            name: 'prompt',
+            mode: InputArgument::OPTIONAL,
+            default: 'How to make a cat happy? Be brief, respond in 1 sentence.',
         );
     }
 
     protected function executeInCoroutine(InputInterface $input, OutputInterface $output): int
     {
-        /**
-         * @var string $model
-         */
-        $model = $input->getOption('model');
-
         /**
          * @var string $prompt
          */
         $prompt = $input->getArgument('prompt');
 
-        return $this->executeOllamaCommand($input, $output, $model, $prompt);
+        return $this->executeLlamaCppCommand($input, $output, $prompt);
     }
 }
diff --git a/src/Command/LlamaCppGenerate/Completion.php b/src/Command/LlamaCppGenerate/Completion.php
new file mode 100644
index 00000000..6ae0b135
--- /dev/null
+++ b/src/Command/LlamaCppGenerate/Completion.php
@@ -0,0 +1,32 @@
+<?php
+
+declare(strict_types=1);
+
+namespace Distantmagic\Resonance\Command\LlamaCppGenerate;
+
+use Distantmagic\Resonance\Attribute\ConsoleCommand;
+use Distantmagic\Resonance\Command;
+use Distantmagic\Resonance\Command\LlamaCppGenerate;
+use Distantmagic\Resonance\LlamaCppCompletionRequest;
+use Symfony\Component\Console\Input\InputInterface;
+use Symfony\Component\Console\Output\OutputInterface;
+
+#[ConsoleCommand(
+    name: 'llamacpp:completion',
+    description: 'Generate completion based on a prompt'
+)]
+final class Completion extends LlamaCppGenerate
+{
+    protected function executeLlamaCppCommand(InputInterface $input, OutputInterface $output, string $prompt): int
+    {
+        $request = new LlamaCppCompletionRequest($prompt);
+
+        $completion = $this->llamaCppClient->generateCompletion($request);
+
+        foreach ($completion as $token) {
+            $output->write((string) $token);
+        }
+
+        return Command::SUCCESS;
+    }
+}
diff --git a/src/Command/LlamaCppGenerate/Embedding.php b/src/Command/LlamaCppGenerate/Embedding.php
new file mode 100644
index 00000000..d1e76f8d
--- /dev/null
+++ b/src/Command/LlamaCppGenerate/Embedding.php
@@ -0,0 +1,41 @@
+<?php
+
+declare(strict_types=1);
+
+namespace Distantmagic\Resonance\Command\LlamaCppGenerate;
+
+use Distantmagic\Resonance\Attribute\ConsoleCommand;
+use Distantmagic\Resonance\Command;
+use Distantmagic\Resonance\Command\LlamaCppGenerate;
+use Distantmagic\Resonance\JsonSerializer;
+use Distantmagic\Resonance\LlamaCppClient;
+use Distantmagic\Resonance\LlamaCppEmbeddingRequest;
+use Distantmagic\Resonance\SwooleConfiguration;
+use Symfony\Component\Console\Input\InputInterface;
+use Symfony\Component\Console\Output\OutputInterface;
+
+#[ConsoleCommand(
+    name: 'llamacpp:embedding',
+    description: 'Generate embedding based on a prompt'
+)]
+final class Embedding extends LlamaCppGenerate
+{
+    public function __construct(
+        private JsonSerializer $jsonSerializer,
+        LlamaCppClient $llamaCppClient,
+        SwooleConfiguration $swooleConfiguration,
+    ) {
+        parent::__construct($llamaCppClient, $swooleConfiguration);
+    }
+
+    protected function executeLlamaCppCommand(InputInterface $input, OutputInterface $output, string $prompt): int
+    {
+        $request = new LlamaCppEmbeddingRequest($prompt);
+
+        $embedding = $this->llamaCppClient->generateEmbedding($request);
+
+        $output->writeln($this->jsonSerializer->serialize($embedding->embedding));
+
+        return Command::SUCCESS;
+    }
+}
diff --git a/src/Command/LlamaCppHealth.php b/src/Command/LlamaCppHealth.php
new file mode 100644
index 00000000..4c887b1d
--- /dev/null
+++ b/src/Command/LlamaCppHealth.php
@@ -0,0 +1,34 @@
+<?php
+
+declare(strict_types=1);
+
+namespace Distantmagic\Resonance\Command;
+
+use Distantmagic\Resonance\Attribute\ConsoleCommand;
+use Distantmagic\Resonance\Command;
+use Distantmagic\Resonance\CoroutineCommand;
+use Distantmagic\Resonance\LlamaCppClient;
+use Distantmagic\Resonance\SwooleConfiguration;
+use Symfony\Component\Console\Input\InputInterface;
+use Symfony\Component\Console\Output\OutputInterface;
+
+#[ConsoleCommand(
+    name: 'llamacpp:health',
+    description: 'Get server\'s health status'
+)]
+final class LlamaCppHealth extends CoroutineCommand
+{
+    public function __construct(
+        private LlamaCppClient $llamaCppClient,
+        SwooleConfiguration $swooleConfiguration,
+    ) {
+        parent::__construct($swooleConfiguration);
+    }
+
+    protected function executeInCoroutine(InputInterface $input, OutputInterface $output): int
+    {
+        $output->writeln($this->llamaCppClient->getHealth()->value);
+
+        return Command::SUCCESS;
+    }
+}
diff --git a/src/Command/LlamaCppInfill.php b/src/Command/LlamaCppInfill.php
new file mode 100644
index 00000000..30dbf0d1
--- /dev/null
+++ b/src/Command/LlamaCppInfill.php
@@ -0,0 +1,44 @@
+<?php
+
+declare(strict_types=1);
+
+namespace Distantmagic\Resonance\Command;
+
+use Distantmagic\Resonance\Attribute\ConsoleCommand;
+use Distantmagic\Resonance\Command;
+use Distantmagic\Resonance\CoroutineCommand;
+use Distantmagic\Resonance\JsonSerializer;
+use Distantmagic\Resonance\LlamaCppClient;
+use Distantmagic\Resonance\LlamaCppInfillRequest;
+use Distantmagic\Resonance\SwooleConfiguration;
+use Symfony\Component\Console\Input\InputInterface;
+use Symfony\Component\Console\Output\OutputInterface;
+
+#[ConsoleCommand(
+    name: 'llamacpp:infill',
+    description: 'Generate code infill'
+)]
+final class LlamaCppInfill extends CoroutineCommand
+{
+    public function __construct(
+        private JsonSerializer $jsonSerializer,
+        private LlamaCppClient $llamaCppClient,
+        SwooleConfiguration $swooleConfiguration,
+    ) {
+        parent::__construct($swooleConfiguration);
+    }
+
+    protected function executeInCoroutine(InputInterface $input, OutputInterface $output): int
+    {
+        $request = new LlamaCppInfillRequest(
+            before: '<?php // hello world',
+            after: '?>',
+        );
+
+        foreach ($this->llamaCppClient->generateInfill($request) as $token) {
+            $output->write((string) $token);
+        }
+
+        return Command::SUCCESS;
+    }
+}
diff --git a/src/Command/OllamaChat.php b/src/Command/OllamaChat.php
deleted file mode 100644
index 656210f4..00000000
--- a/src/Command/OllamaChat.php
+++ /dev/null
@@ -1,71 +0,0 @@
-<?php
-
-declare(strict_types=1);
-
-namespace Distantmagic\Resonance\Command;
-
-use Distantmagic\Resonance\Attribute\ConsoleCommand;
-use Distantmagic\Resonance\CoroutineCommand;
-use Distantmagic\Resonance\OllamaChatSession;
-use Distantmagic\Resonance\OllamaClient;
-use Distantmagic\Resonance\SwooleConfiguration;
-use Symfony\Component\Console\Command\Command;
-use Symfony\Component\Console\Helper\QuestionHelper;
-use Symfony\Component\Console\Input\InputInterface;
-use Symfony\Component\Console\Input\InputOption;
-use Symfony\Component\Console\Output\OutputInterface;
-use Symfony\Component\Console\Question\Question;
-
-#[ConsoleCommand(
-    name: 'ollama:chat',
-    description: 'Chat with LLM model through Ollama'
-)]
-final class OllamaChat extends CoroutineCommand
-{
-    public function __construct(
-        protected OllamaClient $ollamaClient,
-        SwooleConfiguration $swooleConfiguration,
-    ) {
-        parent::__construct($swooleConfiguration);
-    }
-
-    protected function configure(): void
-    {
-        $this->addOption(
-            default: 'mistral',
-            mode: InputOption::VALUE_REQUIRED,
-            name: 'model',
-        );
-    }
-
-    protected function executeInCoroutine(InputInterface $input, OutputInterface $output): int
-    {
-        /**
-         * @var string $model
-         */
-        $model = $input->getOption('model');
-
-        /**
-         * @var QuestionHelper $helper
-         */
-        $helper = $this->getHelper('question');
-        $userInputQuestion = new Question('> ');
-
-        $chatSession = new OllamaChatSession(
-            model: $model,
-            ollamaClient: $this->ollamaClient,
-        );
-
-        while (true) {
-            $userMessageContent = $helper->ask($input, $output, $userInputQuestion);
-
-            foreach ($chatSession->respond($userMessageContent) as $value) {
-                $output->write((string) $value);
-            }
-
-            $output->writeln('');
-        }
-
-        return Command::SUCCESS;
-    }
-}
diff --git a/src/Command/OllamaGenerate/Completion.php b/src/Command/OllamaGenerate/Completion.php
deleted file mode 100644
index 9d9d523b..00000000
--- a/src/Command/OllamaGenerate/Completion.php
+++ /dev/null
@@ -1,33 +0,0 @@
-<?php
-
-declare(strict_types=1);
-
-namespace Distantmagic\Resonance\Command\OllamaGenerate;
-
-use Distantmagic\Resonance\Attribute\ConsoleCommand;
-use Distantmagic\Resonance\Command;
-use Distantmagic\Resonance\Command\OllamaGenerate;
-use Distantmagic\Resonance\OllamaCompletionRequest;
-use Symfony\Component\Console\Input\InputInterface;
-use Symfony\Component\Console\Output\OutputInterface;
-
-#[ConsoleCommand(
-    name: 'ollama:completion',
-    description: 'Generate LLM completion'
-)]
-final class Completion extends OllamaGenerate
-{
-    protected function executeOllamaCommand(InputInterface $input, OutputInterface $output, string $model, string $prompt): int
-    {
-        $completionRequest = new OllamaCompletionRequest(
-            model: $model,
-            prompt: $prompt,
-        );
-
-        foreach ($this->ollamaClient->generateCompletion($completionRequest) as $token) {
-            $output->write((string) $token);
-        }
-
-        return Command::SUCCESS;
-    }
-}
diff --git a/src/Command/OllamaGenerate/Embedding.php b/src/Command/OllamaGenerate/Embedding.php
deleted file mode 100644
index 9c5ec24e..00000000
--- a/src/Command/OllamaGenerate/Embedding.php
+++ /dev/null
@@ -1,51 +0,0 @@
-<?php
-
-declare(strict_types=1);
-
-namespace Distantmagic\Resonance\Command\OllamaGenerate;
-
-use Distantmagic\Resonance\Attribute\ConsoleCommand;
-use Distantmagic\Resonance\Command;
-use Distantmagic\Resonance\Command\OllamaGenerate;
-use Distantmagic\Resonance\JsonSerializer;
-use Distantmagic\Resonance\OllamaClient;
-use Distantmagic\Resonance\OllamaEmbeddingRequest;
-use Distantmagic\Resonance\SwooleConfiguration;
-use Symfony\Component\Console\Input\InputInterface;
-use Symfony\Component\Console\Output\OutputInterface;
-
-#[ConsoleCommand(
-    name: 'ollama:embedding',
-    description: 'Generate LLM embedding'
-)]
-final class Embedding extends OllamaGenerate
-{
-    public function __construct(
-        private JsonSerializer $jsonSerializer,
-        OllamaClient $ollamaClient,
-        SwooleConfiguration $swooleConfiguration,
-    ) {
-        parent::__construct($ollamaClient, $swooleConfiguration);
-    }
-
-    protected function executeOllamaCommand(InputInterface $input, OutputInterface $output, string $model, string $prompt): int
-    {
-        $embeddingRequest = new OllamaEmbeddingRequest(
-            model: $model,
-            prompt: $prompt,
-        );
-
-        $embeddingResponse = $this
-            ->ollamaClient
-            ->generateEmbedding($embeddingRequest)
-        ;
-
-        $output->writeln(
-            $this
-                ->jsonSerializer
-                ->serialize($embeddingResponse)
-        );
-
-        return Command::SUCCESS;
-    }
-}
diff --git a/src/JsonSerializer.php b/src/JsonSerializer.php
index 7fb69fa9..be467ddf 100644
--- a/src/JsonSerializer.php
+++ b/src/JsonSerializer.php
@@ -5,11 +5,17 @@ declare(strict_types=1);
 namespace Distantmagic\Resonance;
 
 use Distantmagic\Resonance\Attribute\Singleton;
+use RuntimeException;
 
 #[Singleton]
 readonly class JsonSerializer
 {
-    public function __construct(private ApplicationConfiguration $applicationConfiguration) {}
+    public function __construct(private ApplicationConfiguration $applicationConfiguration)
+    {
+        if (!function_exists('swoole_substr_json_decode')) {
+            throw new RuntimeException('You need to compile Swoole with JSON support');
+        }
+    }
 
     public function serialize(mixed $data): string
     {
@@ -21,11 +27,14 @@ readonly class JsonSerializer
         );
     }
 
-    public function unserialize(string $data): mixed
-    {
-        return json_decode(
-            json: $data,
+    public function unserialize(
+        string $json,
+        int $offset = 0,
+    ): mixed {
+        return swoole_substr_json_decode(
             flags: JSON_THROW_ON_ERROR,
+            offset: $offset,
+            str: $json,
         );
     }
 }
diff --git a/src/LlamaCppClient.php b/src/LlamaCppClient.php
new file mode 100644
index 00000000..b0f88345
--- /dev/null
+++ b/src/LlamaCppClient.php
@@ -0,0 +1,233 @@
+<?php
+
+declare(strict_types=1);
+
+namespace Distantmagic\Resonance;
+
+use CurlHandle;
+use Distantmagic\Resonance\Attribute\Singleton;
+use Generator;
+use JsonSerializable;
+use RuntimeException;
+use Swoole\Coroutine\Channel;
+
+#[Singleton]
+readonly class LlamaCppClient
+{
+    // strlen('data: ')
+    public const COMPLETION_CHUNKED_DATA_PREFIX_LENGTH = 6;
+
+    public function __construct(
+        private JsonSerializer $jsonSerializer,
+        private LlamaCppConfiguration $llamaCppConfiguration,
+        private LlamaCppLinkBuilder $llamaCppLinkBuilder,
+    ) {}
+
+    /**
+     * @return Generator<LlamaCppCompletionToken>
+     */
+    public function generateCompletion(LlamaCppCompletionRequest $request): Generator
+    {
+        $curlHandle = $this->createCurlHandle();
+
+        curl_setopt($curlHandle, CURLOPT_POST, true);
+
+        $responseChunks = $this->streamResponse($curlHandle, $request, '/completion');
+
+        /**
+         * @var null|string
+         */
+        $previousContent = null;
+
+        foreach ($responseChunks as $responseChunk) {
+            /**
+             * @var object{
+             *   content: string,
+             *   stop: boolean,
+             * }
+             */
+            $unserializedToken = $this->jsonSerializer->unserialize(
+                json: $responseChunk,
+                offset: self::COMPLETION_CHUNKED_DATA_PREFIX_LENGTH,
+            );
+
+            if (is_string($previousContent)) {
+                yield new LlamaCppCompletionToken(
+                    content: $previousContent,
+                    isLast: $unserializedToken->stop,
+                );
+
+                $previousContent = null;
+            }
+
+            if (!$unserializedToken->stop) {
+                $previousContent = $unserializedToken->content;
+            }
+        }
+    }
+
+    public function generateEmbedding(LlamaCppEmbeddingRequest $request): LlamaCppEmbedding
+    {
+        $curlHandle = $this->createCurlHandle();
+
+        $requestData = json_encode($request);
+
+        curl_setopt($curlHandle, CURLOPT_POST, true);
+        curl_setopt($curlHandle, CURLOPT_POSTFIELDS, $requestData);
+        curl_setopt($curlHandle, CURLOPT_RETURNTRANSFER, true);
+        curl_setopt($curlHandle, CURLOPT_URL, $this->llamaCppLinkBuilder->build('/embedding'));
+
+        /**
+         * @var false|string $responseContent
+         */
+        $responseContent = curl_exec($curlHandle);
+
+        if (false === $responseContent) {
+            throw new CurlException($curlHandle);
+        }
+
+        $this->assertStatusCode($curlHandle, 200);
+
+        /**
+         * @var object{ embedding: array<float> } $responseData
+         */
+        $responseData = $this
+            ->jsonSerializer
+            ->unserialize($responseContent)
+        ;
+
+        return new LlamaCppEmbedding($responseData->embedding);
+    }
+
+    /**
+     * @return Generator<LlamaCppInfill>
+     */
+    public function generateInfill(LlamaCppInfillRequest $request): Generator
+    {
+        $curlHandle = $this->createCurlHandle();
+
+        curl_setopt($curlHandle, CURLOPT_POST, true);
+
+        $responseChunks = $this->streamResponse($curlHandle, $request, '/infill');
+
+        foreach ($responseChunks as $responseChunk) {
+            /**
+             * @var object{ content: string }
+             */
+            $token = $this->jsonSerializer->unserialize($responseChunk);
+
+            yield new LlamaCppInfill(
+                after: $request->after,
+                before: $request->before,
+                content: $token->content,
+            );
+        }
+    }
+
+    public function getHealth(): LlamaCppHealthStatus
+    {
+        $curlHandle = $this->createCurlHandle();
+
+        curl_setopt($curlHandle, CURLOPT_RETURNTRANSFER, true);
+        curl_setopt($curlHandle, CURLOPT_URL, $this->llamaCppLinkBuilder->build('/health'));
+
+        /**
+         * @var false|string $responseContent
+         */
+        $responseContent = curl_exec($curlHandle);
+
+        if (false === $responseContent) {
+            throw new CurlException($curlHandle);
+        }
+
+        $this->assertStatusCode($curlHandle, 200);
+
+        /**
+         * @var object{ status: string } $responseData
+         */
+        $responseData = $this
+            ->jsonSerializer
+            ->unserialize($responseContent)
+        ;
+
+        return LlamaCppHealthStatus::from($responseData->status);
+    }
+
+    private function assertStatusCode(CurlHandle $curlHandle, int $expectedStatusCode): void
+    {
+        /**
+         * @var int $statusCode
+         */
+        $statusCode = curl_getinfo($curlHandle, CURLINFO_RESPONSE_CODE);
+
+        if ($expectedStatusCode === $statusCode) {
+            return;
+        }
+
+        throw new RuntimeException(sprintf(
+            'curl request finished with unexpected status code: "%s"',
+            $statusCode,
+        ));
+    }
+
+    private function createCurlHandle(): CurlHandle
+    {
+        $curlHandle = curl_init();
+
+        /**
+         * @var array<string>
+         */
+        $headers = [
+            'Content-Type: application/json',
+        ];
+
+        if ($this->llamaCppConfiguration->apiKey) {
+            $headers[] = sprintf('Authorization: Bearer %s', $this->llamaCppConfiguration->apiKey);
+        }
+
+        curl_setopt($curlHandle, CURLOPT_HTTPHEADER, $headers);
+
+        return $curlHandle;
+    }
+
+    /**
+     * @return SwooleChannelIterator<string>
+     */
+    private function streamResponse(CurlHandle $curlHandle, JsonSerializable $request, string $path): SwooleChannelIterator
+    {
+        $channel = new Channel(1);
+        $requestData = json_encode($request);
+
+        $cid = go(function () use ($channel, $curlHandle, $path, $requestData) {
+            try {
+                curl_setopt($curlHandle, CURLOPT_POSTFIELDS, $requestData);
+                curl_setopt($curlHandle, CURLOPT_RETURNTRANSFER, false);
+                curl_setopt($curlHandle, CURLOPT_URL, $this->llamaCppLinkBuilder->build($path));
+                curl_setopt($curlHandle, CURLOPT_WRITEFUNCTION, static function (CurlHandle $curlHandle, string $data) use ($channel) {
+                    $channel->push($data);
+
+                    return strlen($data);
+                });
+
+                if (!curl_exec($curlHandle)) {
+                    throw new CurlException($curlHandle);
+                }
+
+                $this->assertStatusCode($curlHandle, 200);
+            } finally {
+                curl_setopt($curlHandle, CURLOPT_WRITEFUNCTION, null);
+
+                $channel->close();
+            }
+        });
+
+        if (!is_int($cid)) {
+            throw new RuntimeException('Unable to start a coroutine');
+        }
+
+        /**
+         * @var SwooleChannelIterator<string>
+         */
+        return new SwooleChannelIterator($channel);
+    }
+}
diff --git a/src/LlamaCppCompletionRequest.php b/src/LlamaCppCompletionRequest.php
new file mode 100644
index 00000000..53c54b86
--- /dev/null
+++ b/src/LlamaCppCompletionRequest.php
@@ -0,0 +1,26 @@
+<?php
+
+declare(strict_types=1);
+
+namespace Distantmagic\Resonance;
+
+use JsonSerializable;
+
+readonly class LlamaCppCompletionRequest implements JsonSerializable
+{
+    public function __construct(
+        public string $prompt,
+    ) {}
+
+    public function jsonSerialize(): array
+    {
+        return [
+            'prompt' => sprintf('[INT]%s[SYST]', $this->prompt),
+            'stop' => [
+                '[INST]',
+                '[SYST]',
+            ],
+            'stream' => true,
+        ];
+    }
+}
diff --git a/src/LlamaCppCompletionToken.php b/src/LlamaCppCompletionToken.php
new file mode 100644
index 00000000..6b3e9fb9
--- /dev/null
+++ b/src/LlamaCppCompletionToken.php
@@ -0,0 +1,20 @@
+<?php
+
+declare(strict_types=1);
+
+namespace Distantmagic\Resonance;
+
+use Stringable;
+
+readonly class LlamaCppCompletionToken implements Stringable
+{
+    public function __construct(
+        public string $content,
+        public bool $isLast,
+    ) {}
+
+    public function __toString(): string
+    {
+        return $this->content;
+    }
+}
diff --git a/src/OllamaConfiguration.php b/src/LlamaCppConfiguration.php
similarity index 74%
rename from src/OllamaConfiguration.php
rename to src/LlamaCppConfiguration.php
index 9796d667..1da4070f 100644
--- a/src/OllamaConfiguration.php
+++ b/src/LlamaCppConfiguration.php
@@ -4,9 +4,10 @@ declare(strict_types=1);
 
 namespace Distantmagic\Resonance;
 
-readonly class OllamaConfiguration
+readonly class LlamaCppConfiguration
 {
     public function __construct(
+        public ?string $apiKey,
         public string $host,
         public int $port,
         public string $scheme,
diff --git a/src/LlamaCppEmbedding.php b/src/LlamaCppEmbedding.php
new file mode 100644
index 00000000..a779d6f9
--- /dev/null
+++ b/src/LlamaCppEmbedding.php
@@ -0,0 +1,15 @@
+<?php
+
+declare(strict_types=1);
+
+namespace Distantmagic\Resonance;
+
+readonly class LlamaCppEmbedding
+{
+    /**
+     * @param array<float> $embedding
+     */
+    public function __construct(
+        public array $embedding,
+    ) {}
+}
diff --git a/src/OllamaChatMessage.php b/src/LlamaCppEmbeddingRequest.php
similarity index 53%
rename from src/OllamaChatMessage.php
rename to src/LlamaCppEmbeddingRequest.php
index a50091df..39985104 100644
--- a/src/OllamaChatMessage.php
+++ b/src/LlamaCppEmbeddingRequest.php
@@ -5,25 +5,17 @@ declare(strict_types=1);
 namespace Distantmagic\Resonance;
 
 use JsonSerializable;
-use Stringable;
 
-readonly class OllamaChatMessage implements JsonSerializable, Stringable
+readonly class LlamaCppEmbeddingRequest implements JsonSerializable
 {
     public function __construct(
         public string $content,
-        public OllamaChatRole $role,
     ) {}
 
-    public function __toString(): string
-    {
-        return $this->content;
-    }
-
     public function jsonSerialize(): array
     {
         return [
             'content' => $this->content,
-            'role' => $this->role->value,
         ];
     }
 }
diff --git a/src/LlamaCppHealthStatus.php b/src/LlamaCppHealthStatus.php
new file mode 100644
index 00000000..1dcff198
--- /dev/null
+++ b/src/LlamaCppHealthStatus.php
@@ -0,0 +1,12 @@
+<?php
+
+declare(strict_types=1);
+
+namespace Distantmagic\Resonance;
+
+enum LlamaCppHealthStatus: string
+{
+    case Error = 'error';
+    case LoadingModel = 'loading model';
+    case Ok = 'ok';
+}
diff --git a/src/LlamaCppInfill.php b/src/LlamaCppInfill.php
new file mode 100644
index 00000000..1284e1c2
--- /dev/null
+++ b/src/LlamaCppInfill.php
@@ -0,0 +1,21 @@
+<?php
+
+declare(strict_types=1);
+
+namespace Distantmagic\Resonance;
+
+use Stringable;
+
+readonly class LlamaCppInfill implements Stringable
+{
+    public function __construct(
+        public string $after,
+        public string $before,
+        public string $content,
+    ) {}
+
+    public function __toString(): string
+    {
+        return $this->content;
+    }
+}
diff --git a/src/LlamaCppInfillRequest.php b/src/LlamaCppInfillRequest.php
new file mode 100644
index 00000000..adfa49fc
--- /dev/null
+++ b/src/LlamaCppInfillRequest.php
@@ -0,0 +1,27 @@
+<?php
+
+declare(strict_types=1);
+
+namespace Distantmagic\Resonance;
+
+use JsonSerializable;
+
+readonly class LlamaCppInfillRequest implements JsonSerializable
+{
+    public function __construct(
+        public string $after,
+        public string $before,
+    ) {}
+
+    public function jsonSerialize(): array
+    {
+        return [
+            'infill_prefix' => $this->before,
+            'infill_suffix' => $this->after,
+
+            // prompt field should not be mandatory, its a bug:
+            // https://github.com/ggerganov/llama.cpp/issues/4027
+            'prompt' => 'prompt',
+        ];
+    }
+}
diff --git a/src/OllamaLinkBuilder.php b/src/LlamaCppLinkBuilder.php
similarity index 82%
rename from src/OllamaLinkBuilder.php
rename to src/LlamaCppLinkBuilder.php
index 281a2de6..cea1533f 100644
--- a/src/OllamaLinkBuilder.php
+++ b/src/LlamaCppLinkBuilder.php
@@ -7,10 +7,10 @@ namespace Distantmagic\Resonance;
 use Distantmagic\Resonance\Attribute\Singleton;
 
 #[Singleton]
-readonly class OllamaLinkBuilder
+readonly class LlamaCppLinkBuilder
 {
     public function __construct(
-        private OllamaConfiguration $ollamaConfiguration,
+        private LlamaCppConfiguration $ollamaConfiguration,
     ) {}
 
     public function build(string $path): string
diff --git a/src/OllamaChatRequest.php b/src/OllamaChatRequest.php
deleted file mode 100644
index c87c64eb..00000000
--- a/src/OllamaChatRequest.php
+++ /dev/null
@@ -1,30 +0,0 @@
-<?php
-
-declare(strict_types=1);
-
-namespace Distantmagic\Resonance;
-
-use JsonSerializable;
-
-readonly class OllamaChatRequest implements JsonSerializable
-{
-    /**
-     * @param array<OllamaChatMessage> $messages
-     */
-    public function __construct(
-        public string $model,
-        public array $messages,
-        public OllamaRequestOptions $options = new OllamaRequestOptions(),
-    ) {}
-
-    public function jsonSerialize(): array
-    {
-        return [
-            'model' => $this->model,
-            'messages' => $this->messages,
-            'options' => $this->options,
-            'raw' => true,
-            'stream' => true,
-        ];
-    }
-}
diff --git a/src/OllamaChatRole.php b/src/OllamaChatRole.php
deleted file mode 100644
index 0a66628a..00000000
--- a/src/OllamaChatRole.php
+++ /dev/null
@@ -1,12 +0,0 @@
-<?php
-
-declare(strict_types=1);
-
-namespace Distantmagic\Resonance;
-
-enum OllamaChatRole: string
-{
-    case Assistant = 'assistant';
-    case System = 'system';
-    case User = 'user';
-}
diff --git a/src/OllamaChatSession.php b/src/OllamaChatSession.php
deleted file mode 100644
index 2ff877d7..00000000
--- a/src/OllamaChatSession.php
+++ /dev/null
@@ -1,41 +0,0 @@
-<?php
-
-declare(strict_types=1);
-
-namespace Distantmagic\Resonance;
-
-use Ds\Set;
-use Generator;
-
-readonly class OllamaChatSession
-{
-    /**
-     * @var Set<OllamaChatMessage>
-     */
-    private Set $messages;
-
-    public function __construct(
-        public string $model,
-        public OllamaClient $ollamaClient,
-    ) {
-        $this->messages = new Set();
-    }
-
-    /**
-     * @return Generator<OllamaChatToken>
-     */
-    public function respond(string $userMessageContent): Generator
-    {
-        $this
-            ->messages
-            ->add(new OllamaChatMessage($userMessageContent, OllamaChatRole::User))
-        ;
-
-        $chatRequest = new OllamaChatRequest(
-            model: $this->model,
-            messages: $this->messages->toArray(),
-        );
-
-        yield from $this->ollamaClient->generateChatCompletion($chatRequest);
-    }
-}
diff --git a/src/OllamaChatToken.php b/src/OllamaChatToken.php
deleted file mode 100644
index 9e77141a..00000000
--- a/src/OllamaChatToken.php
+++ /dev/null
@@ -1,21 +0,0 @@
-<?php
-
-declare(strict_types=1);
-
-namespace Distantmagic\Resonance;
-
-use DateTimeImmutable;
-use Stringable;
-
-readonly class OllamaChatToken implements Stringable
-{
-    public function __construct(
-        public DateTimeImmutable $createdAt,
-        public OllamaChatMessage $message,
-    ) {}
-
-    public function __toString(): string
-    {
-        return (string) $this->message;
-    }
-}
diff --git a/src/OllamaClient.php b/src/OllamaClient.php
deleted file mode 100644
index 2d351aa0..00000000
--- a/src/OllamaClient.php
+++ /dev/null
@@ -1,184 +0,0 @@
-<?php
-
-declare(strict_types=1);
-
-namespace Distantmagic\Resonance;
-
-use CurlHandle;
-use DateTimeImmutable;
-use Distantmagic\Resonance\Attribute\Singleton;
-use Generator;
-use JsonSerializable;
-use Psr\Log\LoggerInterface;
-use RuntimeException;
-use Swoole\Coroutine\Channel;
-
-#[Singleton]
-readonly class OllamaClient
-{
-    private CurlHandle $ch;
-
-    public function __construct(
-        private JsonSerializer $jsonSerializer,
-        private LoggerInterface $logger,
-        private OllamaLinkBuilder $ollamaLinkBuilder,
-    ) {
-        $this->ch = curl_init();
-
-        curl_setopt($this->ch, CURLOPT_POST, true);
-        curl_setopt($this->ch, CURLOPT_HTTPHEADER, [
-            'Content-Type: application/json',
-        ]);
-    }
-
-    public function __destruct()
-    {
-        curl_close($this->ch);
-    }
-
-    /**
-     * @return Generator<OllamaChatToken>
-     */
-    public function generateChatCompletion(OllamaChatRequest $request): Generator
-    {
-        $channel = $this->streamJson($request, '/api/chat');
-
-        /**
-         * @var SwooleChannelIterator<object{ error: string }|object{
-         *   created_at: string,
-         *   message: object{
-         *     content: string,
-         *     role: string,
-         *   },
-         *   response: string,
-         * }>
-         */
-        $swooleChannelIterator = new SwooleChannelIterator($channel);
-
-        foreach ($swooleChannelIterator as $data) {
-            if (isset($data->error)) {
-                $this->logger->error($data->error);
-            } else {
-                yield new OllamaChatToken(
-                    createdAt: new DateTimeImmutable($data->created_at),
-                    message: new OllamaChatMessage(
-                        content: $data->message->content,
-                        role: OllamaChatRole::from($data->message->role),
-                    )
-                );
-            }
-        }
-    }
-
-    /**
-     * @return Generator<OllamaCompletionToken>
-     */
-    public function generateCompletion(OllamaCompletionRequest $request): Generator
-    {
-        $channel = $this->streamJson($request, '/api/generate');
-
-        /**
-         * @var SwooleChannelIterator<object{ created_at: string, response: string }>
-         */
-        $swooleChannelIterator = new SwooleChannelIterator($channel);
-
-        foreach ($swooleChannelIterator as $token) {
-            yield new OllamaCompletionToken(
-                createdAt: new DateTimeImmutable($token->created_at),
-                response: $token->response,
-            );
-        }
-    }
-
-    public function generateEmbedding(OllamaEmbeddingRequest $request): OllamaEmbeddingResponse
-    {
-        $requestData = json_encode($request);
-
-        curl_setopt($this->ch, CURLOPT_URL, $this->ollamaLinkBuilder->build('/api/embeddings'));
-        curl_setopt($this->ch, CURLOPT_POSTFIELDS, $requestData);
-        curl_setopt($this->ch, CURLOPT_RETURNTRANSFER, true);
-
-        /**
-         * @var false|string $responseContent
-         */
-        $responseContent = curl_exec($this->ch);
-
-        if (false === $responseContent) {
-            throw new CurlException($this->ch);
-        }
-
-        $this->assertStatusCode(200);
-
-        /**
-         * @var object{ embedding: array<float> } $responseData
-         */
-        $responseData = $this
-            ->jsonSerializer
-            ->unserialize($responseContent)
-        ;
-
-        return new OllamaEmbeddingResponse($responseData->embedding);
-    }
-
-    private function assertStatusCode(int $expectedStatusCode): void
-    {
-        /**
-         * @var int $statusCode
-         */
-        $statusCode = curl_getinfo($this->ch, CURLINFO_RESPONSE_CODE);
-
-        if ($expectedStatusCode === $statusCode) {
-            return;
-        }
-
-        throw new RuntimeException(sprintf(
-            'curl request finished with unexpected status code: "%s"',
-            $statusCode,
-        ));
-    }
-
-    private function streamJson(JsonSerializable $request, string $path): Channel
-    {
-        $channel = new Channel(1);
-        $requestData = json_encode($request);
-
-        $cid = go(function () use ($channel, $path, $requestData) {
-            try {
-                curl_setopt($this->ch, CURLOPT_URL, $this->ollamaLinkBuilder->build($path));
-                curl_setopt($this->ch, CURLOPT_POSTFIELDS, $requestData);
-                curl_setopt($this->ch, CURLOPT_RETURNTRANSFER, false);
-                curl_setopt($this->ch, CURLOPT_WRITEFUNCTION, function (CurlHandle $ch, string $data) use ($channel) {
-                    $dataChunks = explode("\n", $data);
-
-                    foreach ($dataChunks as $dataChunk) {
-                        if (!empty($dataChunk)) {
-                            $channel->push(
-                                $this
-                                    ->jsonSerializer
-                                    ->unserialize($dataChunk)
-                            );
-                        }
-                    }
-
-                    return strlen($data);
-                });
-
-                if (!curl_exec($this->ch)) {
-                    throw new CurlException($this->ch);
-                }
-
-                $this->assertStatusCode(200);
-            } finally {
-                curl_setopt($this->ch, CURLOPT_WRITEFUNCTION, null);
-
-                $channel->close();
-            }
-        });
-
-        if (!is_int($cid)) {
-            throw new RuntimeException('Unable to start a coroutine');
-        }
-
-        return $channel;
-    }
-}
diff --git a/src/OllamaCompletionRequest.php b/src/OllamaCompletionRequest.php
deleted file mode 100644
index 1215d4fe..00000000
--- a/src/OllamaCompletionRequest.php
+++ /dev/null
@@ -1,32 +0,0 @@
-<?php
-
-declare(strict_types=1);
-
-namespace Distantmagic\Resonance;
-
-use JsonSerializable;
-
-readonly class OllamaCompletionRequest implements JsonSerializable
-{
-    public function __construct(
-        public string $model,
-        public string $prompt,
-        public OllamaRequestOptions $options = new OllamaRequestOptions(),
-    ) {}
-
-    public function jsonSerialize(): array
-    {
-        return [
-            'model' => $this->model,
-            'options' => $this->options,
-            'prompt' => sprintf(
-                '%s%s%s',
-                $this->options->stopDelimiter->instructions,
-                $this->prompt,
-                $this->options->stopDelimiter->system,
-            ),
-            'raw' => true,
-            'stream' => true,
-        ];
-    }
-}
diff --git a/src/OllamaCompletionToken.php b/src/OllamaCompletionToken.php
deleted file mode 100644
index bb4947d2..00000000
--- a/src/OllamaCompletionToken.php
+++ /dev/null
@@ -1,21 +0,0 @@
-<?php
-
-declare(strict_types=1);
-
-namespace Distantmagic\Resonance;
-
-use DateTimeImmutable;
-use Stringable;
-
-readonly class OllamaCompletionToken implements Stringable
-{
-    public function __construct(
-        public DateTimeImmutable $createdAt,
-        public string $response,
-    ) {}
-
-    public function __toString(): string
-    {
-        return $this->response;
-    }
-}
diff --git a/src/OllamaEmbeddingRequest.php b/src/OllamaEmbeddingRequest.php
deleted file mode 100644
index cdfa7b47..00000000
--- a/src/OllamaEmbeddingRequest.php
+++ /dev/null
@@ -1,27 +0,0 @@
-<?php
-
-declare(strict_types=1);
-
-namespace Distantmagic\Resonance;
-
-use JsonSerializable;
-
-readonly class OllamaEmbeddingRequest implements JsonSerializable
-{
-    public function __construct(
-        public string $model,
-        public string $prompt,
-        public OllamaRequestOptions $options = new OllamaRequestOptions(),
-    ) {}
-
-    public function jsonSerialize(): array
-    {
-        return [
-            'model' => $this->model,
-            'options' => $this->options,
-            'prompt' => $this->prompt,
-            'raw' => true,
-            'stream' => true,
-        ];
-    }
-}
diff --git a/src/OllamaEmbeddingResponse.php b/src/OllamaEmbeddingResponse.php
deleted file mode 100644
index 955bc499..00000000
--- a/src/OllamaEmbeddingResponse.php
+++ /dev/null
@@ -1,20 +0,0 @@
-<?php
-
-declare(strict_types=1);
-
-namespace Distantmagic\Resonance;
-
-use JsonSerializable;
-
-readonly class OllamaEmbeddingResponse implements JsonSerializable
-{
-    /**
-     * @param array<float> $embedding
-     */
-    public function __construct(public array $embedding) {}
-
-    public function jsonSerialize(): array
-    {
-        return $this->embedding;
-    }
-}
diff --git a/src/OllamaRequestOptions.php b/src/OllamaRequestOptions.php
deleted file mode 100644
index 24a5a62c..00000000
--- a/src/OllamaRequestOptions.php
+++ /dev/null
@@ -1,27 +0,0 @@
-<?php
-
-declare(strict_types=1);
-
-namespace Distantmagic\Resonance;
-
-use JsonSerializable;
-
-readonly class OllamaRequestOptions implements JsonSerializable
-{
-    public function __construct(
-        public float $numPredict = -1,
-        public float $temperature = 0.5,
-        public OllamaRequestStopDelimiter $stopDelimiter = new OllamaRequestStopDelimiter(),
-    ) {}
-
-    public function jsonSerialize(): array
-    {
-        $ret = [];
-
-        $ret['num_predict'] = $this->numPredict;
-        $ret['stop'] = $this->stopDelimiter;
-        $ret['temperature'] = $this->temperature;
-
-        return $ret;
-    }
-}
diff --git a/src/OllamaRequestStopDelimiter.php b/src/OllamaRequestStopDelimiter.php
deleted file mode 100644
index 750b3f0b..00000000
--- a/src/OllamaRequestStopDelimiter.php
+++ /dev/null
@@ -1,23 +0,0 @@
-<?php
-
-declare(strict_types=1);
-
-namespace Distantmagic\Resonance;
-
-use JsonSerializable;
-
-readonly class OllamaRequestStopDelimiter implements JsonSerializable
-{
-    public function __construct(
-        public string $instructions = '[INST]',
-        public string $system = '[SYS]',
-    ) {}
-
-    public function jsonSerialize(): array
-    {
-        return [
-            $this->instructions,
-            $this->system,
-        ];
-    }
-}
diff --git a/src/SingletonProvider/ConfigurationProvider/OllamaConfigurationProvider.php b/src/SingletonProvider/ConfigurationProvider/LlamaCppConfigurationProvider.php
similarity index 64%
rename from src/SingletonProvider/ConfigurationProvider/OllamaConfigurationProvider.php
rename to src/SingletonProvider/ConfigurationProvider/LlamaCppConfigurationProvider.php
index 086d16d6..7aebce60 100644
--- a/src/SingletonProvider/ConfigurationProvider/OllamaConfigurationProvider.php
+++ b/src/SingletonProvider/ConfigurationProvider/LlamaCppConfigurationProvider.php
@@ -6,22 +6,23 @@ namespace Distantmagic\Resonance\SingletonProvider\ConfigurationProvider;
 
 use Distantmagic\Resonance\Attribute\Singleton;
 use Distantmagic\Resonance\JsonSchema;
-use Distantmagic\Resonance\OllamaConfiguration;
+use Distantmagic\Resonance\LlamaCppConfiguration;
 use Distantmagic\Resonance\SingletonProvider\ConfigurationProvider;
 
 /**
- * @template-extends ConfigurationProvider<OllamaConfiguration, object{
+ * @template-extends ConfigurationProvider<LlamaCppConfiguration, object{
+ *     apiKey: null|string,
  *     host: string,
  *     port: int,
  *     scheme: string,
  * }>
  */
-#[Singleton(provides: OllamaConfiguration::class)]
-final readonly class OllamaConfigurationProvider extends ConfigurationProvider
+#[Singleton(provides: LlamaCppConfiguration::class)]
+final readonly class LlamaCppConfigurationProvider extends ConfigurationProvider
 {
     protected function getConfigurationKey(): string
     {
-        return 'ollama';
+        return 'llamacpp';
     }
 
     protected function makeSchema(): JsonSchema
@@ -29,6 +30,12 @@ final readonly class OllamaConfigurationProvider extends ConfigurationProvider
         return new JsonSchema([
             'type' => 'object',
             'properties' => [
+                'apiKey' => [
+                    'type' => 'string',
+                    'minLength' => 1,
+                    'nullable' => true,
+                    'default' => null,
+                ],
                 'host' => [
                     'type' => 'string',
                     'minLength' => 1,
@@ -48,9 +55,10 @@ final readonly class OllamaConfigurationProvider extends ConfigurationProvider
         ]);
     }
 
-    protected function provideConfiguration($validatedData): OllamaConfiguration
+    protected function provideConfiguration($validatedData): LlamaCppConfiguration
     {
-        return new OllamaConfiguration(
+        return new LlamaCppConfiguration(
+            apiKey: $validatedData->apiKey,
             host: $validatedData->host,
             port: $validatedData->port,
             scheme: $validatedData->scheme,
-- 
GitLab