From e70da8ce29f6551349c320d96fbd55cb4d0dfd33 Mon Sep 17 00:00:00 2001
From: Mateusz Charytoniuk <mateusz.charytoniuk@protonmail.com>
Date: Thu, 25 Jan 2024 07:35:44 +0100
Subject: [PATCH] chore: sqlite-vss draft

---
 .../pages/docs/features/vector-store/index.md |  17 ++
 .../features/vector-store/sqlite-vss/index.md |  74 +++++++
 resources/css/docs-page-homepage.css          |   2 +-
 src/Command/StaticPagesMakeEmbeddings.php     | 186 ++++++++++++++++++
 src/LlamaCppEmbeddingRequest.php              |   3 +-
 src/LlmPromptTemplate.php                     |  16 ++
 src/LlmPromptTemplate/MistralInstructChat.php |  23 +++
 src/LlmPromptTemplate/Phi2Question.php        |  23 +++
 src/LlmPromptTemplate/Plain.php               |  20 ++
 src/SQLiteVSSConfiguration.php                |  17 ++
 src/SQLiteVSSConnectionBuilder.php            |  26 +++
 .../SQLiteVSSConfigurationProvider.php        |  51 +++++
 12 files changed, 455 insertions(+), 3 deletions(-)
 create mode 100644 docs/pages/docs/features/vector-store/index.md
 create mode 100644 docs/pages/docs/features/vector-store/sqlite-vss/index.md
 create mode 100644 src/Command/StaticPagesMakeEmbeddings.php
 create mode 100644 src/LlmPromptTemplate.php
 create mode 100644 src/LlmPromptTemplate/MistralInstructChat.php
 create mode 100644 src/LlmPromptTemplate/Phi2Question.php
 create mode 100644 src/LlmPromptTemplate/Plain.php
 create mode 100644 src/SQLiteVSSConfiguration.php
 create mode 100644 src/SQLiteVSSConnectionBuilder.php
 create mode 100644 src/SingletonProvider/ConfigurationProvider/SQLiteVSSConfigurationProvider.php

diff --git a/docs/pages/docs/features/vector-store/index.md b/docs/pages/docs/features/vector-store/index.md
new file mode 100644
index 00000000..c8cae071
--- /dev/null
+++ b/docs/pages/docs/features/vector-store/index.md
@@ -0,0 +1,17 @@
+---
+collections: 
+    - documents
+draft: true
+layout: dm:document
+parent: docs/features/index
+title: Vector Store
+description: >
+    Vector stores can be used to store and search large vectors, such as LLM
+    embeddings.
+---
+
+# Vector Store
+
+Explore Resonance's integrations with vector stores.
+
+{{docs/features/vector-store/*/index}}
diff --git a/docs/pages/docs/features/vector-store/sqlite-vss/index.md b/docs/pages/docs/features/vector-store/sqlite-vss/index.md
new file mode 100644
index 00000000..ec5380e4
--- /dev/null
+++ b/docs/pages/docs/features/vector-store/sqlite-vss/index.md
@@ -0,0 +1,74 @@
+---
+collections:
+    - documents
+draft: true
+layout: dm:document
+parent: docs/features/vector-store/index
+title: SQLite-VSS
+description: >
+    SQLite Extension for vector search
+---
+
+# SQLite-VSS
+
+[SQLite-VSS](https://github.com/asg017/sqlite-vss) is a SQLite extension for 
+vector search. It supports similarity search and uses 
+[FAISS](https://faiss.ai/) under the hood.
+
+# Usage
+
+## Installation
+
+You need PHP `sqlite3` extension.
+
+To use SQLite with SQLite-VSS you need to load two additional extensions:
+`vector0.so` and `vss0.so`. They are both available at the 
+[SQLite-VSS](https://github.com/asg017/sqlite-vss/releases) release page.
+
+After downloading them, you need to enable SQLite extensions in your `php.ini` 
+file. Look for a similar section:
+
+```ini
+[sqlite3]
+; Directory pointing to SQLite3 extensions
+; https://php.net/sqlite3.extension-dir
+;sqlite3.extension_dir =
+```
+
+You must configure a directory in `sqlite3.extension_dir`, and then put those
+extensions into that directory.
+
+## Configuration
+
+You can configure extension filenames. Both are relative to 
+`sqlite3.extension_dir` directory and both must be inside that directory:
+
+```ini file:config.ini
+[sqlite-vss]
+extension_vector0 = vector0.so
+extension_vss0 = vss0.so
+```
+
+## Creating Database Object
+
+In your class you can then use `SQLiteVSSConnectionBuilder` to create an
+`SQLite3` instance with VSS extensions loaded:
+
+```php
+use Distantmagic\Resonance\SQLiteVSSConnectionBuilder;
+use SQLite3;
+
+class MyClass
+{
+    private SQLite3 $sqlite;
+
+    public function __construct(SQLiteVSSConnectionBuilder $builder)
+    {
+        $this->sqlite = $builder->buildConnection(':memory:');
+
+        // This should select the currently installed SQLite-VSS version if 
+        // everything is installed correctly.
+        $this->sqlite->query('SELECT vss_version()')->fetchArray();
+    }
+}
+```
diff --git a/resources/css/docs-page-homepage.css b/resources/css/docs-page-homepage.css
index 670b0145..2f2a0d6a 100644
--- a/resources/css/docs-page-homepage.css
+++ b/resources/css/docs-page-homepage.css
@@ -91,7 +91,7 @@ h2.homepage__example__title {
 }
 
 .homepage__title::before {
-  background-image: url('../images/resonance_cat.webp');
+  background-image: url("../images/resonance_cat.webp");
   background-position: center;
   background-size: cover;
   border-radius: 50%;
diff --git a/src/Command/StaticPagesMakeEmbeddings.php b/src/Command/StaticPagesMakeEmbeddings.php
new file mode 100644
index 00000000..2da64878
--- /dev/null
+++ b/src/Command/StaticPagesMakeEmbeddings.php
@@ -0,0 +1,186 @@
+<?php
+
+declare(strict_types=1);
+
+namespace Distantmagic\Resonance\Command;
+
+use Distantmagic\Resonance\Attribute\ConsoleCommand;
+use Distantmagic\Resonance\Command;
+use Distantmagic\Resonance\JsonSerializer;
+use Distantmagic\Resonance\LlamaCppClient;
+use Distantmagic\Resonance\LlamaCppEmbeddingRequest;
+use Distantmagic\Resonance\SQLiteVSSConnectionBuilder;
+use Distantmagic\Resonance\StaticPageAggregate;
+use Distantmagic\Resonance\StaticPageContentType;
+use Distantmagic\Resonance\StaticPageMarkdownParser;
+use Generator;
+use League\CommonMark\Extension\CommonMark\Node\Block\FencedCode;
+use League\CommonMark\Extension\CommonMark\Node\Block\Heading;
+use League\CommonMark\Extension\CommonMark\Node\Block\ListBlock;
+use League\CommonMark\Node\Node;
+use League\CommonMark\Node\StringContainerHelper;
+use Rubix\ML\Classifiers\KNearestNeighbors;
+use Rubix\ML\Datasets\Labeled;
+use SQLite3;
+use Symfony\Component\Console\Input\InputInterface;
+use Symfony\Component\Console\Output\OutputInterface;
+
+#[ConsoleCommand(
+    name: 'static-pages:make-embeddings',
+    description: 'Create embeddings from static pages contents (requires llama.cpp)'
+)]
+final class StaticPagesMakeEmbeddings extends Command
+{
+    private SQLite3 $embeddingsDatabase;
+
+    public function __construct(
+        private JsonSerializer $jsonSerializer,
+        private LlamaCppClient $llamaCppClient,
+        private StaticPageAggregate $staticPageAggregate,
+        private SQLiteVSSConnectionBuilder $sqliteVSSConnectionBuilder,
+        private StaticPageMarkdownParser $staticPageMarkdownParser,
+    ) {
+        parent::__construct();
+
+        $this->embeddingsDatabase = $sqliteVSSConnectionBuilder->buildConnection(':memory:');
+        $this->embeddingsDatabase->enableExceptions(true);
+        $this->embeddingsDatabase->exec(<<<'SQL'
+            CREATE VIRTUAL TABLE vss_embeddings USING vss0
+            (
+                embedding(4096),
+            );
+        SQL);
+    }
+
+    protected function execute(InputInterface $input, OutputInterface $output): int
+    {
+        $kNearestNeighbors = new KNearestNeighbors(
+            k: 10,
+            weighted: true,
+        );
+        $kNearestNeighbors->train($this->generateDataset());
+
+        $probas = $kNearestNeighbors->probaSample($this->getEmbedding('how to add a controller'));
+
+        foreach ($probas as $label => $proba) {
+            if ($proba > 0) {
+                var_dump($label);
+            }
+        }
+
+        return Command::SUCCESS;
+    }
+
+    private function extractNodeTextContent(Node $node): string
+    {
+        $childTextContent = StringContainerHelper::getChildText($node);
+
+        return trim(strip_tags($childTextContent));
+    }
+
+    /**
+     * @return Generator<non-empty-string>
+     */
+    private function generateChunks(): Generator
+    {
+        foreach ($this->staticPageAggregate->staticPages as $staticPage) {
+            if (StaticPageContentType::Html === $staticPage->frontMatter->contentType) {
+                continue;
+            }
+
+            $document = $this
+                ->staticPageMarkdownParser
+                ->converter
+                ->convert($staticPage->content)
+                ->getDocument()
+            ;
+
+            yield from $this->generateChunksFromNodeChildren($document);
+        }
+    }
+
+    /**
+     * @return Generator<non-empty-string>
+     */
+    private function generateChunksFromNodeChildren(Node $node): Generator
+    {
+        foreach ($node->children() as $child) {
+            if ($child instanceof Heading) {
+                continue;
+            }
+
+            if ($child instanceof FencedCode) {
+                continue;
+            }
+
+            if ($child instanceof ListBlock) {
+                yield from $this->generateChunksFromNodeChildren($child);
+
+                continue;
+            }
+
+            $textContent = $this->extractNodeTextContent($child);
+
+            if (!empty($textContent)) {
+                yield $textContent;
+            }
+        }
+    }
+
+    private function generateDataset(): Labeled
+    {
+        $samples = [];
+        $labels = [];
+
+        foreach ($this->generateEmbeddings() as $chunk => $embedding) {
+            $lastRowId = $this->embeddingsDatabase->lastInsertRowID();
+
+            $insertEmbedding = $this->embeddingsDatabase->prepare(<<<'SQL'
+                INSERT INTO vss_embeddings
+                (
+                    rowid,
+                    embedding
+                )
+                VALUES
+                (
+                    :rowid,
+                    :embedding
+                )
+            SQL);
+            $insertEmbedding->bindValue(':rowid', $lastRowId, SQLITE3_INTEGER);
+            $insertEmbedding->bindValue(
+                ':embedding',
+                $this->jsonSerializer->serialize($embedding),
+            );
+            $insertEmbedding->execute();
+            $insertEmbedding->close();
+
+            $samples[] = $embedding;
+            $labels[] = $chunk;
+        }
+
+        return new Labeled($samples, $labels);
+    }
+
+    /**
+     * @return Generator<non-empty-string,list<float>>
+     */
+    private function generateEmbeddings(): Generator
+    {
+        foreach ($this->generateChunks() as $chunk) {
+            yield $chunk => $this->getEmbedding($chunk);
+        }
+    }
+
+    /**
+     * @param non-empty-string $label
+     *
+     * @return list<float>
+     */
+    private function getEmbedding(string $label): array
+    {
+        $request = new LlamaCppEmbeddingRequest($label);
+
+        return $this->llamaCppClient->generateEmbedding($request)->embedding;
+    }
+}
diff --git a/src/LlamaCppEmbeddingRequest.php b/src/LlamaCppEmbeddingRequest.php
index 5fc8d8c9..1ea1874a 100644
--- a/src/LlamaCppEmbeddingRequest.php
+++ b/src/LlamaCppEmbeddingRequest.php
@@ -13,8 +13,7 @@ readonly class LlamaCppEmbeddingRequest implements JsonSerializable
      */
     public function __construct(
         public string $content,
-    ) {
-    }
+    ) {}
 
     public function jsonSerialize(): array
     {
diff --git a/src/LlmPromptTemplate.php b/src/LlmPromptTemplate.php
new file mode 100644
index 00000000..4a24bb7d
--- /dev/null
+++ b/src/LlmPromptTemplate.php
@@ -0,0 +1,16 @@
+<?php
+
+declare(strict_types=1);
+
+namespace Distantmagic\Resonance;
+
+use JsonSerializable;
+use Stringable;
+
+abstract readonly class LlmPromptTemplate implements JsonSerializable, Stringable
+{
+    public function jsonSerialize(): string
+    {
+        return (string) $this;
+    }
+}
diff --git a/src/LlmPromptTemplate/MistralInstructChat.php b/src/LlmPromptTemplate/MistralInstructChat.php
new file mode 100644
index 00000000..708cd97e
--- /dev/null
+++ b/src/LlmPromptTemplate/MistralInstructChat.php
@@ -0,0 +1,23 @@
+<?php
+
+declare(strict_types=1);
+
+namespace Distantmagic\Resonance\LlmPromptTemplate;
+
+use Distantmagic\Resonance\LlmPromptTemplate;
+
+readonly class MistralInstructChat extends LlmPromptTemplate
+{
+    /**
+     * @param non-empty-string $prompt
+     */
+    public function __construct(private string $prompt) {}
+
+    public function __toString(): string
+    {
+        return sprintf(
+            '[INST]%s[/INST]',
+            $this->prompt,
+        );
+    }
+}
diff --git a/src/LlmPromptTemplate/Phi2Question.php b/src/LlmPromptTemplate/Phi2Question.php
new file mode 100644
index 00000000..b4c75401
--- /dev/null
+++ b/src/LlmPromptTemplate/Phi2Question.php
@@ -0,0 +1,23 @@
+<?php
+
+declare(strict_types=1);
+
+namespace Distantmagic\Resonance\LlmPromptTemplate;
+
+use Distantmagic\Resonance\LlmPromptTemplate;
+
+readonly class Phi2Question extends LlmPromptTemplate
+{
+    /**
+     * @param non-empty-string $prompt
+     */
+    public function __construct(private string $prompt) {}
+
+    public function __toString(): string
+    {
+        return sprintf(
+            "Question: %s\nAnswer: ",
+            $this->prompt,
+        );
+    }
+}
diff --git a/src/LlmPromptTemplate/Plain.php b/src/LlmPromptTemplate/Plain.php
new file mode 100644
index 00000000..e2074570
--- /dev/null
+++ b/src/LlmPromptTemplate/Plain.php
@@ -0,0 +1,20 @@
+<?php
+
+declare(strict_types=1);
+
+namespace Distantmagic\Resonance\LlmPromptTemplate;
+
+use Distantmagic\Resonance\LlmPromptTemplate;
+
+readonly class Plain extends LlmPromptTemplate
+{
+    /**
+     * @param non-empty-string $prompt
+     */
+    public function __construct(private string $prompt) {}
+
+    public function __toString(): string
+    {
+        return $this->prompt;
+    }
+}
diff --git a/src/SQLiteVSSConfiguration.php b/src/SQLiteVSSConfiguration.php
new file mode 100644
index 00000000..8117fb76
--- /dev/null
+++ b/src/SQLiteVSSConfiguration.php
@@ -0,0 +1,17 @@
+<?php
+
+declare(strict_types=1);
+
+namespace Distantmagic\Resonance;
+
+use SensitiveParameter;
+
+readonly class SQLiteVSSConfiguration
+{
+    public function __construct(
+        #[SensitiveParameter]
+        public string $extensionVector0,
+        #[SensitiveParameter]
+        public string $extensionVss0,
+    ) {}
+}
diff --git a/src/SQLiteVSSConnectionBuilder.php b/src/SQLiteVSSConnectionBuilder.php
new file mode 100644
index 00000000..be27327a
--- /dev/null
+++ b/src/SQLiteVSSConnectionBuilder.php
@@ -0,0 +1,26 @@
+<?php
+
+declare(strict_types=1);
+
+namespace Distantmagic\Resonance;
+
+use Distantmagic\Resonance\Attribute\Singleton;
+use SQLite3;
+
+#[Singleton]
+readonly class SQLiteVSSConnectionBuilder
+{
+    public function __construct(private SQLiteVSSConfiguration $sqliteVSSConfiguration) {}
+
+    public function buildConnection(
+        string $filename,
+        int $flags = SQLITE3_OPEN_READWRITE | SQLITE3_OPEN_CREATE,
+        string $encryptionKey = '',
+    ): SQLite3 {
+        $sqlite = new SQLite3($filename, $flags, $encryptionKey);
+        $sqlite->loadExtension($this->sqliteVSSConfiguration->extensionVector0);
+        $sqlite->loadExtension($this->sqliteVSSConfiguration->extensionVss0);
+
+        return $sqlite;
+    }
+}
diff --git a/src/SingletonProvider/ConfigurationProvider/SQLiteVSSConfigurationProvider.php b/src/SingletonProvider/ConfigurationProvider/SQLiteVSSConfigurationProvider.php
new file mode 100644
index 00000000..65d02893
--- /dev/null
+++ b/src/SingletonProvider/ConfigurationProvider/SQLiteVSSConfigurationProvider.php
@@ -0,0 +1,51 @@
+<?php
+
+declare(strict_types=1);
+
+namespace Distantmagic\Resonance\SingletonProvider\ConfigurationProvider;
+
+use Distantmagic\Resonance\Attribute\Singleton;
+use Distantmagic\Resonance\JsonSchema;
+use Distantmagic\Resonance\SingletonProvider\ConfigurationProvider;
+use Distantmagic\Resonance\SQLiteVSSConfiguration;
+
+/**
+ * @template-extends ConfigurationProvider<SQLiteVSSConfiguration, object{
+ *     extension_vector0: string,
+ *     extension_vss0: string,
+ * }>
+ */
+#[Singleton(provides: SQLiteVSSConfiguration::class)]
+final readonly class SQLiteVSSConfigurationProvider extends ConfigurationProvider
+{
+    public function getSchema(): JsonSchema
+    {
+        return new JsonSchema([
+            'type' => 'object',
+            'properties' => [
+                'extension_vector0' => [
+                    'type' => 'string',
+                    'minLength' => 1,
+                ],
+                'extension_vss0' => [
+                    'type' => 'string',
+                    'minLength' => 1,
+                ],
+            ],
+            'required' => ['extension_vector0', 'extension_vss0'],
+        ]);
+    }
+
+    protected function getConfigurationKey(): string
+    {
+        return 'sqlite-vss';
+    }
+
+    protected function provideConfiguration($validatedData): SQLiteVSSConfiguration
+    {
+        return new SQLiteVSSConfiguration(
+            extensionVector0: $validatedData->extension_vector0,
+            extensionVss0: $validatedData->extension_vss0,
+        );
+    }
+}
-- 
GitLab