From e70da8ce29f6551349c320d96fbd55cb4d0dfd33 Mon Sep 17 00:00:00 2001 From: Mateusz Charytoniuk <mateusz.charytoniuk@protonmail.com> Date: Thu, 25 Jan 2024 07:35:44 +0100 Subject: [PATCH] chore: sqlite-vss draft --- .../pages/docs/features/vector-store/index.md | 17 ++ .../features/vector-store/sqlite-vss/index.md | 74 +++++++ resources/css/docs-page-homepage.css | 2 +- src/Command/StaticPagesMakeEmbeddings.php | 186 ++++++++++++++++++ src/LlamaCppEmbeddingRequest.php | 3 +- src/LlmPromptTemplate.php | 16 ++ src/LlmPromptTemplate/MistralInstructChat.php | 23 +++ src/LlmPromptTemplate/Phi2Question.php | 23 +++ src/LlmPromptTemplate/Plain.php | 20 ++ src/SQLiteVSSConfiguration.php | 17 ++ src/SQLiteVSSConnectionBuilder.php | 26 +++ .../SQLiteVSSConfigurationProvider.php | 51 +++++ 12 files changed, 455 insertions(+), 3 deletions(-) create mode 100644 docs/pages/docs/features/vector-store/index.md create mode 100644 docs/pages/docs/features/vector-store/sqlite-vss/index.md create mode 100644 src/Command/StaticPagesMakeEmbeddings.php create mode 100644 src/LlmPromptTemplate.php create mode 100644 src/LlmPromptTemplate/MistralInstructChat.php create mode 100644 src/LlmPromptTemplate/Phi2Question.php create mode 100644 src/LlmPromptTemplate/Plain.php create mode 100644 src/SQLiteVSSConfiguration.php create mode 100644 src/SQLiteVSSConnectionBuilder.php create mode 100644 src/SingletonProvider/ConfigurationProvider/SQLiteVSSConfigurationProvider.php diff --git a/docs/pages/docs/features/vector-store/index.md b/docs/pages/docs/features/vector-store/index.md new file mode 100644 index 00000000..c8cae071 --- /dev/null +++ b/docs/pages/docs/features/vector-store/index.md @@ -0,0 +1,17 @@ +--- +collections: + - documents +draft: true +layout: dm:document +parent: docs/features/index +title: Vector Store +description: > + Vector stores can be used to store and search large vectors, such as LLM + embeddings. +--- + +# Vector Store + +Explore Resonance's integrations with vector stores. + +{{docs/features/vector-store/*/index}} diff --git a/docs/pages/docs/features/vector-store/sqlite-vss/index.md b/docs/pages/docs/features/vector-store/sqlite-vss/index.md new file mode 100644 index 00000000..ec5380e4 --- /dev/null +++ b/docs/pages/docs/features/vector-store/sqlite-vss/index.md @@ -0,0 +1,74 @@ +--- +collections: + - documents +draft: true +layout: dm:document +parent: docs/features/vector-store/index +title: SQLite-VSS +description: > + SQLite Extension for vector search +--- + +# SQLite-VSS + +[SQLite-VSS](https://github.com/asg017/sqlite-vss) is a SQLite extension for +vector search. It supports similarity search and uses +[FAISS](https://faiss.ai/) under the hood. + +# Usage + +## Installation + +You need PHP `sqlite3` extension. + +To use SQLite with SQLite-VSS you need to load two additional extensions: +`vector0.so` and `vss0.so`. They are both available at the +[SQLite-VSS](https://github.com/asg017/sqlite-vss/releases) release page. + +After downloading them, you need to enable SQLite extensions in your `php.ini` +file. Look for a similar section: + +```ini +[sqlite3] +; Directory pointing to SQLite3 extensions +; https://php.net/sqlite3.extension-dir +;sqlite3.extension_dir = +``` + +You must configure a directory in `sqlite3.extension_dir`, and then put those +extensions into that directory. + +## Configuration + +You can configure extension filenames. Both are relative to +`sqlite3.extension_dir` directory and both must be inside that directory: + +```ini file:config.ini +[sqlite-vss] +extension_vector0 = vector0.so +extension_vss0 = vss0.so +``` + +## Creating Database Object + +In your class you can then use `SQLiteVSSConnectionBuilder` to create an +`SQLite3` instance with VSS extensions loaded: + +```php +use Distantmagic\Resonance\SQLiteVSSConnectionBuilder; +use SQLite3; + +class MyClass +{ + private SQLite3 $sqlite; + + public function __construct(SQLiteVSSConnectionBuilder $builder) + { + $this->sqlite = $builder->buildConnection(':memory:'); + + // This should select the currently installed SQLite-VSS version if + // everything is installed correctly. + $this->sqlite->query('SELECT vss_version()')->fetchArray(); + } +} +``` diff --git a/resources/css/docs-page-homepage.css b/resources/css/docs-page-homepage.css index 670b0145..2f2a0d6a 100644 --- a/resources/css/docs-page-homepage.css +++ b/resources/css/docs-page-homepage.css @@ -91,7 +91,7 @@ h2.homepage__example__title { } .homepage__title::before { - background-image: url('../images/resonance_cat.webp'); + background-image: url("../images/resonance_cat.webp"); background-position: center; background-size: cover; border-radius: 50%; diff --git a/src/Command/StaticPagesMakeEmbeddings.php b/src/Command/StaticPagesMakeEmbeddings.php new file mode 100644 index 00000000..2da64878 --- /dev/null +++ b/src/Command/StaticPagesMakeEmbeddings.php @@ -0,0 +1,186 @@ +<?php + +declare(strict_types=1); + +namespace Distantmagic\Resonance\Command; + +use Distantmagic\Resonance\Attribute\ConsoleCommand; +use Distantmagic\Resonance\Command; +use Distantmagic\Resonance\JsonSerializer; +use Distantmagic\Resonance\LlamaCppClient; +use Distantmagic\Resonance\LlamaCppEmbeddingRequest; +use Distantmagic\Resonance\SQLiteVSSConnectionBuilder; +use Distantmagic\Resonance\StaticPageAggregate; +use Distantmagic\Resonance\StaticPageContentType; +use Distantmagic\Resonance\StaticPageMarkdownParser; +use Generator; +use League\CommonMark\Extension\CommonMark\Node\Block\FencedCode; +use League\CommonMark\Extension\CommonMark\Node\Block\Heading; +use League\CommonMark\Extension\CommonMark\Node\Block\ListBlock; +use League\CommonMark\Node\Node; +use League\CommonMark\Node\StringContainerHelper; +use Rubix\ML\Classifiers\KNearestNeighbors; +use Rubix\ML\Datasets\Labeled; +use SQLite3; +use Symfony\Component\Console\Input\InputInterface; +use Symfony\Component\Console\Output\OutputInterface; + +#[ConsoleCommand( + name: 'static-pages:make-embeddings', + description: 'Create embeddings from static pages contents (requires llama.cpp)' +)] +final class StaticPagesMakeEmbeddings extends Command +{ + private SQLite3 $embeddingsDatabase; + + public function __construct( + private JsonSerializer $jsonSerializer, + private LlamaCppClient $llamaCppClient, + private StaticPageAggregate $staticPageAggregate, + private SQLiteVSSConnectionBuilder $sqliteVSSConnectionBuilder, + private StaticPageMarkdownParser $staticPageMarkdownParser, + ) { + parent::__construct(); + + $this->embeddingsDatabase = $sqliteVSSConnectionBuilder->buildConnection(':memory:'); + $this->embeddingsDatabase->enableExceptions(true); + $this->embeddingsDatabase->exec(<<<'SQL' + CREATE VIRTUAL TABLE vss_embeddings USING vss0 + ( + embedding(4096), + ); + SQL); + } + + protected function execute(InputInterface $input, OutputInterface $output): int + { + $kNearestNeighbors = new KNearestNeighbors( + k: 10, + weighted: true, + ); + $kNearestNeighbors->train($this->generateDataset()); + + $probas = $kNearestNeighbors->probaSample($this->getEmbedding('how to add a controller')); + + foreach ($probas as $label => $proba) { + if ($proba > 0) { + var_dump($label); + } + } + + return Command::SUCCESS; + } + + private function extractNodeTextContent(Node $node): string + { + $childTextContent = StringContainerHelper::getChildText($node); + + return trim(strip_tags($childTextContent)); + } + + /** + * @return Generator<non-empty-string> + */ + private function generateChunks(): Generator + { + foreach ($this->staticPageAggregate->staticPages as $staticPage) { + if (StaticPageContentType::Html === $staticPage->frontMatter->contentType) { + continue; + } + + $document = $this + ->staticPageMarkdownParser + ->converter + ->convert($staticPage->content) + ->getDocument() + ; + + yield from $this->generateChunksFromNodeChildren($document); + } + } + + /** + * @return Generator<non-empty-string> + */ + private function generateChunksFromNodeChildren(Node $node): Generator + { + foreach ($node->children() as $child) { + if ($child instanceof Heading) { + continue; + } + + if ($child instanceof FencedCode) { + continue; + } + + if ($child instanceof ListBlock) { + yield from $this->generateChunksFromNodeChildren($child); + + continue; + } + + $textContent = $this->extractNodeTextContent($child); + + if (!empty($textContent)) { + yield $textContent; + } + } + } + + private function generateDataset(): Labeled + { + $samples = []; + $labels = []; + + foreach ($this->generateEmbeddings() as $chunk => $embedding) { + $lastRowId = $this->embeddingsDatabase->lastInsertRowID(); + + $insertEmbedding = $this->embeddingsDatabase->prepare(<<<'SQL' + INSERT INTO vss_embeddings + ( + rowid, + embedding + ) + VALUES + ( + :rowid, + :embedding + ) + SQL); + $insertEmbedding->bindValue(':rowid', $lastRowId, SQLITE3_INTEGER); + $insertEmbedding->bindValue( + ':embedding', + $this->jsonSerializer->serialize($embedding), + ); + $insertEmbedding->execute(); + $insertEmbedding->close(); + + $samples[] = $embedding; + $labels[] = $chunk; + } + + return new Labeled($samples, $labels); + } + + /** + * @return Generator<non-empty-string,list<float>> + */ + private function generateEmbeddings(): Generator + { + foreach ($this->generateChunks() as $chunk) { + yield $chunk => $this->getEmbedding($chunk); + } + } + + /** + * @param non-empty-string $label + * + * @return list<float> + */ + private function getEmbedding(string $label): array + { + $request = new LlamaCppEmbeddingRequest($label); + + return $this->llamaCppClient->generateEmbedding($request)->embedding; + } +} diff --git a/src/LlamaCppEmbeddingRequest.php b/src/LlamaCppEmbeddingRequest.php index 5fc8d8c9..1ea1874a 100644 --- a/src/LlamaCppEmbeddingRequest.php +++ b/src/LlamaCppEmbeddingRequest.php @@ -13,8 +13,7 @@ readonly class LlamaCppEmbeddingRequest implements JsonSerializable */ public function __construct( public string $content, - ) { - } + ) {} public function jsonSerialize(): array { diff --git a/src/LlmPromptTemplate.php b/src/LlmPromptTemplate.php new file mode 100644 index 00000000..4a24bb7d --- /dev/null +++ b/src/LlmPromptTemplate.php @@ -0,0 +1,16 @@ +<?php + +declare(strict_types=1); + +namespace Distantmagic\Resonance; + +use JsonSerializable; +use Stringable; + +abstract readonly class LlmPromptTemplate implements JsonSerializable, Stringable +{ + public function jsonSerialize(): string + { + return (string) $this; + } +} diff --git a/src/LlmPromptTemplate/MistralInstructChat.php b/src/LlmPromptTemplate/MistralInstructChat.php new file mode 100644 index 00000000..708cd97e --- /dev/null +++ b/src/LlmPromptTemplate/MistralInstructChat.php @@ -0,0 +1,23 @@ +<?php + +declare(strict_types=1); + +namespace Distantmagic\Resonance\LlmPromptTemplate; + +use Distantmagic\Resonance\LlmPromptTemplate; + +readonly class MistralInstructChat extends LlmPromptTemplate +{ + /** + * @param non-empty-string $prompt + */ + public function __construct(private string $prompt) {} + + public function __toString(): string + { + return sprintf( + '[INST]%s[/INST]', + $this->prompt, + ); + } +} diff --git a/src/LlmPromptTemplate/Phi2Question.php b/src/LlmPromptTemplate/Phi2Question.php new file mode 100644 index 00000000..b4c75401 --- /dev/null +++ b/src/LlmPromptTemplate/Phi2Question.php @@ -0,0 +1,23 @@ +<?php + +declare(strict_types=1); + +namespace Distantmagic\Resonance\LlmPromptTemplate; + +use Distantmagic\Resonance\LlmPromptTemplate; + +readonly class Phi2Question extends LlmPromptTemplate +{ + /** + * @param non-empty-string $prompt + */ + public function __construct(private string $prompt) {} + + public function __toString(): string + { + return sprintf( + "Question: %s\nAnswer: ", + $this->prompt, + ); + } +} diff --git a/src/LlmPromptTemplate/Plain.php b/src/LlmPromptTemplate/Plain.php new file mode 100644 index 00000000..e2074570 --- /dev/null +++ b/src/LlmPromptTemplate/Plain.php @@ -0,0 +1,20 @@ +<?php + +declare(strict_types=1); + +namespace Distantmagic\Resonance\LlmPromptTemplate; + +use Distantmagic\Resonance\LlmPromptTemplate; + +readonly class Plain extends LlmPromptTemplate +{ + /** + * @param non-empty-string $prompt + */ + public function __construct(private string $prompt) {} + + public function __toString(): string + { + return $this->prompt; + } +} diff --git a/src/SQLiteVSSConfiguration.php b/src/SQLiteVSSConfiguration.php new file mode 100644 index 00000000..8117fb76 --- /dev/null +++ b/src/SQLiteVSSConfiguration.php @@ -0,0 +1,17 @@ +<?php + +declare(strict_types=1); + +namespace Distantmagic\Resonance; + +use SensitiveParameter; + +readonly class SQLiteVSSConfiguration +{ + public function __construct( + #[SensitiveParameter] + public string $extensionVector0, + #[SensitiveParameter] + public string $extensionVss0, + ) {} +} diff --git a/src/SQLiteVSSConnectionBuilder.php b/src/SQLiteVSSConnectionBuilder.php new file mode 100644 index 00000000..be27327a --- /dev/null +++ b/src/SQLiteVSSConnectionBuilder.php @@ -0,0 +1,26 @@ +<?php + +declare(strict_types=1); + +namespace Distantmagic\Resonance; + +use Distantmagic\Resonance\Attribute\Singleton; +use SQLite3; + +#[Singleton] +readonly class SQLiteVSSConnectionBuilder +{ + public function __construct(private SQLiteVSSConfiguration $sqliteVSSConfiguration) {} + + public function buildConnection( + string $filename, + int $flags = SQLITE3_OPEN_READWRITE | SQLITE3_OPEN_CREATE, + string $encryptionKey = '', + ): SQLite3 { + $sqlite = new SQLite3($filename, $flags, $encryptionKey); + $sqlite->loadExtension($this->sqliteVSSConfiguration->extensionVector0); + $sqlite->loadExtension($this->sqliteVSSConfiguration->extensionVss0); + + return $sqlite; + } +} diff --git a/src/SingletonProvider/ConfigurationProvider/SQLiteVSSConfigurationProvider.php b/src/SingletonProvider/ConfigurationProvider/SQLiteVSSConfigurationProvider.php new file mode 100644 index 00000000..65d02893 --- /dev/null +++ b/src/SingletonProvider/ConfigurationProvider/SQLiteVSSConfigurationProvider.php @@ -0,0 +1,51 @@ +<?php + +declare(strict_types=1); + +namespace Distantmagic\Resonance\SingletonProvider\ConfigurationProvider; + +use Distantmagic\Resonance\Attribute\Singleton; +use Distantmagic\Resonance\JsonSchema; +use Distantmagic\Resonance\SingletonProvider\ConfigurationProvider; +use Distantmagic\Resonance\SQLiteVSSConfiguration; + +/** + * @template-extends ConfigurationProvider<SQLiteVSSConfiguration, object{ + * extension_vector0: string, + * extension_vss0: string, + * }> + */ +#[Singleton(provides: SQLiteVSSConfiguration::class)] +final readonly class SQLiteVSSConfigurationProvider extends ConfigurationProvider +{ + public function getSchema(): JsonSchema + { + return new JsonSchema([ + 'type' => 'object', + 'properties' => [ + 'extension_vector0' => [ + 'type' => 'string', + 'minLength' => 1, + ], + 'extension_vss0' => [ + 'type' => 'string', + 'minLength' => 1, + ], + ], + 'required' => ['extension_vector0', 'extension_vss0'], + ]); + } + + protected function getConfigurationKey(): string + { + return 'sqlite-vss'; + } + + protected function provideConfiguration($validatedData): SQLiteVSSConfiguration + { + return new SQLiteVSSConfiguration( + extensionVector0: $validatedData->extension_vector0, + extensionVss0: $validatedData->extension_vss0, + ); + } +} -- GitLab