diff --git a/docs/examples/hybrid-layer.ipynb b/docs/examples/hybrid-layer.ipynb index 8b1da5ae75f0a8a9572996b8e416a282d2c48f1b..89965b4e238bbd92215164429e178057fbf29844 100644 --- a/docs/examples/hybrid-layer.ipynb +++ b/docs/examples/hybrid-layer.ipynb @@ -30,11 +30,11 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ - "!pip install -qU semantic-router==0.0.6" + "# !pip install -qU semantic-router==0.0.6" ] }, { @@ -46,18 +46,15 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 2, "metadata": {}, "outputs": [ { - "ename": "ImportError", - "evalue": "cannot import name 'Route' from 'semantic_router.schema' (/Users/jakit/customers/aurelio/semantic-router/.venv/lib/python3.11/site-packages/semantic_router/schema.py)", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mImportError\u001b[0m Traceback (most recent call last)", - "\u001b[1;32m/Users/jakit/customers/aurelio/semantic-router/docs/examples/hybrid-layer.ipynb Cell 7\u001b[0m line \u001b[0;36m1\n\u001b[0;32m----> <a href='vscode-notebook-cell:/Users/jakit/customers/aurelio/semantic-router/docs/examples/hybrid-layer.ipynb#X10sZmlsZQ%3D%3D?line=0'>1</a>\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39msemantic_router\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mschema\u001b[39;00m \u001b[39mimport\u001b[39;00m Route\n\u001b[1;32m <a href='vscode-notebook-cell:/Users/jakit/customers/aurelio/semantic-router/docs/examples/hybrid-layer.ipynb#X10sZmlsZQ%3D%3D?line=2'>3</a>\u001b[0m politics \u001b[39m=\u001b[39m Route(\n\u001b[1;32m <a href='vscode-notebook-cell:/Users/jakit/customers/aurelio/semantic-router/docs/examples/hybrid-layer.ipynb#X10sZmlsZQ%3D%3D?line=3'>4</a>\u001b[0m name\u001b[39m=\u001b[39m\u001b[39m\"\u001b[39m\u001b[39mpolitics\u001b[39m\u001b[39m\"\u001b[39m,\n\u001b[1;32m <a href='vscode-notebook-cell:/Users/jakit/customers/aurelio/semantic-router/docs/examples/hybrid-layer.ipynb#X10sZmlsZQ%3D%3D?line=4'>5</a>\u001b[0m utterances\u001b[39m=\u001b[39m[\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m <a href='vscode-notebook-cell:/Users/jakit/customers/aurelio/semantic-router/docs/examples/hybrid-layer.ipynb#X10sZmlsZQ%3D%3D?line=11'>12</a>\u001b[0m ],\n\u001b[1;32m <a href='vscode-notebook-cell:/Users/jakit/customers/aurelio/semantic-router/docs/examples/hybrid-layer.ipynb#X10sZmlsZQ%3D%3D?line=12'>13</a>\u001b[0m )\n", - "\u001b[0;31mImportError\u001b[0m: cannot import name 'Route' from 'semantic_router.schema' (/Users/jakit/customers/aurelio/semantic-router/.venv/lib/python3.11/site-packages/semantic_router/schema.py)" + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/danielgriffiths/Coding_files/Aurelio_local/semantic-router/.venv/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n" ] } ], @@ -86,21 +83,10 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ - "chitchat = Route(\n", - " name=\"chitchat\",\n", - " utterances=[\n", - " \"how's the weather today?\",\n", - " \"how are things going?\",\n", - " \"lovely weather today\",\n", - " \"the weather is horrendous\",\n", - " \"let's go to the chippy\",\n", - " ],\n", - ")\n", - "\n", "chitchat = Route(\n", " name=\"chitchat\",\n", " utterances=[\n", @@ -124,19 +110,21 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "import os\n", - "from semantic_router.encoders import CohereEncoder\n", + "from semantic_router.encoders import CohereEncoder, BM25Encoder, TfidfEncoder\n", "from getpass import getpass\n", "\n", "os.environ[\"COHERE_API_KEY\"] = os.environ[\"COHERE_API_KEY\"] or getpass(\n", " \"Enter Cohere API Key: \"\n", ")\n", "\n", - "encoder = CohereEncoder()" + "dense_encoder = CohereEncoder()\n", + "# sparse_encoder = BM25Encoder()\n", + "sparse_encoder = TfidfEncoder()" ] }, { @@ -148,33 +136,110 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 2/2 [00:00<00:00, 2.58it/s]\n" + ] + } + ], "source": [ "from semantic_router.hybrid_layer import HybridRouteLayer\n", "\n", - "dl = HybridRouteLayer(encoder=encoder, routes=routes)" + "dl = HybridRouteLayer(dense_encoder=dense_encoder, sparse_encoder=sparse_encoder, routes=routes)" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "'politics'" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "dl(\"don't you love politics?\")" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "'chitchat'" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "dl(\"how's the weather today?\")" ] }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "religion = Route(\n", + " name=\"religion\",\n", + " utterances=[\n", + " \"what do you know about Buddhism?\",\n", + " \"tell me about Christianity\",\n", + " \"explain the principles of Hinduism\",\n", + " \"describe the teachings of Islam\",\n", + " \"what are the main beliefs of Judaism?\",\n", + " ],\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "dl.add(religion)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'religion'" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dl(\"what do you think of Hinduism?\")" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -199,7 +264,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.3" + "version": "3.11.5" } }, "nbformat": 4, diff --git a/poetry.lock b/poetry.lock index f5d58647509f00de61c09fd94a65fefc965b4453..b6fff4703b0e67f0bbb4d2b130fdeef485f18fce 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1916,6 +1916,95 @@ files = [ {file = "ruff-0.1.8.tar.gz", hash = "sha256:f7ee467677467526cfe135eab86a40a0e8db43117936ac4f9b469ce9cdb3fb62"}, ] +[[package]] +name = "scikit-learn" +version = "1.3.2" +description = "A set of python modules for machine learning and data mining" +optional = false +python-versions = ">=3.8" +files = [ + {file = "scikit-learn-1.3.2.tar.gz", hash = "sha256:a2f54c76accc15a34bfb9066e6c7a56c1e7235dda5762b990792330b52ccfb05"}, + {file = "scikit_learn-1.3.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:e326c0eb5cf4d6ba40f93776a20e9a7a69524c4db0757e7ce24ba222471ee8a1"}, + {file = "scikit_learn-1.3.2-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:535805c2a01ccb40ca4ab7d081d771aea67e535153e35a1fd99418fcedd1648a"}, + {file = "scikit_learn-1.3.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1215e5e58e9880b554b01187b8c9390bf4dc4692eedeaf542d3273f4785e342c"}, + {file = "scikit_learn-1.3.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0ee107923a623b9f517754ea2f69ea3b62fc898a3641766cb7deb2f2ce450161"}, + {file = "scikit_learn-1.3.2-cp310-cp310-win_amd64.whl", hash = "sha256:35a22e8015048c628ad099da9df5ab3004cdbf81edc75b396fd0cff8699ac58c"}, + {file = "scikit_learn-1.3.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:6fb6bc98f234fda43163ddbe36df8bcde1d13ee176c6dc9b92bb7d3fc842eb66"}, + {file = "scikit_learn-1.3.2-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:18424efee518a1cde7b0b53a422cde2f6625197de6af36da0b57ec502f126157"}, + {file = "scikit_learn-1.3.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3271552a5eb16f208a6f7f617b8cc6d1f137b52c8a1ef8edf547db0259b2c9fb"}, + {file = "scikit_learn-1.3.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fc4144a5004a676d5022b798d9e573b05139e77f271253a4703eed295bde0433"}, + {file = "scikit_learn-1.3.2-cp311-cp311-win_amd64.whl", hash = "sha256:67f37d708f042a9b8d59551cf94d30431e01374e00dc2645fa186059c6c5d78b"}, + {file = "scikit_learn-1.3.2-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:8db94cd8a2e038b37a80a04df8783e09caac77cbe052146432e67800e430c028"}, + {file = "scikit_learn-1.3.2-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:61a6efd384258789aa89415a410dcdb39a50e19d3d8410bd29be365bcdd512d5"}, + {file = "scikit_learn-1.3.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cb06f8dce3f5ddc5dee1715a9b9f19f20d295bed8e3cd4fa51e1d050347de525"}, + {file = "scikit_learn-1.3.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5b2de18d86f630d68fe1f87af690d451388bb186480afc719e5f770590c2ef6c"}, + {file = "scikit_learn-1.3.2-cp312-cp312-win_amd64.whl", hash = "sha256:0402638c9a7c219ee52c94cbebc8fcb5eb9fe9c773717965c1f4185588ad3107"}, + {file = "scikit_learn-1.3.2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:a19f90f95ba93c1a7f7924906d0576a84da7f3b2282ac3bfb7a08a32801add93"}, + {file = "scikit_learn-1.3.2-cp38-cp38-macosx_12_0_arm64.whl", hash = "sha256:b8692e395a03a60cd927125eef3a8e3424d86dde9b2370d544f0ea35f78a8073"}, + {file = "scikit_learn-1.3.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:15e1e94cc23d04d39da797ee34236ce2375ddea158b10bee3c343647d615581d"}, + {file = "scikit_learn-1.3.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:785a2213086b7b1abf037aeadbbd6d67159feb3e30263434139c98425e3dcfcf"}, + {file = "scikit_learn-1.3.2-cp38-cp38-win_amd64.whl", hash = "sha256:64381066f8aa63c2710e6b56edc9f0894cc7bf59bd71b8ce5613a4559b6145e0"}, + {file = "scikit_learn-1.3.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:6c43290337f7a4b969d207e620658372ba3c1ffb611f8bc2b6f031dc5c6d1d03"}, + {file = "scikit_learn-1.3.2-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:dc9002fc200bed597d5d34e90c752b74df516d592db162f756cc52836b38fe0e"}, + {file = "scikit_learn-1.3.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1d08ada33e955c54355d909b9c06a4789a729977f165b8bae6f225ff0a60ec4a"}, + {file = "scikit_learn-1.3.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:763f0ae4b79b0ff9cca0bf3716bcc9915bdacff3cebea15ec79652d1cc4fa5c9"}, + {file = "scikit_learn-1.3.2-cp39-cp39-win_amd64.whl", hash = "sha256:ed932ea780517b00dae7431e031faae6b49b20eb6950918eb83bd043237950e0"}, +] + +[package.dependencies] +joblib = ">=1.1.1" +numpy = ">=1.17.3,<2.0" +scipy = ">=1.5.0" +threadpoolctl = ">=2.0.0" + +[package.extras] +benchmark = ["matplotlib (>=3.1.3)", "memory-profiler (>=0.57.0)", "pandas (>=1.0.5)"] +docs = ["Pillow (>=7.1.2)", "matplotlib (>=3.1.3)", "memory-profiler (>=0.57.0)", "numpydoc (>=1.2.0)", "pandas (>=1.0.5)", "plotly (>=5.14.0)", "pooch (>=1.6.0)", "scikit-image (>=0.16.2)", "seaborn (>=0.9.0)", "sphinx (>=6.0.0)", "sphinx-copybutton (>=0.5.2)", "sphinx-gallery (>=0.10.1)", "sphinx-prompt (>=1.3.0)", "sphinxext-opengraph (>=0.4.2)"] +examples = ["matplotlib (>=3.1.3)", "pandas (>=1.0.5)", "plotly (>=5.14.0)", "pooch (>=1.6.0)", "scikit-image (>=0.16.2)", "seaborn (>=0.9.0)"] +tests = ["black (>=23.3.0)", "matplotlib (>=3.1.3)", "mypy (>=1.3)", "numpydoc (>=1.2.0)", "pandas (>=1.0.5)", "pooch (>=1.6.0)", "pyamg (>=4.0.0)", "pytest (>=7.1.2)", "pytest-cov (>=2.9.0)", "ruff (>=0.0.272)", "scikit-image (>=0.16.2)"] + +[[package]] +name = "scipy" +version = "1.11.4" +description = "Fundamental algorithms for scientific computing in Python" +optional = false +python-versions = ">=3.9" +files = [ + {file = "scipy-1.11.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:bc9a714581f561af0848e6b69947fda0614915f072dfd14142ed1bfe1b806710"}, + {file = "scipy-1.11.4-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:cf00bd2b1b0211888d4dc75656c0412213a8b25e80d73898083f402b50f47e41"}, + {file = "scipy-1.11.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b9999c008ccf00e8fbcce1236f85ade5c569d13144f77a1946bef8863e8f6eb4"}, + {file = "scipy-1.11.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:933baf588daa8dc9a92c20a0be32f56d43faf3d1a60ab11b3f08c356430f6e56"}, + {file = "scipy-1.11.4-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:8fce70f39076a5aa62e92e69a7f62349f9574d8405c0a5de6ed3ef72de07f446"}, + {file = "scipy-1.11.4-cp310-cp310-win_amd64.whl", hash = "sha256:6550466fbeec7453d7465e74d4f4b19f905642c89a7525571ee91dd7adabb5a3"}, + {file = "scipy-1.11.4-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:f313b39a7e94f296025e3cffc2c567618174c0b1dde173960cf23808f9fae4be"}, + {file = "scipy-1.11.4-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:1b7c3dca977f30a739e0409fb001056484661cb2541a01aba0bb0029f7b68db8"}, + {file = "scipy-1.11.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:00150c5eae7b610c32589dda259eacc7c4f1665aedf25d921907f4d08a951b1c"}, + {file = "scipy-1.11.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:530f9ad26440e85766509dbf78edcfe13ffd0ab7fec2560ee5c36ff74d6269ff"}, + {file = "scipy-1.11.4-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:5e347b14fe01003d3b78e196e84bd3f48ffe4c8a7b8a1afbcb8f5505cb710993"}, + {file = "scipy-1.11.4-cp311-cp311-win_amd64.whl", hash = "sha256:acf8ed278cc03f5aff035e69cb511741e0418681d25fbbb86ca65429c4f4d9cd"}, + {file = "scipy-1.11.4-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:028eccd22e654b3ea01ee63705681ee79933652b2d8f873e7949898dda6d11b6"}, + {file = "scipy-1.11.4-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:2c6ff6ef9cc27f9b3db93a6f8b38f97387e6e0591600369a297a50a8e96e835d"}, + {file = "scipy-1.11.4-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b030c6674b9230d37c5c60ab456e2cf12f6784596d15ce8da9365e70896effc4"}, + {file = "scipy-1.11.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ad669df80528aeca5f557712102538f4f37e503f0c5b9541655016dd0932ca79"}, + {file = "scipy-1.11.4-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:ce7fff2e23ab2cc81ff452a9444c215c28e6305f396b2ba88343a567feec9660"}, + {file = "scipy-1.11.4-cp312-cp312-win_amd64.whl", hash = "sha256:36750b7733d960d7994888f0d148d31ea3017ac15eef664194b4ef68d36a4a97"}, + {file = "scipy-1.11.4-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:6e619aba2df228a9b34718efb023966da781e89dd3d21637b27f2e54db0410d7"}, + {file = "scipy-1.11.4-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:f3cd9e7b3c2c1ec26364856f9fbe78695fe631150f94cd1c22228456404cf1ec"}, + {file = "scipy-1.11.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d10e45a6c50211fe256da61a11c34927c68f277e03138777bdebedd933712fea"}, + {file = "scipy-1.11.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:91af76a68eeae0064887a48e25c4e616fa519fa0d38602eda7e0f97d65d57937"}, + {file = "scipy-1.11.4-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:6df1468153a31cf55ed5ed39647279beb9cfb5d3f84369453b49e4b8502394fd"}, + {file = "scipy-1.11.4-cp39-cp39-win_amd64.whl", hash = "sha256:ee410e6de8f88fd5cf6eadd73c135020bfbbbdfcd0f6162c36a7638a1ea8cc65"}, + {file = "scipy-1.11.4.tar.gz", hash = "sha256:90a2b78e7f5733b9de748f589f09225013685f9b218275257f8a8168ededaeaa"}, +] + +[package.dependencies] +numpy = ">=1.21.6,<1.28.0" + +[package.extras] +dev = ["click", "cython-lint (>=0.12.2)", "doit (>=0.36.0)", "mypy", "pycodestyle", "pydevtool", "rich-click", "ruff", "types-psutil", "typing_extensions"] +doc = ["jupytext", "matplotlib (>2)", "myst-nb", "numpydoc", "pooch", "pydata-sphinx-theme (==0.9.0)", "sphinx (!=4.1.0)", "sphinx-design (>=0.2.0)"] +test = ["asv", "gmpy2", "mpmath", "pooch", "pytest", "pytest-cov", "pytest-timeout", "pytest-xdist", "scikit-umfpack", "threadpoolctl"] + [[package]] name = "six" version = "1.16.0" @@ -1957,6 +2046,17 @@ pure-eval = "*" [package.extras] tests = ["cython", "littleutils", "pygments", "pytest", "typeguard"] +[[package]] +name = "threadpoolctl" +version = "3.2.0" +description = "threadpoolctl" +optional = false +python-versions = ">=3.8" +files = [ + {file = "threadpoolctl-3.2.0-py3-none-any.whl", hash = "sha256:2b7818516e423bdaebb97c723f86a7c6b0a83d3f3b0970328d66f4d9104dc032"}, + {file = "threadpoolctl-3.2.0.tar.gz", hash = "sha256:c96a0ba3bdddeaca37dc4cc7344aafad41cdb8c313f74fdfe387a867bba93355"}, +] + [[package]] name = "tokenize-rt" version = "5.2.0" @@ -2203,4 +2303,4 @@ testing = ["big-O", "jaraco.functools", "jaraco.itertools", "more-itertools", "p [metadata] lock-version = "2.0" python-versions = "^3.10" -content-hash = "3300c77d6b6fab3faca403e2f3064a23e9f5ddcd34d63cad42d39b94b1ae5c2b" +content-hash = "7e705f5c5f2a8bba630031c0ff6752972e7cddc8ec95f3fb05b5be2ad7962268" diff --git a/pyproject.toml b/pyproject.toml index 030a8f72d6f05151ddd88df4fb4bb4427d28950d..9f42964b5c6eda3aa5b2965358b0c895ad29f87d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -19,6 +19,7 @@ cohere = "^4.32" numpy = "^1.25.2" pinecone-text = "^0.7.0" colorlog = "^6.8.0" +scikit-learn = "^1.3.2" [tool.poetry.group.dev.dependencies] diff --git a/semantic_router/encoders/__init__.py b/semantic_router/encoders/__init__.py index 30ad624a2104ec3c48b8684615b89e3a35d43be2..2769b31daf1485a2066797e65f60dde47ba6daee 100644 --- a/semantic_router/encoders/__init__.py +++ b/semantic_router/encoders/__init__.py @@ -2,5 +2,6 @@ from .base import BaseEncoder from .bm25 import BM25Encoder from .cohere import CohereEncoder from .openai import OpenAIEncoder +from .tfidf import TfidfEncoder -__all__ = ["BaseEncoder", "CohereEncoder", "OpenAIEncoder", "BM25Encoder"] +__all__ = ["BaseEncoder", "CohereEncoder", "OpenAIEncoder", "BM25Encoder", "TfidfEncoder"] diff --git a/semantic_router/encoders/tfidf.py b/semantic_router/encoders/tfidf.py new file mode 100644 index 0000000000000000000000000000000000000000..41487a0621dc9c9441d82632487a1c5a0b99458a --- /dev/null +++ b/semantic_router/encoders/tfidf.py @@ -0,0 +1,33 @@ +from typing import Any +from sklearn.feature_extraction.text import TfidfVectorizer +from semantic_router.encoders import BaseEncoder +from semantic_router.schema import Route + +class TfidfEncoder(BaseEncoder): + vectorizer: TfidfVectorizer | None = None + + def __init__(self, name: str = "tfidf"): + super().__init__(name=name) + self.vectorizer = TfidfVectorizer() + + def __call__(self, docs: list[str]) -> list[list[float]]: + if self.vectorizer is None: + raise ValueError("Vectorizer is not initialized.") + if len(docs) == 0: + raise ValueError("No documents to encode.") + + embeds = self.vectorizer.transform(docs).toarray() + return embeds.tolist() + + def fit(self, routes: list[Route]): + if self.vectorizer is None: + raise ValueError("Vectorizer is not initialized.") + docs = self._get_all_utterances(routes) + self.vectorizer.fit(docs) + + def _get_all_utterances(self, routes: list[Route]) -> list[str]: + utterances = [] + for route in routes: + for utterance in route.utterances: + utterances.append(utterance) + return utterances \ No newline at end of file diff --git a/semantic_router/hybrid_layer.py b/semantic_router/hybrid_layer.py index dec6336e917d1a21ead95720d50f5f2e582aaa81..a68472d31806bb784b8255fff234a9f9111de9c0 100644 --- a/semantic_router/hybrid_layer.py +++ b/semantic_router/hybrid_layer.py @@ -19,19 +19,22 @@ class HybridRouteLayer: score_threshold = 0.82 def __init__( - self, encoder: BaseEncoder, routes: list[Route] = [], alpha: float = 0.3 + self, dense_encoder: BaseEncoder, sparse_encoder: BaseEncoder, routes: list[Route] = [], alpha: float = 0.3 ): - self.encoder = encoder - self.sparse_encoder = BM25Encoder() + self.dense_encoder = dense_encoder + self.sparse_encoder = sparse_encoder self.alpha = alpha + self.routes = routes # decide on default threshold based on encoder - if isinstance(encoder, OpenAIEncoder): + if isinstance(dense_encoder, OpenAIEncoder): self.score_threshold = 0.82 - elif isinstance(encoder, CohereEncoder): + elif isinstance(dense_encoder, CohereEncoder): self.score_threshold = 0.3 else: self.score_threshold = 0.82 # if routes list has been passed, we initialize index now + if self.sparse_encoder.name == 'tfidf': + self.sparse_encoder.fit(routes) if routes: # initialize index now for route in tqdm(routes): @@ -47,15 +50,18 @@ class HybridRouteLayer: return None def add(self, route: Route): + if self.sparse_encoder.name == 'tfidf': + self.sparse_encoder.fit(self.routes + [route]) + self.sparse_index = None + for r in self.routes: + self.calculate_sparse_embeds(r) + self.routes.append(route) self._add_route(route=route) def _add_route(self, route: Route): # create embeddings - dense_embeds = np.array(self.encoder(route.utterances)) # * self.alpha - sparse_embeds = np.array( - self.sparse_encoder(route.utterances) - ) # * (1 - self.alpha) - + dense_embeds = np.array(self.dense_encoder(route.utterances)) # * self.alpha + self.compute_and_store_sparse_embeddings(route) # create route array if self.categories is None: self.categories = np.array([route.name] * len(route.utterances)) @@ -71,6 +77,11 @@ class HybridRouteLayer: self.index = dense_embeds else: self.index = np.concatenate([self.index, dense_embeds]) + + def compute_and_store_sparse_embeddings(self, route: Route): + sparse_embeds = np.array( + self.sparse_encoder(route.utterances) + ) # * (1 - self.alpha) # create sparse utterance array if self.sparse_index is None: self.sparse_index = sparse_embeds @@ -82,7 +93,7 @@ class HybridRouteLayer: retrieve the top_k most similar records. """ # create dense query vector - xq_d = np.array(self.encoder([text])) + xq_d = np.array(self.dense_encoder([text])) xq_d = np.squeeze(xq_d) # Reduce to 1d array. # create sparse query vector xq_s = np.array(self.sparse_encoder([text]))