diff --git a/docs/examples/hybrid-layer.ipynb b/docs/examples/hybrid-layer.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..589e13a1e4d3fc1d924c7bb851c19c56f9f30dd5 --- /dev/null +++ b/docs/examples/hybrid-layer.ipynb @@ -0,0 +1,420 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Semantic Router: Hybrid Layer" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The Hybrid Layer in the Semantic Router library can improve decision making performance particularly for niche use-cases that contain specific terminology, such as finance or medical. It helps us provide more importance to decision making based on the keywords contained in our utterances and user queries." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Getting Started" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We start by installing the library:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!pip install -qU semantic-router==0.0.5" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We start by defining a dictionary mapping decisions to example phrases that should trigger those decisions." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "\n", + "os.environ[\"COHERE_API_KEY\"] = \"<<APIKEY>>\"" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/jamesbriggs/opt/anaconda3/envs/decision-layer/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n", + "None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.\n" + ] + } + ], + "source": [ + "from semantic_router.schema import Decision\n", + "\n", + "politics = Decision(\n", + " name=\"politics\",\n", + " utterances=[\n", + " \"isn't politics the best thing ever\",\n", + " \"why don't you tell me about your political opinions\",\n", + " \"don't you just love the president\" \"don't you just hate the president\",\n", + " \"they're going to destroy this country!\",\n", + " \"they will save the country!\",\n", + " ],\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's define another for good measure:" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "chitchat = Decision(\n", + " name=\"chitchat\",\n", + " utterances=[\n", + " \"how's the weather today?\",\n", + " \"how are things going?\",\n", + " \"lovely weather today\",\n", + " \"the weather is horrendous\",\n", + " \"let's go to the chippy\",\n", + " ],\n", + ")\n", + "\n", + "decisions = [politics, chitchat]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now we initialize our embedding model:" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "from semantic_router.encoders import CohereEncoder\n", + "from getpass import getpass\n", + "\n", + "os.environ[\"COHERE_API_KEY\"] = os.environ[\"COHERE_API_KEY\"] or getpass(\n", + " \"Enter Cohere API Key: \"\n", + ")\n", + "\n", + "encoder = CohereEncoder()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now we define the `DecisionLayer`. When called, the decision layer will consume text (a query) and output the category (`Decision`) it belongs to — to initialize a `DecisionLayer` we need our `encoder` model and a list of `decisions`." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2067848296 1405\n", + "2212344012 2520\n", + "3313717465 206\n", + "3076736765 769\n", + "1778150425 4131\n", + "2067848296 1405\n", + "202708381 770\n", + "2212344012 2520\n", + "3374841595 2375\n", + "2067848296 1405\n", + "3508911095 2067\n", + "3454774732 not in encoder.idx_mapping\n", + "2379717389 3565\n", + "298452803 4356\n", + "1063320047 3369\n", + "4186256544 713\n", + "1846246980 858\n", + "3897916792 643\n", + "575623047 1476\n", + "3897916792 643\n" + ] + }, + { + "ename": "ValueError", + "evalue": "all the input arrays must have same number of dimensions, but the array at index 0 has 2 dimension(s) and the array at index 1 has 1 dimension(s)", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", + "\u001b[1;32m/Users/jamesbriggs/Documents/projects/aurelio-labs/semantic-router/docs/examples/hybrid-layer.ipynb Cell 14\u001b[0m line \u001b[0;36m3\n\u001b[1;32m <a href='vscode-notebook-cell:/Users/jamesbriggs/Documents/projects/aurelio-labs/semantic-router/docs/examples/hybrid-layer.ipynb#X16sZmlsZQ%3D%3D?line=0'>1</a>\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39msemantic_router\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mlayer\u001b[39;00m \u001b[39mimport\u001b[39;00m HybridDecisionLayer\n\u001b[0;32m----> <a href='vscode-notebook-cell:/Users/jamesbriggs/Documents/projects/aurelio-labs/semantic-router/docs/examples/hybrid-layer.ipynb#X16sZmlsZQ%3D%3D?line=2'>3</a>\u001b[0m dl \u001b[39m=\u001b[39m HybridDecisionLayer(encoder\u001b[39m=\u001b[39;49mencoder, decisions\u001b[39m=\u001b[39;49mdecisions)\n", + "File \u001b[0;32m~/Documents/projects/aurelio-labs/semantic-router/semantic_router/layer.py:137\u001b[0m, in \u001b[0;36mHybridDecisionLayer.__init__\u001b[0;34m(self, encoder, decisions, alpha)\u001b[0m\n\u001b[1;32m 134\u001b[0m \u001b[39mif\u001b[39;00m decisions:\n\u001b[1;32m 135\u001b[0m \u001b[39m# initialize index now\u001b[39;00m\n\u001b[1;32m 136\u001b[0m \u001b[39mfor\u001b[39;00m decision \u001b[39min\u001b[39;00m decisions:\n\u001b[0;32m--> 137\u001b[0m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_add_decision(decision\u001b[39m=\u001b[39;49mdecision)\n", + "File \u001b[0;32m~/Documents/projects/aurelio-labs/semantic-router/semantic_router/layer.py:156\u001b[0m, in \u001b[0;36mHybridDecisionLayer._add_decision\u001b[0;34m(self, decision)\u001b[0m\n\u001b[1;32m 154\u001b[0m sparse_embeds \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39msparse_encoder(decision\u001b[39m.\u001b[39mutterances)\n\u001b[1;32m 155\u001b[0m \u001b[39m# concatenate vectors to create hybrid vecs\u001b[39;00m\n\u001b[0;32m--> 156\u001b[0m embeds \u001b[39m=\u001b[39m np\u001b[39m.\u001b[39;49mconcatenate([\n\u001b[1;32m 157\u001b[0m dense_embeds, sparse_embeds\n\u001b[1;32m 158\u001b[0m ], axis\u001b[39m=\u001b[39;49m\u001b[39m1\u001b[39;49m)\n\u001b[1;32m 160\u001b[0m \u001b[39m# create decision array\u001b[39;00m\n\u001b[1;32m 161\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mcategories \u001b[39mis\u001b[39;00m \u001b[39mNone\u001b[39;00m:\n", + "\u001b[0;31mValueError\u001b[0m: all the input arrays must have same number of dimensions, but the array at index 0 has 2 dimension(s) and the array at index 1 has 1 dimension(s)" + ] + } + ], + "source": [ + "from semantic_router.layer import HybridDecisionLayer\n", + "\n", + "dl = HybridDecisionLayer(encoder=encoder, decisions=decisions)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dl(\"don't you love politics?\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "if 3454774732 in encoder.idx_mapping:\n", + " print(\"yes\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from semantic_router.encoders import BM25Encoder\n", + "\n", + "encoder = BM25Encoder()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "tests = [\"hello this is some text\", \"and more stuff\"]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "idx_list = encoder.model.get_params()['doc_freq']['indices']\n", + "idx_list" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sparse_dicts = encoder.model.encode_documents(tests)\n", + "sparse_dicts" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "embeds = [0.0] * len(encoder.idx_mapping)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "for output in sparse_dicts:\n", + " indices = output[\"indices\"]\n", + " values = output[\"values\"]\n", + " for idx, val in zip(indices, values):\n", + " position = encoder.idx_mapping[idx]\n", + " embeds[position] = val" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "encoder.idx_mapping" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "encoded_output = encoder(tests)\n", + "encoded_output" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "\n", + "\n", + "sparse_vec = np.zeros(len(idx_list))\n", + "idx_position_dict = {idx: i for i, idx in enumerate(idx_list)}\n", + "\n", + "for output in encoded_output:\n", + " indices = output['indices']\n", + " values = output['values']\n", + " for idx, value in zip(indices, values):\n", + " if idx in idx_position_dict:\n", + " position = idx_position_dict[idx]\n", + " sparse_vec[position] = value" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sparse_vec" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sparse_vec.shape" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now we can test it:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dl(\"don't you love politics?\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dl(\"how's the weather today?\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Both are classified accurately, what if we send a query that is unrelated to our existing `Decision` objects?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dl(\"I'm interested in learning about llama 2\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In this case, we return `None` because no matches were identified." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "decision-layer", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.5" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/poetry.lock b/poetry.lock index a2617c1c1dd5aaacf48d1efc23c5412f7af5f947..f47ac40ca97c59a3e5e3d18ea89f13bd816f3729 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.5.1 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.6.1 and should not be changed by hand. [[package]] name = "aiohttp" @@ -763,6 +763,17 @@ docs = ["Jinja2 (==2.11.3)", "MarkupSafe (==1.1.1)", "Pygments (==2.8.1)", "alab qa = ["flake8 (==5.0.4)", "mypy (==0.971)", "types-setuptools (==67.2.0.1)"] testing = ["Django", "attrs", "colorama", "docopt", "pytest (<7.0.0)"] +[[package]] +name = "joblib" +version = "1.3.2" +description = "Lightweight pipelining with Python functions" +optional = false +python-versions = ">=3.7" +files = [ + {file = "joblib-1.3.2-py3-none-any.whl", hash = "sha256:ef4331c65f239985f3f2220ecc87db222f08fd22097a3dd5698f693875f8cbb9"}, + {file = "joblib-1.3.2.tar.gz", hash = "sha256:92f865e621e17784e7955080b6d042489e3b8e294949cc44c6eac304f59772b1"}, +] + [[package]] name = "jupyter-client" version = "8.6.0" @@ -819,6 +830,50 @@ files = [ [package.dependencies] traitlets = "*" +[[package]] +name = "mmh3" +version = "3.1.0" +description = "Python wrapper for MurmurHash (MurmurHash3), a set of fast and robust hash functions." +optional = false +python-versions = "*" +files = [ + {file = "mmh3-3.1.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:16ee043b1bac040b4324b8baee39df9fdca480a560a6d74f2eef66a5009a234e"}, + {file = "mmh3-3.1.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:04ac865319e5b36148a4b6cdf27f8bda091c47c4ab7b355d7f353dfc2b8a3cce"}, + {file = "mmh3-3.1.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9e751f5433417a21c2060b0efa1afc67cfbe29977c867336148c8edb086fae70"}, + {file = "mmh3-3.1.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:bdb863b89c1b34e3681d4a3b15d424734940eb8036f3457cb35ef34fb87a503c"}, + {file = "mmh3-3.1.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1230930fbf2faec4ddf5b76d0768ae73c102de173c301962bdd468177275adf9"}, + {file = "mmh3-3.1.0-cp310-cp310-win32.whl", hash = "sha256:b8ed7a2361718795a1b519a08d05f44947a20b27e202b53946561a00dde669c1"}, + {file = "mmh3-3.1.0-cp310-cp310-win_amd64.whl", hash = "sha256:29e878e7467a000f34ab68c218ad7ad81312c0a94bc10df3c50a48bcad39dd83"}, + {file = "mmh3-3.1.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:c271472325b70d64a4fbb1f2e964ca5b093ac10258e1390f8408890b065868fe"}, + {file = "mmh3-3.1.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:0109320f7e0e262123ff4f1acd06acfbc8b3bf19cc13d98c0bc369264430aaeb"}, + {file = "mmh3-3.1.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:524e29dfe66499695f9496edcfc96782d130aabd6ba12c50c72372163cc6f3ea"}, + {file = "mmh3-3.1.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:66bdb06a03074e65e614da1aa199b1d16c90608bec9d8fc3faa81d887ffe93cc"}, + {file = "mmh3-3.1.0-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2a4d471eb75df8320061ab3b8cbe11c970be9f116b01bc2222ebda9c0a777520"}, + {file = "mmh3-3.1.0-cp311-cp311-win32.whl", hash = "sha256:a886d9ce995a4bdfd7a600ddf61b9015cccbc73c50b898f8ff3c78af24384710"}, + {file = "mmh3-3.1.0-cp311-cp311-win_amd64.whl", hash = "sha256:5edb5ac882c04aff8a2a18ae8b74a0c339ac9b83db9820d8456f518bb558e0d8"}, + {file = "mmh3-3.1.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:190fd10981fbd6c67e10ce3b56bcc021562c0df0fee2e2864347d64e65b1783a"}, + {file = "mmh3-3.1.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cd781b115cf649811cfde76368c33d2e553b6f88bb41131c314f30d8e65e9d24"}, + {file = "mmh3-3.1.0-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f48bb0a867077acc1f548591ad49506389f36d18f36dccd10becf071e5cbdda4"}, + {file = "mmh3-3.1.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7d0936a82438e340636a11b9a938378870fc1c7a139632dac09a9a9277351704"}, + {file = "mmh3-3.1.0-cp37-cp37m-win32.whl", hash = "sha256:d196cc035c2238493248522ae4e54c3cb790549b1564f6dea4d88dfe4b326313"}, + {file = "mmh3-3.1.0-cp37-cp37m-win_amd64.whl", hash = "sha256:731d37f089b6c212fab1beea24e673161146eb6c76baf9ac074a3424d1172d41"}, + {file = "mmh3-3.1.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:9977fb81f8c66f4eee8439734a18dba7826fe78723d15ab53f42db977005be0f"}, + {file = "mmh3-3.1.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:bf4f3f20a8b8405c08b13bc9e4ac33bf55129b50b535cd07ce1891b7f96326ac"}, + {file = "mmh3-3.1.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:87cdbc6e70099ad92f17a28b4054ffb1938657e8fb7c1e4e03b194a1b4683fd6"}, + {file = "mmh3-3.1.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6dd81321d14f62aa3711f30533c85a74dc7596e0fee63c8eddd375bc92ab846c"}, + {file = "mmh3-3.1.0-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2e6eba88e5c1a2778f3de00a9502e3c214ebb757337ece2a7d71e060d188ddfa"}, + {file = "mmh3-3.1.0-cp38-cp38-win32.whl", hash = "sha256:d91e696925f208d28f3bb7bdf29815524ce955248276af256519bd3538c411ce"}, + {file = "mmh3-3.1.0-cp38-cp38-win_amd64.whl", hash = "sha256:cbc2917df568aeb86ec5aa863bfb20fa14e01039cbdce7650efbabc30960df49"}, + {file = "mmh3-3.1.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:3b22832d565128be83d69f5d49243bb567840a954df377c9f5b26646a6eec39b"}, + {file = "mmh3-3.1.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:ced92a0e285a9111413541c197b0c17d280cee96f7c564b258caf5de5ab8ee01"}, + {file = "mmh3-3.1.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f906833753b4ddcb690c2c1b74e77725868bc3a8b762b7a77737d08be89ae41d"}, + {file = "mmh3-3.1.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:72b5685832a7a87a55ebff481794bc410484d7bd4c5e80dae4d8ac50739138ef"}, + {file = "mmh3-3.1.0-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8d2aa4d422c7c088bbc5d367b45431268ebe6742a0a64eade93fab708e25757c"}, + {file = "mmh3-3.1.0-cp39-cp39-win32.whl", hash = "sha256:4459bec818f534dc8378568ad89ab310ff47cda3e00ab322edce48dd899bba32"}, + {file = "mmh3-3.1.0-cp39-cp39-win_amd64.whl", hash = "sha256:03e04b3480e71828f48d17653451a3286555f0534942cb6ba93065b10ad5f9dc"}, + {file = "mmh3-3.1.0.tar.gz", hash = "sha256:9b0f2b2ab4a915333c9d1089572e290a021ebb5b900bb7f7114dccc03995d732"}, +] + [[package]] name = "multidict" version = "6.0.4" @@ -924,6 +979,65 @@ files = [ {file = "nest_asyncio-1.5.8.tar.gz", hash = "sha256:25aa2ca0d2a5b5531956b9e273b45cf664cae2b145101d73b86b199978d48fdb"}, ] +[[package]] +name = "nltk" +version = "3.8.1" +description = "Natural Language Toolkit" +optional = false +python-versions = ">=3.7" +files = [ + {file = "nltk-3.8.1-py3-none-any.whl", hash = "sha256:fd5c9109f976fa86bcadba8f91e47f5e9293bd034474752e92a520f81c93dda5"}, + {file = "nltk-3.8.1.zip", hash = "sha256:1834da3d0682cba4f2cede2f9aad6b0fafb6461ba451db0efb6f9c39798d64d3"}, +] + +[package.dependencies] +click = "*" +joblib = "*" +regex = ">=2021.8.3" +tqdm = "*" + +[package.extras] +all = ["matplotlib", "numpy", "pyparsing", "python-crfsuite", "requests", "scikit-learn", "scipy", "twython"] +corenlp = ["requests"] +machine-learning = ["numpy", "python-crfsuite", "scikit-learn", "scipy"] +plot = ["matplotlib"] +tgrep = ["pyparsing"] +twitter = ["twython"] + +[[package]] +name = "numpy" +version = "1.25.2" +description = "Fundamental package for array computing in Python" +optional = false +python-versions = ">=3.9" +files = [ + {file = "numpy-1.25.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:db3ccc4e37a6873045580d413fe79b68e47a681af8db2e046f1dacfa11f86eb3"}, + {file = "numpy-1.25.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:90319e4f002795ccfc9050110bbbaa16c944b1c37c0baeea43c5fb881693ae1f"}, + {file = "numpy-1.25.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:dfe4a913e29b418d096e696ddd422d8a5d13ffba4ea91f9f60440a3b759b0187"}, + {file = "numpy-1.25.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f08f2e037bba04e707eebf4bc934f1972a315c883a9e0ebfa8a7756eabf9e357"}, + {file = "numpy-1.25.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:bec1e7213c7cb00d67093247f8c4db156fd03075f49876957dca4711306d39c9"}, + {file = "numpy-1.25.2-cp310-cp310-win32.whl", hash = "sha256:7dc869c0c75988e1c693d0e2d5b26034644399dd929bc049db55395b1379e044"}, + {file = "numpy-1.25.2-cp310-cp310-win_amd64.whl", hash = "sha256:834b386f2b8210dca38c71a6e0f4fd6922f7d3fcff935dbe3a570945acb1b545"}, + {file = "numpy-1.25.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:c5462d19336db4560041517dbb7759c21d181a67cb01b36ca109b2ae37d32418"}, + {file = "numpy-1.25.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:c5652ea24d33585ea39eb6a6a15dac87a1206a692719ff45d53c5282e66d4a8f"}, + {file = "numpy-1.25.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0d60fbae8e0019865fc4784745814cff1c421df5afee233db6d88ab4f14655a2"}, + {file = "numpy-1.25.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:60e7f0f7f6d0eee8364b9a6304c2845b9c491ac706048c7e8cf47b83123b8dbf"}, + {file = "numpy-1.25.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:bb33d5a1cf360304754913a350edda36d5b8c5331a8237268c48f91253c3a364"}, + {file = "numpy-1.25.2-cp311-cp311-win32.whl", hash = "sha256:5883c06bb92f2e6c8181df7b39971a5fb436288db58b5a1c3967702d4278691d"}, + {file = "numpy-1.25.2-cp311-cp311-win_amd64.whl", hash = "sha256:5c97325a0ba6f9d041feb9390924614b60b99209a71a69c876f71052521d42a4"}, + {file = "numpy-1.25.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:b79e513d7aac42ae918db3ad1341a015488530d0bb2a6abcbdd10a3a829ccfd3"}, + {file = "numpy-1.25.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:eb942bfb6f84df5ce05dbf4b46673ffed0d3da59f13635ea9b926af3deb76926"}, + {file = "numpy-1.25.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3e0746410e73384e70d286f93abf2520035250aad8c5714240b0492a7302fdca"}, + {file = "numpy-1.25.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d7806500e4f5bdd04095e849265e55de20d8cc4b661b038957354327f6d9b295"}, + {file = "numpy-1.25.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:8b77775f4b7df768967a7c8b3567e309f617dd5e99aeb886fa14dc1a0791141f"}, + {file = "numpy-1.25.2-cp39-cp39-win32.whl", hash = "sha256:2792d23d62ec51e50ce4d4b7d73de8f67a2fd3ea710dcbc8563a51a03fb07b01"}, + {file = "numpy-1.25.2-cp39-cp39-win_amd64.whl", hash = "sha256:76b4115d42a7dfc5d485d358728cdd8719be33cc5ec6ec08632a5d6fca2ed380"}, + {file = "numpy-1.25.2-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:1a1329e26f46230bf77b02cc19e900db9b52f398d6722ca853349a782d4cff55"}, + {file = "numpy-1.25.2-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4c3abc71e8b6edba80a01a52e66d83c5d14433cbcd26a40c329ec7ed09f37901"}, + {file = "numpy-1.25.2-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:1b9735c27cea5d995496f46a8b1cd7b408b3f34b6d50459d9ac8fe3a20cc17bf"}, + {file = "numpy-1.25.2.tar.gz", hash = "sha256:fd608e19c8d7c55021dffd43bfe5492fab8cc105cc8986f813f8c3c048b38760"}, +] + [[package]] name = "openai" version = "0.28.1" @@ -997,6 +1111,28 @@ files = [ [package.dependencies] ptyprocess = ">=0.5" +[[package]] +name = "pinecone-text" +version = "0.7.0" +description = "Text utilities library by Pinecone.io" +optional = false +python-versions = ">=3.8,<4.0" +files = [ + {file = "pinecone_text-0.7.0-py3-none-any.whl", hash = "sha256:d20c7adc2259965a30fcbcf93a5eeb3f8d12babc9ea65ba858f1a6a5973d0737"}, + {file = "pinecone_text-0.7.0.tar.gz", hash = "sha256:8bda3c7337511dfb61da541299024ee73dbbed5d94e2af558a12357591b46174"}, +] + +[package.dependencies] +mmh3 = ">=3.1.0,<4.0.0" +nltk = ">=3.6.5,<4.0.0" +numpy = ">=1.21.5,<=1.25.2" +wget = ">=3.2,<4.0" + +[package.extras] +dense = ["openai (>=1.2.3,<2.0.0)", "sentence-transformers (>=2.0.0)", "torch (>=1.13.1)", "transformers (>=4.26.1)"] +openai = ["openai (>=1.2.3,<2.0.0)"] +splade = ["sentence-transformers (>=2.0.0)", "torch (>=1.13.1)", "transformers (>=4.26.1)"] + [[package]] name = "platformdirs" version = "4.0.0" @@ -1298,6 +1434,103 @@ files = [ [package.dependencies] cffi = {version = "*", markers = "implementation_name == \"pypy\""} +[[package]] +name = "regex" +version = "2023.10.3" +description = "Alternative regular expression module, to replace re." +optional = false +python-versions = ">=3.7" +files = [ + {file = "regex-2023.10.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:4c34d4f73ea738223a094d8e0ffd6d2c1a1b4c175da34d6b0de3d8d69bee6bcc"}, + {file = "regex-2023.10.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:a8f4e49fc3ce020f65411432183e6775f24e02dff617281094ba6ab079ef0915"}, + {file = "regex-2023.10.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4cd1bccf99d3ef1ab6ba835308ad85be040e6a11b0977ef7ea8c8005f01a3c29"}, + {file = "regex-2023.10.3-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:81dce2ddc9f6e8f543d94b05d56e70d03a0774d32f6cca53e978dc01e4fc75b8"}, + {file = "regex-2023.10.3-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9c6b4d23c04831e3ab61717a707a5d763b300213db49ca680edf8bf13ab5d91b"}, + {file = "regex-2023.10.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c15ad0aee158a15e17e0495e1e18741573d04eb6da06d8b84af726cfc1ed02ee"}, + {file = "regex-2023.10.3-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6239d4e2e0b52c8bd38c51b760cd870069f0bdf99700a62cd509d7a031749a55"}, + {file = "regex-2023.10.3-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:4a8bf76e3182797c6b1afa5b822d1d5802ff30284abe4599e1247be4fd6b03be"}, + {file = "regex-2023.10.3-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:d9c727bbcf0065cbb20f39d2b4f932f8fa1631c3e01fcedc979bd4f51fe051c5"}, + {file = "regex-2023.10.3-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:3ccf2716add72f80714b9a63899b67fa711b654be3fcdd34fa391d2d274ce767"}, + {file = "regex-2023.10.3-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:107ac60d1bfdc3edb53be75e2a52aff7481b92817cfdddd9b4519ccf0e54a6ff"}, + {file = "regex-2023.10.3-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:00ba3c9818e33f1fa974693fb55d24cdc8ebafcb2e4207680669d8f8d7cca79a"}, + {file = "regex-2023.10.3-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:f0a47efb1dbef13af9c9a54a94a0b814902e547b7f21acb29434504d18f36e3a"}, + {file = "regex-2023.10.3-cp310-cp310-win32.whl", hash = "sha256:36362386b813fa6c9146da6149a001b7bd063dabc4d49522a1f7aa65b725c7ec"}, + {file = "regex-2023.10.3-cp310-cp310-win_amd64.whl", hash = "sha256:c65a3b5330b54103e7d21cac3f6bf3900d46f6d50138d73343d9e5b2900b2353"}, + {file = "regex-2023.10.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:90a79bce019c442604662d17bf69df99090e24cdc6ad95b18b6725c2988a490e"}, + {file = "regex-2023.10.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:c7964c2183c3e6cce3f497e3a9f49d182e969f2dc3aeeadfa18945ff7bdd7051"}, + {file = "regex-2023.10.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4ef80829117a8061f974b2fda8ec799717242353bff55f8a29411794d635d964"}, + {file = "regex-2023.10.3-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5addc9d0209a9afca5fc070f93b726bf7003bd63a427f65ef797a931782e7edc"}, + {file = "regex-2023.10.3-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c148bec483cc4b421562b4bcedb8e28a3b84fcc8f0aa4418e10898f3c2c0eb9b"}, + {file = "regex-2023.10.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8d1f21af4c1539051049796a0f50aa342f9a27cde57318f2fc41ed50b0dbc4ac"}, + {file = "regex-2023.10.3-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0b9ac09853b2a3e0d0082104036579809679e7715671cfbf89d83c1cb2a30f58"}, + {file = "regex-2023.10.3-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:ebedc192abbc7fd13c5ee800e83a6df252bec691eb2c4bedc9f8b2e2903f5e2a"}, + {file = "regex-2023.10.3-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:d8a993c0a0ffd5f2d3bda23d0cd75e7086736f8f8268de8a82fbc4bd0ac6791e"}, + {file = "regex-2023.10.3-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:be6b7b8d42d3090b6c80793524fa66c57ad7ee3fe9722b258aec6d0672543fd0"}, + {file = "regex-2023.10.3-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:4023e2efc35a30e66e938de5aef42b520c20e7eda7bb5fb12c35e5d09a4c43f6"}, + {file = "regex-2023.10.3-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:0d47840dc05e0ba04fe2e26f15126de7c755496d5a8aae4a08bda4dd8d646c54"}, + {file = "regex-2023.10.3-cp311-cp311-win32.whl", hash = "sha256:9145f092b5d1977ec8c0ab46e7b3381b2fd069957b9862a43bd383e5c01d18c2"}, + {file = "regex-2023.10.3-cp311-cp311-win_amd64.whl", hash = "sha256:b6104f9a46bd8743e4f738afef69b153c4b8b592d35ae46db07fc28ae3d5fb7c"}, + {file = "regex-2023.10.3-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:bff507ae210371d4b1fe316d03433ac099f184d570a1a611e541923f78f05037"}, + {file = "regex-2023.10.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:be5e22bbb67924dea15039c3282fa4cc6cdfbe0cbbd1c0515f9223186fc2ec5f"}, + {file = "regex-2023.10.3-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4a992f702c9be9c72fa46f01ca6e18d131906a7180950958f766c2aa294d4b41"}, + {file = "regex-2023.10.3-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:7434a61b158be563c1362d9071358f8ab91b8d928728cd2882af060481244c9e"}, + {file = "regex-2023.10.3-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c2169b2dcabf4e608416f7f9468737583ce5f0a6e8677c4efbf795ce81109d7c"}, + {file = "regex-2023.10.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a9e908ef5889cda4de038892b9accc36d33d72fb3e12c747e2799a0e806ec841"}, + {file = "regex-2023.10.3-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:12bd4bc2c632742c7ce20db48e0d99afdc05e03f0b4c1af90542e05b809a03d9"}, + {file = "regex-2023.10.3-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:bc72c231f5449d86d6c7d9cc7cd819b6eb30134bb770b8cfdc0765e48ef9c420"}, + {file = "regex-2023.10.3-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:bce8814b076f0ce5766dc87d5a056b0e9437b8e0cd351b9a6c4e1134a7dfbda9"}, + {file = "regex-2023.10.3-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:ba7cd6dc4d585ea544c1412019921570ebd8a597fabf475acc4528210d7c4a6f"}, + {file = "regex-2023.10.3-cp312-cp312-musllinux_1_1_s390x.whl", hash = "sha256:b0c7d2f698e83f15228ba41c135501cfe7d5740181d5903e250e47f617eb4292"}, + {file = "regex-2023.10.3-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:5a8f91c64f390ecee09ff793319f30a0f32492e99f5dc1c72bc361f23ccd0a9a"}, + {file = "regex-2023.10.3-cp312-cp312-win32.whl", hash = "sha256:ad08a69728ff3c79866d729b095872afe1e0557251da4abb2c5faff15a91d19a"}, + {file = "regex-2023.10.3-cp312-cp312-win_amd64.whl", hash = "sha256:39cdf8d141d6d44e8d5a12a8569d5a227f645c87df4f92179bd06e2e2705e76b"}, + {file = "regex-2023.10.3-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:4a3ee019a9befe84fa3e917a2dd378807e423d013377a884c1970a3c2792d293"}, + {file = "regex-2023.10.3-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:76066d7ff61ba6bf3cb5efe2428fc82aac91802844c022d849a1f0f53820502d"}, + {file = "regex-2023.10.3-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:bfe50b61bab1b1ec260fa7cd91106fa9fece57e6beba05630afe27c71259c59b"}, + {file = "regex-2023.10.3-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9fd88f373cb71e6b59b7fa597e47e518282455c2734fd4306a05ca219a1991b0"}, + {file = "regex-2023.10.3-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b3ab05a182c7937fb374f7e946f04fb23a0c0699c0450e9fb02ef567412d2fa3"}, + {file = "regex-2023.10.3-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:dac37cf08fcf2094159922edc7a2784cfcc5c70f8354469f79ed085f0328ebdf"}, + {file = "regex-2023.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:e54ddd0bb8fb626aa1f9ba7b36629564544954fff9669b15da3610c22b9a0991"}, + {file = "regex-2023.10.3-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:3367007ad1951fde612bf65b0dffc8fd681a4ab98ac86957d16491400d661302"}, + {file = "regex-2023.10.3-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:16f8740eb6dbacc7113e3097b0a36065a02e37b47c936b551805d40340fb9971"}, + {file = "regex-2023.10.3-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:f4f2ca6df64cbdd27f27b34f35adb640b5d2d77264228554e68deda54456eb11"}, + {file = "regex-2023.10.3-cp37-cp37m-musllinux_1_1_s390x.whl", hash = "sha256:39807cbcbe406efca2a233884e169d056c35aa7e9f343d4e78665246a332f597"}, + {file = "regex-2023.10.3-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:7eece6fbd3eae4a92d7c748ae825cbc1ee41a89bb1c3db05b5578ed3cfcfd7cb"}, + {file = "regex-2023.10.3-cp37-cp37m-win32.whl", hash = "sha256:ce615c92d90df8373d9e13acddd154152645c0dc060871abf6bd43809673d20a"}, + {file = "regex-2023.10.3-cp37-cp37m-win_amd64.whl", hash = "sha256:0f649fa32fe734c4abdfd4edbb8381c74abf5f34bc0b3271ce687b23729299ed"}, + {file = "regex-2023.10.3-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:9b98b7681a9437262947f41c7fac567c7e1f6eddd94b0483596d320092004533"}, + {file = "regex-2023.10.3-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:91dc1d531f80c862441d7b66c4505cd6ea9d312f01fb2f4654f40c6fdf5cc37a"}, + {file = "regex-2023.10.3-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:82fcc1f1cc3ff1ab8a57ba619b149b907072e750815c5ba63e7aa2e1163384a4"}, + {file = "regex-2023.10.3-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:7979b834ec7a33aafae34a90aad9f914c41fd6eaa8474e66953f3f6f7cbd4368"}, + {file = "regex-2023.10.3-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:ef71561f82a89af6cfcbee47f0fabfdb6e63788a9258e913955d89fdd96902ab"}, + {file = "regex-2023.10.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dd829712de97753367153ed84f2de752b86cd1f7a88b55a3a775eb52eafe8a94"}, + {file = "regex-2023.10.3-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:00e871d83a45eee2f8688d7e6849609c2ca2a04a6d48fba3dff4deef35d14f07"}, + {file = "regex-2023.10.3-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:706e7b739fdd17cb89e1fbf712d9dc21311fc2333f6d435eac2d4ee81985098c"}, + {file = "regex-2023.10.3-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:cc3f1c053b73f20c7ad88b0d1d23be7e7b3901229ce89f5000a8399746a6e039"}, + {file = "regex-2023.10.3-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:6f85739e80d13644b981a88f529d79c5bdf646b460ba190bffcaf6d57b2a9863"}, + {file = "regex-2023.10.3-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:741ba2f511cc9626b7561a440f87d658aabb3d6b744a86a3c025f866b4d19e7f"}, + {file = "regex-2023.10.3-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:e77c90ab5997e85901da85131fd36acd0ed2221368199b65f0d11bca44549711"}, + {file = "regex-2023.10.3-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:979c24cbefaf2420c4e377ecd1f165ea08cc3d1fbb44bdc51bccbbf7c66a2cb4"}, + {file = "regex-2023.10.3-cp38-cp38-win32.whl", hash = "sha256:58837f9d221744d4c92d2cf7201c6acd19623b50c643b56992cbd2b745485d3d"}, + {file = "regex-2023.10.3-cp38-cp38-win_amd64.whl", hash = "sha256:c55853684fe08d4897c37dfc5faeff70607a5f1806c8be148f1695be4a63414b"}, + {file = "regex-2023.10.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:2c54e23836650bdf2c18222c87f6f840d4943944146ca479858404fedeb9f9af"}, + {file = "regex-2023.10.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:69c0771ca5653c7d4b65203cbfc5e66db9375f1078689459fe196fe08b7b4930"}, + {file = "regex-2023.10.3-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6ac965a998e1388e6ff2e9781f499ad1eaa41e962a40d11c7823c9952c77123e"}, + {file = "regex-2023.10.3-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1c0e8fae5b27caa34177bdfa5a960c46ff2f78ee2d45c6db15ae3f64ecadde14"}, + {file = "regex-2023.10.3-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:6c56c3d47da04f921b73ff9415fbaa939f684d47293f071aa9cbb13c94afc17d"}, + {file = "regex-2023.10.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7ef1e014eed78ab650bef9a6a9cbe50b052c0aebe553fb2881e0453717573f52"}, + {file = "regex-2023.10.3-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d29338556a59423d9ff7b6eb0cb89ead2b0875e08fe522f3e068b955c3e7b59b"}, + {file = "regex-2023.10.3-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:9c6d0ced3c06d0f183b73d3c5920727268d2201aa0fe6d55c60d68c792ff3588"}, + {file = "regex-2023.10.3-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:994645a46c6a740ee8ce8df7911d4aee458d9b1bc5639bc968226763d07f00fa"}, + {file = "regex-2023.10.3-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:66e2fe786ef28da2b28e222c89502b2af984858091675044d93cb50e6f46d7af"}, + {file = "regex-2023.10.3-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:11175910f62b2b8c055f2b089e0fedd694fe2be3941b3e2633653bc51064c528"}, + {file = "regex-2023.10.3-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:06e9abc0e4c9ab4779c74ad99c3fc10d3967d03114449acc2c2762ad4472b8ca"}, + {file = "regex-2023.10.3-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:fb02e4257376ae25c6dd95a5aec377f9b18c09be6ebdefa7ad209b9137b73d48"}, + {file = "regex-2023.10.3-cp39-cp39-win32.whl", hash = "sha256:3b2c3502603fab52d7619b882c25a6850b766ebd1b18de3df23b2f939360e1bd"}, + {file = "regex-2023.10.3-cp39-cp39-win_amd64.whl", hash = "sha256:adbccd17dcaff65704c856bd29951c58a1bd4b2b0f8ad6b826dbd543fe740988"}, + {file = "regex-2023.10.3.tar.gz", hash = "sha256:3fef4f844d2290ee0ba57addcec17eec9e3df73f10a2748485dfd6a3a188cc0f"}, +] + [[package]] name = "requests" version = "2.31.0" @@ -1479,6 +1712,16 @@ files = [ {file = "wcwidth-0.2.10.tar.gz", hash = "sha256:390c7454101092a6a5e43baad8f83de615463af459201709556b6e4b1c861f97"}, ] +[[package]] +name = "wget" +version = "3.2" +description = "pure python download utility" +optional = false +python-versions = "*" +files = [ + {file = "wget-3.2.zip", hash = "sha256:35e630eca2aa50ce998b9b1a127bb26b30dfee573702782aa982f875e3f16061"}, +] + [[package]] name = "yarl" version = "1.9.2" @@ -1584,4 +1827,4 @@ testing = ["big-O", "jaraco.functools", "jaraco.itertools", "more-itertools", "p [metadata] lock-version = "2.0" python-versions = "^3.10" -content-hash = "7955e07ea098c2e8b29421733eb5ec6c06cbbc5bf64bd88451baa1a42c71e6b2" +content-hash = "4953e126f4e42186a812ca444ae887f723a5b76943234b7897df3bd8563944a3" diff --git a/pyproject.toml b/pyproject.toml index f45792aa7ef63d8fb74d1b32ce0c9b0aeb7da0c8..f2b4ba2e2e621f42e4d82909c623a86d0013024e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -15,6 +15,7 @@ python = "^3.10" pydantic = "^1.8.2" openai = "^0.28.1" cohere = "^4.32" +pinecone-text = "^0.7.0" [tool.poetry.group.dev.dependencies] diff --git a/semantic_router/__init__.py b/semantic_router/__init__.py index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..ac1e314ef1d854ad9e240c22354bf7b980428d01 100644 --- a/semantic_router/__init__.py +++ b/semantic_router/__init__.py @@ -0,0 +1,3 @@ +from .layer import DecisionLayer, HybridDecisionLayer + +__all__ = ["DecisionLayer", "HybridDecisionLayer"] \ No newline at end of file diff --git a/semantic_router/encoders/__init__.py b/semantic_router/encoders/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..0c86ce7c47290d1490565837bb8a98a5631941f2 --- /dev/null +++ b/semantic_router/encoders/__init__.py @@ -0,0 +1,6 @@ +from .base import BaseEncoder +from .cohere import CohereEncoder +from .openai import OpenAIEncoder +from .bm25 import BM25Encoder + +__all__ = ["BaseEncoder", "CohereEncoder", "OpenAIEncoder", "BM25Encoder"] diff --git a/semantic_router/retrievers/base.py b/semantic_router/encoders/base.py similarity index 88% rename from semantic_router/retrievers/base.py rename to semantic_router/encoders/base.py index 4274e074cd26c52318ba287d12e4ca19fe36aac1..b6de1f89b80f42ea42d3abe76bbf61441936aa6e 100644 --- a/semantic_router/retrievers/base.py +++ b/semantic_router/encoders/base.py @@ -1,7 +1,7 @@ from pydantic import BaseModel -class BaseRetriever(BaseModel): +class BaseEncoder(BaseModel): name: str class Config: diff --git a/semantic_router/encoders/bm25.py b/semantic_router/encoders/bm25.py new file mode 100644 index 0000000000000000000000000000000000000000..344f0820ae354645331dc6512456ffe65c60113b --- /dev/null +++ b/semantic_router/encoders/bm25.py @@ -0,0 +1,40 @@ +from pinecone_text.sparse import BM25Encoder as encoder + +from semantic_router.encoders import BaseEncoder + + +class BM25Encoder(BaseEncoder): + model: encoder | None = None + idx_mapping: dict[int, int] | None = None + + def __init__(self, name: str = "bm25"): + super().__init__(name=name) + # initialize BM25 encoder with default params (trained on MSMarco) + self.model = encoder.default() + self.idx_mapping = { + idx: i for i, idx in enumerate(self.model.get_params()["doc_freq"]["indices"]) + } + + def __call__(self, docs: list[str]) -> list[list[float]]: + if len(docs) == 1: + sparse_dicts = self.model.encode_query(docs[0]) + elif len(docs) > 1: + sparse_dicts = self.model.encode_documents(docs) + else: + raise ValueError("No documents to encode.") + # convert sparse dict to sparse vector + embeds = [0.0] * len(self.idx_mapping) + for output in sparse_dicts: + indices = output["indices"] + values = output["values"] + for idx, val in zip(indices, values): + if idx in self.idx_mapping: + print(idx, self.idx_mapping[idx]) + position = self.idx_mapping[idx] + embeds[position] = val + else: + print(idx, "not in encoder.idx_mapping") + return embeds + + def fit(self, docs: list[str]): + self.model.fit(docs) diff --git a/semantic_router/retrievers/cohere.py b/semantic_router/encoders/cohere.py similarity index 88% rename from semantic_router/retrievers/cohere.py rename to semantic_router/encoders/cohere.py index 187cb7442df0a14db2e64e087016614c9439cf00..fd20fa7586da1a4370d541e8659b826b93db9fac 100644 --- a/semantic_router/retrievers/cohere.py +++ b/semantic_router/encoders/cohere.py @@ -2,10 +2,10 @@ import os import cohere -from semantic_router.retrievers import BaseRetriever +from semantic_router.encoders import BaseEncoder -class CohereRetriever(BaseRetriever): +class CohereEncoder(BaseEncoder): client: cohere.Client | None def __init__( diff --git a/semantic_router/retrievers/huggingface.py b/semantic_router/encoders/huggingface.py similarity index 61% rename from semantic_router/retrievers/huggingface.py rename to semantic_router/encoders/huggingface.py index 9c8f2f05b403c4023446dc520c4816333edc59cf..52ddecd2ed02ea5d6cd187aa6f52bf1277785585 100644 --- a/semantic_router/retrievers/huggingface.py +++ b/semantic_router/encoders/huggingface.py @@ -1,7 +1,7 @@ -from semantic_router.retrievers import BaseRetriever +from semantic_router.encoders import BaseEncoder -class HuggingFaceRetriever(BaseRetriever): +class HuggingFaceEncoder(BaseEncoder): def __init__(self, name: str): self.name = name diff --git a/semantic_router/retrievers/openai.py b/semantic_router/encoders/openai.py similarity index 92% rename from semantic_router/retrievers/openai.py rename to semantic_router/encoders/openai.py index 2dbfd880904a1b196c2ab6e45fd44fdfd405feb3..5700c8003b05bcb7a3be237b7ac655d9b13d56d4 100644 --- a/semantic_router/retrievers/openai.py +++ b/semantic_router/encoders/openai.py @@ -4,10 +4,10 @@ from time import sleep import openai from openai.error import RateLimitError -from semantic_router.retrievers import BaseRetriever +from semantic_router.encoders import BaseEncoder -class OpenAIRetriever(BaseRetriever): +class OpenAIEncoder(BaseEncoder): def __init__(self, name: str, openai_api_key: str | None = None): super().__init__(name=name) openai.api_key = openai_api_key or os.getenv("OPENAI_API_KEY") diff --git a/semantic_router/encoders/tfidf.py b/semantic_router/encoders/tfidf.py new file mode 100644 index 0000000000000000000000000000000000000000..5dc7f34d95b0a7a7f418801000330b3159e0692e --- /dev/null +++ b/semantic_router/encoders/tfidf.py @@ -0,0 +1,37 @@ +from functools import partial + +from sklearn.feature_extraction.text import TfidfVectorizer + +from semantic_router.encoders import BaseEncoder + + +class TfidfEncoder(BaseEncoder): + model: encoder | None = None + + def __init__(self, name: str = "bm25"): + super().__init__(name=name) + # initialize BM25 encoder with default params (trained on MSMarco) + self.model = encoder.default() + self.idx_mapping = { + idx: i for i, idx in enumerate(self.model.get_params()["doc_freq"]["indices"]) + } + + def __call__(self, docs: list[str]) -> list[list[float]]: + if len(docs) == 1: + sparse_dicts = self.model.encode_query(docs[0]) + elif len(docs) > 1: + sparse_dicts = self.model.encode_documents(docs) + else: + raise ValueError("No documents to encode.") + # convert sparse dict to sparse vector + embeds = [0.0] * len(self.idx_mapping) + for output in sparse_dicts: + indices = output["indices"] + values = output["values"] + for idx, val in zip(indices, values): + position = self.idx_mapping[idx] + embeds[position] = val + return embeds + + def fit(self, docs: list[str]): + self.model.fit(docs) diff --git a/semantic_router/layer.py b/semantic_router/layer.py index ad27a4c168f5dc4fe8869343b6707b8eaa3cdfcd..e8b71576c4accd5936621f6da2889411caf2fe97 100644 --- a/semantic_router/layer.py +++ b/semantic_router/layer.py @@ -1,11 +1,11 @@ import numpy as np from numpy.linalg import norm -from semantic_router.retrievers import ( - BaseRetriever, - CohereRetriever, - OpenAIRetriever, - BM25Retriever +from semantic_router.encoders import ( + BaseEncoder, + CohereEncoder, + OpenAIEncoder, + BM25Encoder ) from semantic_router.schema import Decision @@ -15,12 +15,12 @@ class DecisionLayer: categories = None score_threshold = 0.82 - def __init__(self, encoder: BaseRetriever, decisions: list[Decision] = []): + def __init__(self, encoder: BaseEncoder, decisions: list[Decision] = []): self.encoder = encoder # decide on default threshold based on encoder - if isinstance(encoder, OpenAIRetriever): + if isinstance(encoder, OpenAIEncoder): self.score_threshold = 0.82 - elif isinstance(encoder, CohereRetriever): + elif isinstance(encoder, CohereEncoder): self.score_threshold = 0.3 else: self.score_threshold = 0.82 @@ -116,17 +116,17 @@ class HybridDecisionLayer: def __init__( self, - encoder: BaseRetriever, + encoder: BaseEncoder, decisions: list[Decision] = [], alpha: float = 0.3 ): self.encoder = encoder - self.sparse_encoder = BM25Retriever() + self.sparse_encoder = BM25Encoder() self.alpha = alpha # decide on default threshold based on encoder - if isinstance(encoder, OpenAIRetriever): + if isinstance(encoder, OpenAIEncoder): self.score_threshold = 0.82 - elif isinstance(encoder, CohereRetriever): + elif isinstance(encoder, CohereEncoder): self.score_threshold = 0.3 else: self.score_threshold = 0.82 @@ -150,8 +150,8 @@ class HybridDecisionLayer: def _add_decision(self, decision: Decision): # create embeddings - dense_embeds = self.encoder(decision.utterances) - sparse_embeds = self.sparse_encoder(decision.utterances) + dense_embeds = self.encoder(decision.utterances) * self.alpha + sparse_embeds = self.sparse_encoder(decision.utterances) * (1 - self.alpha) # concatenate vectors to create hybrid vecs embeds = np.concatenate([ dense_embeds, sparse_embeds @@ -168,12 +168,20 @@ class HybridDecisionLayer: self.utterances, np.array(decision.utterances) ]) - # create utterance array (the index) + # create utterance array (the dense index) if self.index is None: - self.index = np.array(embeds) + self.index = np.array(dense_embeds) else: - embed_arr = np.array(embeds) + embed_arr = np.array(dense_embeds) self.index = np.concatenate([self.index, embed_arr]) + # create sparse utterance array + if self.sparse_index is None: + self.sparse_index = np.array(sparse_embeds) + else: + sparse_embed_arr = np.array(sparse_embeds) + self.sparse_index = np.concatenate([ + self.sparse_index, sparse_embed_arr + ]) def _query(self, text: str, top_k: int = 5): """Given some text, encodes and searches the index vector space to diff --git a/semantic_router/rankers/cohere.py b/semantic_router/rankers/cohere.py index b703a960509208fd65df62d21867b5b3b1ee0ece..e79608b817708a324fe925fed4b3bbb5972e93cb 100644 --- a/semantic_router/rankers/cohere.py +++ b/semantic_router/rankers/cohere.py @@ -2,10 +2,10 @@ import os import cohere -from semantic_router.rankers import BaseReranker +from semantic_router.rankers import BaseRanker -class CohereRanker(BaseReranker): +class CohereRanker(BaseRanker): client: cohere.Client | None def __init__( diff --git a/semantic_router/retrievers/__init__.py b/semantic_router/retrievers/__init__.py deleted file mode 100644 index 0fcaa6d2e568ebbc5413726b3a638a75462b781d..0000000000000000000000000000000000000000 --- a/semantic_router/retrievers/__init__.py +++ /dev/null @@ -1,5 +0,0 @@ -from .base import BaseRetriever -from .cohere import CohereRetriever -from .openai import OpenAIRetriever - -__all__ = ["BaseRetriever", "CohereRetriever", "OpenAIRetriever"] diff --git a/semantic_router/retrievers/bm25.py b/semantic_router/retrievers/bm25.py deleted file mode 100644 index 2a68a3ff79430cd7f570b17463bfc103a6dad26e..0000000000000000000000000000000000000000 --- a/semantic_router/retrievers/bm25.py +++ /dev/null @@ -1,21 +0,0 @@ -import os - -from pinecone_text import BM25Encoder - -from semantic_router.retrievers import BaseRetriever - - -class BM25Retriever(BaseRetriever): - def __init__(self, name: str = "bm25"): - super().__init__(name=name) - self.model = BM25Encoder() - - def __call__(self, docs: list[str]) -> list[list[float]]: - if self.params is None: - raise ValueError("BM25 model not trained, must call `.fit` first.") - embeds = self.model.encode_doocuments(docs) - return embeds.embeddings - - def fit(self, docs: list[str]): - params = self.model.fit(docs) - self.model.set_params(**params) \ No newline at end of file diff --git a/semantic_router/schema.py b/semantic_router/schema.py index cb1288fb1171501ea475f39dd6c812101532b736..37a43dd41007634706caedc1b5c5dadd6d734bba 100644 --- a/semantic_router/schema.py +++ b/semantic_router/schema.py @@ -3,10 +3,10 @@ from enum import Enum from pydantic import BaseModel from pydantic.dataclasses import dataclass -from semantic_router.retrievers import ( - BaseRetriever, - CohereRetriever, - OpenAIRetriever, +from semantic_router.encoders import ( + BaseEncoder, + CohereEncoder, + OpenAIEncoder, ) @@ -16,27 +16,27 @@ class Decision(BaseModel): description: str | None = None -class RetrieverType(Enum): +class EncoderType(Enum): HUGGINGFACE = "huggingface" OPENAI = "openai" COHERE = "cohere" @dataclass -class Retriever: - type: RetrieverType +class Encoder: + type: EncoderType name: str - model: BaseRetriever + model: BaseEncoder def __init__(self, type: str, name: str): - self.type = RetrieverType(type) + self.type = EncoderType(type) self.name = name - if self.type == RetrieverType.HUGGINGFACE: + if self.type == EncoderType.HUGGINGFACE: raise NotImplementedError - elif self.type == RetrieverType.OPENAI: - self.model = OpenAIRetriever(name) - elif self.type == RetrieverType.COHERE: - self.model = CohereRetriever(name) + elif self.type == EncoderType.OPENAI: + self.model = OpenAIEncoder(name) + elif self.type == EncoderType.COHERE: + self.model = CohereEncoder(name) def __call__(self, texts: list[str]) -> list[float]: return self.model(texts)