From 35b0c8efa3f087ef00d06e151701283306b48773 Mon Sep 17 00:00:00 2001 From: James Briggs <35938317+jamescalam@users.noreply.github.com> Date: Sat, 2 Dec 2023 11:40:23 -0800 Subject: [PATCH] example notebook for hybrid --- docs/examples/hybrid-layer.ipynb | 248 +++++-------------------------- 1 file changed, 37 insertions(+), 211 deletions(-) diff --git a/docs/examples/hybrid-layer.ipynb b/docs/examples/hybrid-layer.ipynb index d3fb58c5..fa6b78c7 100644 --- a/docs/examples/hybrid-layer.ipynb +++ b/docs/examples/hybrid-layer.ipynb @@ -52,7 +52,7 @@ "source": [ "import os\n", "\n", - "os.environ[\"COHERE_API_KEY\"] = \"<<APIKEY>>\"" + "os.environ[\"COHERE_API_KEY\"] = \"<<COHERE_API_KEY>>\"" ] }, { @@ -78,7 +78,8 @@ " utterances=[\n", " \"isn't politics the best thing ever\",\n", " \"why don't you tell me about your political opinions\",\n", - " \"don't you just love the president\" \"don't you just hate the president\",\n", + " \"don't you just love the president\",\n", + " \"don't you just hate the president\",\n", " \"they're going to destroy this country!\",\n", " \"they will save the country!\",\n", " ],\n", @@ -98,6 +99,17 @@ "metadata": {}, "outputs": [], "source": [ + "chitchat = Decision(\n", + " name=\"chitchat\",\n", + " utterances=[\n", + " \"how's the weather today?\",\n", + " \"how are things going?\",\n", + " \"lovely weather today\",\n", + " \"the weather is horrendous\",\n", + " \"let's go to the chippy\",\n", + " ],\n", + ")\n", + "\n", "chitchat = Decision(\n", " name=\"chitchat\",\n", " utterances=[\n", @@ -146,15 +158,7 @@ "cell_type": "code", "execution_count": 5, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "3454774732 not in encoder.idx_mapping\n" - ] - } - ], + "outputs": [], "source": [ "from semantic_router.layer import HybridDecisionLayer\n", "\n", @@ -167,18 +171,14 @@ "metadata": {}, "outputs": [ { - "ename": "AxisError", - "evalue": "axis 1 is out of bounds for array of dimension 1", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mAxisError\u001b[0m Traceback (most recent call last)", - "\u001b[1;32m/Users/jamesbriggs/Documents/projects/aurelio-labs/semantic-router/docs/examples/hybrid-layer.ipynb Cell 15\u001b[0m line \u001b[0;36m1\n\u001b[0;32m----> <a href='vscode-notebook-cell:/Users/jamesbriggs/Documents/projects/aurelio-labs/semantic-router/docs/examples/hybrid-layer.ipynb#X20sZmlsZQ%3D%3D?line=0'>1</a>\u001b[0m dl(\u001b[39m\"\u001b[39;49m\u001b[39mdon\u001b[39;49m\u001b[39m'\u001b[39;49m\u001b[39mt you love politics?\u001b[39;49m\u001b[39m\"\u001b[39;49m)\n", - "File \u001b[0;32m~/Documents/projects/aurelio-labs/semantic-router/semantic_router/layer.py:141\u001b[0m, in \u001b[0;36mHybridDecisionLayer.__call__\u001b[0;34m(self, text)\u001b[0m\n\u001b[1;32m 140\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39m__call__\u001b[39m(\u001b[39mself\u001b[39m, text: \u001b[39mstr\u001b[39m) \u001b[39m-\u001b[39m\u001b[39m>\u001b[39m \u001b[39mstr\u001b[39m \u001b[39m|\u001b[39m \u001b[39mNone\u001b[39;00m:\n\u001b[0;32m--> 141\u001b[0m results \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_query(text)\n\u001b[1;32m 142\u001b[0m top_class, top_class_scores \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_semantic_classify(results)\n\u001b[1;32m 143\u001b[0m passed \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_pass_threshold(top_class_scores, \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mscore_threshold)\n", - "File \u001b[0;32m~/Documents/projects/aurelio-labs/semantic-router/semantic_router/layer.py:204\u001b[0m, in \u001b[0;36mHybridDecisionLayer._query\u001b[0;34m(self, text, top_k)\u001b[0m\n\u001b[1;32m 202\u001b[0m sim_d \u001b[39m=\u001b[39m np\u001b[39m.\u001b[39mdot(\u001b[39mself\u001b[39m\u001b[39m.\u001b[39mindex, xq_d\u001b[39m.\u001b[39mT) \u001b[39m/\u001b[39m (index_norm \u001b[39m*\u001b[39m xq_d_norm)\n\u001b[1;32m 203\u001b[0m \u001b[39m# calculate sparse vec similarity\u001b[39;00m\n\u001b[0;32m--> 204\u001b[0m sparse_norm \u001b[39m=\u001b[39m norm(\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49msparse_index, axis\u001b[39m=\u001b[39;49m\u001b[39m1\u001b[39;49m)\n\u001b[1;32m 205\u001b[0m xq_s_norm \u001b[39m=\u001b[39m norm(xq_s\u001b[39m.\u001b[39mT)\n\u001b[1;32m 206\u001b[0m sim_s \u001b[39m=\u001b[39m np\u001b[39m.\u001b[39mdot(\u001b[39mself\u001b[39m\u001b[39m.\u001b[39msparse_index, xq_s\u001b[39m.\u001b[39mT) \u001b[39m/\u001b[39m (sparse_norm \u001b[39m*\u001b[39m xq_s_norm)\n", - "File \u001b[0;32m~/opt/anaconda3/envs/decision-layer/lib/python3.11/site-packages/numpy/linalg/linalg.py:2583\u001b[0m, in \u001b[0;36mnorm\u001b[0;34m(x, ord, axis, keepdims)\u001b[0m\n\u001b[1;32m 2580\u001b[0m \u001b[39melif\u001b[39;00m \u001b[39mord\u001b[39m \u001b[39mis\u001b[39;00m \u001b[39mNone\u001b[39;00m \u001b[39mor\u001b[39;00m \u001b[39mord\u001b[39m \u001b[39m==\u001b[39m \u001b[39m2\u001b[39m:\n\u001b[1;32m 2581\u001b[0m \u001b[39m# special case for speedup\u001b[39;00m\n\u001b[1;32m 2582\u001b[0m s \u001b[39m=\u001b[39m (x\u001b[39m.\u001b[39mconj() \u001b[39m*\u001b[39m x)\u001b[39m.\u001b[39mreal\n\u001b[0;32m-> 2583\u001b[0m \u001b[39mreturn\u001b[39;00m sqrt(add\u001b[39m.\u001b[39;49mreduce(s, axis\u001b[39m=\u001b[39;49maxis, keepdims\u001b[39m=\u001b[39;49mkeepdims))\n\u001b[1;32m 2584\u001b[0m \u001b[39m# None of the str-type keywords for ord ('fro', 'nuc')\u001b[39;00m\n\u001b[1;32m 2585\u001b[0m \u001b[39m# are valid for vectors\u001b[39;00m\n\u001b[1;32m 2586\u001b[0m \u001b[39melif\u001b[39;00m \u001b[39misinstance\u001b[39m(\u001b[39mord\u001b[39m, \u001b[39mstr\u001b[39m):\n", - "\u001b[0;31mAxisError\u001b[0m: axis 1 is out of bounds for array of dimension 1" - ] + "data": { + "text/plain": [ + "'politics'" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ @@ -187,178 +187,20 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "---\n", - "\n", - "#### Testing" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from semantic_router.encoders import BM25Encoder\n", - "\n", - "encoder = BM25Encoder()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "tests = [\"hello this is some text\", \"and more stuff\"]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "idx_list = encoder.model.get_params()['doc_freq']['indices']\n", - "idx_list" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "sparse_dicts = encoder.model.encode_documents(tests)\n", - "sparse_dicts" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "embeds = [0.0] * len(encoder.idx_mapping)" - ] - }, - { - "cell_type": "code", - "execution_count": null, + "execution_count": 7, "metadata": {}, - "outputs": [], - "source": [ - "for output in sparse_dicts:\n", - " indices = output[\"indices\"]\n", - " values = output[\"values\"]\n", - " for idx, val in zip(indices, values):\n", - " position = encoder.idx_mapping[idx]\n", - " embeds[position] = val" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "encoder.idx_mapping" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "encoded_output = encoder(tests)\n", - "encoded_output" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import numpy as np\n", - "\n", - "\n", - "sparse_vec = np.zeros(len(idx_list))\n", - "idx_position_dict = {idx: i for i, idx in enumerate(idx_list)}\n", - "\n", - "for output in encoded_output:\n", - " indices = output['indices']\n", - " values = output['values']\n", - " for idx, value in zip(indices, values):\n", - " if idx in idx_position_dict:\n", - " position = idx_position_dict[idx]\n", - " sparse_vec[position] = value" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "sparse_vec" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "sparse_vec.shape" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Finish Testing\n", - "\n", - "---" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now we can test it:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dl(\"don't you love politics?\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "'chitchat'" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "dl(\"how's the weather today?\")" ] @@ -367,23 +209,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Both are classified accurately, what if we send a query that is unrelated to our existing `Decision` objects?" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dl(\"I'm interested in learning about llama 2\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "In this case, we return `None` because no matches were identified." + "---" ] } ], -- GitLab