From 35b0c8efa3f087ef00d06e151701283306b48773 Mon Sep 17 00:00:00 2001
From: James Briggs <35938317+jamescalam@users.noreply.github.com>
Date: Sat, 2 Dec 2023 11:40:23 -0800
Subject: [PATCH] example notebook for hybrid

---
 docs/examples/hybrid-layer.ipynb | 248 +++++--------------------------
 1 file changed, 37 insertions(+), 211 deletions(-)

diff --git a/docs/examples/hybrid-layer.ipynb b/docs/examples/hybrid-layer.ipynb
index d3fb58c5..fa6b78c7 100644
--- a/docs/examples/hybrid-layer.ipynb
+++ b/docs/examples/hybrid-layer.ipynb
@@ -52,7 +52,7 @@
    "source": [
     "import os\n",
     "\n",
-    "os.environ[\"COHERE_API_KEY\"] = \"<<APIKEY>>\""
+    "os.environ[\"COHERE_API_KEY\"] = \"<<COHERE_API_KEY>>\""
    ]
   },
   {
@@ -78,7 +78,8 @@
     "    utterances=[\n",
     "        \"isn't politics the best thing ever\",\n",
     "        \"why don't you tell me about your political opinions\",\n",
-    "        \"don't you just love the president\" \"don't you just hate the president\",\n",
+    "        \"don't you just love the president\",\n",
+    "        \"don't you just hate the president\",\n",
     "        \"they're going to destroy this country!\",\n",
     "        \"they will save the country!\",\n",
     "    ],\n",
@@ -98,6 +99,17 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "chitchat = Decision(\n",
+    "    name=\"chitchat\",\n",
+    "    utterances=[\n",
+    "        \"how's the weather today?\",\n",
+    "        \"how are things going?\",\n",
+    "        \"lovely weather today\",\n",
+    "        \"the weather is horrendous\",\n",
+    "        \"let's go to the chippy\",\n",
+    "    ],\n",
+    ")\n",
+    "\n",
     "chitchat = Decision(\n",
     "    name=\"chitchat\",\n",
     "    utterances=[\n",
@@ -146,15 +158,7 @@
    "cell_type": "code",
    "execution_count": 5,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "3454774732 not in encoder.idx_mapping\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "from semantic_router.layer import HybridDecisionLayer\n",
     "\n",
@@ -167,18 +171,14 @@
    "metadata": {},
    "outputs": [
     {
-     "ename": "AxisError",
-     "evalue": "axis 1 is out of bounds for array of dimension 1",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mAxisError\u001b[0m                                 Traceback (most recent call last)",
-      "\u001b[1;32m/Users/jamesbriggs/Documents/projects/aurelio-labs/semantic-router/docs/examples/hybrid-layer.ipynb Cell 15\u001b[0m line \u001b[0;36m1\n\u001b[0;32m----> <a href='vscode-notebook-cell:/Users/jamesbriggs/Documents/projects/aurelio-labs/semantic-router/docs/examples/hybrid-layer.ipynb#X20sZmlsZQ%3D%3D?line=0'>1</a>\u001b[0m dl(\u001b[39m\"\u001b[39;49m\u001b[39mdon\u001b[39;49m\u001b[39m'\u001b[39;49m\u001b[39mt you love politics?\u001b[39;49m\u001b[39m\"\u001b[39;49m)\n",
-      "File \u001b[0;32m~/Documents/projects/aurelio-labs/semantic-router/semantic_router/layer.py:141\u001b[0m, in \u001b[0;36mHybridDecisionLayer.__call__\u001b[0;34m(self, text)\u001b[0m\n\u001b[1;32m    140\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39m__call__\u001b[39m(\u001b[39mself\u001b[39m, text: \u001b[39mstr\u001b[39m) \u001b[39m-\u001b[39m\u001b[39m>\u001b[39m \u001b[39mstr\u001b[39m \u001b[39m|\u001b[39m \u001b[39mNone\u001b[39;00m:\n\u001b[0;32m--> 141\u001b[0m     results \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_query(text)\n\u001b[1;32m    142\u001b[0m     top_class, top_class_scores \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_semantic_classify(results)\n\u001b[1;32m    143\u001b[0m     passed \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_pass_threshold(top_class_scores, \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mscore_threshold)\n",
-      "File \u001b[0;32m~/Documents/projects/aurelio-labs/semantic-router/semantic_router/layer.py:204\u001b[0m, in \u001b[0;36mHybridDecisionLayer._query\u001b[0;34m(self, text, top_k)\u001b[0m\n\u001b[1;32m    202\u001b[0m sim_d \u001b[39m=\u001b[39m np\u001b[39m.\u001b[39mdot(\u001b[39mself\u001b[39m\u001b[39m.\u001b[39mindex, xq_d\u001b[39m.\u001b[39mT) \u001b[39m/\u001b[39m (index_norm \u001b[39m*\u001b[39m xq_d_norm)\n\u001b[1;32m    203\u001b[0m \u001b[39m# calculate sparse vec similarity\u001b[39;00m\n\u001b[0;32m--> 204\u001b[0m sparse_norm \u001b[39m=\u001b[39m norm(\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49msparse_index, axis\u001b[39m=\u001b[39;49m\u001b[39m1\u001b[39;49m)\n\u001b[1;32m    205\u001b[0m xq_s_norm \u001b[39m=\u001b[39m norm(xq_s\u001b[39m.\u001b[39mT)\n\u001b[1;32m    206\u001b[0m sim_s \u001b[39m=\u001b[39m np\u001b[39m.\u001b[39mdot(\u001b[39mself\u001b[39m\u001b[39m.\u001b[39msparse_index, xq_s\u001b[39m.\u001b[39mT) \u001b[39m/\u001b[39m (sparse_norm \u001b[39m*\u001b[39m xq_s_norm)\n",
-      "File \u001b[0;32m~/opt/anaconda3/envs/decision-layer/lib/python3.11/site-packages/numpy/linalg/linalg.py:2583\u001b[0m, in \u001b[0;36mnorm\u001b[0;34m(x, ord, axis, keepdims)\u001b[0m\n\u001b[1;32m   2580\u001b[0m \u001b[39melif\u001b[39;00m \u001b[39mord\u001b[39m \u001b[39mis\u001b[39;00m \u001b[39mNone\u001b[39;00m \u001b[39mor\u001b[39;00m \u001b[39mord\u001b[39m \u001b[39m==\u001b[39m \u001b[39m2\u001b[39m:\n\u001b[1;32m   2581\u001b[0m     \u001b[39m# special case for speedup\u001b[39;00m\n\u001b[1;32m   2582\u001b[0m     s \u001b[39m=\u001b[39m (x\u001b[39m.\u001b[39mconj() \u001b[39m*\u001b[39m x)\u001b[39m.\u001b[39mreal\n\u001b[0;32m-> 2583\u001b[0m     \u001b[39mreturn\u001b[39;00m sqrt(add\u001b[39m.\u001b[39;49mreduce(s, axis\u001b[39m=\u001b[39;49maxis, keepdims\u001b[39m=\u001b[39;49mkeepdims))\n\u001b[1;32m   2584\u001b[0m \u001b[39m# None of the str-type keywords for ord ('fro', 'nuc')\u001b[39;00m\n\u001b[1;32m   2585\u001b[0m \u001b[39m# are valid for vectors\u001b[39;00m\n\u001b[1;32m   2586\u001b[0m \u001b[39melif\u001b[39;00m \u001b[39misinstance\u001b[39m(\u001b[39mord\u001b[39m, \u001b[39mstr\u001b[39m):\n",
-      "\u001b[0;31mAxisError\u001b[0m: axis 1 is out of bounds for array of dimension 1"
-     ]
+     "data": {
+      "text/plain": [
+       "'politics'"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
     }
    ],
    "source": [
@@ -187,178 +187,20 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "---\n",
-    "\n",
-    "#### Testing"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from semantic_router.encoders import BM25Encoder\n",
-    "\n",
-    "encoder = BM25Encoder()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "tests = [\"hello this is some text\", \"and more stuff\"]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "idx_list = encoder.model.get_params()['doc_freq']['indices']\n",
-    "idx_list"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "sparse_dicts = encoder.model.encode_documents(tests)\n",
-    "sparse_dicts"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "embeds = [0.0] * len(encoder.idx_mapping)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 7,
    "metadata": {},
-   "outputs": [],
-   "source": [
-    "for output in sparse_dicts:\n",
-    "    indices = output[\"indices\"]\n",
-    "    values = output[\"values\"]\n",
-    "    for idx, val in zip(indices, values):\n",
-    "        position = encoder.idx_mapping[idx]\n",
-    "        embeds[position] = val"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "encoder.idx_mapping"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "encoded_output = encoder(tests)\n",
-    "encoded_output"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import numpy as np\n",
-    "\n",
-    "\n",
-    "sparse_vec = np.zeros(len(idx_list))\n",
-    "idx_position_dict = {idx: i for i, idx in enumerate(idx_list)}\n",
-    "\n",
-    "for output in encoded_output:\n",
-    "    indices = output['indices']\n",
-    "    values = output['values']\n",
-    "    for idx, value in zip(indices, values):\n",
-    "        if idx in idx_position_dict:\n",
-    "            position = idx_position_dict[idx]\n",
-    "            sparse_vec[position] = value"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "sparse_vec"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "sparse_vec.shape"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "#### Finish Testing\n",
-    "\n",
-    "---"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Now we can test it:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "dl(\"don't you love politics?\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'chitchat'"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
     "dl(\"how's the weather today?\")"
    ]
@@ -367,23 +209,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Both are classified accurately, what if we send a query that is unrelated to our existing `Decision` objects?"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "dl(\"I'm interested in learning about llama 2\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "In this case, we return `None` because no matches were identified."
+    "---"
    ]
   }
  ],
-- 
GitLab