From d568399065fafbc07c001c626a66daa608f7e110 Mon Sep 17 00:00:00 2001
From: Siraj R Aizlewood <siraj@aurelio.ai>
Date: Tue, 7 Nov 2023 11:36:53 +0400
Subject: [PATCH] Testing Alternative Cosine Similarity Function

---
 00_walkthrough.ipynb             | 420 +++++++++++++++++++++++++++++--
 decision_layer/decision_layer.py |  26 +-
 2 files changed, 422 insertions(+), 24 deletions(-)

diff --git a/00_walkthrough.ipynb b/00_walkthrough.ipynb
index 23d8f633..6633875b 100644
--- a/00_walkthrough.ipynb
+++ b/00_walkthrough.ipynb
@@ -76,7 +76,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -95,7 +95,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -124,7 +124,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -185,7 +185,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 6,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -203,7 +203,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 7,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -220,7 +220,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 8,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -237,7 +237,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 9,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -263,7 +263,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 10,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -284,7 +284,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 11,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -298,9 +298,113 @@
     "dl = DecisionLayer(encoder=encoder, decisions=decisions)"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Test `politics` decision:"
+   ]
+  },
   {
    "cell_type": "code",
-   "execution_count": 20,
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "##################################################\n",
+      "sim 1\n",
+      "[[0.24654371]\n",
+      " [0.24179116]\n",
+      " [0.24323266]\n",
+      " [0.21900559]\n",
+      " [0.22244086]\n",
+      " [0.2156429 ]\n",
+      " [0.18936619]\n",
+      " [0.19757812]\n",
+      " [0.18816959]\n",
+      " [0.19574877]\n",
+      " [0.19575958]\n",
+      " [0.20340967]\n",
+      " [0.19478593]]\n",
+      "##################################################\n",
+      "##################################################\n",
+      "sim 2\n",
+      "[[0.888926  ]\n",
+      " [0.87179043]\n",
+      " [0.87698776]\n",
+      " [0.78963588]\n",
+      " [0.80202191]\n",
+      " [0.77751152]\n",
+      " [0.68276953]\n",
+      " [0.71237805]\n",
+      " [0.67845506]\n",
+      " [0.70578225]\n",
+      " [0.70582121]\n",
+      " [0.73340407]\n",
+      " [0.70231063]]\n",
+      "##################################################\n"
+     ]
+    },
+    {
+     "ename": "TypeError",
+     "evalue": "DecisionLayer._semantic_classify() got an unexpected keyword argument 'apply_tan'",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[1;31mTypeError\u001b[0m                                 Traceback (most recent call last)",
+      "\u001b[1;32mc:\\Users\\Siraj\\Documents\\Personal\\Work\\Aurelio\\20231106 Semantic Layer\\Repo\\semantic-layer\\00_walkthrough.ipynb Cell 19\u001b[0m line \u001b[0;36m1\n\u001b[1;32m----> <a href='vscode-notebook-cell:/c%3A/Users/Siraj/Documents/Personal/Work/Aurelio/20231106%20Semantic%20Layer/Repo/semantic-layer/00_walkthrough.ipynb#X24sZmlsZQ%3D%3D?line=0'>1</a>\u001b[0m out \u001b[39m=\u001b[39m dl(\u001b[39m\"\u001b[39;49m\u001b[39mdon\u001b[39;49m\u001b[39m'\u001b[39;49m\u001b[39mt you love politics?\u001b[39;49m\u001b[39m\"\u001b[39;49m, _tan\u001b[39m=\u001b[39;49m\u001b[39mTrue\u001b[39;49;00m, _threshold\u001b[39m=\u001b[39;49m\u001b[39m0.75\u001b[39;49m)\n\u001b[0;32m      <a href='vscode-notebook-cell:/c%3A/Users/Siraj/Documents/Personal/Work/Aurelio/20231106%20Semantic%20Layer/Repo/semantic-layer/00_walkthrough.ipynb#X24sZmlsZQ%3D%3D?line=1'>2</a>\u001b[0m \u001b[39mprint\u001b[39m(out)\n",
+      "File \u001b[1;32mc:\\Users\\Siraj\\Documents\\Personal\\Work\\Aurelio\\20231106 Semantic Layer\\Repo\\semantic-layer\\decision_layer\\decision_layer.py:21\u001b[0m, in \u001b[0;36mDecisionLayer.__call__\u001b[1;34m(self, text, _tan, _threshold)\u001b[0m\n\u001b[0;32m     19\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39m__call__\u001b[39m(\u001b[39mself\u001b[39m, text: \u001b[39mstr\u001b[39m, _tan: \u001b[39mbool\u001b[39m\u001b[39m=\u001b[39m\u001b[39mTrue\u001b[39;00m, _threshold: \u001b[39mfloat\u001b[39m\u001b[39m=\u001b[39m\u001b[39m0.5\u001b[39m):\n\u001b[0;32m     20\u001b[0m     results \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_query(text)\n\u001b[1;32m---> 21\u001b[0m     decision \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_semantic_classify(results, apply_tan\u001b[39m=\u001b[39;49m_tan, threshold\u001b[39m=\u001b[39;49m_threshold)\n\u001b[0;32m     22\u001b[0m     \u001b[39m# return decision\u001b[39;00m\n\u001b[0;32m     23\u001b[0m     \u001b[39mreturn\u001b[39;00m decision\n",
+      "\u001b[1;31mTypeError\u001b[0m: DecisionLayer._semantic_classify() got an unexpected keyword argument 'apply_tan'"
+     ]
+    }
+   ],
+   "source": [
+    "out = dl(\"don't you love politics?\", _tan=True, _threshold=0.75)\n",
+    "print(out)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "politics\n"
+     ]
+    }
+   ],
+   "source": [
+    "out = dl(\"don't you love politics?\", _tan=False, _threshold=0.75)\n",
+    "print(out)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "politics\n"
+     ]
+    }
+   ],
+   "source": [
+    "out = dl(\"Tell me your thoughts on the president of the united states of america.\", _tan=True, _threshold=0.75)\n",
+    "print(out)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
@@ -312,13 +416,277 @@
     }
    ],
    "source": [
-    "out = dl(\"don't you love politics?\", _tan=True, _threshold=0.5)\n",
+    "out = dl(\"Tell me your thoughts on the president of the united states of america.\", _tan=False, _threshold=0.75)\n",
+    "print(out)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Test `other_brands` decision:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "None\n"
+     ]
+    }
+   ],
+   "source": [
+    "out = dl(\"What is Binance?\", _tan=True, _threshold=0.5)\n",
     "print(out)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 21,
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "None\n"
+     ]
+    }
+   ],
+   "source": [
+    "out = dl(\"What is Binance?\", _tan=False, _threshold=0.5)\n",
+    "print(out)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "None\n"
+     ]
+    }
+   ],
+   "source": [
+    "out = dl(\"Tell me about Binance.\", _tan=True, _threshold=0.5)\n",
+    "print(out)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "None\n"
+     ]
+    }
+   ],
+   "source": [
+    "out = dl(\"Tell me about Binance.\", _tan=False, _threshold=0.5)\n",
+    "print(out)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "None\n"
+     ]
+    }
+   ],
+   "source": [
+    "out = dl(\"How can I use Binance?\", _tan=True, _threshold=0.5)\n",
+    "print(out)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "other_brands\n"
+     ]
+    }
+   ],
+   "source": [
+    "out = dl(\"How can I use Binance?\", _tan=False, _threshold=0.5)\n",
+    "print(out)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Test `discount` decision:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "politics\n"
+     ]
+    }
+   ],
+   "source": [
+    "out = dl(\"discount please.\", _tan=True, _threshold=0.5)\n",
+    "print(out)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "politics\n"
+     ]
+    }
+   ],
+   "source": [
+    "out = dl(\"discount please.\", _tan=False, _threshold=0.5)\n",
+    "print(out)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "politics\n"
+     ]
+    }
+   ],
+   "source": [
+    "out = dl(\"can i get a freebie?\", _tan=True, _threshold=0.5)\n",
+    "print(out)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "None\n"
+     ]
+    }
+   ],
+   "source": [
+    "out = dl(\"can i get a freebie?\", _tan=False, _threshold=0.5)\n",
+    "print(out)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Test `bot_functionality` decision:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "politics\n"
+     ]
+    }
+   ],
+   "source": [
+    "out = dl(\"Are you and AI?\", _tan=True, _threshold=0.5)\n",
+    "print(out)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "politics\n"
+     ]
+    }
+   ],
+   "source": [
+    "out = dl(\"Are you and AI?\", _tan=False, _threshold=0.5)\n",
+    "print(out)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "out = dl(\"\", _tan=True, _threshold=0.5)\n",
+    "print(out)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "out = dl(\"\", _tan=False, _threshold=0.5)\n",
+    "print(out)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Test `other` (unclassified) decision."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
@@ -336,18 +704,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
-     "ename": "NameError",
-     "evalue": "name 'dl' is not defined",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[1;31mNameError\u001b[0m                                 Traceback (most recent call last)",
-      "\u001b[1;32mc:\\Users\\Siraj\\Documents\\Personal\\Work\\Aurelio\\20231106 Semantic Layer\\Repo\\semantic-layer\\00_walkthrough.ipynb Cell 20\u001b[0m line \u001b[0;36m1\n\u001b[1;32m----> <a href='vscode-notebook-cell:/c%3A/Users/Siraj/Documents/Personal/Work/Aurelio/20231106%20Semantic%20Layer/Repo/semantic-layer/00_walkthrough.ipynb#X26sZmlsZQ%3D%3D?line=0'>1</a>\u001b[0m out \u001b[39m=\u001b[39m dl(\u001b[39m\"\u001b[39m\u001b[39mHow do I bake a cake?\u001b[39m\u001b[39m\"\u001b[39m, _tan\u001b[39m=\u001b[39m\u001b[39mTrue\u001b[39;00m, _threshold\u001b[39m=\u001b[39m\u001b[39m0.5\u001b[39m)\n\u001b[0;32m      <a href='vscode-notebook-cell:/c%3A/Users/Siraj/Documents/Personal/Work/Aurelio/20231106%20Semantic%20Layer/Repo/semantic-layer/00_walkthrough.ipynb#X26sZmlsZQ%3D%3D?line=1'>2</a>\u001b[0m \u001b[39mprint\u001b[39m(out)\n",
-      "\u001b[1;31mNameError\u001b[0m: name 'dl' is not defined"
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "None\n"
      ]
     }
    ],
@@ -355,6 +719,20 @@
     "out = dl(\"How do I bake a cake?\", _tan=True, _threshold=0.5)\n",
     "print(out)"
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
   }
  ],
  "metadata": {
diff --git a/decision_layer/decision_layer.py b/decision_layer/decision_layer.py
index 307783ad..5e1c66b6 100644
--- a/decision_layer/decision_layer.py
+++ b/decision_layer/decision_layer.py
@@ -18,7 +18,7 @@ class DecisionLayer:
 
     def __call__(self, text: str, _tan: bool=True, _threshold: float=0.5):
         results = self._query(text)
-        decision = self._semantic_classify(results, apply_tan=_tan, threshold=_threshold)
+        decision = self._semantic_classify(results, _tan=_tan, _threshold=_threshold)
         # return decision
         return decision
 
@@ -42,15 +42,35 @@ class DecisionLayer:
             embed_arr = np.array(embeds)
             self.index = np.concatenate([self.index, embed_arr])
 
+    def _cosine_similarity(self, v1, v2):
+        """Compute the dot product between two embeddings using numpy functions."""
+        np_v1 = np.array(v1)
+        np_v2 = np.array(v2)
+        return np.dot(np_v1, np_v2) / (np.linalg.norm(np_v1) * np.linalg.norm(np_v2))
+
     def _query(self, text: str, top_k: int=5):
         """Given some text, encodes and searches the index vector space to
         retrieve the top_k most similar records.
         """
         # create query vector
         xq = np.array(self.encoder([text]))
-        # calculate cosine similarities
+        # calculate cosine similaritiess
         sim = np.dot(self.index, xq.T) / (norm(self.index)*norm(xq.T))
+        # DEBUGGING: Start.
+        print('#'*50)
+        print('sim 1')
+        print(sim)
+        print('#'*50)
+        # DEBUGGING: End.
+        sim = np.array([self._cosine_similarity(embedding, xq.T) for embedding in self.index])
+        # DEBUGGING: Start.
+        print('#'*50)
+        print('sim 2')
+        print(sim)
+        print('#'*50)
+        # DEBUGGING: End.
         # get indices of top_k records
+        top_k = min(top_k, sim.shape[0])
         idx = np.argpartition(sim.T[0], -top_k)[-top_k:]
         scores = sim[idx]
         # get the utterance categories (decision names)
@@ -60,7 +80,7 @@ class DecisionLayer:
         ]
 
 
-    def _semantic_classify(self, query_results: dict, apply_tan: bool=True, threshold: float=0.5):
+    def _semantic_classify(self, query_results: dict, _tan: bool=True, _threshold: float=0.5):
         """Given some text, categorizes."""
         
         # apply the scoring system to the results and group by category
-- 
GitLab