From 4488744850ce9c93e24b1755be6aa9bccda27c0a Mon Sep 17 00:00:00 2001
From: Sean Hatfield <seanhatfield5@gmail.com>
Date: Wed, 25 Sep 2024 16:12:17 -0700
Subject: [PATCH] Support more Confluence URL formats (#2118)

* support more confluence url formats

* use pattern matching for confluence urls and manual splitting as fallback

* rework entire Confluence flow to prevent issues with custom, local, and cloud spaces

* remove dep

---------

Co-authored-by: Timothy Carambat <rambat1010@gmail.com>
---
 collector/extensions/resync/index.js          |   1 +
 .../Confluence/ConfluenceLoader/index.js      |   3 +-
 .../utils/extensions/Confluence/index.js      | 150 ++++++------------
 .../Connectors/Confluence/index.jsx           |  31 +++-
 frontend/src/models/dataConnector.js          |   5 +-
 5 files changed, 77 insertions(+), 113 deletions(-)

diff --git a/collector/extensions/resync/index.js b/collector/extensions/resync/index.js
index 66882ba7a..024935f5c 100644
--- a/collector/extensions/resync/index.js
+++ b/collector/extensions/resync/index.js
@@ -59,6 +59,7 @@ async function resyncConfluence({ chunkSource }, response) {
     const { success, reason, content } = await fetchConfluencePage({
       pageUrl: `https:${source.pathname}`, // need to add back the real protocol
       baseUrl: source.searchParams.get('baseUrl'),
+      spaceKey: source.searchParams.get('spaceKey'),
       accessToken: source.searchParams.get('token'),
       username: source.searchParams.get('username'),
     });
diff --git a/collector/utils/extensions/Confluence/ConfluenceLoader/index.js b/collector/utils/extensions/Confluence/ConfluenceLoader/index.js
index 770185986..2afb95273 100644
--- a/collector/utils/extensions/Confluence/ConfluenceLoader/index.js
+++ b/collector/utils/extensions/Confluence/ConfluenceLoader/index.js
@@ -72,8 +72,9 @@ class ConfluencePagesLoader {
     }
   }
 
+  // https://developer.atlassian.com/cloud/confluence/rest/v2/intro/#auth
   async fetchAllPagesInSpace(start = 0, limit = this.limit) {
-    const url = `${this.baseUrl}/rest/api/content?spaceKey=${this.spaceKey}&limit=${limit}&start=${start}&expand=${this.expand}`;
+    const url = `${this.baseUrl}/wiki/rest/api/content?spaceKey=${this.spaceKey}&limit=${limit}&start=${start}&expand=${this.expand}`;
     const data = await this.fetchConfluenceData(url);
     if (data.size === 0) {
       return [];
diff --git a/collector/utils/extensions/Confluence/index.js b/collector/utils/extensions/Confluence/index.js
index aada1322c..819176712 100644
--- a/collector/utils/extensions/Confluence/index.js
+++ b/collector/utils/extensions/Confluence/index.js
@@ -2,7 +2,6 @@ const fs = require("fs");
 const path = require("path");
 const { default: slugify } = require("slugify");
 const { v4 } = require("uuid");
-const UrlPattern = require("url-pattern");
 const { writeToServerDocuments, sanitizeFileName } = require("../../files");
 const { tokenizeString } = require("../../tokenizer");
 const { ConfluencePagesLoader } = require("./ConfluenceLoader");
@@ -13,8 +12,11 @@ const { ConfluencePagesLoader } = require("./ConfluenceLoader");
  * @param {import("../../../middleware/setDataSigner").ResponseWithSigner} response - Express response object with encryptionWorker
  * @returns
  */
-async function loadConfluence({ pageUrl, username, accessToken }, response) {
-  if (!pageUrl || !username || !accessToken) {
+async function loadConfluence(
+  { baseUrl = null, spaceKey = null, username = null, accessToken = null },
+  response
+) {
+  if (!baseUrl || !spaceKey || !username || !accessToken) {
     return {
       success: false,
       reason:
@@ -22,19 +24,24 @@ async function loadConfluence({ pageUrl, username, accessToken }, response) {
     };
   }
 
-  const { valid, result } = validSpaceUrl(pageUrl);
-  if (!valid) {
+  if (!validBaseUrl(baseUrl)) {
     return {
       success: false,
-      reason:
-        "Confluence space URL is not in the expected format of one of https://domain.atlassian.net/wiki/space/~SPACEID/* or https://customDomain/wiki/space/~SPACEID/* or https://customDomain/display/~SPACEID/*",
+      reason: "Provided base URL is not a valid URL.",
     };
   }
 
-  const { apiBase: baseUrl, spaceKey, subdomain } = result;
-  console.log(`-- Working Confluence ${baseUrl} --`);
+  if (!spaceKey) {
+    return {
+      success: false,
+      reason: "You need to provide a Confluence space key.",
+    };
+  }
+
+  const { origin, hostname } = new URL(baseUrl);
+  console.log(`-- Working Confluence ${origin} --`);
   const loader = new ConfluencePagesLoader({
-    baseUrl,
+    baseUrl: origin, // Use the origin to avoid issues with subdomains, ports, protocols, etc.
     spaceKey,
     username,
     accessToken,
@@ -59,7 +66,7 @@ async function loadConfluence({ pageUrl, username, accessToken }, response) {
     };
   }
   const outFolder = slugify(
-    `${subdomain}-confluence-${v4().slice(0, 4)}`
+    `confluence-${origin}-${v4().slice(0, 4)}`
   ).toLowerCase();
 
   const outFolderPath =
@@ -80,11 +87,11 @@ async function loadConfluence({ pageUrl, username, accessToken }, response) {
       id: v4(),
       url: doc.metadata.url + ".page",
       title: doc.metadata.title || doc.metadata.source,
-      docAuthor: subdomain,
+      docAuthor: origin,
       description: doc.metadata.title,
-      docSource: `${subdomain} Confluence`,
+      docSource: `${origin} Confluence`,
       chunkSource: generateChunkSource(
-        { doc, baseUrl, accessToken, username },
+        { doc, baseUrl: origin, spaceKey, accessToken, username },
         response.locals.encryptionWorker
       ),
       published: new Date().toLocaleString(),
@@ -120,10 +127,11 @@ async function loadConfluence({ pageUrl, username, accessToken }, response) {
 async function fetchConfluencePage({
   pageUrl,
   baseUrl,
+  spaceKey,
   username,
   accessToken,
 }) {
-  if (!pageUrl || !baseUrl || !username || !accessToken) {
+  if (!pageUrl || !baseUrl || !spaceKey || !username || !accessToken) {
     return {
       success: false,
       content: null,
@@ -132,20 +140,25 @@ async function fetchConfluencePage({
     };
   }
 
-  const { valid, result } = validSpaceUrl(pageUrl);
-  if (!valid) {
+  if (!validBaseUrl(baseUrl)) {
     return {
       success: false,
       content: null,
-      reason:
-        "Confluence space URL is not in the expected format of https://domain.atlassian.net/wiki/space/~SPACEID/* or https://customDomain/wiki/space/~SPACEID/*",
+      reason: "Provided base URL is not a valid URL.",
+    };
+  }
+
+  if (!spaceKey) {
+    return {
+      success: false,
+      content: null,
+      reason: "You need to provide a Confluence space key.",
     };
   }
 
   console.log(`-- Working Confluence Page ${pageUrl} --`);
-  const { spaceKey } = result;
   const loader = new ConfluencePagesLoader({
-    baseUrl,
+    baseUrl, // Should be the origin of the baseUrl
     spaceKey,
     username,
     accessToken,
@@ -190,91 +203,17 @@ async function fetchConfluencePage({
 }
 
 /**
- * A match result for a url-pattern of a Confluence URL
- * @typedef {Object} ConfluenceMatchResult
- * @property {string} subdomain - the subdomain of an organization's Confluence space
- * @property {string} spaceKey - the spaceKey of an organization that determines the documents to collect.
- * @property {string} apiBase - the correct REST API url to use for loader.
- */
-
-/**
- * Generates the correct API base URL for interfacing with the Confluence REST API
- * depending on the URL pattern being used since there are various ways to host/access a
- * Confluence space.
- * @param {ConfluenceMatchResult} matchResult - result from `url-pattern`.match
- * @param {boolean} isCustomDomain - determines if we need to coerce the subpath of the provided URL
- * @returns {string} - the resulting REST API URL
- */
-function generateAPIBaseUrl(matchResult = {}, isCustomDomain = false) {
-  const { subdomain } = matchResult;
-  if (isCustomDomain) return `https://${subdomain}`;
-  return `https://${subdomain}.atlassian.net/wiki`;
-}
-
-/**
- * Validates and parses the correct information from a given Confluence URL
- * @param {string} spaceUrl - The organization's Confluence URL to parse
- * @returns {{
- *  valid: boolean,
- *  result: (ConfluenceMatchResult|null),
- * }}
+ * Validates if the provided baseUrl is a valid URL at all.
+ * @param {string} baseUrl
+ * @returns {boolean}
  */
-function validSpaceUrl(spaceUrl = "") {
-  let matchResult;
-  const patterns = {
-    default: new UrlPattern(
-      "https\\://(:subdomain).atlassian.net/wiki/spaces/(:spaceKey)*"
-    ),
-    subdomain: new UrlPattern(
-      "https\\://(:subdomain.):domain.:tld/wiki/spaces/(:spaceKey)*"
-    ),
-    custom: new UrlPattern(
-      "https\\://(:subdomain.):domain.:tld/display/(:spaceKey)*"
-    ),
-  };
-
-  // If using the default Atlassian Confluence URL pattern.
-  // We can proceed because the Library/API can use this base url scheme.
-  matchResult = patterns.default.match(spaceUrl);
-  if (matchResult)
-    return {
-      valid: matchResult.hasOwnProperty("spaceKey"),
-      result: {
-        ...matchResult,
-        apiBase: generateAPIBaseUrl(matchResult),
-      },
-    };
-
-  // If using a custom subdomain Confluence URL pattern.
-  // We need to attach the customDomain as a property to the match result
-  // so we can form the correct REST API base from the subdomain.
-  matchResult = patterns.subdomain.match(spaceUrl);
-  if (matchResult) {
-    return {
-      valid: matchResult.hasOwnProperty("spaceKey"),
-      result: {
-        ...matchResult,
-        apiBase: generateAPIBaseUrl(matchResult),
-      },
-    };
+function validBaseUrl(baseUrl) {
+  try {
+    new URL(baseUrl);
+    return true;
+  } catch (e) {
+    return false;
   }
-
-  // If using a base FQDN Confluence URL pattern.
-  // We need to attach the customDomain as a property to the match result
-  // so we can form the correct REST API base from the root domain since /display/ is basically a URL mask.
-  matchResult = patterns.custom.match(spaceUrl);
-  if (matchResult) {
-    return {
-      valid: matchResult.hasOwnProperty("spaceKey"),
-      result: {
-        ...matchResult,
-        apiBase: generateAPIBaseUrl(matchResult, true),
-      },
-    };
-  }
-
-  // No match
-  return { valid: false, result: null };
 }
 
 /**
@@ -286,11 +225,12 @@ function validSpaceUrl(spaceUrl = "") {
  * @returns {string}
  */
 function generateChunkSource(
-  { doc, baseUrl, accessToken, username },
+  { doc, baseUrl, spaceKey, accessToken, username },
   encryptionWorker
 ) {
   const payload = {
     baseUrl,
+    spaceKey,
     token: accessToken,
     username,
   };
diff --git a/frontend/src/components/Modals/ManageWorkspace/DataConnectors/Connectors/Confluence/index.jsx b/frontend/src/components/Modals/ManageWorkspace/DataConnectors/Connectors/Confluence/index.jsx
index 3cb2c4f82..b9a1c9059 100644
--- a/frontend/src/components/Modals/ManageWorkspace/DataConnectors/Connectors/Confluence/index.jsx
+++ b/frontend/src/components/Modals/ManageWorkspace/DataConnectors/Connectors/Confluence/index.jsx
@@ -22,7 +22,8 @@ export default function ConfluenceOptions() {
         }
       );
       const { data, error } = await System.dataConnectors.confluence.collect({
-        pageUrl: form.get("pageUrl"),
+        baseUrl: form.get("baseUrl"),
+        spaceKey: form.get("spaceKey"),
         username: form.get("username"),
         accessToken: form.get("accessToken"),
       });
@@ -56,17 +57,37 @@ export default function ConfluenceOptions() {
               <div className="flex flex-col pr-10">
                 <div className="flex flex-col gap-y-1 mb-4">
                   <label className="text-white text-sm font-bold flex gap-x-2 items-center">
-                    <p className="font-bold text-white">Confluence Page URL</p>
+                    <p className="font-bold text-white">Confluence base URL</p>
                   </label>
                   <p className="text-xs font-normal text-white/50">
-                    URL of a page in the Confluence space.
+                    This is the base URL of your Confluence space.
                   </p>
                 </div>
                 <input
                   type="url"
-                  name="pageUrl"
+                  name="baseUrl"
                   className="bg-zinc-900 text-white placeholder:text-white/20 text-sm rounded-lg focus:outline-primary-button active:outline-primary-button outline-none block w-full p-2.5"
-                  placeholder="https://example.atlassian.net/wiki/spaces/~7120208c08555d52224113949698b933a3bb56/pages/851969/Test+anythingLLM+page"
+                  placeholder="eg: https://example.atlassian.net, http://localhost:8211, etc..."
+                  required={true}
+                  autoComplete="off"
+                  spellCheck={false}
+                />
+              </div>
+              <div className="flex flex-col pr-10">
+                <div className="flex flex-col gap-y-1 mb-4">
+                  <label className="text-white text-sm font-bold">
+                    Confluence space key
+                  </label>
+                  <p className="text-xs font-normal text-white/50">
+                    This is the spaces key of your confluence instance that will
+                    be used. Usually begins with ~
+                  </p>
+                </div>
+                <input
+                  type="text"
+                  name="spaceKey"
+                  className="bg-zinc-900 text-white placeholder:text-white/20 text-sm rounded-lg focus:outline-primary-button active:outline-primary-button outline-none block w-full p-2.5"
+                  placeholder="eg: ~7120208c08555d52224113949698b933a3bb56"
                   required={true}
                   autoComplete="off"
                   spellCheck={false}
diff --git a/frontend/src/models/dataConnector.js b/frontend/src/models/dataConnector.js
index c363835c8..eb5fe79da 100644
--- a/frontend/src/models/dataConnector.js
+++ b/frontend/src/models/dataConnector.js
@@ -119,12 +119,13 @@ const DataConnector = {
   },
 
   confluence: {
-    collect: async function ({ pageUrl, username, accessToken }) {
+    collect: async function ({ baseUrl, spaceKey, username, accessToken }) {
       return await fetch(`${API_BASE}/ext/confluence`, {
         method: "POST",
         headers: baseHeaders(),
         body: JSON.stringify({
-          pageUrl,
+          baseUrl,
+          spaceKey,
           username,
           accessToken,
         }),
-- 
GitLab