From a598c8e04c87844e72b5f6802984fd6b0498f4d6 Mon Sep 17 00:00:00 2001
From: Timothy Carambat <rambat1010@gmail.com>
Date: Mon, 17 Jun 2024 16:04:20 -0700
Subject: [PATCH] 1347 human readable confluence url (#1706)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* chore: confluence data connector can now handle custom urls, in addition to default {subdomain}.atlassian.net ones

* chore: formatting as per yarn lint

* chore: fixing the human readable confluence url fetch baseUrl

* chore: fixing the human readable confluence url fetch baseUrl

* chore: fixing the human readable confluence url fetch baseUrl

* chore: fixing the human readable confluence url fetch baseUrl

* chore: fixing the human readable confluence url fetch baseUrl

* refactor implementation of various types of Confluence URL patterns

---------

Co-authored-by: Predrag Stojadinovic <predrag@stojadinovic.net>
Co-authored-by: Predrag Stojadinović <cope@users.noreply.github.com>
Co-authored-by: Predrag Stojadinovic <predrags@nvidia.com>
---
 .../utils/extensions/Confluence/index.js      | 133 ++++++++++++------
 1 file changed, 93 insertions(+), 40 deletions(-)

diff --git a/collector/utils/extensions/Confluence/index.js b/collector/utils/extensions/Confluence/index.js
index 1732d0726..0bee15614 100644
--- a/collector/utils/extensions/Confluence/index.js
+++ b/collector/utils/extensions/Confluence/index.js
@@ -9,37 +9,6 @@ const {
   ConfluencePagesLoader,
 } = require("langchain/document_loaders/web/confluence");
 
-function validSpaceUrl(spaceUrl = "") {
-  // Atlassian default URL match
-  const atlassianPattern = new UrlPattern(
-    "https\\://(:subdomain).atlassian.net/wiki/spaces/(:spaceKey)*"
-  );
-  const atlassianMatch = atlassianPattern.match(spaceUrl);
-  if (atlassianMatch) {
-    return { valid: true, result: atlassianMatch };
-  }
-
-  let customMatch = null;
-  [
-    "https\\://(:subdomain.):domain.:tld/wiki/spaces/(:spaceKey)*", // Custom Confluence space
-    "https\\://(:subdomain.):domain.:tld/display/(:spaceKey)*", // Custom Confluence space + Human-readable space tag.
-  ].forEach((matchPattern) => {
-    if (!!customMatch) return;
-    const pattern = new UrlPattern(matchPattern);
-    customMatch = pattern.match(spaceUrl);
-  });
-
-  if (customMatch) {
-    customMatch.customDomain =
-      (customMatch.subdomain ? `${customMatch.subdomain}.` : "") + //
-      `${customMatch.domain}.${customMatch.tld}`;
-    return { valid: true, result: customMatch, custom: true };
-  }
-
-  // No match
-  return { valid: false, result: null };
-}
-
 async function loadConfluence({ pageUrl, username, accessToken }) {
   if (!pageUrl || !username || !accessToken) {
     return {
@@ -49,21 +18,16 @@ async function loadConfluence({ pageUrl, username, accessToken }) {
     };
   }
 
-  const validSpace = validSpaceUrl(pageUrl);
-  if (!validSpace.result) {
+  const { valid, result } = validSpaceUrl(pageUrl);
+  if (!valid) {
     return {
       success: false,
       reason:
-        "Confluence space URL is not in the expected format of https://domain.atlassian.net/wiki/space/~SPACEID/* or https://customDomain/wiki/space/~SPACEID/*",
+        "Confluence space URL is not in the expected format of one of https://domain.atlassian.net/wiki/space/~SPACEID/* or https://customDomain/wiki/space/~SPACEID/* or https://customDomain/display/~SPACEID/*",
     };
   }
 
-  const { subdomain, customDomain, spaceKey } = validSpace.result;
-  let baseUrl = `https://${subdomain}.atlassian.net/wiki`;
-  if (customDomain) {
-    baseUrl = `https://${customDomain}/wiki`;
-  }
-
+  const { apiBase: baseUrl, spaceKey, subdomain } = result;
   console.log(`-- Working Confluence ${baseUrl} --`);
   const loader = new ConfluencePagesLoader({
     baseUrl,
@@ -142,4 +106,93 @@ async function loadConfluence({ pageUrl, username, accessToken }) {
   };
 }
 
+/**
+ * A match result for a url-pattern of a Confluence URL
+ * @typedef {Object} ConfluenceMatchResult
+ * @property {string} subdomain - the subdomain of an organization's Confluence space
+ * @property {string} spaceKey - the spaceKey of an organization that determines the documents to collect.
+ * @property {string} apiBase - the correct REST API url to use for loader.
+ */
+
+/**
+ * Generates the correct API base URL for interfacing with the Confluence REST API
+ * depending on the URL pattern being used since there are various ways to host/access a
+ * Confluence space.
+ * @param {ConfluenceMatchResult} matchResult - result from `url-pattern`.match
+ * @param {boolean} isCustomDomain - determines if we need to coerce the subpath of the provided URL
+ * @returns {string} - the resulting REST API URL
+ */
+function generateAPIBaseUrl(matchResult = {}, isCustomDomain = false) {
+  const { subdomain } = matchResult;
+  let subpath = isCustomDomain ? `` : `/wiki`;
+  if (isCustomDomain) return `https://${customDomain}${subpath}`;
+  return `https://${subdomain}.atlassian.net${subpath}`;
+}
+
+/**
+ * Validates and parses the correct information from a given Confluence URL
+ * @param {string} spaceUrl - The organization's Confluence URL to parse
+ * @returns {{
+ *  valid: boolean,
+ *  result: (ConfluenceMatchResult|null),
+ * }}
+ */
+function validSpaceUrl(spaceUrl = "") {
+  let matchResult;
+  const patterns = {
+    default: new UrlPattern(
+      "https\\://(:subdomain).atlassian.net/wiki/spaces/(:spaceKey)*"
+    ),
+    subdomain: new UrlPattern(
+      "https\\://(:subdomain.):domain.:tld/wiki/spaces/(:spaceKey)*"
+    ),
+    custom: new UrlPattern(
+      "https\\://(:subdomain.):domain.:tld/display/(:spaceKey)*"
+    ),
+  };
+
+  // If using the default Atlassian Confluence URL pattern.
+  // We can proceed because the Library/API can use this base url scheme.
+  matchResult = patterns.default.match(spaceUrl);
+  if (matchResult)
+    return {
+      valid: matchResult.hasOwnProperty("spaceKey"),
+      result: {
+        ...matchResult,
+        apiBase: generateAPIBaseUrl(matchResult),
+      },
+    };
+
+  // If using a custom subdomain Confluence URL pattern.
+  // We need to attach the customDomain as a property to the match result
+  // so we can form the correct REST API base from the subdomain.
+  matchResult = patterns.subdomain.match(spaceUrl);
+  if (matchResult) {
+    return {
+      valid: matchResult.hasOwnProperty("spaceKey"),
+      result: {
+        ...matchResult,
+        apiBase: generateAPIBaseUrl(matchResult),
+      },
+    };
+  }
+
+  // If using a base FQDN Confluence URL pattern.
+  // We need to attach the customDomain as a property to the match result
+  // so we can form the correct REST API base from the root domain since /display/ is basically a URL mask.
+  matchResult = patterns.custom.match(spaceUrl);
+  if (matchResult) {
+    return {
+      valid: matchResult.hasOwnProperty("spaceKey"),
+      result: {
+        ...matchResult,
+        apiBase: generateAPIBaseUrl(matchResult, true),
+      },
+    };
+  }
+
+  // No match
+  return { valid: false, result: null };
+}
+
 module.exports = loadConfluence;
-- 
GitLab