From a598c8e04c87844e72b5f6802984fd6b0498f4d6 Mon Sep 17 00:00:00 2001 From: Timothy Carambat <rambat1010@gmail.com> Date: Mon, 17 Jun 2024 16:04:20 -0700 Subject: [PATCH] 1347 human readable confluence url (#1706) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * chore: confluence data connector can now handle custom urls, in addition to default {subdomain}.atlassian.net ones * chore: formatting as per yarn lint * chore: fixing the human readable confluence url fetch baseUrl * chore: fixing the human readable confluence url fetch baseUrl * chore: fixing the human readable confluence url fetch baseUrl * chore: fixing the human readable confluence url fetch baseUrl * chore: fixing the human readable confluence url fetch baseUrl * refactor implementation of various types of Confluence URL patterns --------- Co-authored-by: Predrag Stojadinovic <predrag@stojadinovic.net> Co-authored-by: Predrag Stojadinović <cope@users.noreply.github.com> Co-authored-by: Predrag Stojadinovic <predrags@nvidia.com> --- .../utils/extensions/Confluence/index.js | 133 ++++++++++++------ 1 file changed, 93 insertions(+), 40 deletions(-) diff --git a/collector/utils/extensions/Confluence/index.js b/collector/utils/extensions/Confluence/index.js index 1732d0726..0bee15614 100644 --- a/collector/utils/extensions/Confluence/index.js +++ b/collector/utils/extensions/Confluence/index.js @@ -9,37 +9,6 @@ const { ConfluencePagesLoader, } = require("langchain/document_loaders/web/confluence"); -function validSpaceUrl(spaceUrl = "") { - // Atlassian default URL match - const atlassianPattern = new UrlPattern( - "https\\://(:subdomain).atlassian.net/wiki/spaces/(:spaceKey)*" - ); - const atlassianMatch = atlassianPattern.match(spaceUrl); - if (atlassianMatch) { - return { valid: true, result: atlassianMatch }; - } - - let customMatch = null; - [ - "https\\://(:subdomain.):domain.:tld/wiki/spaces/(:spaceKey)*", // Custom Confluence space - "https\\://(:subdomain.):domain.:tld/display/(:spaceKey)*", // Custom Confluence space + Human-readable space tag. - ].forEach((matchPattern) => { - if (!!customMatch) return; - const pattern = new UrlPattern(matchPattern); - customMatch = pattern.match(spaceUrl); - }); - - if (customMatch) { - customMatch.customDomain = - (customMatch.subdomain ? `${customMatch.subdomain}.` : "") + // - `${customMatch.domain}.${customMatch.tld}`; - return { valid: true, result: customMatch, custom: true }; - } - - // No match - return { valid: false, result: null }; -} - async function loadConfluence({ pageUrl, username, accessToken }) { if (!pageUrl || !username || !accessToken) { return { @@ -49,21 +18,16 @@ async function loadConfluence({ pageUrl, username, accessToken }) { }; } - const validSpace = validSpaceUrl(pageUrl); - if (!validSpace.result) { + const { valid, result } = validSpaceUrl(pageUrl); + if (!valid) { return { success: false, reason: - "Confluence space URL is not in the expected format of https://domain.atlassian.net/wiki/space/~SPACEID/* or https://customDomain/wiki/space/~SPACEID/*", + "Confluence space URL is not in the expected format of one of https://domain.atlassian.net/wiki/space/~SPACEID/* or https://customDomain/wiki/space/~SPACEID/* or https://customDomain/display/~SPACEID/*", }; } - const { subdomain, customDomain, spaceKey } = validSpace.result; - let baseUrl = `https://${subdomain}.atlassian.net/wiki`; - if (customDomain) { - baseUrl = `https://${customDomain}/wiki`; - } - + const { apiBase: baseUrl, spaceKey, subdomain } = result; console.log(`-- Working Confluence ${baseUrl} --`); const loader = new ConfluencePagesLoader({ baseUrl, @@ -142,4 +106,93 @@ async function loadConfluence({ pageUrl, username, accessToken }) { }; } +/** + * A match result for a url-pattern of a Confluence URL + * @typedef {Object} ConfluenceMatchResult + * @property {string} subdomain - the subdomain of an organization's Confluence space + * @property {string} spaceKey - the spaceKey of an organization that determines the documents to collect. + * @property {string} apiBase - the correct REST API url to use for loader. + */ + +/** + * Generates the correct API base URL for interfacing with the Confluence REST API + * depending on the URL pattern being used since there are various ways to host/access a + * Confluence space. + * @param {ConfluenceMatchResult} matchResult - result from `url-pattern`.match + * @param {boolean} isCustomDomain - determines if we need to coerce the subpath of the provided URL + * @returns {string} - the resulting REST API URL + */ +function generateAPIBaseUrl(matchResult = {}, isCustomDomain = false) { + const { subdomain } = matchResult; + let subpath = isCustomDomain ? `` : `/wiki`; + if (isCustomDomain) return `https://${customDomain}${subpath}`; + return `https://${subdomain}.atlassian.net${subpath}`; +} + +/** + * Validates and parses the correct information from a given Confluence URL + * @param {string} spaceUrl - The organization's Confluence URL to parse + * @returns {{ + * valid: boolean, + * result: (ConfluenceMatchResult|null), + * }} + */ +function validSpaceUrl(spaceUrl = "") { + let matchResult; + const patterns = { + default: new UrlPattern( + "https\\://(:subdomain).atlassian.net/wiki/spaces/(:spaceKey)*" + ), + subdomain: new UrlPattern( + "https\\://(:subdomain.):domain.:tld/wiki/spaces/(:spaceKey)*" + ), + custom: new UrlPattern( + "https\\://(:subdomain.):domain.:tld/display/(:spaceKey)*" + ), + }; + + // If using the default Atlassian Confluence URL pattern. + // We can proceed because the Library/API can use this base url scheme. + matchResult = patterns.default.match(spaceUrl); + if (matchResult) + return { + valid: matchResult.hasOwnProperty("spaceKey"), + result: { + ...matchResult, + apiBase: generateAPIBaseUrl(matchResult), + }, + }; + + // If using a custom subdomain Confluence URL pattern. + // We need to attach the customDomain as a property to the match result + // so we can form the correct REST API base from the subdomain. + matchResult = patterns.subdomain.match(spaceUrl); + if (matchResult) { + return { + valid: matchResult.hasOwnProperty("spaceKey"), + result: { + ...matchResult, + apiBase: generateAPIBaseUrl(matchResult), + }, + }; + } + + // If using a base FQDN Confluence URL pattern. + // We need to attach the customDomain as a property to the match result + // so we can form the correct REST API base from the root domain since /display/ is basically a URL mask. + matchResult = patterns.custom.match(spaceUrl); + if (matchResult) { + return { + valid: matchResult.hasOwnProperty("spaceKey"), + result: { + ...matchResult, + apiBase: generateAPIBaseUrl(matchResult, true), + }, + }; + } + + // No match + return { valid: false, result: null }; +} + module.exports = loadConfluence; -- GitLab