diff --git a/collector/extensions/resync/index.js b/collector/extensions/resync/index.js index 66882ba7a68f4c0a59f61dd8436c62ba674251cd..024935f5cfd11bc9803eaeeb6f54fb8b2d33e43b 100644 --- a/collector/extensions/resync/index.js +++ b/collector/extensions/resync/index.js @@ -59,6 +59,7 @@ async function resyncConfluence({ chunkSource }, response) { const { success, reason, content } = await fetchConfluencePage({ pageUrl: `https:${source.pathname}`, // need to add back the real protocol baseUrl: source.searchParams.get('baseUrl'), + spaceKey: source.searchParams.get('spaceKey'), accessToken: source.searchParams.get('token'), username: source.searchParams.get('username'), }); diff --git a/collector/utils/extensions/Confluence/ConfluenceLoader/index.js b/collector/utils/extensions/Confluence/ConfluenceLoader/index.js index 77018598680ec60cd54f90877bff68dadcfbef03..2afb9527354a3ecf8bfd38dc956337f6b8b9c754 100644 --- a/collector/utils/extensions/Confluence/ConfluenceLoader/index.js +++ b/collector/utils/extensions/Confluence/ConfluenceLoader/index.js @@ -72,8 +72,9 @@ class ConfluencePagesLoader { } } + // https://developer.atlassian.com/cloud/confluence/rest/v2/intro/#auth async fetchAllPagesInSpace(start = 0, limit = this.limit) { - const url = `${this.baseUrl}/rest/api/content?spaceKey=${this.spaceKey}&limit=${limit}&start=${start}&expand=${this.expand}`; + const url = `${this.baseUrl}/wiki/rest/api/content?spaceKey=${this.spaceKey}&limit=${limit}&start=${start}&expand=${this.expand}`; const data = await this.fetchConfluenceData(url); if (data.size === 0) { return []; diff --git a/collector/utils/extensions/Confluence/index.js b/collector/utils/extensions/Confluence/index.js index aada1322cc18957074e1daef19dea51d27e36f56..819176712b75bab9bc407b9ea080392cb1e3ee2e 100644 --- a/collector/utils/extensions/Confluence/index.js +++ b/collector/utils/extensions/Confluence/index.js @@ -2,7 +2,6 @@ const fs = require("fs"); const path = require("path"); const { default: slugify } = require("slugify"); const { v4 } = require("uuid"); -const UrlPattern = require("url-pattern"); const { writeToServerDocuments, sanitizeFileName } = require("../../files"); const { tokenizeString } = require("../../tokenizer"); const { ConfluencePagesLoader } = require("./ConfluenceLoader"); @@ -13,8 +12,11 @@ const { ConfluencePagesLoader } = require("./ConfluenceLoader"); * @param {import("../../../middleware/setDataSigner").ResponseWithSigner} response - Express response object with encryptionWorker * @returns */ -async function loadConfluence({ pageUrl, username, accessToken }, response) { - if (!pageUrl || !username || !accessToken) { +async function loadConfluence( + { baseUrl = null, spaceKey = null, username = null, accessToken = null }, + response +) { + if (!baseUrl || !spaceKey || !username || !accessToken) { return { success: false, reason: @@ -22,19 +24,24 @@ async function loadConfluence({ pageUrl, username, accessToken }, response) { }; } - const { valid, result } = validSpaceUrl(pageUrl); - if (!valid) { + if (!validBaseUrl(baseUrl)) { return { success: false, - reason: - "Confluence space URL is not in the expected format of one of https://domain.atlassian.net/wiki/space/~SPACEID/* or https://customDomain/wiki/space/~SPACEID/* or https://customDomain/display/~SPACEID/*", + reason: "Provided base URL is not a valid URL.", }; } - const { apiBase: baseUrl, spaceKey, subdomain } = result; - console.log(`-- Working Confluence ${baseUrl} --`); + if (!spaceKey) { + return { + success: false, + reason: "You need to provide a Confluence space key.", + }; + } + + const { origin, hostname } = new URL(baseUrl); + console.log(`-- Working Confluence ${origin} --`); const loader = new ConfluencePagesLoader({ - baseUrl, + baseUrl: origin, // Use the origin to avoid issues with subdomains, ports, protocols, etc. spaceKey, username, accessToken, @@ -59,7 +66,7 @@ async function loadConfluence({ pageUrl, username, accessToken }, response) { }; } const outFolder = slugify( - `${subdomain}-confluence-${v4().slice(0, 4)}` + `confluence-${origin}-${v4().slice(0, 4)}` ).toLowerCase(); const outFolderPath = @@ -80,11 +87,11 @@ async function loadConfluence({ pageUrl, username, accessToken }, response) { id: v4(), url: doc.metadata.url + ".page", title: doc.metadata.title || doc.metadata.source, - docAuthor: subdomain, + docAuthor: origin, description: doc.metadata.title, - docSource: `${subdomain} Confluence`, + docSource: `${origin} Confluence`, chunkSource: generateChunkSource( - { doc, baseUrl, accessToken, username }, + { doc, baseUrl: origin, spaceKey, accessToken, username }, response.locals.encryptionWorker ), published: new Date().toLocaleString(), @@ -120,10 +127,11 @@ async function loadConfluence({ pageUrl, username, accessToken }, response) { async function fetchConfluencePage({ pageUrl, baseUrl, + spaceKey, username, accessToken, }) { - if (!pageUrl || !baseUrl || !username || !accessToken) { + if (!pageUrl || !baseUrl || !spaceKey || !username || !accessToken) { return { success: false, content: null, @@ -132,20 +140,25 @@ async function fetchConfluencePage({ }; } - const { valid, result } = validSpaceUrl(pageUrl); - if (!valid) { + if (!validBaseUrl(baseUrl)) { return { success: false, content: null, - reason: - "Confluence space URL is not in the expected format of https://domain.atlassian.net/wiki/space/~SPACEID/* or https://customDomain/wiki/space/~SPACEID/*", + reason: "Provided base URL is not a valid URL.", + }; + } + + if (!spaceKey) { + return { + success: false, + content: null, + reason: "You need to provide a Confluence space key.", }; } console.log(`-- Working Confluence Page ${pageUrl} --`); - const { spaceKey } = result; const loader = new ConfluencePagesLoader({ - baseUrl, + baseUrl, // Should be the origin of the baseUrl spaceKey, username, accessToken, @@ -190,91 +203,17 @@ async function fetchConfluencePage({ } /** - * A match result for a url-pattern of a Confluence URL - * @typedef {Object} ConfluenceMatchResult - * @property {string} subdomain - the subdomain of an organization's Confluence space - * @property {string} spaceKey - the spaceKey of an organization that determines the documents to collect. - * @property {string} apiBase - the correct REST API url to use for loader. - */ - -/** - * Generates the correct API base URL for interfacing with the Confluence REST API - * depending on the URL pattern being used since there are various ways to host/access a - * Confluence space. - * @param {ConfluenceMatchResult} matchResult - result from `url-pattern`.match - * @param {boolean} isCustomDomain - determines if we need to coerce the subpath of the provided URL - * @returns {string} - the resulting REST API URL - */ -function generateAPIBaseUrl(matchResult = {}, isCustomDomain = false) { - const { subdomain } = matchResult; - if (isCustomDomain) return `https://${subdomain}`; - return `https://${subdomain}.atlassian.net/wiki`; -} - -/** - * Validates and parses the correct information from a given Confluence URL - * @param {string} spaceUrl - The organization's Confluence URL to parse - * @returns {{ - * valid: boolean, - * result: (ConfluenceMatchResult|null), - * }} + * Validates if the provided baseUrl is a valid URL at all. + * @param {string} baseUrl + * @returns {boolean} */ -function validSpaceUrl(spaceUrl = "") { - let matchResult; - const patterns = { - default: new UrlPattern( - "https\\://(:subdomain).atlassian.net/wiki/spaces/(:spaceKey)*" - ), - subdomain: new UrlPattern( - "https\\://(:subdomain.):domain.:tld/wiki/spaces/(:spaceKey)*" - ), - custom: new UrlPattern( - "https\\://(:subdomain.):domain.:tld/display/(:spaceKey)*" - ), - }; - - // If using the default Atlassian Confluence URL pattern. - // We can proceed because the Library/API can use this base url scheme. - matchResult = patterns.default.match(spaceUrl); - if (matchResult) - return { - valid: matchResult.hasOwnProperty("spaceKey"), - result: { - ...matchResult, - apiBase: generateAPIBaseUrl(matchResult), - }, - }; - - // If using a custom subdomain Confluence URL pattern. - // We need to attach the customDomain as a property to the match result - // so we can form the correct REST API base from the subdomain. - matchResult = patterns.subdomain.match(spaceUrl); - if (matchResult) { - return { - valid: matchResult.hasOwnProperty("spaceKey"), - result: { - ...matchResult, - apiBase: generateAPIBaseUrl(matchResult), - }, - }; +function validBaseUrl(baseUrl) { + try { + new URL(baseUrl); + return true; + } catch (e) { + return false; } - - // If using a base FQDN Confluence URL pattern. - // We need to attach the customDomain as a property to the match result - // so we can form the correct REST API base from the root domain since /display/ is basically a URL mask. - matchResult = patterns.custom.match(spaceUrl); - if (matchResult) { - return { - valid: matchResult.hasOwnProperty("spaceKey"), - result: { - ...matchResult, - apiBase: generateAPIBaseUrl(matchResult, true), - }, - }; - } - - // No match - return { valid: false, result: null }; } /** @@ -286,11 +225,12 @@ function validSpaceUrl(spaceUrl = "") { * @returns {string} */ function generateChunkSource( - { doc, baseUrl, accessToken, username }, + { doc, baseUrl, spaceKey, accessToken, username }, encryptionWorker ) { const payload = { baseUrl, + spaceKey, token: accessToken, username, }; diff --git a/frontend/src/components/Modals/ManageWorkspace/DataConnectors/Connectors/Confluence/index.jsx b/frontend/src/components/Modals/ManageWorkspace/DataConnectors/Connectors/Confluence/index.jsx index 3cb2c4f82af63f21f3dd1c703c71caa1c8b4dcb0..b9a1c90599e08fa5a391a1e1c0104b05cf53cb5d 100644 --- a/frontend/src/components/Modals/ManageWorkspace/DataConnectors/Connectors/Confluence/index.jsx +++ b/frontend/src/components/Modals/ManageWorkspace/DataConnectors/Connectors/Confluence/index.jsx @@ -22,7 +22,8 @@ export default function ConfluenceOptions() { } ); const { data, error } = await System.dataConnectors.confluence.collect({ - pageUrl: form.get("pageUrl"), + baseUrl: form.get("baseUrl"), + spaceKey: form.get("spaceKey"), username: form.get("username"), accessToken: form.get("accessToken"), }); @@ -56,17 +57,37 @@ export default function ConfluenceOptions() { <div className="flex flex-col pr-10"> <div className="flex flex-col gap-y-1 mb-4"> <label className="text-white text-sm font-bold flex gap-x-2 items-center"> - <p className="font-bold text-white">Confluence Page URL</p> + <p className="font-bold text-white">Confluence base URL</p> </label> <p className="text-xs font-normal text-white/50"> - URL of a page in the Confluence space. + This is the base URL of your Confluence space. </p> </div> <input type="url" - name="pageUrl" + name="baseUrl" className="bg-zinc-900 text-white placeholder:text-white/20 text-sm rounded-lg focus:outline-primary-button active:outline-primary-button outline-none block w-full p-2.5" - placeholder="https://example.atlassian.net/wiki/spaces/~7120208c08555d52224113949698b933a3bb56/pages/851969/Test+anythingLLM+page" + placeholder="eg: https://example.atlassian.net, http://localhost:8211, etc..." + required={true} + autoComplete="off" + spellCheck={false} + /> + </div> + <div className="flex flex-col pr-10"> + <div className="flex flex-col gap-y-1 mb-4"> + <label className="text-white text-sm font-bold"> + Confluence space key + </label> + <p className="text-xs font-normal text-white/50"> + This is the spaces key of your confluence instance that will + be used. Usually begins with ~ + </p> + </div> + <input + type="text" + name="spaceKey" + className="bg-zinc-900 text-white placeholder:text-white/20 text-sm rounded-lg focus:outline-primary-button active:outline-primary-button outline-none block w-full p-2.5" + placeholder="eg: ~7120208c08555d52224113949698b933a3bb56" required={true} autoComplete="off" spellCheck={false} diff --git a/frontend/src/models/dataConnector.js b/frontend/src/models/dataConnector.js index c363835c8354ec5f59c307c96b2fcd29e60e1573..eb5fe79da1ed4abf4dab4151a5bbed4f321bfc72 100644 --- a/frontend/src/models/dataConnector.js +++ b/frontend/src/models/dataConnector.js @@ -119,12 +119,13 @@ const DataConnector = { }, confluence: { - collect: async function ({ pageUrl, username, accessToken }) { + collect: async function ({ baseUrl, spaceKey, username, accessToken }) { return await fetch(`${API_BASE}/ext/confluence`, { method: "POST", headers: baseHeaders(), body: JSON.stringify({ - pageUrl, + baseUrl, + spaceKey, username, accessToken, }),