Skip to content
Snippets Groups Projects
Unverified Commit 44887448 authored by Sean Hatfield's avatar Sean Hatfield Committed by GitHub
Browse files

Support more Confluence URL formats (#2118)


* support more confluence url formats

* use pattern matching for confluence urls and manual splitting as fallback

* rework entire Confluence flow to prevent issues with custom, local, and cloud spaces

* remove dep

---------

Co-authored-by: default avatarTimothy Carambat <rambat1010@gmail.com>
parent 44dddcd4
No related branches found
No related tags found
No related merge requests found
...@@ -59,6 +59,7 @@ async function resyncConfluence({ chunkSource }, response) { ...@@ -59,6 +59,7 @@ async function resyncConfluence({ chunkSource }, response) {
const { success, reason, content } = await fetchConfluencePage({ const { success, reason, content } = await fetchConfluencePage({
pageUrl: `https:${source.pathname}`, // need to add back the real protocol pageUrl: `https:${source.pathname}`, // need to add back the real protocol
baseUrl: source.searchParams.get('baseUrl'), baseUrl: source.searchParams.get('baseUrl'),
spaceKey: source.searchParams.get('spaceKey'),
accessToken: source.searchParams.get('token'), accessToken: source.searchParams.get('token'),
username: source.searchParams.get('username'), username: source.searchParams.get('username'),
}); });
......
...@@ -72,8 +72,9 @@ class ConfluencePagesLoader { ...@@ -72,8 +72,9 @@ class ConfluencePagesLoader {
} }
} }
// https://developer.atlassian.com/cloud/confluence/rest/v2/intro/#auth
async fetchAllPagesInSpace(start = 0, limit = this.limit) { async fetchAllPagesInSpace(start = 0, limit = this.limit) {
const url = `${this.baseUrl}/rest/api/content?spaceKey=${this.spaceKey}&limit=${limit}&start=${start}&expand=${this.expand}`; const url = `${this.baseUrl}/wiki/rest/api/content?spaceKey=${this.spaceKey}&limit=${limit}&start=${start}&expand=${this.expand}`;
const data = await this.fetchConfluenceData(url); const data = await this.fetchConfluenceData(url);
if (data.size === 0) { if (data.size === 0) {
return []; return [];
......
...@@ -2,7 +2,6 @@ const fs = require("fs"); ...@@ -2,7 +2,6 @@ const fs = require("fs");
const path = require("path"); const path = require("path");
const { default: slugify } = require("slugify"); const { default: slugify } = require("slugify");
const { v4 } = require("uuid"); const { v4 } = require("uuid");
const UrlPattern = require("url-pattern");
const { writeToServerDocuments, sanitizeFileName } = require("../../files"); const { writeToServerDocuments, sanitizeFileName } = require("../../files");
const { tokenizeString } = require("../../tokenizer"); const { tokenizeString } = require("../../tokenizer");
const { ConfluencePagesLoader } = require("./ConfluenceLoader"); const { ConfluencePagesLoader } = require("./ConfluenceLoader");
...@@ -13,8 +12,11 @@ const { ConfluencePagesLoader } = require("./ConfluenceLoader"); ...@@ -13,8 +12,11 @@ const { ConfluencePagesLoader } = require("./ConfluenceLoader");
* @param {import("../../../middleware/setDataSigner").ResponseWithSigner} response - Express response object with encryptionWorker * @param {import("../../../middleware/setDataSigner").ResponseWithSigner} response - Express response object with encryptionWorker
* @returns * @returns
*/ */
async function loadConfluence({ pageUrl, username, accessToken }, response) { async function loadConfluence(
if (!pageUrl || !username || !accessToken) { { baseUrl = null, spaceKey = null, username = null, accessToken = null },
response
) {
if (!baseUrl || !spaceKey || !username || !accessToken) {
return { return {
success: false, success: false,
reason: reason:
...@@ -22,19 +24,24 @@ async function loadConfluence({ pageUrl, username, accessToken }, response) { ...@@ -22,19 +24,24 @@ async function loadConfluence({ pageUrl, username, accessToken }, response) {
}; };
} }
const { valid, result } = validSpaceUrl(pageUrl); if (!validBaseUrl(baseUrl)) {
if (!valid) {
return { return {
success: false, success: false,
reason: reason: "Provided base URL is not a valid URL.",
"Confluence space URL is not in the expected format of one of https://domain.atlassian.net/wiki/space/~SPACEID/* or https://customDomain/wiki/space/~SPACEID/* or https://customDomain/display/~SPACEID/*",
}; };
} }
const { apiBase: baseUrl, spaceKey, subdomain } = result; if (!spaceKey) {
console.log(`-- Working Confluence ${baseUrl} --`); return {
success: false,
reason: "You need to provide a Confluence space key.",
};
}
const { origin, hostname } = new URL(baseUrl);
console.log(`-- Working Confluence ${origin} --`);
const loader = new ConfluencePagesLoader({ const loader = new ConfluencePagesLoader({
baseUrl, baseUrl: origin, // Use the origin to avoid issues with subdomains, ports, protocols, etc.
spaceKey, spaceKey,
username, username,
accessToken, accessToken,
...@@ -59,7 +66,7 @@ async function loadConfluence({ pageUrl, username, accessToken }, response) { ...@@ -59,7 +66,7 @@ async function loadConfluence({ pageUrl, username, accessToken }, response) {
}; };
} }
const outFolder = slugify( const outFolder = slugify(
`${subdomain}-confluence-${v4().slice(0, 4)}` `confluence-${origin}-${v4().slice(0, 4)}`
).toLowerCase(); ).toLowerCase();
const outFolderPath = const outFolderPath =
...@@ -80,11 +87,11 @@ async function loadConfluence({ pageUrl, username, accessToken }, response) { ...@@ -80,11 +87,11 @@ async function loadConfluence({ pageUrl, username, accessToken }, response) {
id: v4(), id: v4(),
url: doc.metadata.url + ".page", url: doc.metadata.url + ".page",
title: doc.metadata.title || doc.metadata.source, title: doc.metadata.title || doc.metadata.source,
docAuthor: subdomain, docAuthor: origin,
description: doc.metadata.title, description: doc.metadata.title,
docSource: `${subdomain} Confluence`, docSource: `${origin} Confluence`,
chunkSource: generateChunkSource( chunkSource: generateChunkSource(
{ doc, baseUrl, accessToken, username }, { doc, baseUrl: origin, spaceKey, accessToken, username },
response.locals.encryptionWorker response.locals.encryptionWorker
), ),
published: new Date().toLocaleString(), published: new Date().toLocaleString(),
...@@ -120,10 +127,11 @@ async function loadConfluence({ pageUrl, username, accessToken }, response) { ...@@ -120,10 +127,11 @@ async function loadConfluence({ pageUrl, username, accessToken }, response) {
async function fetchConfluencePage({ async function fetchConfluencePage({
pageUrl, pageUrl,
baseUrl, baseUrl,
spaceKey,
username, username,
accessToken, accessToken,
}) { }) {
if (!pageUrl || !baseUrl || !username || !accessToken) { if (!pageUrl || !baseUrl || !spaceKey || !username || !accessToken) {
return { return {
success: false, success: false,
content: null, content: null,
...@@ -132,20 +140,25 @@ async function fetchConfluencePage({ ...@@ -132,20 +140,25 @@ async function fetchConfluencePage({
}; };
} }
const { valid, result } = validSpaceUrl(pageUrl); if (!validBaseUrl(baseUrl)) {
if (!valid) {
return { return {
success: false, success: false,
content: null, content: null,
reason: reason: "Provided base URL is not a valid URL.",
"Confluence space URL is not in the expected format of https://domain.atlassian.net/wiki/space/~SPACEID/* or https://customDomain/wiki/space/~SPACEID/*", };
}
if (!spaceKey) {
return {
success: false,
content: null,
reason: "You need to provide a Confluence space key.",
}; };
} }
console.log(`-- Working Confluence Page ${pageUrl} --`); console.log(`-- Working Confluence Page ${pageUrl} --`);
const { spaceKey } = result;
const loader = new ConfluencePagesLoader({ const loader = new ConfluencePagesLoader({
baseUrl, baseUrl, // Should be the origin of the baseUrl
spaceKey, spaceKey,
username, username,
accessToken, accessToken,
...@@ -190,91 +203,17 @@ async function fetchConfluencePage({ ...@@ -190,91 +203,17 @@ async function fetchConfluencePage({
} }
/** /**
* A match result for a url-pattern of a Confluence URL * Validates if the provided baseUrl is a valid URL at all.
* @typedef {Object} ConfluenceMatchResult * @param {string} baseUrl
* @property {string} subdomain - the subdomain of an organization's Confluence space * @returns {boolean}
* @property {string} spaceKey - the spaceKey of an organization that determines the documents to collect.
* @property {string} apiBase - the correct REST API url to use for loader.
*/
/**
* Generates the correct API base URL for interfacing with the Confluence REST API
* depending on the URL pattern being used since there are various ways to host/access a
* Confluence space.
* @param {ConfluenceMatchResult} matchResult - result from `url-pattern`.match
* @param {boolean} isCustomDomain - determines if we need to coerce the subpath of the provided URL
* @returns {string} - the resulting REST API URL
*/
function generateAPIBaseUrl(matchResult = {}, isCustomDomain = false) {
const { subdomain } = matchResult;
if (isCustomDomain) return `https://${subdomain}`;
return `https://${subdomain}.atlassian.net/wiki`;
}
/**
* Validates and parses the correct information from a given Confluence URL
* @param {string} spaceUrl - The organization's Confluence URL to parse
* @returns {{
* valid: boolean,
* result: (ConfluenceMatchResult|null),
* }}
*/ */
function validSpaceUrl(spaceUrl = "") { function validBaseUrl(baseUrl) {
let matchResult; try {
const patterns = { new URL(baseUrl);
default: new UrlPattern( return true;
"https\\://(:subdomain).atlassian.net/wiki/spaces/(:spaceKey)*" } catch (e) {
), return false;
subdomain: new UrlPattern(
"https\\://(:subdomain.):domain.:tld/wiki/spaces/(:spaceKey)*"
),
custom: new UrlPattern(
"https\\://(:subdomain.):domain.:tld/display/(:spaceKey)*"
),
};
// If using the default Atlassian Confluence URL pattern.
// We can proceed because the Library/API can use this base url scheme.
matchResult = patterns.default.match(spaceUrl);
if (matchResult)
return {
valid: matchResult.hasOwnProperty("spaceKey"),
result: {
...matchResult,
apiBase: generateAPIBaseUrl(matchResult),
},
};
// If using a custom subdomain Confluence URL pattern.
// We need to attach the customDomain as a property to the match result
// so we can form the correct REST API base from the subdomain.
matchResult = patterns.subdomain.match(spaceUrl);
if (matchResult) {
return {
valid: matchResult.hasOwnProperty("spaceKey"),
result: {
...matchResult,
apiBase: generateAPIBaseUrl(matchResult),
},
};
} }
// If using a base FQDN Confluence URL pattern.
// We need to attach the customDomain as a property to the match result
// so we can form the correct REST API base from the root domain since /display/ is basically a URL mask.
matchResult = patterns.custom.match(spaceUrl);
if (matchResult) {
return {
valid: matchResult.hasOwnProperty("spaceKey"),
result: {
...matchResult,
apiBase: generateAPIBaseUrl(matchResult, true),
},
};
}
// No match
return { valid: false, result: null };
} }
/** /**
...@@ -286,11 +225,12 @@ function validSpaceUrl(spaceUrl = "") { ...@@ -286,11 +225,12 @@ function validSpaceUrl(spaceUrl = "") {
* @returns {string} * @returns {string}
*/ */
function generateChunkSource( function generateChunkSource(
{ doc, baseUrl, accessToken, username }, { doc, baseUrl, spaceKey, accessToken, username },
encryptionWorker encryptionWorker
) { ) {
const payload = { const payload = {
baseUrl, baseUrl,
spaceKey,
token: accessToken, token: accessToken,
username, username,
}; };
......
...@@ -22,7 +22,8 @@ export default function ConfluenceOptions() { ...@@ -22,7 +22,8 @@ export default function ConfluenceOptions() {
} }
); );
const { data, error } = await System.dataConnectors.confluence.collect({ const { data, error } = await System.dataConnectors.confluence.collect({
pageUrl: form.get("pageUrl"), baseUrl: form.get("baseUrl"),
spaceKey: form.get("spaceKey"),
username: form.get("username"), username: form.get("username"),
accessToken: form.get("accessToken"), accessToken: form.get("accessToken"),
}); });
...@@ -56,17 +57,37 @@ export default function ConfluenceOptions() { ...@@ -56,17 +57,37 @@ export default function ConfluenceOptions() {
<div className="flex flex-col pr-10"> <div className="flex flex-col pr-10">
<div className="flex flex-col gap-y-1 mb-4"> <div className="flex flex-col gap-y-1 mb-4">
<label className="text-white text-sm font-bold flex gap-x-2 items-center"> <label className="text-white text-sm font-bold flex gap-x-2 items-center">
<p className="font-bold text-white">Confluence Page URL</p> <p className="font-bold text-white">Confluence base URL</p>
</label> </label>
<p className="text-xs font-normal text-white/50"> <p className="text-xs font-normal text-white/50">
URL of a page in the Confluence space. This is the base URL of your Confluence space.
</p> </p>
</div> </div>
<input <input
type="url" type="url"
name="pageUrl" name="baseUrl"
className="bg-zinc-900 text-white placeholder:text-white/20 text-sm rounded-lg focus:outline-primary-button active:outline-primary-button outline-none block w-full p-2.5" className="bg-zinc-900 text-white placeholder:text-white/20 text-sm rounded-lg focus:outline-primary-button active:outline-primary-button outline-none block w-full p-2.5"
placeholder="https://example.atlassian.net/wiki/spaces/~7120208c08555d52224113949698b933a3bb56/pages/851969/Test+anythingLLM+page" placeholder="eg: https://example.atlassian.net, http://localhost:8211, etc..."
required={true}
autoComplete="off"
spellCheck={false}
/>
</div>
<div className="flex flex-col pr-10">
<div className="flex flex-col gap-y-1 mb-4">
<label className="text-white text-sm font-bold">
Confluence space key
</label>
<p className="text-xs font-normal text-white/50">
This is the spaces key of your confluence instance that will
be used. Usually begins with ~
</p>
</div>
<input
type="text"
name="spaceKey"
className="bg-zinc-900 text-white placeholder:text-white/20 text-sm rounded-lg focus:outline-primary-button active:outline-primary-button outline-none block w-full p-2.5"
placeholder="eg: ~7120208c08555d52224113949698b933a3bb56"
required={true} required={true}
autoComplete="off" autoComplete="off"
spellCheck={false} spellCheck={false}
......
...@@ -119,12 +119,13 @@ const DataConnector = { ...@@ -119,12 +119,13 @@ const DataConnector = {
}, },
confluence: { confluence: {
collect: async function ({ pageUrl, username, accessToken }) { collect: async function ({ baseUrl, spaceKey, username, accessToken }) {
return await fetch(`${API_BASE}/ext/confluence`, { return await fetch(`${API_BASE}/ext/confluence`, {
method: "POST", method: "POST",
headers: baseHeaders(), headers: baseHeaders(),
body: JSON.stringify({ body: JSON.stringify({
pageUrl, baseUrl,
spaceKey,
username, username,
accessToken, accessToken,
}), }),
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment