Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
Anything Llm
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Iterations
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Locked files
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package registry
Container Registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Service Desk
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Code review analytics
Issue analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Terms and privacy
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
mirrored_repos
MachineLearning
Mintplex Labs
Anything Llm
Commits
3fa00ad7
Commit
3fa00ad7
authored
8 months ago
by
timothycarambat
Browse files
Options
Downloads
Plain Diff
Merge branch 'master' of github.com:Mintplex-Labs/anything-llm
parents
89538c36
f205d51f
No related branches found
Branches containing commit
No related tags found
Tags containing commit
No related merge requests found
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
collector/utils/extensions/Confluence/ConfluenceLoader/index.js
+134
-0
134 additions, 0 deletions
...tor/utils/extensions/Confluence/ConfluenceLoader/index.js
collector/utils/extensions/Confluence/index.js
+1
-3
1 addition, 3 deletions
collector/utils/extensions/Confluence/index.js
with
135 additions
and
3 deletions
collector/utils/extensions/Confluence/ConfluenceLoader/index.js
0 → 100644
+
134
−
0
View file @
3fa00ad7
/*
* This is a custom implementation of the Confluence langchain loader. There was an issue where
* code blocks were not being extracted. This is a temporary fix until this issue is resolved.*/
const
{
htmlToText
}
=
require
(
"
html-to-text
"
);
class
ConfluencePagesLoader
{
constructor
({
baseUrl
,
spaceKey
,
username
,
accessToken
,
limit
=
25
,
expand
=
"
body.storage,version
"
,
personalAccessToken
,
})
{
this
.
baseUrl
=
baseUrl
;
this
.
spaceKey
=
spaceKey
;
this
.
username
=
username
;
this
.
accessToken
=
accessToken
;
this
.
limit
=
limit
;
this
.
expand
=
expand
;
this
.
personalAccessToken
=
personalAccessToken
;
}
get
authorizationHeader
()
{
if
(
this
.
personalAccessToken
)
{
return
`Bearer
${
this
.
personalAccessToken
}
`
;
}
else
if
(
this
.
username
&&
this
.
accessToken
)
{
const
authToken
=
Buffer
.
from
(
`
${
this
.
username
}
:
${
this
.
accessToken
}
`
).
toString
(
"
base64
"
);
return
`Basic
${
authToken
}
`
;
}
return
undefined
;
}
async
load
(
options
)
{
try
{
const
pages
=
await
this
.
fetchAllPagesInSpace
(
options
?.
start
,
options
?.
limit
);
return
pages
.
map
((
page
)
=>
this
.
createDocumentFromPage
(
page
));
}
catch
(
error
)
{
console
.
error
(
"
Error:
"
,
error
);
return
[];
}
}
async
fetchConfluenceData
(
url
)
{
try
{
const
initialHeaders
=
{
"
Content-Type
"
:
"
application/json
"
,
Accept
:
"
application/json
"
,
};
const
authHeader
=
this
.
authorizationHeader
;
if
(
authHeader
)
{
initialHeaders
.
Authorization
=
authHeader
;
}
const
response
=
await
fetch
(
url
,
{
headers
:
initialHeaders
,
});
if
(
!
response
.
ok
)
{
throw
new
Error
(
`Failed to fetch
${
url
}
from Confluence:
${
response
.
status
}
`
);
}
return
await
response
.
json
();
}
catch
(
error
)
{
throw
new
Error
(
`Failed to fetch
${
url
}
from Confluence:
${
error
}
`
);
}
}
async
fetchAllPagesInSpace
(
start
=
0
,
limit
=
this
.
limit
)
{
const
url
=
`
${
this
.
baseUrl
}
/rest/api/content?spaceKey=
${
this
.
spaceKey
}
&limit=
${
limit
}
&start=
${
start
}
&expand=
${
this
.
expand
}
`
;
const
data
=
await
this
.
fetchConfluenceData
(
url
);
if
(
data
.
size
===
0
)
{
return
[];
}
const
nextPageStart
=
start
+
data
.
size
;
const
nextPageResults
=
await
this
.
fetchAllPagesInSpace
(
nextPageStart
,
limit
);
return
data
.
results
.
concat
(
nextPageResults
);
}
createDocumentFromPage
(
page
)
{
// Function to extract code blocks
const
extractCodeBlocks
=
(
content
)
=>
{
const
codeBlockRegex
=
/<ac:structured-macro ac:name="code"
[^
>
]
*>
[\s\S]
*
?
<ac:plain-text-body><!
\[
CDATA
\[([\s\S]
*
?)\]\]
><
\/
ac:plain-text-body>
[\s\S]
*
?
<
\/
ac:structured-macro>/g
;
const
languageRegex
=
/<ac:parameter ac:name="language">
(
.*
?)
<
\/
ac:parameter>/
;
return
content
.
replace
(
codeBlockRegex
,
(
match
)
=>
{
const
language
=
match
.
match
(
languageRegex
)?.[
1
]
||
""
;
const
code
=
match
.
match
(
/<ac:plain-text-body><!
\[
CDATA
\[([\s\S]
*
?)\]\]
><
\/
ac:plain-text-body>/
)?.[
1
]
||
""
;
return
`\n
\`\`\`
${
language
}
\n
${
code
.
trim
()}
\n
\`\`\`
\n`
;
});
};
const
contentWithCodeBlocks
=
extractCodeBlocks
(
page
.
body
.
storage
.
value
);
const
plainTextContent
=
htmlToText
(
contentWithCodeBlocks
,
{
wordwrap
:
false
,
preserveNewlines
:
true
,
});
const
textWithPreservedStructure
=
plainTextContent
.
replace
(
/
\n{3,}
/g
,
"
\n\n
"
);
const
pageUrl
=
`
${
this
.
baseUrl
}
/spaces/
${
this
.
spaceKey
}
/pages/
${
page
.
id
}
`
;
return
{
pageContent
:
textWithPreservedStructure
,
metadata
:
{
id
:
page
.
id
,
status
:
page
.
status
,
title
:
page
.
title
,
type
:
page
.
type
,
url
:
pageUrl
,
version
:
page
.
version
?.
number
,
updated_by
:
page
.
version
?.
by
?.
displayName
,
updated_at
:
page
.
version
?.
when
,
},
};
}
}
module
.
exports
=
{
ConfluencePagesLoader
};
This diff is collapsed.
Click to expand it.
collector/utils/extensions/Confluence/index.js
+
1
−
3
View file @
3fa00ad7
...
@@ -5,9 +5,7 @@ const { v4 } = require("uuid");
...
@@ -5,9 +5,7 @@ const { v4 } = require("uuid");
const
UrlPattern
=
require
(
"
url-pattern
"
);
const
UrlPattern
=
require
(
"
url-pattern
"
);
const
{
writeToServerDocuments
,
sanitizeFileName
}
=
require
(
"
../../files
"
);
const
{
writeToServerDocuments
,
sanitizeFileName
}
=
require
(
"
../../files
"
);
const
{
tokenizeString
}
=
require
(
"
../../tokenizer
"
);
const
{
tokenizeString
}
=
require
(
"
../../tokenizer
"
);
const
{
const
{
ConfluencePagesLoader
}
=
require
(
"
./ConfluenceLoader
"
);
ConfluencePagesLoader
,
}
=
require
(
"
langchain/document_loaders/web/confluence
"
);
/**
/**
* Load Confluence documents from a spaceID and Confluence credentials
* Load Confluence documents from a spaceID and Confluence credentials
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment