Skip to content
Snippets Groups Projects
Unverified Commit 612a7e16 authored by Sean Hatfield's avatar Sean Hatfield Committed by GitHub
Browse files

[FEAT] Website depth scraping data connector (#1191)


* WIP website depth scraping, (sort of works)

* website depth data connector stable + add maxLinks option

* linting + loading small ui tweak

* refactor website depth data connector for stability, speed, & readability

* patch: remove console log
Guard clause on URL validitiy check
reasonable overrides

---------

Co-authored-by: default avatarTimothy Carambat <rambat1010@gmail.com>
parent b6be43be
No related branches found
No related tags found
No related merge requests found
const { verifyPayloadIntegrity } = require("../middleware/verifyIntegrity");
const { reqBody } = require("../utils/http");
const { validURL } = require("../utils/url");
function extensions(app) {
if (!app) return;
......@@ -86,6 +87,25 @@ function extensions(app) {
}
);
app.post(
"/ext/website-depth",
[verifyPayloadIntegrity],
async function (request, response) {
try {
const websiteDepth = require("../utils/extensions/WebsiteDepth");
const { url, depth = 1, maxLinks = 20 } = reqBody(request);
if (!validURL(url)) return { success: false, reason: "Not a valid URL." };
const scrapedData = await websiteDepth(url, depth, maxLinks);
response.status(200).json({ success: true, data: scrapedData });
} catch (e) {
console.error(e);
response.status(400).json({ success: false, reason: e.message });
}
return;
}
);
app.post(
"/ext/confluence",
[verifyPayloadIntegrity],
......
const { v4 } = require("uuid");
const {
PuppeteerWebBaseLoader,
} = require("langchain/document_loaders/web/puppeteer");
const { default: slugify } = require("slugify");
const { parse } = require("node-html-parser");
const { writeToServerDocuments } = require("../../files");
const { tokenizeString } = require("../../tokenizer");
const path = require("path");
const fs = require("fs");
async function discoverLinks(startUrl, depth = 1, maxLinks = 20) {
const baseUrl = new URL(startUrl).origin;
const discoveredLinks = new Set();
const pendingLinks = [startUrl];
let currentLevel = 0;
depth = depth < 1 ? 1 : depth;
maxLinks = maxLinks < 1 ? 1 : maxLinks;
// Check depth and if there are any links left to scrape
while (currentLevel < depth && pendingLinks.length > 0) {
const newLinks = await getPageLinks(pendingLinks[0], baseUrl);
pendingLinks.shift();
for (const link of newLinks) {
if (!discoveredLinks.has(link)) {
discoveredLinks.add(link);
pendingLinks.push(link);
}
// Exit out if we reach maxLinks
if (discoveredLinks.size >= maxLinks) {
return Array.from(discoveredLinks).slice(0, maxLinks);
}
}
if (pendingLinks.length === 0) {
currentLevel++;
}
}
return Array.from(discoveredLinks);
}
async function getPageLinks(url, baseUrl) {
try {
const loader = new PuppeteerWebBaseLoader(url, {
launchOptions: { headless: "new" },
gotoOptions: { waitUntil: "domcontentloaded" },
});
const docs = await loader.load();
const html = docs[0].pageContent;
const links = extractLinks(html, baseUrl);
return links;
} catch (error) {
console.error(`Failed to get page links from ${url}.`, error);
return [];
}
}
function extractLinks(html, baseUrl) {
const root = parse(html);
const links = root.querySelectorAll("a");
const extractedLinks = new Set();
for (const link of links) {
const href = link.getAttribute("href");
if (href) {
const absoluteUrl = new URL(href, baseUrl).href;
if (absoluteUrl.startsWith(baseUrl)) {
extractedLinks.add(absoluteUrl);
}
}
}
return Array.from(extractedLinks);
}
async function bulkScrapePages(links, outputFolder) {
const scrapedData = [];
for (let i = 0; i < links.length; i++) {
const link = links[i];
console.log(`Scraping ${i + 1}/${links.length}: ${link}`);
try {
const loader = new PuppeteerWebBaseLoader(link, {
launchOptions: { headless: "new" },
gotoOptions: { waitUntil: "domcontentloaded" },
async evaluate(page, browser) {
const result = await page.evaluate(() => document.body.innerText);
await browser.close();
return result;
},
});
const docs = await loader.load();
const content = docs[0].pageContent;
if (!content.length) {
console.warn(`Empty content for ${link}. Skipping.`);
continue;
}
const url = new URL(link);
const filename = (url.host + "-" + url.pathname).replace(".", "_");
const data = {
id: v4(),
url: "file://" + slugify(filename) + ".html",
title: slugify(filename) + ".html",
docAuthor: "no author found",
description: "No description found.",
docSource: "URL link uploaded by the user.",
chunkSource: `link://${link}`,
published: new Date().toLocaleString(),
wordCount: content.split(" ").length,
pageContent: content,
token_count_estimate: tokenizeString(content).length,
};
writeToServerDocuments(data, data.title, outputFolder);
scrapedData.push(data);
console.log(`Successfully scraped ${link}.`);
} catch (error) {
console.error(`Failed to scrape ${link}.`, error);
}
}
return scrapedData;
}
async function websiteScraper(startUrl, depth = 1, maxLinks = 20) {
const websiteName = new URL(startUrl).hostname;
const outputFolder = path.resolve(
__dirname,
`../../../../server/storage/documents/${slugify(websiteName)}`
);
fs.mkdirSync(outputFolder, { recursive: true });
console.log("Discovering links...");
const linksToScrape = await discoverLinks(startUrl, depth, maxLinks);
console.log(`Found ${linksToScrape.length} links to scrape.`);
console.log("Starting bulk scraping...");
const scrapedData = await bulkScrapePages(linksToScrape, outputFolder);
console.log(`Scraped ${scrapedData.length} pages.`);
return scrapedData;
}
module.exports = websiteScraper;
import Github from "./github.svg";
import YouTube from "./youtube.svg";
import Link from "./link.svg";
import Confluence from "./confluence.jpeg";
const ConnectorImages = {
github: Github,
youtube: YouTube,
websiteDepth: Link,
confluence: Confluence,
};
......
<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" width="330" zoomAndPan="magnify" viewBox="0 0 247.5 247.500006" height="330" preserveAspectRatio="xMidYMid meet" version="1.0"><defs><filter x="0%" y="0%" width="100%" height="100%" id="9045983972"><feColorMatrix values="0 0 0 0 1 0 0 0 0 1 0 0 0 0 1 0 0 0 1 0" color-interpolation-filters="sRGB"/></filter><filter x="0%" y="0%" width="100%" height="100%" id="111345b854"><feColorMatrix values="0 0 0 0 1 0 0 0 0 1 0 0 0 0 1 0.2126 0.7152 0.0722 0 0" color-interpolation-filters="sRGB"/></filter><clipPath id="41acadd750"><path d="M 23 23 L 224.648438 23 L 224.648438 224.648438 L 23 224.648438 Z M 23 23 " clip-rule="nonzero"/></clipPath><image x="0" y="0" width="200" xlink:href="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAMgAAADICAAAAACIM/FCAAAAAmJLR0QA/4ePzL8AAAqWSURBVHic7Z17cFTVHcd/m10CyWZLEjBgQtKQkBAiD215tby0gkinVaqMRCw0gHQsrW2trWjLWCmIjjPFtpap2ihO7VCnlDFWaG0QJoDUjgxaaBNLlkceggmQLIYNm81jf/0jm2Qfv3Pu69x77mby/Sewe8/v/D57zz33nnN+53cBhjWsYQ0rkeSQ7UC0HGPGpI1MCgV8rde0ljTFHx1KKp01fWrhhBH9//c3eGs+Ot4o0yUdyt1YeRUpNby6wiPbOdUa8/C/SIiwOt9aMULZiHzNej3Iw0BExJbtObLdVNLCg4oUiIjYVTFRtqs8TXtHHQYiYtcLY2S7y1Lar3vUcyBi20NJsl0mtbRREwYi4tFJsp2O18jfaMZARH+5bL9jlXdcDwcivjZKtutRmn9JJwfiBzfKdj5C93Xq5kBsKJHt/oDW9xrgQLzyRdkAYW0IGeJA9N0iGwEABHAgXposGwKEcCDWy7/ihXAgfiC7FxbEgbhriHAglg8RDrxWEG3byjH7hpeUagvVnTrbci3kSs8pmjFB4dgjt6IoxzRK6Xy0vbo8PeLwvPX7u7jHf9ueHB+uHhlXJOtnn3JKtGZIoFDiqPsGXSrlMXp2BRERn7eWoE9cju6nkpkFx+9lFgvmWQgQFpejfja37PoAq+BLFnk/KC7HsRsUSs9uYZTsHG+J94PichxIVSw/iTW832KB8xHiclSlqLBQzDgnzZbOQRrnAJjDuE6Wm+x7pERwAGygy+8x1fVoD4RwALxJGriufIEJkigOuLGdNGFV2xLGAfAEaeNFszyPlkAOSCV7rjMmeR4tkRwAm0kzVozexXLA+G7Kzt1meB4twRwA5GrKz8U7HiPhHPAQZcn0O4l4DiigTH0k2vEYmcABQI0X28T6HStTOKCKsmbqVJ05HLCTMpcr0vEYmcQBP6XsTRHoeIzM4oDvUgbDawwuMb5Hqm8eLhQIOVzx7ffA3QH9loPUh+F5C/EgpaXfb7jY0hoAAEjyZI7PzS8qndYfGWOIg3a214BBzXIUf6ui3mC7AoBNVNOaJshH9Sr+0cvGOGAHBZIvxDlrVUmBJE4816DOERxaQwbtoCzqhNSEv7RnwA2tr1Afngv/TSSQu6gPP7baC+NK81NNa1X42wQ6I/e7qU+Ph/+asIbonFRckDM2MyU5FOzwNV/wnr4gxGxSDRVNczkr/A/BjyiZty2cOyN2Dc134v3DxzqNmi4jo4IOGjVLKffR95hRih2Vaz5nyHhKPWm3XIzrEUq+/6BC9FJg9yID9p8jbfZmKZfUpPQneOuvAzq5Wm9TXkT/SoeEUoD7Sc7aa7S8q3R1L9kXaXMPisRwrGbUQuvEl7VX4fmQtuUX+cRYckQLBiKGKtKVrUbJzapC4LKu4wfMxWO2mm7XxnGYZWiqMI7Mt7VjIGJou1MExz5hHNPP6+JAxCrVgSRsDpwliuPOa3o5EOsKDXPsFcWxhlywUKsWVUGvHI6gyp9CUUbj4XwzDXHgdptwILbdbITDa3BKRhwHYrPCfhDm/QMRexfahwPxdKZeDnzWThyIBzj3Ey7HUTEDKXFxr0/r47ggZlVaYPxuLznHo8TRoaLDE8UR+OTM2RYV29oa6YEjl6NrmTUcvj0Pz+sL73PmLd18QGH7DhlNwuXoLbOCI/inO2OuQ88Dh3gFQnPi60jjcqyxgCOwg7wKZ/yFU+Z43JhRPsfez7PKfenf7FIr7cbhu49T0rWNOclSF30zkc5xqoBfeEkbo+DLUW1LOschxYmAKXT4rs04qlSEUkykSOzGoeqhuiS+dcVwHE0IDoAlsVf87xOTA+DpROJIHse2M+Jk4nAsPvskx9KCiIIVduZwPBNC7urL4Kq/rTmcf0TEIG9NZ2bicCBy6z3WV/CVBODAIzx76xKHA/ELHIPp3Wjz/mqQA9/kmazG3s1RH9iXA3E+x+aWmuhvbfZ8FcWBpzjbuG6IHgvbmgNxm9o6+PMM0jmw946hwYHYpiqymDffbg8OxEYVub0SgQOxUTE5TuY/E4EDsU3hOik5bSsOcHmZzmzjbaZdx1tLlTIeXMU+9iTzzlj0D04Vksa1jhOcwyvJJYCJL3IzU8gan8/lTqS+tzYm+GTU8kr+goMgjjLt8wwVXL+wq/qprxe4AACScpds2keGi4rnWMJLZcmYLxmtIsVfqLWp6bKajIaCOGby+hLmvM9igatxYjhyWZkiuBxxE1bSOZy8MQ5vHs6pMiupRRywVScHQCbvNm05x22ckFeledHCy/bhSOV0Psrzu7f47MLBu2LVzFPPVh01S6vnm4I4CtlL4urm229uNsLReY8gDmBHWqpdN5hUp5/DJyhwCWCeYQ6AdO4zLU+1xaI44K/GOQCcv9CXqfQNcUHVpaynDI37OW/VnloZ29cLwwDYxajkoNbgQc9OrQ9efxOZuyyD8dB7Qsc5n/O+FoxzwnorAAB4kK6libMgyJZjZa1ajJZH2AnodImORepZoNNc0kpVaa/P/1B0PqZsurPZasDk3NfoDEwD6t53l4Zgf5XaSP9gxnJcpJa9wXz+ClZtVMqip0t/Jmu717Bd17zH98WN1dqrn12WJsDpCA2sgDVTF3XtVDGJT8dNLswdl+FOdnR3XL3UVH/axPejTCZPyFrzKjRL6yiOdkFx9Faof9NxKfVlpZGUGRarH6SI+pK7NmtTUTfi0FjZXmlXEjU2/J9sr7Qo3LQy4tMfJ1gWgjAIeXfyGjPt3vUT619MUUr1vj82ZNJdjYgNu9aVWpONIRyEQJ4RvxHD7v2LACCvvByu13x85vwnzW2fBQHA5cnMyskv+G2tEdscLaDOSLkBg+5qohcMdvbtXBQ2Dxeh8Bm5Tjqj32zf+YiRIzyCCq39g37LLIUbMNmK9D+fkhz9MoWjXzlU09qp1xrVrgZkRrsalIeqskqnMYkcANT086f6TEnlAHKmgHySVJJcDthNVavnfRKSOWALVe+72u3I5oB7yIo1bySVzgFjyaq1pvyXzwFQQ9Xdqu3mbgcO+B1Z+2blgoOyBQfcS1bfoWG+3x4ckELP0+5XbcAmHACv0y48orK4bThgGe1DUN1aq304wMVYI/epSd9qIw5Wyn/Ei8okGRbsm1CvDFbE2VWl1jXhv3biAHie5UvXo9x8a0t5sWoSOCCXHcz493xmKc8O3mK0DA6AZ9gOBbbQq9TONdwEdHI4wN3E8cm3Pf7Ffp7vnOVhyOIAKOO6FTr8vZsiLpYJD+y5zj3eUo6Yq/hdpTR3V2q8jf6OUZ5xRVOYiSr6Zeq8j4JyrvB/Yy2S1q4AAOBrQ4QD4FdDhAOStSbCtCkHQPp/hgYHQE7D0OAAKDEaUt1jDw6AEmPnJGDlWwr5yj5lgKONt3PaaqXzdmzy1XSTbOejNOKXOrfovCM63bZhfVXPJd/1mJVv5VapbPL9V1zVEfnk7KAV2nov/+OCA1/FKXWrhozXu3Nku8tT9nOfqaLo2a2Y81a2Rm+6oNyoXsiX7aYaORe/wts9FXyrzLK327KlsrccecftC2dQYT7nDle/7RPqkU5p6PZHz59eVFw0cL9r93q9tUfFvCRFgDTfv5I9aZ7UTr/fT8bhDGtYwxrWsNTp/+rU2Ks7TEVtAAAAAElFTkSuQmCC" id="95b5eeec8e" height="200" preserveAspectRatio="xMidYMid meet"/><mask id="bd5e341d33"><g filter="url(#9045983972)"><g filter="url(#111345b854)" transform="matrix(1.010459, 0, 0, 1.010459, 22.55585, 22.555871)"><image x="0" y="0" width="200" xlink:href="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAMgAAADICAAAAACIM/FCAAAAAmJLR0QA/4ePzL8AAAqWSURBVHic7Z17cFTVHcd/m10CyWZLEjBgQtKQkBAiD215tby0gkinVaqMRCw0gHQsrW2trWjLWCmIjjPFtpap2ihO7VCnlDFWaG0QJoDUjgxaaBNLlkceggmQLIYNm81jf/0jm2Qfv3Pu69x77mby/Sewe8/v/D57zz33nnN+53cBhjWsYQ0rkeSQ7UC0HGPGpI1MCgV8rde0ljTFHx1KKp01fWrhhBH9//c3eGs+Ot4o0yUdyt1YeRUpNby6wiPbOdUa8/C/SIiwOt9aMULZiHzNej3Iw0BExJbtObLdVNLCg4oUiIjYVTFRtqs8TXtHHQYiYtcLY2S7y1Lar3vUcyBi20NJsl0mtbRREwYi4tFJsp2O18jfaMZARH+5bL9jlXdcDwcivjZKtutRmn9JJwfiBzfKdj5C93Xq5kBsKJHt/oDW9xrgQLzyRdkAYW0IGeJA9N0iGwEABHAgXposGwKEcCDWy7/ihXAgfiC7FxbEgbhriHAglg8RDrxWEG3byjH7hpeUagvVnTrbci3kSs8pmjFB4dgjt6IoxzRK6Xy0vbo8PeLwvPX7u7jHf9ueHB+uHhlXJOtnn3JKtGZIoFDiqPsGXSrlMXp2BRERn7eWoE9cju6nkpkFx+9lFgvmWQgQFpejfja37PoAq+BLFnk/KC7HsRsUSs9uYZTsHG+J94PichxIVSw/iTW832KB8xHiclSlqLBQzDgnzZbOQRrnAJjDuE6Wm+x7pERwAGygy+8x1fVoD4RwALxJGriufIEJkigOuLGdNGFV2xLGAfAEaeNFszyPlkAOSCV7rjMmeR4tkRwAm0kzVozexXLA+G7Kzt1meB4twRwA5GrKz8U7HiPhHPAQZcn0O4l4DiigTH0k2vEYmcABQI0X28T6HStTOKCKsmbqVJ05HLCTMpcr0vEYmcQBP6XsTRHoeIzM4oDvUgbDawwuMb5Hqm8eLhQIOVzx7ffA3QH9loPUh+F5C/EgpaXfb7jY0hoAAEjyZI7PzS8qndYfGWOIg3a214BBzXIUf6ui3mC7AoBNVNOaJshH9Sr+0cvGOGAHBZIvxDlrVUmBJE4816DOERxaQwbtoCzqhNSEv7RnwA2tr1Afngv/TSSQu6gPP7baC+NK81NNa1X42wQ6I/e7qU+Ph/+asIbonFRckDM2MyU5FOzwNV/wnr4gxGxSDRVNczkr/A/BjyiZty2cOyN2Dc134v3DxzqNmi4jo4IOGjVLKffR95hRih2Vaz5nyHhKPWm3XIzrEUq+/6BC9FJg9yID9p8jbfZmKZfUpPQneOuvAzq5Wm9TXkT/SoeEUoD7Sc7aa7S8q3R1L9kXaXMPisRwrGbUQuvEl7VX4fmQtuUX+cRYckQLBiKGKtKVrUbJzapC4LKu4wfMxWO2mm7XxnGYZWiqMI7Mt7VjIGJou1MExz5hHNPP6+JAxCrVgSRsDpwliuPOa3o5EOsKDXPsFcWxhlywUKsWVUGvHI6gyp9CUUbj4XwzDXHgdptwILbdbITDa3BKRhwHYrPCfhDm/QMRexfahwPxdKZeDnzWThyIBzj3Ey7HUTEDKXFxr0/r47ggZlVaYPxuLznHo8TRoaLDE8UR+OTM2RYV29oa6YEjl6NrmTUcvj0Pz+sL73PmLd18QGH7DhlNwuXoLbOCI/inO2OuQ88Dh3gFQnPi60jjcqyxgCOwg7wKZ/yFU+Z43JhRPsfez7PKfenf7FIr7cbhu49T0rWNOclSF30zkc5xqoBfeEkbo+DLUW1LOschxYmAKXT4rs04qlSEUkykSOzGoeqhuiS+dcVwHE0IDoAlsVf87xOTA+DpROJIHse2M+Jk4nAsPvskx9KCiIIVduZwPBNC7urL4Kq/rTmcf0TEIG9NZ2bicCBy6z3WV/CVBODAIzx76xKHA/ELHIPp3Wjz/mqQA9/kmazG3s1RH9iXA3E+x+aWmuhvbfZ8FcWBpzjbuG6IHgvbmgNxm9o6+PMM0jmw946hwYHYpiqymDffbg8OxEYVub0SgQOxUTE5TuY/E4EDsU3hOik5bSsOcHmZzmzjbaZdx1tLlTIeXMU+9iTzzlj0D04Vksa1jhOcwyvJJYCJL3IzU8gan8/lTqS+tzYm+GTU8kr+goMgjjLt8wwVXL+wq/qprxe4AACScpds2keGi4rnWMJLZcmYLxmtIsVfqLWp6bKajIaCOGby+hLmvM9igatxYjhyWZkiuBxxE1bSOZy8MQ5vHs6pMiupRRywVScHQCbvNm05x22ckFeledHCy/bhSOV0Psrzu7f47MLBu2LVzFPPVh01S6vnm4I4CtlL4urm229uNsLReY8gDmBHWqpdN5hUp5/DJyhwCWCeYQ6AdO4zLU+1xaI44K/GOQCcv9CXqfQNcUHVpaynDI37OW/VnloZ29cLwwDYxajkoNbgQc9OrQ9efxOZuyyD8dB7Qsc5n/O+FoxzwnorAAB4kK6libMgyJZjZa1ajJZH2AnodImORepZoNNc0kpVaa/P/1B0PqZsurPZasDk3NfoDEwD6t53l4Zgf5XaSP9gxnJcpJa9wXz+ClZtVMqip0t/Jmu717Bd17zH98WN1dqrn12WJsDpCA2sgDVTF3XtVDGJT8dNLswdl+FOdnR3XL3UVH/axPejTCZPyFrzKjRL6yiOdkFx9Faof9NxKfVlpZGUGRarH6SI+pK7NmtTUTfi0FjZXmlXEjU2/J9sr7Qo3LQy4tMfJ1gWgjAIeXfyGjPt3vUT619MUUr1vj82ZNJdjYgNu9aVWpONIRyEQJ4RvxHD7v2LACCvvByu13x85vwnzW2fBQHA5cnMyskv+G2tEdscLaDOSLkBg+5qohcMdvbtXBQ2Dxeh8Bm5Tjqj32zf+YiRIzyCCq39g37LLIUbMNmK9D+fkhz9MoWjXzlU09qp1xrVrgZkRrsalIeqskqnMYkcANT086f6TEnlAHKmgHySVJJcDthNVavnfRKSOWALVe+72u3I5oB7yIo1bySVzgFjyaq1pvyXzwFQQ9Xdqu3mbgcO+B1Z+2blgoOyBQfcS1bfoWG+3x4ckELP0+5XbcAmHACv0y48orK4bThgGe1DUN1aq304wMVYI/epSd9qIw5Wyn/Ei8okGRbsm1CvDFbE2VWl1jXhv3biAHie5UvXo9x8a0t5sWoSOCCXHcz493xmKc8O3mK0DA6AZ9gOBbbQq9TONdwEdHI4wN3E8cm3Pf7Ffp7vnOVhyOIAKOO6FTr8vZsiLpYJD+y5zj3eUo6Yq/hdpTR3V2q8jf6OUZ5xRVOYiSr6Zeq8j4JyrvB/Yy2S1q4AAOBrQ4QD4FdDhAOStSbCtCkHQPp/hgYHQE7D0OAAKDEaUt1jDw6AEmPnJGDlWwr5yj5lgKONt3PaaqXzdmzy1XSTbOejNOKXOrfovCM63bZhfVXPJd/1mJVv5VapbPL9V1zVEfnk7KAV2nov/+OCA1/FKXWrhozXu3Nku8tT9nOfqaLo2a2Y81a2Rm+6oNyoXsiX7aYaORe/wts9FXyrzLK327KlsrccecftC2dQYT7nDle/7RPqkU5p6PZHz59eVFw0cL9r93q9tUfFvCRFgDTfv5I9aZ7UTr/fT8bhDGtYwxrWsNTp/+rU2Ks7TEVtAAAAAElFTkSuQmCC" height="200" preserveAspectRatio="xMidYMid meet"/></g></g></mask><image x="0" y="0" width="200" xlink:href="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAMgAAADICAIAAAAiOjnJAAAABmJLR0QA/wD/AP+gvaeTAAAAiklEQVR4nO3BAQEAAACCIP+vbkhAAQAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAADwYNWXAAG9rB+hAAAAAElFTkSuQmCC" id="f7fc71df02" height="200" preserveAspectRatio="xMidYMid meet"/></defs><path fill="#ffffff" d="M 0 0 L 247 0 L 247 247 L 0 247 Z M 0 0 " fill-opacity="1" fill-rule="nonzero"/><path fill="#ffffff" d="M 0 0 L 247 0 L 247 247 L 0 247 Z M 0 0 " fill-opacity="1" fill-rule="nonzero"/><g clip-path="url(#41acadd750)"><g mask="url(#bd5e341d33)"><g transform="matrix(1.010459, 0, 0, 1.010459, 22.55585, 22.555871)"><image x="0" y="0" width="200" xlink:href="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAMgAAADICAIAAAAiOjnJAAAABmJLR0QA/wD/AP+gvaeTAAAAiklEQVR4nO3BAQEAAACCIP+vbkhAAQAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAADwYNWXAAG9rB+hAAAAAElFTkSuQmCC" height="200" preserveAspectRatio="xMidYMid meet"/></g></g></g></svg>
\ No newline at end of file
import React, { useState } from "react";
import System from "@/models/system";
import showToast from "@/utils/toast";
import pluralize from "pluralize";
export default function WebsiteDepthOptions() {
const [loading, setLoading] = useState(false);
const handleSubmit = async (e) => {
e.preventDefault();
const form = new FormData(e.target);
try {
setLoading(true);
showToast("Scraping website - this may take a while.", "info", {
clear: true,
autoClose: false,
});
const { data, error } = await System.dataConnectors.websiteDepth.scrape({
url: form.get("url"),
depth: parseInt(form.get("depth")),
maxLinks: parseInt(form.get("maxLinks")),
});
if (!!error) {
showToast(error, "error", { clear: true });
setLoading(false);
return;
}
showToast(
`Successfully scraped ${data.length} ${pluralize(
"page",
data.length
)}!`,
"success",
{ clear: true }
);
e.target.reset();
setLoading(false);
} catch (e) {
console.error(e);
showToast(e.message, "error", { clear: true });
setLoading(false);
}
};
return (
<div className="flex w-full">
<div className="flex flex-col w-full px-1 md:pb-6 pb-16">
<form className="w-full" onSubmit={handleSubmit}>
<div className="w-full flex flex-col py-2">
<div className="w-full flex flex-col gap-4">
<div className="flex flex-col pr-10">
<div className="flex flex-col gap-y-1 mb-4">
<label className="text-white text-sm font-bold">
Website URL
</label>
<p className="text-xs font-normal text-white/50">
URL of the website you want to scrape.
</p>
</div>
<input
type="url"
name="url"
className="border-none bg-zinc-900 text-white placeholder:text-white/20 text-sm rounded-lg focus:border-white block w-full p-2.5"
placeholder="https://example.com"
required={true}
autoComplete="off"
spellCheck={false}
/>
</div>
<div className="flex flex-col pr-10">
<div className="flex flex-col gap-y-1 mb-4">
<label className="text-white text-sm font-bold">Depth</label>
<p className="text-xs font-normal text-white/50">
This is the number of child-links that the worker should
follow from the origin URL.
</p>
</div>
<input
type="number"
name="depth"
min="1"
max="5"
className="border-none bg-zinc-900 text-white placeholder:text-white/20 text-sm rounded-lg focus:border-white block w-full p-2.5"
required={true}
defaultValue="1"
/>
</div>
<div className="flex flex-col pr-10">
<div className="flex flex-col gap-y-1 mb-4">
<label className="text-white text-sm font-bold">
Max Links
</label>
<p className="text-xs font-normal text-white/50">
Maximum number of links to scrape.
</p>
</div>
<input
type="number"
name="maxLinks"
min="1"
className="border-none bg-zinc-900 text-white placeholder:text-white/20 text-sm rounded-lg focus:border-white block w-full p-2.5"
required={true}
defaultValue="20"
/>
</div>
</div>
</div>
<div className="flex flex-col gap-y-2 w-full pr-10">
<button
type="submit"
disabled={loading}
className={`mt-2 w-full ${
loading ? "cursor-not-allowed animate-pulse" : ""
} justify-center border border-slate-200 px-4 py-2 rounded-lg text-[#222628] text-sm font-bold items-center flex gap-x-2 bg-slate-200 hover:bg-slate-300 hover:text-slate-800 disabled:bg-slate-300 disabled:cursor-not-allowed`}
>
{loading ? "Scraping website..." : "Submit"}
</button>
{loading && (
<p className="text-xs text-white/50">
Once complete, all scraped pages will be available for embedding
into workspaces in the document picker.
</p>
)}
</div>
</form>
</div>
</div>
);
}
......@@ -5,6 +5,7 @@ import YoutubeOptions from "./Connectors/Youtube";
import ConfluenceOptions from "./Connectors/Confluence";
import { useState } from "react";
import ConnectorOption from "./ConnectorOption";
import WebsiteDepthOptions from "./Connectors/WebsiteDepth";
export const DATA_CONNECTORS = {
github: {
......@@ -21,6 +22,12 @@ export const DATA_CONNECTORS = {
"Import the transcription of an entire YouTube video from a link.",
options: <YoutubeOptions />,
},
"website-depth": {
name: "Bulk Link Scraper",
image: ConnectorImages.websiteDepth,
description: "Scrape a website and its sub-links up to a certain depth.",
options: <WebsiteDepthOptions />,
},
confluence: {
name: "Confluence",
image: ConnectorImages.confluence,
......
......@@ -60,6 +60,24 @@ const DataConnector = {
});
},
},
websiteDepth: {
scrape: async ({ url, depth, maxLinks }) => {
return await fetch(`${API_BASE}/ext/website-depth`, {
method: "POST",
headers: baseHeaders(),
body: JSON.stringify({ url, depth, maxLinks }),
})
.then((res) => res.json())
.then((res) => {
if (!res.success) throw new Error(res.reason);
return { data: res.data, error: null };
})
.catch((e) => {
console.error(e);
return { data: null, error: e.message };
});
},
},
confluence: {
collect: async function ({ pageUrl, username, accessToken }) {
......
......@@ -93,6 +93,27 @@ function extensionEndpoints(app) {
}
}
);
app.post(
"/ext/website-depth",
[validatedRequest, flexUserRoleValid([ROLES.admin, ROLES.manager])],
async (request, response) => {
try {
const responseFromProcessor =
await new CollectorApi().forwardExtensionRequest({
endpoint: "/ext/website-depth",
method: "POST",
body: request.body,
});
await Telemetry.sendTelemetry("extension_invoked", {
type: "website_depth",
});
response.status(200).json(responseFromProcessor);
} catch (e) {
console.error(e);
response.sendStatus(500).end();
}
}
);
}
module.exports = { extensionEndpoints };
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment