diff --git a/collector/extensions/index.js b/collector/extensions/index.js index 6a3f3393e131b09e479cc76a0bde8b8be4fc2e22..0772646461d8d26c8eed2a668f0779c88260ef03 100644 --- a/collector/extensions/index.js +++ b/collector/extensions/index.js @@ -1,5 +1,6 @@ const { verifyPayloadIntegrity } = require("../middleware/verifyIntegrity"); const { reqBody } = require("../utils/http"); +const { validURL } = require("../utils/url"); function extensions(app) { if (!app) return; @@ -86,6 +87,25 @@ function extensions(app) { } ); + app.post( + "/ext/website-depth", + [verifyPayloadIntegrity], + async function (request, response) { + try { + const websiteDepth = require("../utils/extensions/WebsiteDepth"); + const { url, depth = 1, maxLinks = 20 } = reqBody(request); + if (!validURL(url)) return { success: false, reason: "Not a valid URL." }; + + const scrapedData = await websiteDepth(url, depth, maxLinks); + response.status(200).json({ success: true, data: scrapedData }); + } catch (e) { + console.error(e); + response.status(400).json({ success: false, reason: e.message }); + } + return; + } + ); + app.post( "/ext/confluence", [verifyPayloadIntegrity], diff --git a/collector/utils/extensions/WebsiteDepth/index.js b/collector/utils/extensions/WebsiteDepth/index.js new file mode 100644 index 0000000000000000000000000000000000000000..6e561ef74d504ac9c3cfb041de85bef1b144fea3 --- /dev/null +++ b/collector/utils/extensions/WebsiteDepth/index.js @@ -0,0 +1,153 @@ +const { v4 } = require("uuid"); +const { + PuppeteerWebBaseLoader, +} = require("langchain/document_loaders/web/puppeteer"); +const { default: slugify } = require("slugify"); +const { parse } = require("node-html-parser"); +const { writeToServerDocuments } = require("../../files"); +const { tokenizeString } = require("../../tokenizer"); +const path = require("path"); +const fs = require("fs"); + +async function discoverLinks(startUrl, depth = 1, maxLinks = 20) { + const baseUrl = new URL(startUrl).origin; + const discoveredLinks = new Set(); + const pendingLinks = [startUrl]; + let currentLevel = 0; + depth = depth < 1 ? 1 : depth; + maxLinks = maxLinks < 1 ? 1 : maxLinks; + + // Check depth and if there are any links left to scrape + while (currentLevel < depth && pendingLinks.length > 0) { + const newLinks = await getPageLinks(pendingLinks[0], baseUrl); + pendingLinks.shift(); + + for (const link of newLinks) { + if (!discoveredLinks.has(link)) { + discoveredLinks.add(link); + pendingLinks.push(link); + } + + // Exit out if we reach maxLinks + if (discoveredLinks.size >= maxLinks) { + return Array.from(discoveredLinks).slice(0, maxLinks); + } + } + + if (pendingLinks.length === 0) { + currentLevel++; + } + } + + return Array.from(discoveredLinks); +} + +async function getPageLinks(url, baseUrl) { + try { + const loader = new PuppeteerWebBaseLoader(url, { + launchOptions: { headless: "new" }, + gotoOptions: { waitUntil: "domcontentloaded" }, + }); + const docs = await loader.load(); + const html = docs[0].pageContent; + const links = extractLinks(html, baseUrl); + return links; + } catch (error) { + console.error(`Failed to get page links from ${url}.`, error); + return []; + } +} + +function extractLinks(html, baseUrl) { + const root = parse(html); + const links = root.querySelectorAll("a"); + const extractedLinks = new Set(); + + for (const link of links) { + const href = link.getAttribute("href"); + if (href) { + const absoluteUrl = new URL(href, baseUrl).href; + if (absoluteUrl.startsWith(baseUrl)) { + extractedLinks.add(absoluteUrl); + } + } + } + + return Array.from(extractedLinks); +} + +async function bulkScrapePages(links, outputFolder) { + const scrapedData = []; + + for (let i = 0; i < links.length; i++) { + const link = links[i]; + console.log(`Scraping ${i + 1}/${links.length}: ${link}`); + + try { + const loader = new PuppeteerWebBaseLoader(link, { + launchOptions: { headless: "new" }, + gotoOptions: { waitUntil: "domcontentloaded" }, + async evaluate(page, browser) { + const result = await page.evaluate(() => document.body.innerText); + await browser.close(); + return result; + }, + }); + const docs = await loader.load(); + const content = docs[0].pageContent; + + if (!content.length) { + console.warn(`Empty content for ${link}. Skipping.`); + continue; + } + + const url = new URL(link); + const filename = (url.host + "-" + url.pathname).replace(".", "_"); + + const data = { + id: v4(), + url: "file://" + slugify(filename) + ".html", + title: slugify(filename) + ".html", + docAuthor: "no author found", + description: "No description found.", + docSource: "URL link uploaded by the user.", + chunkSource: `link://${link}`, + published: new Date().toLocaleString(), + wordCount: content.split(" ").length, + pageContent: content, + token_count_estimate: tokenizeString(content).length, + }; + + writeToServerDocuments(data, data.title, outputFolder); + scrapedData.push(data); + + console.log(`Successfully scraped ${link}.`); + } catch (error) { + console.error(`Failed to scrape ${link}.`, error); + } + } + + return scrapedData; +} + +async function websiteScraper(startUrl, depth = 1, maxLinks = 20) { + const websiteName = new URL(startUrl).hostname; + const outputFolder = path.resolve( + __dirname, + `../../../../server/storage/documents/${slugify(websiteName)}` + ); + + fs.mkdirSync(outputFolder, { recursive: true }); + + console.log("Discovering links..."); + const linksToScrape = await discoverLinks(startUrl, depth, maxLinks); + console.log(`Found ${linksToScrape.length} links to scrape.`); + + console.log("Starting bulk scraping..."); + const scrapedData = await bulkScrapePages(linksToScrape, outputFolder); + console.log(`Scraped ${scrapedData.length} pages.`); + + return scrapedData; +} + +module.exports = websiteScraper; diff --git a/frontend/src/components/DataConnectorOption/media/index.js b/frontend/src/components/DataConnectorOption/media/index.js index ac8105975ed4dcdf71ba87f9052017a6a5942dbd..dee46a12b5e7d4efe8efbf3df64846f7611b7498 100644 --- a/frontend/src/components/DataConnectorOption/media/index.js +++ b/frontend/src/components/DataConnectorOption/media/index.js @@ -1,10 +1,12 @@ import Github from "./github.svg"; import YouTube from "./youtube.svg"; +import Link from "./link.svg"; import Confluence from "./confluence.jpeg"; const ConnectorImages = { github: Github, youtube: YouTube, + websiteDepth: Link, confluence: Confluence, }; diff --git a/frontend/src/components/DataConnectorOption/media/link.svg b/frontend/src/components/DataConnectorOption/media/link.svg new file mode 100644 index 0000000000000000000000000000000000000000..c957e542ebd6acec879c0c4f1a392e94528996e1 --- /dev/null +++ b/frontend/src/components/DataConnectorOption/media/link.svg @@ -0,0 +1 @@ +<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" width="330" zoomAndPan="magnify" viewBox="0 0 247.5 247.500006" height="330" preserveAspectRatio="xMidYMid meet" version="1.0"><defs><filter x="0%" y="0%" width="100%" height="100%" id="9045983972"><feColorMatrix values="0 0 0 0 1 0 0 0 0 1 0 0 0 0 1 0 0 0 1 0" color-interpolation-filters="sRGB"/></filter><filter x="0%" y="0%" width="100%" height="100%" id="111345b854"><feColorMatrix values="0 0 0 0 1 0 0 0 0 1 0 0 0 0 1 0.2126 0.7152 0.0722 0 0" color-interpolation-filters="sRGB"/></filter><clipPath id="41acadd750"><path d="M 23 23 L 224.648438 23 L 224.648438 224.648438 L 23 224.648438 Z M 23 23 " clip-rule="nonzero"/></clipPath><image x="0" y="0" width="200" xlink:href="" id="95b5eeec8e" height="200" preserveAspectRatio="xMidYMid meet"/><mask id="bd5e341d33"><g filter="url(#9045983972)"><g filter="url(#111345b854)" transform="matrix(1.010459, 0, 0, 1.010459, 22.55585, 22.555871)"><image x="0" y="0" width="200" xlink:href="" height="200" preserveAspectRatio="xMidYMid meet"/></g></g></mask><image x="0" y="0" width="200" xlink:href="" id="f7fc71df02" height="200" preserveAspectRatio="xMidYMid meet"/></defs><path fill="#ffffff" d="M 0 0 L 247 0 L 247 247 L 0 247 Z M 0 0 " fill-opacity="1" fill-rule="nonzero"/><path fill="#ffffff" d="M 0 0 L 247 0 L 247 247 L 0 247 Z M 0 0 " fill-opacity="1" fill-rule="nonzero"/><g clip-path="url(#41acadd750)"><g mask="url(#bd5e341d33)"><g transform="matrix(1.010459, 0, 0, 1.010459, 22.55585, 22.555871)"><image x="0" y="0" width="200" xlink:href="" height="200" preserveAspectRatio="xMidYMid meet"/></g></g></g></svg> \ No newline at end of file diff --git a/frontend/src/components/Modals/MangeWorkspace/DataConnectors/Connectors/WebsiteDepth/index.jsx b/frontend/src/components/Modals/MangeWorkspace/DataConnectors/Connectors/WebsiteDepth/index.jsx new file mode 100644 index 0000000000000000000000000000000000000000..b3fc454530698ca9f8e35076332c26fe3a8d5ee2 --- /dev/null +++ b/frontend/src/components/Modals/MangeWorkspace/DataConnectors/Connectors/WebsiteDepth/index.jsx @@ -0,0 +1,134 @@ +import React, { useState } from "react"; +import System from "@/models/system"; +import showToast from "@/utils/toast"; +import pluralize from "pluralize"; + +export default function WebsiteDepthOptions() { + const [loading, setLoading] = useState(false); + + const handleSubmit = async (e) => { + e.preventDefault(); + const form = new FormData(e.target); + + try { + setLoading(true); + showToast("Scraping website - this may take a while.", "info", { + clear: true, + autoClose: false, + }); + + const { data, error } = await System.dataConnectors.websiteDepth.scrape({ + url: form.get("url"), + depth: parseInt(form.get("depth")), + maxLinks: parseInt(form.get("maxLinks")), + }); + + if (!!error) { + showToast(error, "error", { clear: true }); + setLoading(false); + return; + } + + showToast( + `Successfully scraped ${data.length} ${pluralize( + "page", + data.length + )}!`, + "success", + { clear: true } + ); + e.target.reset(); + setLoading(false); + } catch (e) { + console.error(e); + showToast(e.message, "error", { clear: true }); + setLoading(false); + } + }; + + return ( + <div className="flex w-full"> + <div className="flex flex-col w-full px-1 md:pb-6 pb-16"> + <form className="w-full" onSubmit={handleSubmit}> + <div className="w-full flex flex-col py-2"> + <div className="w-full flex flex-col gap-4"> + <div className="flex flex-col pr-10"> + <div className="flex flex-col gap-y-1 mb-4"> + <label className="text-white text-sm font-bold"> + Website URL + </label> + <p className="text-xs font-normal text-white/50"> + URL of the website you want to scrape. + </p> + </div> + <input + type="url" + name="url" + className="border-none bg-zinc-900 text-white placeholder:text-white/20 text-sm rounded-lg focus:border-white block w-full p-2.5" + placeholder="https://example.com" + required={true} + autoComplete="off" + spellCheck={false} + /> + </div> + <div className="flex flex-col pr-10"> + <div className="flex flex-col gap-y-1 mb-4"> + <label className="text-white text-sm font-bold">Depth</label> + <p className="text-xs font-normal text-white/50"> + This is the number of child-links that the worker should + follow from the origin URL. + </p> + </div> + <input + type="number" + name="depth" + min="1" + max="5" + className="border-none bg-zinc-900 text-white placeholder:text-white/20 text-sm rounded-lg focus:border-white block w-full p-2.5" + required={true} + defaultValue="1" + /> + </div> + <div className="flex flex-col pr-10"> + <div className="flex flex-col gap-y-1 mb-4"> + <label className="text-white text-sm font-bold"> + Max Links + </label> + <p className="text-xs font-normal text-white/50"> + Maximum number of links to scrape. + </p> + </div> + <input + type="number" + name="maxLinks" + min="1" + className="border-none bg-zinc-900 text-white placeholder:text-white/20 text-sm rounded-lg focus:border-white block w-full p-2.5" + required={true} + defaultValue="20" + /> + </div> + </div> + </div> + + <div className="flex flex-col gap-y-2 w-full pr-10"> + <button + type="submit" + disabled={loading} + className={`mt-2 w-full ${ + loading ? "cursor-not-allowed animate-pulse" : "" + } justify-center border border-slate-200 px-4 py-2 rounded-lg text-[#222628] text-sm font-bold items-center flex gap-x-2 bg-slate-200 hover:bg-slate-300 hover:text-slate-800 disabled:bg-slate-300 disabled:cursor-not-allowed`} + > + {loading ? "Scraping website..." : "Submit"} + </button> + {loading && ( + <p className="text-xs text-white/50"> + Once complete, all scraped pages will be available for embedding + into workspaces in the document picker. + </p> + )} + </div> + </form> + </div> + </div> + ); +} diff --git a/frontend/src/components/Modals/MangeWorkspace/DataConnectors/index.jsx b/frontend/src/components/Modals/MangeWorkspace/DataConnectors/index.jsx index 69d30e281992f1aac77fe877959db853c3e9fcd3..5a03e19c1899acbbfd45cd994ab8a06dab2b7fa2 100644 --- a/frontend/src/components/Modals/MangeWorkspace/DataConnectors/index.jsx +++ b/frontend/src/components/Modals/MangeWorkspace/DataConnectors/index.jsx @@ -5,6 +5,7 @@ import YoutubeOptions from "./Connectors/Youtube"; import ConfluenceOptions from "./Connectors/Confluence"; import { useState } from "react"; import ConnectorOption from "./ConnectorOption"; +import WebsiteDepthOptions from "./Connectors/WebsiteDepth"; export const DATA_CONNECTORS = { github: { @@ -21,6 +22,12 @@ export const DATA_CONNECTORS = { "Import the transcription of an entire YouTube video from a link.", options: <YoutubeOptions />, }, + "website-depth": { + name: "Bulk Link Scraper", + image: ConnectorImages.websiteDepth, + description: "Scrape a website and its sub-links up to a certain depth.", + options: <WebsiteDepthOptions />, + }, confluence: { name: "Confluence", image: ConnectorImages.confluence, diff --git a/frontend/src/models/dataConnector.js b/frontend/src/models/dataConnector.js index 19fa5f9124fbe9135e9571e88aee9752016d140b..d01c3c8b8074a82bc62d3de5e1287933ab245cdb 100644 --- a/frontend/src/models/dataConnector.js +++ b/frontend/src/models/dataConnector.js @@ -60,6 +60,24 @@ const DataConnector = { }); }, }, + websiteDepth: { + scrape: async ({ url, depth, maxLinks }) => { + return await fetch(`${API_BASE}/ext/website-depth`, { + method: "POST", + headers: baseHeaders(), + body: JSON.stringify({ url, depth, maxLinks }), + }) + .then((res) => res.json()) + .then((res) => { + if (!res.success) throw new Error(res.reason); + return { data: res.data, error: null }; + }) + .catch((e) => { + console.error(e); + return { data: null, error: e.message }; + }); + }, + }, confluence: { collect: async function ({ pageUrl, username, accessToken }) { diff --git a/server/endpoints/extensions/index.js b/server/endpoints/extensions/index.js index 07eb7130db6e6d0abd449951e4ed44c423f7fc6e..cf8e1191c229c2232abcc2c2e718690da858c40d 100644 --- a/server/endpoints/extensions/index.js +++ b/server/endpoints/extensions/index.js @@ -93,6 +93,27 @@ function extensionEndpoints(app) { } } ); + app.post( + "/ext/website-depth", + [validatedRequest, flexUserRoleValid([ROLES.admin, ROLES.manager])], + async (request, response) => { + try { + const responseFromProcessor = + await new CollectorApi().forwardExtensionRequest({ + endpoint: "/ext/website-depth", + method: "POST", + body: request.body, + }); + await Telemetry.sendTelemetry("extension_invoked", { + type: "website_depth", + }); + response.status(200).json(responseFromProcessor); + } catch (e) { + console.error(e); + response.sendStatus(500).end(); + } + } + ); } module.exports = { extensionEndpoints };