From 9bc01afa7db56d3adf0ac5dfd8a7285a4a979a51 Mon Sep 17 00:00:00 2001 From: Sean Hatfield <seanhatfield5@gmail.com> Date: Thu, 12 Dec 2024 06:01:52 +0800 Subject: [PATCH] Fix scraping failed bug in link/bulk link scrapers (#2807) * fix scraping failed bug in link/bulk link scrapers * reset submodule * swap to networkidle2 as a safe mix for SPA and API-loaded pages, but also not hang on request heavy pages * lint --------- Co-authored-by: timothycarambat <rambat1010@gmail.com> --- collector/processLink/convert/generic.js | 2 +- collector/utils/extensions/WebsiteDepth/index.js | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/collector/processLink/convert/generic.js b/collector/processLink/convert/generic.js index c12d79ade..a5eb20ca9 100644 --- a/collector/processLink/convert/generic.js +++ b/collector/processLink/convert/generic.js @@ -61,7 +61,7 @@ async function getPageContent(link) { ignoreHTTPSErrors: true, }, gotoOptions: { - waitUntil: "domcontentloaded", + waitUntil: "networkidle2", }, async evaluate(page, browser) { const result = await page.evaluate(() => document.body.innerText); diff --git a/collector/utils/extensions/WebsiteDepth/index.js b/collector/utils/extensions/WebsiteDepth/index.js index d8b23144d..e680c0233 100644 --- a/collector/utils/extensions/WebsiteDepth/index.js +++ b/collector/utils/extensions/WebsiteDepth/index.js @@ -48,7 +48,7 @@ async function getPageLinks(url, baseUrl) { try { const loader = new PuppeteerWebBaseLoader(url, { launchOptions: { headless: "new" }, - gotoOptions: { waitUntil: "domcontentloaded" }, + gotoOptions: { waitUntil: "networkidle2" }, }); const docs = await loader.load(); const html = docs[0].pageContent; @@ -92,7 +92,7 @@ async function bulkScrapePages(links, outFolderPath) { try { const loader = new PuppeteerWebBaseLoader(link, { launchOptions: { headless: "new" }, - gotoOptions: { waitUntil: "domcontentloaded" }, + gotoOptions: { waitUntil: "networkidle2" }, async evaluate(page, browser) { const result = await page.evaluate(() => document.body.innerText); await browser.close(); -- GitLab