From 9bc01afa7db56d3adf0ac5dfd8a7285a4a979a51 Mon Sep 17 00:00:00 2001
From: Sean Hatfield <seanhatfield5@gmail.com>
Date: Thu, 12 Dec 2024 06:01:52 +0800
Subject: [PATCH] Fix scraping failed bug in link/bulk link scrapers (#2807)

* fix scraping failed bug in link/bulk link scrapers

* reset submodule

* swap to networkidle2 as a safe mix for SPA and API-loaded pages, but also not hang on request heavy pages

* lint

---------

Co-authored-by: timothycarambat <rambat1010@gmail.com>
---
 collector/processLink/convert/generic.js         | 2 +-
 collector/utils/extensions/WebsiteDepth/index.js | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/collector/processLink/convert/generic.js b/collector/processLink/convert/generic.js
index c12d79ade..a5eb20ca9 100644
--- a/collector/processLink/convert/generic.js
+++ b/collector/processLink/convert/generic.js
@@ -61,7 +61,7 @@ async function getPageContent(link) {
         ignoreHTTPSErrors: true,
       },
       gotoOptions: {
-        waitUntil: "domcontentloaded",
+        waitUntil: "networkidle2",
       },
       async evaluate(page, browser) {
         const result = await page.evaluate(() => document.body.innerText);
diff --git a/collector/utils/extensions/WebsiteDepth/index.js b/collector/utils/extensions/WebsiteDepth/index.js
index d8b23144d..e680c0233 100644
--- a/collector/utils/extensions/WebsiteDepth/index.js
+++ b/collector/utils/extensions/WebsiteDepth/index.js
@@ -48,7 +48,7 @@ async function getPageLinks(url, baseUrl) {
   try {
     const loader = new PuppeteerWebBaseLoader(url, {
       launchOptions: { headless: "new" },
-      gotoOptions: { waitUntil: "domcontentloaded" },
+      gotoOptions: { waitUntil: "networkidle2" },
     });
     const docs = await loader.load();
     const html = docs[0].pageContent;
@@ -92,7 +92,7 @@ async function bulkScrapePages(links, outFolderPath) {
     try {
       const loader = new PuppeteerWebBaseLoader(link, {
         launchOptions: { headless: "new" },
-        gotoOptions: { waitUntil: "domcontentloaded" },
+        gotoOptions: { waitUntil: "networkidle2" },
         async evaluate(page, browser) {
           const result = await page.evaluate(() => document.body.innerText);
           await browser.close();
-- 
GitLab