From 41522cdfb450fdef503a94351303da3d6d20f917 Mon Sep 17 00:00:00 2001 From: Sean Hatfield <seanhatfield5@gmail.com> Date: Thu, 17 Oct 2024 17:04:00 -0700 Subject: [PATCH] Handle non-ascii characters in single and bulk link scraper URLs (#2495) handle non-ascii characters in urls --- collector/processLink/convert/generic.js | 3 ++- collector/utils/extensions/WebsiteDepth/index.js | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/collector/processLink/convert/generic.js b/collector/processLink/convert/generic.js index c24e9dd3b..64fc0a0b7 100644 --- a/collector/processLink/convert/generic.js +++ b/collector/processLink/convert/generic.js @@ -27,7 +27,8 @@ async function scrapeGenericUrl(link, textOnly = false) { } const url = new URL(link); - const filename = (url.host + "-" + url.pathname).replace(".", "_"); + const decodedPathname = decodeURIComponent(url.pathname); + const filename = `${url.hostname}${decodedPathname.replace(/\//g, '_')}`; const data = { id: v4(), diff --git a/collector/utils/extensions/WebsiteDepth/index.js b/collector/utils/extensions/WebsiteDepth/index.js index d00718129..e7d26d99a 100644 --- a/collector/utils/extensions/WebsiteDepth/index.js +++ b/collector/utils/extensions/WebsiteDepth/index.js @@ -108,7 +108,8 @@ async function bulkScrapePages(links, outFolderPath) { } const url = new URL(link); - const filename = (url.host + "-" + url.pathname).replace(".", "_"); + const decodedPathname = decodeURIComponent(url.pathname); + const filename = `${url.hostname}${decodedPathname.replace(/\//g, '_')}`; const data = { id: v4(), -- GitLab