From 41522cdfb450fdef503a94351303da3d6d20f917 Mon Sep 17 00:00:00 2001
From: Sean Hatfield <seanhatfield5@gmail.com>
Date: Thu, 17 Oct 2024 17:04:00 -0700
Subject: [PATCH] Handle non-ascii characters in single and bulk link scraper
 URLs (#2495)

handle non-ascii characters in urls
---
 collector/processLink/convert/generic.js         | 3 ++-
 collector/utils/extensions/WebsiteDepth/index.js | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/collector/processLink/convert/generic.js b/collector/processLink/convert/generic.js
index c24e9dd3b..64fc0a0b7 100644
--- a/collector/processLink/convert/generic.js
+++ b/collector/processLink/convert/generic.js
@@ -27,7 +27,8 @@ async function scrapeGenericUrl(link, textOnly = false) {
   }
 
   const url = new URL(link);
-  const filename = (url.host + "-" + url.pathname).replace(".", "_");
+  const decodedPathname = decodeURIComponent(url.pathname);
+  const filename = `${url.hostname}${decodedPathname.replace(/\//g, '_')}`;
 
   const data = {
     id: v4(),
diff --git a/collector/utils/extensions/WebsiteDepth/index.js b/collector/utils/extensions/WebsiteDepth/index.js
index d00718129..e7d26d99a 100644
--- a/collector/utils/extensions/WebsiteDepth/index.js
+++ b/collector/utils/extensions/WebsiteDepth/index.js
@@ -108,7 +108,8 @@ async function bulkScrapePages(links, outFolderPath) {
       }
 
       const url = new URL(link);
-      const filename = (url.host + "-" + url.pathname).replace(".", "_");
+      const decodedPathname = decodeURIComponent(url.pathname);
+      const filename = `${url.hostname}${decodedPathname.replace(/\//g, '_')}`;
 
       const data = {
         id: v4(),
-- 
GitLab