From a52b0ae655389270662318a85c36d4e0aa7ae320 Mon Sep 17 00:00:00 2001
From: AntonioCiolino <antonio.ciolino@gmail.com>
Date: Mon, 19 Jun 2023 15:07:26 -0400
Subject: [PATCH] Updated Link scraper to avoid NoneType error. (#90)

* Enable web scraping based on a urtl and a simple filter.

* ignore yarn

* Updated Link scraper to avoid NoneType error.
---
 .gitignore                |  2 +-
 collector/scripts/link.py | 14 ++++++++------
 2 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/.gitignore b/.gitignore
index 0725f47c0..a1d96b6e2 100644
--- a/.gitignore
+++ b/.gitignore
@@ -7,4 +7,4 @@ __pycache__
 v-env
 .DS_Store
 aws_cf_deploy_anything_llm.json
-
+yarn.lock
diff --git a/collector/scripts/link.py b/collector/scripts/link.py
index 17a532cb0..2bc604e99 100644
--- a/collector/scripts/link.py
+++ b/collector/scripts/link.py
@@ -80,12 +80,14 @@ def crawler():
 
   # traverse paragraphs from soup
   for link in soup.find_all("a"):
-    data = link.get('href').strip()
-    if filter_value in data:
-      print (data)
-      links.append(root_site + data)
-    else:
-       print (data + " does not apply for linking...")
+    data = link.get('href')
+    if (data is not None):
+      if filter_value in data:
+        data = data.strip()
+        print (data)
+        links.append(root_site + data)
+      else:
+        print (data + " does not apply for linking...")
   #parse the links found  
   parse_links(links)
 
-- 
GitLab