diff --git a/.gitignore b/.gitignore index 0725f47c0dabe2afe5f1c63b0f4424a4c4433100..a1d96b6e2b17ee58b1027fb564d5529f4e06f10e 100644 --- a/.gitignore +++ b/.gitignore @@ -7,4 +7,4 @@ __pycache__ v-env .DS_Store aws_cf_deploy_anything_llm.json - +yarn.lock diff --git a/collector/scripts/link.py b/collector/scripts/link.py index 17a532cb0cc5cc549058949bf938a4602395ca5c..2bc604e99c12af4826517422338f959b8945f20d 100644 --- a/collector/scripts/link.py +++ b/collector/scripts/link.py @@ -80,12 +80,14 @@ def crawler(): # traverse paragraphs from soup for link in soup.find_all("a"): - data = link.get('href').strip() - if filter_value in data: - print (data) - links.append(root_site + data) - else: - print (data + " does not apply for linking...") + data = link.get('href') + if (data is not None): + if filter_value in data: + data = data.strip() + print (data) + links.append(root_site + data) + else: + print (data + " does not apply for linking...") #parse the links found parse_links(links)