Skip to content
Snippets Groups Projects
Unverified Commit 4118c9dc authored by Skid Vis's avatar Skid Vis Committed by GitHub
Browse files

Blocks images in sitemaps from being parsed. (#56)

* Adds ability to import sitemaps to include a website

* adds example sitemap url

* adds filter to bypass common image formats

* moves filetype ignoring to sitemap script
parent 24038069
No related branches found
No related tags found
No related merge requests found
......@@ -4,7 +4,6 @@ from requests_html import HTMLSession
from langchain.document_loaders import UnstructuredHTMLLoader
from .link_utils import append_meta
from .utils import tokenize, ada_v2_cost
from requests.exceptions import ReadTimeout
# Example Channel URL https://tim.blog/2022/08/09/nft-insider-trading-policy/
def link():
......@@ -91,11 +90,7 @@ def links():
# parse links from array
def parse_links(links):
totalTokens = 0
for link in links:
if link.endswith(".pdf"):
print(f"Skipping PDF file: {link}")
continue
for link in links:
print(f"Working on {link}...")
session = HTMLSession()
......
import requests
import xml.etree.ElementTree as ET
from scripts.link import parse_links
import re
def parse_sitemap(url):
response = requests.get(url)
......@@ -9,7 +10,10 @@ def parse_sitemap(url):
urls = []
for element in root.iter('{http://www.sitemaps.org/schemas/sitemap/0.9}url'):
for loc in element.iter('{http://www.sitemaps.org/schemas/sitemap/0.9}loc'):
urls.append(loc.text)
if not has_extension_to_ignore(loc.text):
urls.append(loc.text)
else:
print(f"Skipping filetype: {loc.text}")
return urls
......@@ -25,3 +29,11 @@ def sitemap():
#parse links from array
parse_links(url_array)
def has_extension_to_ignore(string):
image_extensions = ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.pdf']
pattern = r'\b(' + '|'.join(re.escape(ext) for ext in image_extensions) + r')\b'
match = re.search(pattern, string, re.IGNORECASE)
return match is not None
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment