diff --git a/collector/main.py b/collector/main.py index efb05db12e3ace1acd42809ae9a0d0b48c931fb7..cd800eb9e306df9e94675b5034273442ee45502c 100644 --- a/collector/main.py +++ b/collector/main.py @@ -5,6 +5,7 @@ from scripts.link import link, links from scripts.substack import substack from scripts.medium import medium from scripts.gitbook import gitbook +from scripts.sitemap import sitemap def main(): if os.name == 'nt': @@ -13,7 +14,8 @@ def main(): '2': 'Article or Blog Link', '3': 'Substack', '4': 'Medium', - '5': 'Gitbook' + '5': 'Gitbook', + '6': 'Sitemap', } print("There are options for data collection to make this easier for you.\nType the number of the method you wish to execute.") print("1. YouTube Channel\n2. Article or Blog Link (Single)\n3. Substack\n4. Medium\n\n[In development]:\nTwitter\n\n") @@ -29,6 +31,7 @@ def main(): {"name": "Article or Blog Link(s)", "value": "Article or Blog Link(s)"}, {"name": "Gitbook", "value": "Gitbook"}, {"name": "Twitter", "value": "Twitter", "disabled": "Needs PR"}, + {"name": "Sitemap", "value": "Sitemap"}, {"name": "Abort", "value": "Abort"}, ], ).execute() @@ -62,6 +65,9 @@ def main(): if method == 'Gitbook': gitbook() exit(0) + if method == 'Sitemap': + sitemap() + exit(0) print("Selection was not valid.") exit(1) diff --git a/collector/scripts/link.py b/collector/scripts/link.py index 1f03a8f33c77ce24aaff074166d89d54c3f27de2..0dad18c68182295fbcf45f05200198ac623428e7 100644 --- a/collector/scripts/link.py +++ b/collector/scripts/link.py @@ -4,6 +4,7 @@ from requests_html import HTMLSession from langchain.document_loaders import UnstructuredHTMLLoader from .link_utils import append_meta from .utils import tokenize, ada_v2_cost +from requests.exceptions import ReadTimeout # Example Channel URL https://tim.blog/2022/08/09/nft-insider-trading-policy/ def link(): @@ -83,57 +84,71 @@ def links(): print("No valid links provided!") exit(1) - totalTokens = 0 - for link in links: - print(f"Working on {link}...") - session = HTMLSession() - req = session.get(link) - if(req.ok == False): - print(f"Could not reach {link} - skipping!") - continue - - req.html.render() - full_text = None - with tempfile.NamedTemporaryFile(mode = "w") as tmp: - tmp.write(req.html.html) - tmp.seek(0) - loader = UnstructuredHTMLLoader(tmp.name) - data = loader.load()[0] - full_text = data.page_content - tmp.close() - - link = append_meta(req, full_text, True) - if(len(full_text) > 0): - source = urlparse(req.url) - output_filename = f"website-{source.netloc}-{source.path.replace('/','_')}.json" - output_path = f"./outputs/website-logs" - - transaction_output_filename = f"article-{source.path.replace('/','_')}.json" - transaction_output_dir = f"../server/storage/documents/website-{source.netloc}" - - if os.path.isdir(output_path) == False: - os.makedirs(output_path) - - if os.path.isdir(transaction_output_dir) == False: - os.makedirs(transaction_output_dir) - - full_text = append_meta(req, full_text) - tokenCount = len(tokenize(full_text)) - link['pageContent'] = full_text - link['token_count_estimate'] = tokenCount - totalTokens += tokenCount - - with open(f"{output_path}/{output_filename}", 'w', encoding='utf-8') as file: - json.dump(link, file, ensure_ascii=True, indent=4) - - with open(f"{transaction_output_dir}/{transaction_output_filename}", 'w', encoding='utf-8') as file: - json.dump(link, file, ensure_ascii=True, indent=4) - else: - print(f"Could not parse any meaningful data from {link}.") - continue - - print(f"\n\n[Success]: {len(links)} article or link contents fetched!") - print(f"////////////////////////////") - print(f"Your estimated cost to embed this data using OpenAI's text-embedding-ada-002 model at $0.0004 / 1K tokens will cost {ada_v2_cost(totalTokens)} using {totalTokens} tokens.") - print(f"////////////////////////////") - exit(0) \ No newline at end of file + parse_links(links) + + + +# parse links from array +def parse_links(links): + totalTokens = 0 + for link in links: + if link.endswith(".pdf"): + print(f"Skipping PDF file: {link}") + continue + + print(f"Working on {link}...") + session = HTMLSession() + + req = session.get(link, timeout=20) + + if not req.ok: + print(f"Could not reach {link} - skipping!") + continue + + req.html.render(timeout=10) + + full_text = None + with tempfile.NamedTemporaryFile(mode="w") as tmp: + tmp.write(req.html.html) + tmp.seek(0) + loader = UnstructuredHTMLLoader(tmp.name) + data = loader.load()[0] + full_text = data.page_content + tmp.close() + + link = append_meta(req, full_text, True) + if len(full_text) > 0: + source = urlparse(req.url) + output_filename = f"website-{source.netloc}-{source.path.replace('/','_')}.json" + output_path = f"./outputs/website-logs" + + transaction_output_filename = f"article-{source.path.replace('/','_')}.json" + transaction_output_dir = f"../server/storage/documents/website-{source.netloc}" + + if not os.path.isdir(output_path): + os.makedirs(output_path) + + if not os.path.isdir(transaction_output_dir): + os.makedirs(transaction_output_dir) + + full_text = append_meta(req, full_text) + tokenCount = len(tokenize(full_text)) + link['pageContent'] = full_text + link['token_count_estimate'] = tokenCount + totalTokens += tokenCount + + with open(f"{output_path}/{output_filename}", 'w', encoding='utf-8') as file: + json.dump(link, file, ensure_ascii=True, indent=4) + + with open(f"{transaction_output_dir}/{transaction_output_filename}", 'w', encoding='utf-8') as file: + json.dump(link, file, ensure_ascii=True, indent=4) + + req.session.close() + else: + print(f"Could not parse any meaningful data from {link}.") + continue + + print(f"\n\n[Success]: {len(links)} article or link contents fetched!") + print(f"////////////////////////////") + print(f"Your estimated cost to embed this data using OpenAI's text-embedding-ada-002 model at $0.0004 / 1K tokens will cost {ada_v2_cost(totalTokens)} using {totalTokens} tokens.") + print(f"////////////////////////////") \ No newline at end of file diff --git a/collector/scripts/sitemap.py b/collector/scripts/sitemap.py new file mode 100644 index 0000000000000000000000000000000000000000..3895bcefb81a3e306a0e4e9f8e59040d0a3480ca --- /dev/null +++ b/collector/scripts/sitemap.py @@ -0,0 +1,27 @@ +import requests +import xml.etree.ElementTree as ET +from scripts.link import parse_links + +def parse_sitemap(url): + response = requests.get(url) + root = ET.fromstring(response.content) + + urls = [] + for element in root.iter('{http://www.sitemaps.org/schemas/sitemap/0.9}url'): + for loc in element.iter('{http://www.sitemaps.org/schemas/sitemap/0.9}loc'): + urls.append(loc.text) + + return urls + +# Example sitemap URL https://www.nerdwallet.com/blog/wp-sitemap-news-articles-1.xml +def sitemap(): + sitemap_url = input("Enter the URL of the sitemap: ") + + if(len(sitemap_url) == 0): + print("No valid sitemap provided!") + exit(1) + + url_array = parse_sitemap(sitemap_url) + + #parse links from array + parse_links(url_array)