Skip to content
Snippets Groups Projects
Unverified Commit e7ba0284 authored by AntonioCiolino's avatar AntonioCiolino Committed by GitHub
Browse files

Enable web scraping based on a urtl and a simple filter. (#73)

parent 81b21593
No related branches found
No related tags found
No related merge requests found
import os import os
from InquirerPy import inquirer from InquirerPy import inquirer
from scripts.youtube import youtube from scripts.youtube import youtube
from scripts.link import link, links from scripts.link import link, links, crawler
from scripts.substack import substack from scripts.substack import substack
from scripts.medium import medium from scripts.medium import medium
from scripts.gitbook import gitbook from scripts.gitbook import gitbook
...@@ -42,6 +42,7 @@ def main(): ...@@ -42,6 +42,7 @@ def main():
choices=[ choices=[
{"name": "Single URL", "value": "Single URL"}, {"name": "Single URL", "value": "Single URL"},
{"name": "Multiple URLs", "value": "Multiple URLs"}, {"name": "Multiple URLs", "value": "Multiple URLs"},
{"name": "URL Crawler", "value": "URL Crawler"},
{"name": "Abort", "value": "Abort"}, {"name": "Abort", "value": "Abort"},
], ],
).execute() ).execute()
...@@ -51,6 +52,9 @@ def main(): ...@@ -51,6 +52,9 @@ def main():
if method == 'Multiple URLs': if method == 'Multiple URLs':
links() links()
exit(0) exit(0)
if method == 'URL Crawler':
crawler()
exit(0)
if method == 'Abort': exit(0) if method == 'Abort': exit(0)
if method == 'YouTube Channel': if method == 'YouTube Channel':
......
...@@ -4,6 +4,8 @@ from requests_html import HTMLSession ...@@ -4,6 +4,8 @@ from requests_html import HTMLSession
from langchain.document_loaders import UnstructuredHTMLLoader from langchain.document_loaders import UnstructuredHTMLLoader
from .link_utils import append_meta from .link_utils import append_meta
from .utils import tokenize, ada_v2_cost from .utils import tokenize, ada_v2_cost
import requests
from bs4 import BeautifulSoup
# Example Channel URL https://tim.blog/2022/08/09/nft-insider-trading-policy/ # Example Channel URL https://tim.blog/2022/08/09/nft-insider-trading-policy/
def link(): def link():
...@@ -64,6 +66,29 @@ def link(): ...@@ -64,6 +66,29 @@ def link():
print(f"////////////////////////////") print(f"////////////////////////////")
exit(0) exit(0)
def crawler():
prompt = "Paste in root URI of the pages of interest: "
new_link = input(prompt)
filter_value = input("Add a filter value for the url to ensure links don't wander too far: ")
#extract this from the uri provided
root_site = urlparse(new_link).scheme + "://" + urlparse(new_link).hostname
links = []
urls = new_link
links.append(new_link)
grab = requests.get(urls)
soup = BeautifulSoup(grab.text, 'html.parser')
# traverse paragraphs from soup
for link in soup.find_all("a"):
data = link.get('href').strip()
if filter_value in data:
print (data)
links.append(root_site + data)
else:
print (data + " does not apply for linking...")
#parse the links found
parse_links(links)
def links(): def links():
links = [] links = []
prompt = "Paste in the URL of an online article or blog: " prompt = "Paste in the URL of an online article or blog: "
...@@ -86,7 +111,6 @@ def links(): ...@@ -86,7 +111,6 @@ def links():
parse_links(links) parse_links(links)
# parse links from array # parse links from array
def parse_links(links): def parse_links(links):
totalTokens = 0 totalTokens = 0
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment