Skip to content
Snippets Groups Projects
link.py 6.43 KiB
Newer Older
  • Learn to ignore specific revisions
  • timothycarambat's avatar
    timothycarambat committed
    import os, json, tempfile
    from urllib.parse import urlparse
    from requests_html import HTMLSession
    from langchain.document_loaders import UnstructuredHTMLLoader
    from .link_utils import  append_meta
    from .utils import tokenize, ada_v2_cost
    
    import requests
    from bs4 import BeautifulSoup
    
    timothycarambat's avatar
    timothycarambat committed
        
    # Example Channel URL https://tim.blog/2022/08/09/nft-insider-trading-policy/
    def link():
      print("[NOTICE]: The first time running this process it will download supporting libraries.\n\n")
      fqdn_link = input("Paste in the URL of an online article or blog: ")
      if(len(fqdn_link) == 0):
        print("Invalid URL!")
        exit(1)
    
      session = HTMLSession()
      req = session.get(fqdn_link)
      if(req.ok == False):
        print("Could not reach this url!")
        exit(1)
      
      req.html.render()
      full_text = None
      with tempfile.NamedTemporaryFile(mode = "w") as tmp:
        tmp.write(req.html.html)
        tmp.seek(0)
        loader = UnstructuredHTMLLoader(tmp.name)
        data = loader.load()[0]
        full_text = data.page_content
        tmp.close()
      
      link = append_meta(req, full_text, True)
      if(len(full_text) > 0):
        source = urlparse(req.url)
        output_filename = f"website-{source.netloc}-{source.path.replace('/','_')}.json"
        output_path = f"./outputs/website-logs"
    
        transaction_output_filename = f"article-{source.path.replace('/','_')}.json"
    
    frasergr's avatar
    frasergr committed
        transaction_output_dir = f"../server/storage/documents/website-{source.netloc}"
    
    timothycarambat's avatar
    timothycarambat committed
    
        if os.path.isdir(output_path) == False:
          os.makedirs(output_path)
    
        if os.path.isdir(transaction_output_dir) == False:
          os.makedirs(transaction_output_dir)
    
        full_text = append_meta(req, full_text)
        tokenCount = len(tokenize(full_text))
        link['pageContent'] = full_text
        link['token_count_estimate'] = tokenCount
    
        with open(f"{output_path}/{output_filename}", 'w', encoding='utf-8') as file:
          json.dump(link, file, ensure_ascii=True, indent=4)
    
        with open(f"{transaction_output_dir}/{transaction_output_filename}", 'w', encoding='utf-8') as file:
          json.dump(link, file, ensure_ascii=True, indent=4)
      else:
        print("Could not parse any meaningful data from this link or url.")
        exit(1)
    
      print(f"\n\n[Success]: article or link content fetched!")
      print(f"////////////////////////////")
      print(f"Your estimated cost to embed this data using OpenAI's text-embedding-ada-002 model at $0.0004 / 1K tokens will cost {ada_v2_cost(tokenCount)} using {tokenCount} tokens.")
      print(f"////////////////////////////")
      exit(0)
    
    
    def crawler():
      prompt = "Paste in root URI of the pages of interest: "
      new_link = input(prompt)
    
      filter_value = input("Add a filter value for the url to ensure links don't wander too far. eg: 'my-domain.com': ")
    
      #extract this from the uri provided
      root_site = urlparse(new_link).scheme + "://" + urlparse(new_link).hostname
      links = []
      urls = new_link
      links.append(new_link)
      grab = requests.get(urls)
      soup = BeautifulSoup(grab.text, 'html.parser')
    
      # traverse paragraphs from soup
      for link in soup.find_all("a"):
    
        data = link.get('href')
        if (data is not None):
    
          fullpath = data if data[0] != '/' else f"{root_site}{data}"
          try:
            destination = urlparse(fullpath).scheme + "://" + urlparse(fullpath).hostname + (urlparse(fullpath).path if urlparse(fullpath).path is not None else '')
            if filter_value in destination:
              data = destination.strip()
              print (data)
              links.append(data)
            else:
              print (data + " does not apply for linking...")
          except:
    
            print (data + " does not apply for linking...")
    
      #parse the links found  
      parse_links(links)
    
    
    timothycarambat's avatar
    timothycarambat committed
    def links():
      links = []
      prompt = "Paste in the URL of an online article or blog: "
      done = False
      
      while(done == False):
        new_link = input(prompt)
        if(len(new_link) == 0): 
          done = True
          links = [*set(links)]
          continue
    
        links.append(new_link)
        prompt = f"\n{len(links)} links in queue. Submit an empty value when done pasting in links to execute collection.\nPaste in the next URL of an online article or blog: "
    
      if(len(links) == 0):
        print("No valid links provided!")
        exit(1)
    
    
      parse_links(links)
    
    
    # parse links from array
    def parse_links(links):
        totalTokens = 0
    
            print(f"Working on {link}...")
            session = HTMLSession()
            
            req = session.get(link, timeout=20) 
    
            if not req.ok:
                print(f"Could not reach {link} - skipping!")
                continue
            
            req.html.render(timeout=10)    
    
            full_text = None
            with tempfile.NamedTemporaryFile(mode="w") as tmp:
                tmp.write(req.html.html)
                tmp.seek(0)
                loader = UnstructuredHTMLLoader(tmp.name)
                data = loader.load()[0]
                full_text = data.page_content
                tmp.close()
            
            link = append_meta(req, full_text, True)
            if len(full_text) > 0:
                source = urlparse(req.url)
                output_filename = f"website-{source.netloc}-{source.path.replace('/','_')}.json"
                output_path = f"./outputs/website-logs"
    
                transaction_output_filename = f"article-{source.path.replace('/','_')}.json"
                transaction_output_dir = f"../server/storage/documents/website-{source.netloc}"
    
                if not os.path.isdir(output_path):
                    os.makedirs(output_path)
    
                if not os.path.isdir(transaction_output_dir):
                    os.makedirs(transaction_output_dir)
    
                full_text = append_meta(req, full_text)
                tokenCount = len(tokenize(full_text))
                link['pageContent'] = full_text
                link['token_count_estimate'] = tokenCount
                totalTokens += tokenCount
    
                with open(f"{output_path}/{output_filename}", 'w', encoding='utf-8') as file:
                    json.dump(link, file, ensure_ascii=True, indent=4)
    
                with open(f"{transaction_output_dir}/{transaction_output_filename}", 'w', encoding='utf-8') as file:
                    json.dump(link, file, ensure_ascii=True, indent=4)
    
                req.session.close()
            else:
                print(f"Could not parse any meaningful data from {link}.")
                continue    
    
        print(f"\n\n[Success]: {len(links)} article or link contents fetched!")
        print(f"////////////////////////////")
        print(f"Your estimated cost to embed this data using OpenAI's text-embedding-ada-002 model at $0.0004 / 1K tokens will cost {ada_v2_cost(totalTokens)} using {totalTokens} tokens.")
        print(f"////////////////////////////")