Skip to content
Snippets Groups Projects
Commit c5dc6863 authored by timothycarambat's avatar timothycarambat
Browse files

patch link scrape tool schema

parent 085745c5
No related branches found
No related tags found
No related merge requests found
...@@ -2,7 +2,7 @@ import os, json, tempfile ...@@ -2,7 +2,7 @@ import os, json, tempfile
from urllib.parse import urlparse from urllib.parse import urlparse
from requests_html import HTMLSession from requests_html import HTMLSession
from langchain.document_loaders import UnstructuredHTMLLoader from langchain.document_loaders import UnstructuredHTMLLoader
from .link_utils import append_meta from .link_utils import append_meta
from .utils import tokenize, ada_v2_cost from .utils import tokenize, ada_v2_cost
import requests import requests
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
...@@ -47,10 +47,6 @@ def link(): ...@@ -47,10 +47,6 @@ def link():
os.makedirs(transaction_output_dir) os.makedirs(transaction_output_dir)
full_text = append_meta(req, full_text) full_text = append_meta(req, full_text)
tokenCount = len(tokenize(full_text))
link['pageContent'] = full_text
link['token_count_estimate'] = tokenCount
with open(f"{output_path}/{output_filename}", 'w', encoding='utf-8') as file: with open(f"{output_path}/{output_filename}", 'w', encoding='utf-8') as file:
json.dump(link, file, ensure_ascii=True, indent=4) json.dump(link, file, ensure_ascii=True, indent=4)
...@@ -159,8 +155,6 @@ def parse_links(links): ...@@ -159,8 +155,6 @@ def parse_links(links):
full_text = append_meta(req, full_text) full_text = append_meta(req, full_text)
tokenCount = len(tokenize(full_text)) tokenCount = len(tokenize(full_text))
link['pageContent'] = full_text
link['token_count_estimate'] = tokenCount
totalTokens += tokenCount totalTokens += tokenCount
with open(f"{output_path}/{output_filename}", 'w', encoding='utf-8') as file: with open(f"{output_path}/{output_filename}", 'w', encoding='utf-8') as file:
......
import json import json
from datetime import datetime from datetime import datetime
from dotenv import load_dotenv from dotenv import load_dotenv
from .watch.utils import guid
from .utils import tokenize
load_dotenv() load_dotenv()
def append_meta(request, text, metadata_only = False): def append_meta(request, text, metadata_only = False):
meta = { meta = {
'id': guid(),
'url': request.url, 'url': request.url,
'title': request.html.find('title', first=True).text if len(request.html.find('title')) != 0 else '', 'title': request.html.find('title', first=True).text if len(request.html.find('title')) != 0 else '',
'docAuthor': 'N/A',
'docSource': 'webpage',
'chunkSource': request.url,
'description': request.html.find('meta[name="description"]', first=True).attrs.get('content') if request.html.find('meta[name="description"]', first=True) != None else '', 'description': request.html.find('meta[name="description"]', first=True).attrs.get('content') if request.html.find('meta[name="description"]', first=True) != None else '',
'published':request.html.find('meta[property="article:published_time"]', first=True).attrs.get('content') if request.html.find('meta[property="article:published_time"]', first=True) != None else datetime.today().strftime('%Y-%m-%d %H:%M:%S'), 'published':request.html.find('meta[property="article:published_time"]', first=True).attrs.get('content') if request.html.find('meta[property="article:published_time"]', first=True) != None else datetime.today().strftime('%Y-%m-%d %H:%M:%S'),
'wordCount': len(text.split(' ')), 'wordCount': len(text.split(' ')),
'pageContent': text,
'token_count_estimate':len(tokenize(text)),
} }
return "Article JSON Metadata:\n"+json.dumps(meta)+"\n\n\nText Content:\n" + text if metadata_only == False else meta return "Article JSON Metadata:\n"+json.dumps(meta)+"\n\n\nText Content:\n" + text if metadata_only == False else meta
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment