diff --git a/collector/scripts/link.py b/collector/scripts/link.py index a8e9db44e73f4f1667ebfbae7127a0886152842e..6ee3f23ea25d58f25ce25984002d7f9f61d2cac3 100644 --- a/collector/scripts/link.py +++ b/collector/scripts/link.py @@ -2,7 +2,7 @@ import os, json, tempfile from urllib.parse import urlparse from requests_html import HTMLSession from langchain.document_loaders import UnstructuredHTMLLoader -from .link_utils import append_meta +from .link_utils import append_meta from .utils import tokenize, ada_v2_cost import requests from bs4 import BeautifulSoup @@ -47,10 +47,6 @@ def link(): os.makedirs(transaction_output_dir) full_text = append_meta(req, full_text) - tokenCount = len(tokenize(full_text)) - link['pageContent'] = full_text - link['token_count_estimate'] = tokenCount - with open(f"{output_path}/{output_filename}", 'w', encoding='utf-8') as file: json.dump(link, file, ensure_ascii=True, indent=4) @@ -159,8 +155,6 @@ def parse_links(links): full_text = append_meta(req, full_text) tokenCount = len(tokenize(full_text)) - link['pageContent'] = full_text - link['token_count_estimate'] = tokenCount totalTokens += tokenCount with open(f"{output_path}/{output_filename}", 'w', encoding='utf-8') as file: diff --git a/collector/scripts/link_utils.py b/collector/scripts/link_utils.py index 913653cc89411578d2cfd8681f176c9a9e20f5c6..934adf0c7ed102379846815f9c33b5acf58766cc 100644 --- a/collector/scripts/link_utils.py +++ b/collector/scripts/link_utils.py @@ -1,14 +1,22 @@ import json from datetime import datetime from dotenv import load_dotenv +from .watch.utils import guid +from .utils import tokenize load_dotenv() def append_meta(request, text, metadata_only = False): meta = { + 'id': guid(), 'url': request.url, 'title': request.html.find('title', first=True).text if len(request.html.find('title')) != 0 else '', + 'docAuthor': 'N/A', + 'docSource': 'webpage', + 'chunkSource': request.url, 'description': request.html.find('meta[name="description"]', first=True).attrs.get('content') if request.html.find('meta[name="description"]', first=True) != None else '', 'published':request.html.find('meta[property="article:published_time"]', first=True).attrs.get('content') if request.html.find('meta[property="article:published_time"]', first=True) != None else datetime.today().strftime('%Y-%m-%d %H:%M:%S'), 'wordCount': len(text.split(' ')), + 'pageContent': text, + 'token_count_estimate':len(tokenize(text)), } return "Article JSON Metadata:\n"+json.dumps(meta)+"\n\n\nText Content:\n" + text if metadata_only == False else meta