patch link scrape tool schema

c5dc6863 · timothycarambat · 085745c5 · c5dc6863 · c5dc6863
Commit c5dc6863 authored 1 year ago by timothycarambat
--- a/collector/scripts/link.py
+++ b/collector/scripts/link.py
@@ -2,7 +2,7 @@ import os, json, tempfile
 from urllib.parse import urlparse
 from requests_html import HTMLSession
 from langchain.document_loaders import UnstructuredHTMLLoader
-from .link_utils import  append_meta
+from .link_utils import append_meta
 from .utils import tokenize, ada_v2_cost
 import requests
 from bs4 import BeautifulSoup
@@ -47,10 +47,6 @@ def link():
      os.makedirs(transaction_output_dir)
    full_text = append_meta(req, full_text)
-    tokenCount = len(tokenize(full_text))
-    link['pageContent'] = full_text
-    link['token_count_estimate'] = tokenCount
    with open(f"{output_path}/{output_filename}", 'w', encoding='utf-8') as file:
      json.dump(link, file, ensure_ascii=True, indent=4)
@@ -159,8 +155,6 @@ def parse_links(links):
            full_text = append_meta(req, full_text)
            tokenCount = len(tokenize(full_text))
-            link['pageContent'] = full_text
-            link['token_count_estimate'] = tokenCount
            totalTokens += tokenCount
            with open(f"{output_path}/{output_filename}", 'w', encoding='utf-8') as file:

--- a/collector/scripts/link_utils.py
+++ b/collector/scripts/link_utils.py
 import json
 from datetime import datetime
 from dotenv import load_dotenv
+from .watch.utils import guid
+from .utils import tokenize
 load_dotenv()
 def append_meta(request, text, metadata_only = False):
  meta = {
+    'id': guid(),
    'url': request.url,
    'title': request.html.find('title', first=True).text if len(request.html.find('title')) != 0 else '',
+    'docAuthor': 'N/A',
+    'docSource': 'webpage',
+    'chunkSource': request.url,
    'description': request.html.find('meta[name="description"]', first=True).attrs.get('content') if  request.html.find('meta[name="description"]', first=True) != None else '',
    'published':request.html.find('meta[property="article:published_time"]', first=True).attrs.get('content') if request.html.find('meta[property="article:published_time"]', first=True) != None else datetime.today().strftime('%Y-%m-%d %H:%M:%S'),
    'wordCount': len(text.split(' ')),
+    'pageContent': text,
+    'token_count_estimate':len(tokenize(text)),
  }
  return "Article JSON Metadata:\n"+json.dumps(meta)+"\n\n\nText Content:\n" + text if metadata_only == False else meta