Skip to content
Snippets Groups Projects
Unverified Commit f40309cf authored by Sean Hatfield's avatar Sean Hatfield Committed by GitHub
Browse files

Add id to all metadata to prevent errors in frontend document picker (#378)


add id to all metadata to prevent errors in frontend docuemnt picker

Co-authored-by: default avatartimothycarambat <rambat1010@gmail.com>
parent 73f342eb
No related branches found
No related tags found
No related merge requests found
import os, json, requests, tempfile
from requests_html import HTMLSession
from langchain.document_loaders import UnstructuredHTMLLoader
from .watch.utils import guid
def fetch_all_publications(subdomain):
file_path = f"./outputs/substack-logs/substack-{subdomain}.json"
if os.path.isdir("./outputs/substack-logs") == False:
os.makedirs("./outputs/substack-logs")
if os.path.exists(file_path):
with open(file_path, "r") as file:
print(f"Returning cached data for substack {subdomain}.substack.com. If you do not wish to use stored data then delete the file for this newsletter to allow refetching.")
......@@ -24,7 +25,7 @@ def fetch_all_publications(subdomain):
print("Bad response - exiting collection")
collecting = False
continue
data = response.json()
if(len(data) ==0 ):
......@@ -34,11 +35,11 @@ def fetch_all_publications(subdomain):
for publication in data:
publications.append(publication)
offset = len(publications)
with open(file_path, 'w+', encoding='utf-8') as json_file:
json.dump(publications, json_file, ensure_ascii=True, indent=2)
print(f"{len(publications)} publications found for author {subdomain}.substack.com. Saved to substack-logs/channel-{subdomain}.json")
return publications
def only_valid_publications(publications= []):
......@@ -60,7 +61,7 @@ def get_content(article_link):
if(req.ok == False):
print("Could not reach this url!")
return None
req.html.render()
full_text = None
......@@ -75,6 +76,7 @@ def get_content(article_link):
def append_meta(publication, text):
meta = {
'id': guid(),
'url': publication.get('canonical_url'),
'thumbnail': publication.get('cover_image'),
'title': publication.get('title'),
......
......@@ -7,13 +7,14 @@ import os, time
import pandas as pd
import json
from .utils import tokenize, ada_v2_cost
from .watch.utils import guid
def twitter():
#get user and number of tweets to read
username = input("user timeline to read from (blank to ignore): ")
searchQuery = input("Search term, or leave blank to get user tweets (blank to ignore): ")
tweetCount = input("Gather the last number of tweets: ")
# Read your API keys to call the API.
consumer_key = os.environ.get("TW_CONSUMER_KEY")
consumer_secret = os.environ.get("TW_CONSUMER_SECRET")
......@@ -43,7 +44,7 @@ def twitter():
[tweet.id, tweet.user.screen_name, tweet.created_at, tweet.favorite_count, tweet.source, tweet.full_text]
for tweet in tweets
]
# Creation of column list to rename the columns in the dataframe
columns = ["id", "Screen Name", "Date Created", "Number of Likes", "Source of Tweet", "Tweet"]
......@@ -76,7 +77,7 @@ def twitter():
with open(f"{transaction_output_dir}/{transaction_output_filename}", 'w', encoding='utf-8') as file:
json.dump(meta_link, file, ensure_ascii=True, indent=4)
# print(f"{transaction_output_dir}/{transaction_output_filename}")
print(f"{tokenCount} tokens written over {tweets_df.shape[0]} records.")
......@@ -92,6 +93,7 @@ def twitter_meta(row, metadata_only = False):
url = f"http://twitter.com/anyuser/status/{row['id']}"
title = f"Tweet {row['id']}"
meta = {
'id': guid(),
'url': url,
'title': title,
'description': 'Tweet from ' + row["Screen Name"],
......
import json, requests, os, re
from slugify import slugify
from dotenv import load_dotenv
from .watch.utils import guid
load_dotenv()
def is_yt_short(videoId):
......@@ -20,13 +21,13 @@ def get_channel_id(channel_link):
if(response.ok == False):
print("Handle => ChannelId mapping endpoint is too slow - use regular youtube.com/channel URL")
return None
json_data = response.json()
return json_data.get('items')[0].get('id')
else:
pattern = r"youtube\.com/channel/([\w-]+)"
match = re.search(pattern, channel_link)
return match.group(1) if match else None
return match.group(1) if match else None
def clean_text(text):
......@@ -34,6 +35,7 @@ def clean_text(text):
def append_meta(video, duration, text):
meta = {
'id': guid(),
'youtubeURL': f"https://youtube.com/watch?v={video.get('id')}",
'thumbnail': video.get('thumbnail'),
'description': video.get('description'),
......@@ -63,7 +65,7 @@ def fetch_channel_video_information(channel_id, windowSize = 50):
if(os.getenv('GOOGLE_APIS_KEY') == None):
print("GOOGLE_APIS_KEY env variable not set!")
exit(1)
done = False
currentPage = None
pageTokens = []
......@@ -93,7 +95,7 @@ def fetch_channel_video_information(channel_id, windowSize = 50):
for item in response.get('items'):
if 'id' in item and 'videoId' in item.get('id'):
if is_yt_short(item.get('id').get('videoId')):
if is_yt_short(item.get('id').get('videoId')):
print(f"Filtering out YT Short {item.get('id').get('videoId')}")
continue
......@@ -109,12 +111,12 @@ def fetch_channel_video_information(channel_id, windowSize = 50):
'published': item.get('snippet').get('publishTime'),
}
items.append(newItem)
pageTokens.append(currentPage)
data['items'] = items
with open(file_path, 'w+', encoding='utf-8') as json_file:
json.dump(data, json_file, ensure_ascii=True, indent=2)
print(f"{len(items)} videos found for channel {data.get('channelTitle')}. Saved to channel-logs/channel-{channel_id}.json")
return data
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment