Skip to content
Snippets Groups Projects
Unverified Commit 31e5db74 authored by AntonioCiolino's avatar AntonioCiolino Committed by GitHub
Browse files

Twitter Feature (#134)

* .

* twitter feature update

* Key validation and operation
parent d7315b0e
No related branches found
No related tags found
No related merge requests found
...@@ -48,4 +48,15 @@ Now uploads from the frontend will be processed as if you ran the `watch.py` scr ...@@ -48,4 +48,15 @@ Now uploads from the frontend will be processed as if you ran the `watch.py` scr
- ![GCP Project Bar](../images/gcp-project-bar.png) - ![GCP Project Bar](../images/gcp-project-bar.png)
- [Enable YouTube Data APIV3](https://console.cloud.google.com/apis/library/youtube.googleapis.com) - [Enable YouTube Data APIV3](https://console.cloud.google.com/apis/library/youtube.googleapis.com)
- Once enabled generate a Credential key for this API - Once enabled generate a Credential key for this API
- Paste your key after `GOOGLE_APIS_KEY=` in your `collector/.env` file. - Paste your key after `GOOGLE_APIS_KEY=` in your `collector/.env` file.
\ No newline at end of file
### Using ther Twitter API
***required to get data form twitter with tweepy**
- Go to https://developer.twitter.com/en/portal/dashboard with your twitter account
- Create a new Project App
- Get your 4 keys and place them in your `collector.env` file
* TW_CONSUMER_KEY
* TW_CONSUMER_SECRET
* TW_ACCESS_TOKEN
* TW_ACCESS_TOKEN_SECRET
populate the .env with the values
...@@ -6,6 +6,7 @@ from scripts.substack import substack ...@@ -6,6 +6,7 @@ from scripts.substack import substack
from scripts.medium import medium from scripts.medium import medium
from scripts.gitbook import gitbook from scripts.gitbook import gitbook
from scripts.sitemap import sitemap from scripts.sitemap import sitemap
from scripts.twitter import twitter
def main(): def main():
if os.name == 'nt': if os.name == 'nt':
...@@ -15,7 +16,8 @@ def main(): ...@@ -15,7 +16,8 @@ def main():
'3': 'Substack', '3': 'Substack',
'4': 'Medium', '4': 'Medium',
'5': 'Gitbook', '5': 'Gitbook',
'6': 'Sitemap', '6': 'Twitter',
'7': 'Sitemap',
} }
print("There are options for data collection to make this easier for you.\nType the number of the method you wish to execute.") print("There are options for data collection to make this easier for you.\nType the number of the method you wish to execute.")
print("1. YouTube Channel\n2. Article or Blog Link (Single)\n3. Substack\n4. Medium\n\n[In development]:\nTwitter\n\n") print("1. YouTube Channel\n2. Article or Blog Link (Single)\n3. Substack\n4. Medium\n\n[In development]:\nTwitter\n\n")
...@@ -30,7 +32,7 @@ def main(): ...@@ -30,7 +32,7 @@ def main():
{"name": "Medium", "value": "Medium"}, {"name": "Medium", "value": "Medium"},
{"name": "Article or Blog Link(s)", "value": "Article or Blog Link(s)"}, {"name": "Article or Blog Link(s)", "value": "Article or Blog Link(s)"},
{"name": "Gitbook", "value": "Gitbook"}, {"name": "Gitbook", "value": "Gitbook"},
{"name": "Twitter", "value": "Twitter", "disabled": "Needs PR"}, {"name": "Twitter", "value": "Twitter"},
{"name": "Sitemap", "value": "Sitemap"}, {"name": "Sitemap", "value": "Sitemap"},
{"name": "Abort", "value": "Abort"}, {"name": "Abort", "value": "Abort"},
], ],
...@@ -71,8 +73,10 @@ def main(): ...@@ -71,8 +73,10 @@ def main():
exit(0) exit(0)
if method == 'Sitemap': if method == 'Sitemap':
sitemap() sitemap()
exit(0)
if method == 'Twitter':
twitter()
exit(0) exit(0)
print("Selection was not valid.") print("Selection was not valid.")
exit(1) exit(1)
......
...@@ -109,4 +109,5 @@ xlrd==2.0.1 ...@@ -109,4 +109,5 @@ xlrd==2.0.1
XlsxWriter==3.1.2 XlsxWriter==3.1.2
yarl==1.9.2 yarl==1.9.2
youtube-transcript-api==0.6.0 youtube-transcript-api==0.6.0
zipp==3.15.0 zipp==3.15.0
\ No newline at end of file tweepy==4.14.0
"""
Tweepy implementation of twitter reader. Requires the 4 twitter keys to operate.
"""
import tweepy
import os, time
import pandas as pd
import json
from .utils import tokenize, ada_v2_cost
def twitter():
#get user and number of tweets to read
username = input("user timeline to read from (blank to ignore): ")
searchQuery = input("Search term, or leave blank to get user tweets (blank to ignore): ")
tweetCount = input("Gather the last number of tweets: ")
# Read your API keys to call the API.
consumer_key = os.environ.get("TW_CONSUMER_KEY")
consumer_secret = os.environ.get("TW_CONSUMER_SECRET")
access_token = os.environ.get("TW_ACCESS_TOKEN")
access_token_secret = os.environ.get("TW_ACCESS_TOKEN_SECRET")
# Check if any of the required environment variables is missing.
if not consumer_key or not consumer_secret or not access_token or not access_token_secret:
raise EnvironmentError("One of the twitter API environment variables are missing.")
# Pass in our twitter API authentication key
auth = tweepy.OAuth1UserHandler(
consumer_key, consumer_secret, access_token, access_token_secret
)
# Instantiate the tweepy API
api = tweepy.API(auth, wait_on_rate_limit=True)
try:
if (searchQuery == ''):
tweets = api.user_timeline(screen_name=username, tweet_mode = 'extended', count=tweetCount)
else:
tweets = api.search_tweets(q=searchQuery, tweet_mode = 'extended', count=tweetCount)
# Pulling Some attributes from the tweet
attributes_container = [
[tweet.id, tweet.user.screen_name, tweet.created_at, tweet.favorite_count, tweet.source, tweet.full_text]
for tweet in tweets
]
# Creation of column list to rename the columns in the dataframe
columns = ["id", "Screen Name", "Date Created", "Number of Likes", "Source of Tweet", "Tweet"]
# Creation of Dataframe
tweets_df = pd.DataFrame(attributes_container, columns=columns)
totalTokens = 0
for index, row in tweets_df.iterrows():
meta_link = twitter_meta(row, True)
output_filename = f"twitter-{username}-{row['Date Created']}.json"
output_path = f"./outputs/twitter-logs"
transaction_output_filename = f"tweet-{username}-{row['id']}.json"
transaction_output_dir = f"../server/storage/documents/twitter-{username}"
if not os.path.isdir(output_path):
os.makedirs(output_path)
if not os.path.isdir(transaction_output_dir):
os.makedirs(transaction_output_dir)
full_text = twitter_meta(row)
tokenCount = len(tokenize(full_text))
meta_link['pageContent'] = full_text
meta_link['token_count_estimate'] = tokenCount
totalTokens += tokenCount
with open(f"{output_path}/{output_filename}", 'w', encoding='utf-8') as file:
json.dump(meta_link, file, ensure_ascii=True, indent=4)
with open(f"{transaction_output_dir}/{transaction_output_filename}", 'w', encoding='utf-8') as file:
json.dump(meta_link, file, ensure_ascii=True, indent=4)
# print(f"{transaction_output_dir}/{transaction_output_filename}")
print(f"{tokenCount} tokens written over {tweets_df.shape[0]} records.")
except BaseException as e:
print("Status Failed: ", str(e))
time.sleep(3)
def twitter_meta(row, metadata_only = False):
# Note that /anyuser is a known twitter hack for not knowing the user's handle
# https://stackoverflow.com/questions/897107/can-i-fetch-the-tweet-from-twitter-if-i-know-the-tweets-id
url = f"http://twitter.com/anyuser/status/{row['id']}"
title = f"Tweet {row['id']}"
meta = {
'url': url,
'title': title,
'description': 'Tweet from ' + row["Screen Name"],
'published': row["Date Created"].strftime('%Y-%m-%d %H:%M:%S'),
'wordCount': len(row["Tweet"]),
}
return "Tweet JSON Metadata:\n"+json.dumps(meta)+"\n\n\nText Content:\n" + row["Tweet"] if metadata_only == False else meta
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment