You must have cloned and installed Twitter Bot Monitor on your computer. Please go to Github repo for installation instruction. This repo is private so please email me your github handle so I can grant you access.
Zhouhan Chen zc1245@nyu.edu
import json
import util
import streamer
import datetime
import numpy as np
import pandas as pd
import detect
from twitter_credential import token_dict
from collections import defaultdict
prefix = 'trump_test2'
keyword = ['trump']
num_tweets = 20000
duration = 3600
auth_key = 'streaming_1'
src_path = util.get_full_src_path(prefix)
print("The absolute path of raw data file is")
print(src_path)
print()
full_prefix = util.get_full_prefix(prefix)
print("The prefix for all subsequent files is")
print(full_prefix)
print()
tweetStreamer = streamer.Streamer(auth_key=token_dict[auth_key])
tweetStreamer.collect(keyword=keyword, filename=src_path,
num_tweets=num_tweets, duration=duration, whitelist = [],
save_file = True, print_info = "info")
detector = detect.SpamDetector(prefix=full_prefix, url_based = False,
sourcefile=src_path)
# generate user info dictionary
detector.save_user_info()
# EDA: plot the distribution of followers count
import matplotlib.pyplot as plt
plt.style.use('classic')
%matplotlib inline
import seaborn as sns
sns.set()
followers_count = []
for tweet in util.loadjson(src_path):
followers_count.append(tweet['user']['followers_count'])
print("followers count mean is ", np.mean(followers_count))
print("followers count std is ", np.std(followers_count))
followers_count = [num for num in followers_count if num < 10000]
plt.hist(followers_count, alpha=0.5, bins=20)
plt.xlabel('Followers count')
plt.ylabel('Number of accounts')
plt.title('Histogram of followers count')
from utility.wordcloud_maker import generate_cloud
from IPython.display import Image
from IPython.core.display import HTML
text = []
for tweet in util.loadjson(src_path):
text.append(tweet['text'])
generate_cloud(' '.join([t for t in text])) #, full_prefix + 'wordcloud')
# Image(filename = full_prefix + 'wordcloud.png')
dates = []
for tweet in util.loadjson(src_path):
dt = datetime.datetime.strptime(tweet['created_at'], '%a %b %d %H:%M:%S +0000 %Y') # format the time
dates.append({'timestamp': pd.Timestamp(dt)})
df = pd.DataFrame(dates)
times = pd.to_datetime(df.timestamp)
df.groupby([times.dt.minute]).count().rename(columns={"timestamp": "count"}).plot()
plt.xlabel('minute')
plt.ylabel('Number of tweets')
plt.title('Tweet time series')
from mapboxgl import utils
from mapboxgl import viz
token = "pk.eyJ1IjoieWFucGVuZ3BlcnJ5IiwiYSI6ImNqZjkzcXU0ODBvaHMyeW9iNjVvcDVvazcifQ.MoKCHwmangU5Re0okuPB_g"
coordinates = []
for tweet in util.loadjson(src_path):
if "coordinates" in tweet and tweet["coordinates"] is not None:
print("find a coordinate")
coordinates.append(tweet["coordinates"]["coordinates"] + [tweet["user"]["statuses_count"]])
df_coordinates = pd.DataFrame(coordinates, columns=['lon', 'lat', 'statuses_count'])
# Create a geojson file export from a Pandas dataframe
viz_coordinates = utils.df_to_geojson(df_coordinates, properties=['statuses_count'],
lat='lat', lon='lon', precision=3)
# Create the viz from the dataframe
tweet_on_map = viz.CircleViz(viz_coordinates,
access_token=token,
radius = 0,
center = (46.710368,23.626842),
zoom = 2,
stroke_width = 3,
# color = 'red',
stroke_color = 'red',
)
tweet_on_map.show()
c = defaultdict(lambda: defaultdict(int))
num_tweet = 1
for tweet in util.loadjson(src_path):
if 'retweeted_status' in tweet:
c[tweet['retweeted_status']['user']['screen_name']][tweet['user']['screen_name']] += 1
if tweet['user']['screen_name'] not in c:
c[tweet['user']['screen_name']] = defaultdict(int)
import social_network
TYPE = "directed"
g = social_network.Graph(c, TYPE, weighted = True)
g.build_graph()
# let's add a node attribute
user_info = json.load(open(full_prefix + "user_info.json", "r"))
node_attr = {}
for node in g.graph.node:
if node in user_info and user_info[node]['verified']:
print(node)
node_attr[node] = 'verified'
else:
node_attr[node] = 'not_verified'
g.set_node_attributes("user_status", node_attr)
len(g.graph.nodes())
g.graph.nodes()[:10]
list(g.get_node_attributes("user_status").items())[:10]
len(g.graph.edges())
g.graph.edges()[:10]
g.generategexffile(full_prefix)
g.overwrite_default_label(full_prefix, 60)
Image(filename = full_prefix + "community.png")
from nmf_topic_classify import runNMF
# identify class numbers from Gephi interface, and update those two variables
pro_trump_class = 1
anti_trump_class = 0
df = pd.read_csv(full_prefix + 'community.csv')
df.columns = ['Id', 'Label', 'interval', 'userID', 'user_status', 'componentID', 'modularity_class'] # one more column 'user_status'
print(df.head())
df_pro_trump = df[df.modularity_class == pro_trump_class]
df_anti_trump = df[df.modularity_class == anti_trump_class]
user_pro_trump = set(df_pro_trump.Id)
user_anti_trump = set(df_anti_trump.Id)
communities = {
"user_pro_trump": [user_pro_trump, []],
"user_anti_trump": [user_anti_trump, []],
}
for tweet in util.loadjson(src_path):
for community, value in communities.items():
user_names = value[0]
user_tweets = value[1]
if tweet['user']['screen_name'] in user_names:
user_tweets.append(tweet['text'].lower())
# generate NMF topics
for community, value in communities.items():
print("generating topic for community %s" %(community))
user_tweets = value[1]
n_features = 1000
if len(user_tweets) < 1000:
n_features = 100
runNMF(dataset = user_tweets, n_features = n_features)