import plotly
import plotly.plotly as py
import plotly.graph_objs as go
import plotly.figure_factory as ff
import pandas as pd
import numpy as np
import seaborn as sns
import plotly.offline as offline
import plotly_express as px
import cufflinks as cf
import colorlover as cl
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
sns.set()
plotly.tools.set_credentials_file(username='ReichY', api_key='v1zDclkuUYugJxCOUacG')
plotly.offline.init_notebook_mode(connected=True)
import warnings
warnings.filterwarnings("ignore", category=UserWarning)
Just Loading the dataset that has beeen roughtly cleaned last time
#obsolete
db3=pd.read_csv("db3.csv",index_col="Unnamed: 0")
db3=pd.read_csv("Static Tweets with Norm Loc.csv",index_col="Unnamed: 0")
Then we separate the outliers into another dataframe
# No outlier dataset
db4=db3[db3['outlier_is']==1]
# Outliers dataset
db5=db3[db3['outlier_is']==-1]
Now we only look at the non-outliers
We fix the misconduct in the last EDA, only plot the data has retweet and favorite countsm
dips_data=[db4['retweet_count'][db4.retweet_count.notna()], db4['favorite_count'][db4.favorite_count.notna()]]
rt_data=[list(dips_data[0].values)]
fav_data=[list(dips_data[1].values)]
group_labels = ['Rt. Count', 'Fav. Count']
fig = ff.create_distplot(dips_data, group_labels, bin_size=20)
fig['layout'].update(title='Rt. Count & Fav. Count Distribution',autosize=True)
py.iplot(fig, filename='Rt. Count & Fav. Count Distribution')
fig1 = ff.create_distplot(rt_data, group_labels=["Retweet Count"], bin_size=8)
fig1['layout'].update(title="Retweet Count Distribution",autosize=True)
py.iplot(fig1, filename='Retweet Count Distribution')
fig2 = ff.create_distplot(fav_data, group_labels=["Favorite Count"], bin_size=1)
fig2['layout'].update(title="Favorite Count Distribution",autosize=True)
py.iplot(fig2, filename='Favorite Count Distribution')
Plot the tweets that have both retweet counts and favorite counts
Problem:
px.scatter(db4[db4['favorite_count'].notna()], x="favorite_count", y="retweet_count", marginal_x="histogram", marginal_y="histogram",trendline="lowess",
title="Retweet Counts vs Favorite Counts (Tweets have Fav. Counts)")
px.scatter(db4[db4['retweet_count'].notna()], x="favorite_count", y="retweet_count", marginal_x="histogram", marginal_y="histogram",trendline="lowess",
title="Retweet Counts vs Favorite Counts (Tweets have RT. Counts)")
px.scatter(db4[db4['retweet_count'].notna() & db4['favorite_count'].notna()], x="favorite_count", y="retweet_count", marginal_x="histogram", marginal_y="histogram",trendline="ols",
title="Retweet Counts vs Favorite Counts (Only the tweet has both fields)")
db4[db4['favorite_count'].notna()].shape
db4[db4['retweet_count'].notna()].shape
db4[db4['retweet_count'].notna()& db4['favorite_count'].notna()].shape
Read User Dataset
#obsolete
user_set=pd.read_csv("db4.csv")
user_set=pd.read_csv("Users with Nor Loc.csv")
corr=user_set.loc[:,["user_followers_count","user_friends_count","user_statuses_count","user_favourites_count",'created_days','user_listed_count',"tweets_num"]].corr()
fig6 = ff.create_annotated_heatmap(corr.as_matrix().round(2), colorscale="Greens",reversescale=True, showscale=True,
x=["Followers","Friends","Posts","Favourites",'Created Days','Lists',"SilentSam Tweets"],
y=["Followers","Friends","Posts","Favourites",'Created Days','Lists',"SilentSam Tweets"],
)
fig6.layout.title = 'User Characteristics Correlation Matrix'
fig6.layout.update(height=700, width=700)
py.iplot(fig6, filename='annotated_heatmap_color')
px.scatter_matrix(data_frame=user_set, dimensions=["user_followers_count","user_friends_count","user_statuses_count","user_favourites_count",'created_days','user_listed_count',"tweets_num"],
labels={"user_followers_count":"Followers","user_friends_count":"Friends","user_statuses_count":"Posts","user_favourites_count":"Favorites",'created_days':"Created Days",'user_listed_count':"Lists","tweets_num":"SilentSam Tweets"},
title="User Chracters Scatter Matrix",
height=800)
px.parallel_coordinates(user_set[user_set.isoutlier_if==1], dimensions=["user_followers_count","user_friends_count","user_statuses_count","user_favourites_count",'user_listed_count',"tweets_num"], color="created_days", color_continuous_scale=px.colors.sequential.GnBu,
title="Parallel Coordinates Plot of User Characters", )
px.parallel_categories(data_frame=user_set.fillna("False"), dimensions=['eng_or_not',"user_geo_enabled","user_verified"], color="tweets_num",
color_continuous_scale=px.colors.sequential.GnBu,title="Parallel Categories Plot of User Characters" )
fig8=px.scatter_matrix(data_frame=user_set.fillna("False"), dimensions=["user_followers_count","user_friends_count","user_statuses_count","user_favourites_count",'created_days','user_listed_count'],
labels={"user_followers_count":"Followers","user_friends_count":"Friends","user_statuses_count":"Posts","user_favourites_count":"Favorites",'created_days':"Created Days",'user_listed_count':"Lists"},
title="User Chracters Scatter Matrix with Category",color="user_geo_enabled",symbol="user_verified",
height=1000,width=1200)
fig8.layout.legend['tracegroupgap']=2
fig8
fig9=px.scatter(y="lang_trans", x="user_statuses_count", data_frame=user_set, color="lang_trans", title="Languge User's Posts Count", height=800,hover_name="te
xt")
fig9.layout['showlegend']=False
fig9
new_tw_long=db3.stack().reset_index()[db3.stack().reset_index().level_1.isin(["retweet_count","favorite_count"])]
new_tw_long=pd.merge(pd.merge(new_tw_long, db3, left_on="level_0", right_index=True),user_set, left_on="from_user", right_on="from_user").iloc[:,[0,1,2,-1]]
new_tw_long.columns=["tweet_id","type","value","eng_or_not"]
px.box(data_frame=new_tw_long, x="type",y="value", color="eng_or_not", title="RT Count and Fav. Count based on User Language Use")
new_user_set=user_set.set_index("user_screen_name")
new_long=new_user_set.stack().reset_index()[new_user_set.stack().reset_index().level_1.isin(["user_followers_count","user_friends_count","user_statuses_count","user_favourites_count",'created_days','user_listed_count','tweets_num'])]
new_long=pd.merge(new_long, user_set, on="user_screen_name").iloc[:,[0,1,2,-1,-3,-11,-13]]
new_long.columns=['user_screen_name',"type","value","eng_or_not","outlier","user_verified","user_geo_enabled"]
px.box(data_frame=new_long.fillna("False")[new_long["outlier"]==1], x="type",y="value", color="eng_or_not", facet_col="user_verified", facet_row="user_geo_enabled", title="User Characters Box Plots",
notched=True, labels={"type":"User Characters"}, height=800)
db3['eng_or_not']= db3.lang_trans.map(lambda x: "English" if x in ["English","English UK"] else "Non-English")
compre_db=pd.merge(db3, user_set, left_on="user_screen_name", right_on="user_screen_name")
compre_db['user_geo_enabled']=compre_db.user_geo_enabled_x.fillna("False")
compre_db["user_verified"]=compre_db.user_verified_x.fillna("False")
compre_db['user_verified']=compre_db['user_verified'].astype("object")
compre_db.info()
px.scatter(data_frame=compre_db[compre_db.tweets_num.notna()], x="user_statuses_count_y", y="user_favourites_count_y", size="tweets_num", symbol="eng_or_not_x"
,log_x=True, size_max=60, log_y=True, color="user_verified",animation_frame="yearmon")
pd.merge(new_long, user_set, on="user_screen_name").info()
t=db3.groupby(["year","month"]).count()
tt=t.reset_index().iloc[:,0:3]
tt.columns=['year', 'month', 'tweet_count']
tt=tt.pivot('year','month','tweet_count')
tt=tt.fillna(0)
tt=tt.astype(int)
fig20=ff.create_annotated_heatmap(tt.values[::-1], x=list(tt.columns), y=list(tt.index)[::-1], annotation_text=tt.values[::-1],colorscale='YlGnBu',reversescale=True,
showscale=True)
fig20['layout'].update(title='Number of Tweets about SilentSam in different months',autosize=True)
py.iplot(fig20, filename='Number of Tweets about SilentSam in Different Months')
dd=db3.groupby(['year','month']).user_screen_name.nunique().reset_index()
dd=dd.pivot('year','month','user_screen_name')
dd=dd.fillna(0)
dd=dd.astype(int)
dd
fig21=ff.create_annotated_heatmap(dd.values[::-1], x=list(dd.columns), y=list(dd.index)[::-1], annotation_text=dd.values[::-1],colorscale='YlGnBu',reversescale=True,
showscale=True)
fig21['layout'].update(title='Number of Unique Users in Different Months',autosize=True)
py.iplot(fig21, filename='Number of Unique Users in Different Months')
tt2=tt.stack().reset_index()
tt2.columns=['year',"month","value"]
tt2['mon']=tt2['month']*30
px.line_polar(tt2, r="value", theta="mon", color="year",log_r=True, title="Polar Line Plot of Tweets with Log Transformation")
db3.groupby("yearmon").id_str.count().reset_index()
ee1=db3.groupby("yearmon").id_str.count().reset_index()
ee1['type']="tweets"
ee2=db3.groupby("yearmon").user_screen_name.nunique().reset_index()
ee2['type']="users"
ee1.columns=['yearmon',"value","type"]
ee2.columns=['yearmon',"value","type"]
eef=pd.concat([ee1,ee2])
px.line(eef, x="yearmon", y="value", color="type",log_y=False, title="Tweets and Users Over the Months")
px.line(eef, x="yearmon", y="value", color="type",log_y=True, title="Tweets and Users Over the Months with Log Transformation")
dddd=db3.groupby(by=['yearmon','lang_trans']).lang_trans.count().unstack().stack().reset_index()
dddd.columns=['yearmon','lang_trans','tweets']
ddddlog=dddd.copy()
ddddlog.tweets=np.log(ddddlog.tweets)
ddddlog
px.line(dddd, x="yearmon", y="tweets", color="lang_trans",log_y=False, title="Tweets among Different Language Users")
db3.info()
db3[db3.time.isna()]
px.scatter(db3[db3.time.notna()], x="favorite_count", y="retweet_count", color="lang_trans", animation_frame="yearmon",animation_group="id_str",title="Fav. and Rt. Counts Over time")
top3places=db3.groupby(["yearmon",'no']).count().sort_values(by='user_screen_name').groupby(level=0).tail(3).user_screen_name.reset_index()
top3places=top3places.sort_values('yearmon')
top3places
px.scatter(top3places, x="yearmon", y="user_screen_name", color="no",size="user_screen_name", title="Top 2 User Places Over Time")
time_place=pd.crosstab(db3.yearmon, db3.no)
time_place=time_place.loc[:,time_place.sum()>200].T
time_place
f, ax = plt.subplots(figsize=(15, 9))
placeheat=sns.heatmap(time_place.loc[:,time_place.sum()>200].T.T, cmap="YlGnBu",linewidths=.6,fmt='g',ax=ax,annot=True)
placeheat.set_xticklabels(placeheat.get_xticklabels(),rotation=45)
placeheat.set_title("Tweets of Top 20 Most-Tweeted Places Over Time")
cros_sour=pd.crosstab(db3.yearmon,db3.trans_sour)
cros_sour=cros_sour.loc[:,cros_sour.sum()>30]
cros_sour_per=cros_sour.apply(lambda x:x/x.sum()*100, axis=1)
fs, axn = plt.subplots(1,2,sharex=True, sharey=False,figsize=(16, 6))
ax1 = plt.subplot(1, 2, 1)
sns.heatmap(cros_sour.T, ax=ax1,cmap="YlGnBu")
ax1.set_xticklabels(ax1.get_xticklabels(),rotation=45)
ax1.set_title('Absolute Value')
ax2 = plt.subplot(1, 2, 2)
sns.heatmap(cros_sour_per.T, yticklabels=False, fmt='.f',ax=ax2,cmap="GnBu")
ax2.set_xticklabels(ax2.get_xticklabels(),rotation=45)
ax2.set_title('Percentage')
db3[db3.trans_sour.isin(cros_sour.columns)]
px.parallel_categories(db3[db3.trans_sour.isin(cros_sour.columns) | db3.no.isin(top3places.columns)],dimensions=["trans_sour","eng_or_not",'retweet_count',"no"], color="retweet_count",
title="Sources and Language Co-corrence",color_continuous_scale=px.colors.sequential.GnBu)
px.bar(db3[db3.trans_sour.isin(cros_sour.columns)], x="trans_sour", y="retweet_count", color="trans_sour", title="RT Counts of Different Sources")
px.bar(db3[db3.trans_sour.isin(cros_sour.columns)], x="trans_sour", y="favorite_count", color="trans_sour", title="Fav Counts of Different Sources")
cl.scales['10']['div']['RdYlBu'][-3]
['Greys', 'YlGnBu', 'Greens', 'YlOrRd', 'Bluered', 'RdBu',
'Reds', 'Blues', 'Picnic', 'Rainbow', 'Portland', 'Jet',
'Hot', 'Blackbody', 'Earth', 'Electric', 'Viridis', 'Cividis']
user_set.info()
sns.jointplot(x="favorite_count", y="retweet_count", data=db4)
sns.jointplot(x="favorite_count", y="retweet_count", data=db4[db4['retweet_count'].notna() | db4['favorite_count'].notna()].fillna(0))