#SilentSam EDA Revisited

a.k.a. How does my computer run out of memory, jkjk

In [46]:
import plotly 
import plotly.plotly as py
import plotly.graph_objs as go
import plotly.figure_factory as ff
import pandas as pd
import numpy as np
import seaborn as sns
import plotly.offline as offline
import plotly_express as px
import cufflinks as cf
import colorlover as cl
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
sns.set()
plotly.tools.set_credentials_file(username='ReichY', api_key='v1zDclkuUYugJxCOUacG')
plotly.offline.init_notebook_mode(connected=True)
In [2]:
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

Just Loading the dataset that has beeen roughtly cleaned last time

In [4]:
#obsolete
db3=pd.read_csv("db3.csv",index_col="Unnamed: 0")
In [3]:
db3=pd.read_csv("Static Tweets with Norm Loc.csv",index_col="Unnamed: 0")
c:\python36\lib\site-packages\IPython\core\interactiveshell.py:2785: DtypeWarning:

Columns (0,22,41) have mixed types. Specify dtype option on import or set low_memory=False.

Then we separate the outliers into another dataframe

In [4]:
# No outlier dataset
db4=db3[db3['outlier_is']==1]
In [5]:
# Outliers dataset
db5=db3[db3['outlier_is']==-1]

Now we only look at the non-outliers

We fix the misconduct in the last EDA, only plot the data has retweet and favorite countsm

In [9]:
dips_data=[db4['retweet_count'][db4.retweet_count.notna()], db4['favorite_count'][db4.favorite_count.notna()]]
rt_data=[list(dips_data[0].values)]
fav_data=[list(dips_data[1].values)]
group_labels = ['Rt. Count', 'Fav. Count']
In [49]:
fig = ff.create_distplot(dips_data, group_labels, bin_size=20)
fig['layout'].update(title='Rt. Count & Fav. Count Distribution',autosize=True)
py.iplot(fig, filename='Rt. Count & Fav. Count Distribution')
The draw time for this plot will be slow for clients without much RAM.
Out[49]:
In [64]:
fig1 = ff.create_distplot(rt_data, group_labels=["Retweet Count"], bin_size=8)
fig1['layout'].update(title="Retweet Count Distribution",autosize=True)
py.iplot(fig1, filename='Retweet Count Distribution')
The draw time for this plot will be slow for clients without much RAM.
Out[64]:
In [63]:
fig2 = ff.create_distplot(fav_data, group_labels=["Favorite Count"], bin_size=1)
fig2['layout'].update(title="Favorite Count Distribution",autosize=True)
py.iplot(fig2, filename='Favorite Count Distribution')
Out[63]:

Plot the tweets that have both retweet counts and favorite counts

Problem:

  1. If a tweet only have either rt counts or fav counts, should I include it in to our dataset?
  2. If a tweet only have one of the field, should I fill the other field with 0?
In [62]:
px.scatter(db4[db4['favorite_count'].notna()], x="favorite_count", y="retweet_count", marginal_x="histogram", marginal_y="histogram",trendline="lowess", 
           title="Retweet Counts vs Favorite Counts (Tweets have Fav. Counts)")
In [68]:
px.scatter(db4[db4['retweet_count'].notna()], x="favorite_count", y="retweet_count", marginal_x="histogram", marginal_y="histogram",trendline="lowess", 
           title="Retweet Counts vs Favorite Counts (Tweets have RT. Counts)")
In [54]:
px.scatter(db4[db4['retweet_count'].notna() & db4['favorite_count'].notna()], x="favorite_count", y="retweet_count", marginal_x="histogram", marginal_y="histogram",trendline="ols", 
           title="Retweet Counts vs Favorite Counts (Only the tweet has both fields)")
In [64]:
db4[db4['favorite_count'].notna()].shape
Out[64]:
(8635, 34)
In [67]:
db4[db4['retweet_count'].notna()].shape
Out[67]:
(48975, 34)
In [65]:
db4[db4['retweet_count'].notna()& db4['favorite_count'].notna()].shape
Out[65]:
(4870, 34)

Read User Dataset

In [6]:
#obsolete
user_set=pd.read_csv("db4.csv")
In [6]:
user_set=pd.read_csv("Users with Nor Loc.csv")
c:\python36\lib\site-packages\IPython\core\interactiveshell.py:2785: DtypeWarning:

Columns (0,3) have mixed types. Specify dtype option on import or set low_memory=False.

In [14]:
corr=user_set.loc[:,["user_followers_count","user_friends_count","user_statuses_count","user_favourites_count",'created_days','user_listed_count',"tweets_num"]].corr()
In [16]:
fig6 = ff.create_annotated_heatmap(corr.as_matrix().round(2), colorscale="Greens",reversescale=True, showscale=True,
                                  x=["Followers","Friends","Posts","Favourites",'Created Days','Lists',"SilentSam Tweets"], 
                                   y=["Followers","Friends","Posts","Favourites",'Created Days','Lists',"SilentSam Tweets"],
                                )
fig6.layout.title = 'User Characteristics Correlation Matrix'
fig6.layout.update(height=700, width=700)
py.iplot(fig6, filename='annotated_heatmap_color')
c:\python36\lib\site-packages\ipykernel_launcher.py:1: FutureWarning:

Method .as_matrix will be removed in a future version. Use .values instead.

Out[16]:
In [13]:
px.scatter_matrix(data_frame=user_set, dimensions=["user_followers_count","user_friends_count","user_statuses_count","user_favourites_count",'created_days','user_listed_count',"tweets_num"],
                 labels={"user_followers_count":"Followers","user_friends_count":"Friends","user_statuses_count":"Posts","user_favourites_count":"Favorites",'created_days':"Created Days",'user_listed_count':"Lists","tweets_num":"SilentSam Tweets"},
                  title="User Chracters Scatter Matrix",
                 height=800)
In [19]:
px.parallel_coordinates(user_set[user_set.isoutlier_if==1], dimensions=["user_followers_count","user_friends_count","user_statuses_count","user_favourites_count",'user_listed_count',"tweets_num"], color="created_days", color_continuous_scale=px.colors.sequential.GnBu,
                       title="Parallel Coordinates Plot of User Characters", )
In [44]:
px.parallel_categories(data_frame=user_set.fillna("False"), dimensions=['eng_or_not',"user_geo_enabled","user_verified"], color="tweets_num",
                      color_continuous_scale=px.colors.sequential.GnBu,title="Parallel Categories Plot of User Characters" )
In [247]:
fig8=px.scatter_matrix(data_frame=user_set.fillna("False"), dimensions=["user_followers_count","user_friends_count","user_statuses_count","user_favourites_count",'created_days','user_listed_count'],
                 labels={"user_followers_count":"Followers","user_friends_count":"Friends","user_statuses_count":"Posts","user_favourites_count":"Favorites",'created_days':"Created Days",'user_listed_count':"Lists"},
                  title="User Chracters Scatter Matrix with Category",color="user_geo_enabled",symbol="user_verified",
                 height=1000,width=1200)
fig8.layout.legend['tracegroupgap']=2
fig8
In [279]:
fig9=px.scatter(y="lang_trans", x="user_statuses_count", data_frame=user_set, color="lang_trans", title="Languge User's Posts Count", height=800,hover_name="te
                xt")
fig9.layout['showlegend']=False
fig9
In [71]:
new_tw_long=db3.stack().reset_index()[db3.stack().reset_index().level_1.isin(["retweet_count","favorite_count"])]
In [76]:
new_tw_long=pd.merge(pd.merge(new_tw_long, db3, left_on="level_0", right_index=True),user_set, left_on="from_user", right_on="from_user").iloc[:,[0,1,2,-1]]
new_tw_long.columns=["tweet_id","type","value","eng_or_not"]
In [78]:
px.box(data_frame=new_tw_long, x="type",y="value", color="eng_or_not", title="RT Count and Fav. Count based on User Language Use")
In [83]:
new_user_set=user_set.set_index("user_screen_name")
new_long=new_user_set.stack().reset_index()[new_user_set.stack().reset_index().level_1.isin(["user_followers_count","user_friends_count","user_statuses_count","user_favourites_count",'created_days','user_listed_count','tweets_num'])]
new_long=pd.merge(new_long, user_set, on="user_screen_name").iloc[:,[0,1,2,-1,-3,-11,-13]]
new_long.columns=['user_screen_name',"type","value","eng_or_not","outlier","user_verified","user_geo_enabled"]
In [92]:
px.box(data_frame=new_long.fillna("False")[new_long["outlier"]==1], x="type",y="value", color="eng_or_not", facet_col="user_verified", facet_row="user_geo_enabled", title="User Characters Box Plots",
      notched=True, labels={"type":"User Characters"}, height=800)
In [7]:
db3['eng_or_not']= db3.lang_trans.map(lambda x: "English" if x in ["English","English UK"] else "Non-English")
In [8]:
compre_db=pd.merge(db3, user_set, left_on="user_screen_name", right_on="user_screen_name")
In [9]:
compre_db['user_geo_enabled']=compre_db.user_geo_enabled_x.fillna("False")
compre_db["user_verified"]=compre_db.user_verified_x.fillna("False")
compre_db['user_verified']=compre_db['user_verified'].astype("object")
In [13]:
compre_db.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 59218 entries, 0 to 59217
Data columns (total 85 columns):
id_str_x                       59217 non-null float64
from_user_x                    59217 non-null object
text_x                         59217 non-null object
time_x                         59216 non-null object
geo_coordinates_x              14 non-null object
user_lang_x                    59216 non-null object
in_reply_to_screen_name_x      2916 non-null object
from_user_id_str_x             59216 non-null float64
in_reply_to_status_id_str_x    2490 non-null float64
source_x                       59216 non-null object
user_followers_count_x         59183 non-null float64
user_friends_count_x           59115 non-null float64
user_location_x                45261 non-null object
entities_str_x                 59216 non-null object
place_x                        908 non-null object
retweet_count_x                51963 non-null float64
favorite_count_x               9274 non-null float64
user_description_x             54458 non-null object
user_created_at_x              59216 non-null object
user_geo_enabled_x             25624 non-null object
user_listed_count_x            52910 non-null float64
user_verified_x                59216 non-null object
user_statuses_count_x          59216 non-null float64
user_screen_name               59216 non-null object
user_favourites_count_x        59074 non-null float64
possibly_sensitive_x           232 non-null object
lang_trans_x                   59216 non-null object
date_x                         59216 non-null object
outlier_is_x                   59216 non-null float64
outlier_dbs_x                  59216 non-null float64
year                           59216 non-null float64
yearmon                        59216 non-null object
month                          59216 non-null float64
trans_sour                     59216 non-null object
pre_x                          38247 non-null object
no_x                           38246 non-null object
addr_x                         38240 non-null object
lat_x                          38240 non-null float64
long_x                         38240 non-null float64
point_x_x                      38240 non-null object
Accurarcy_x                    752 non-null object
eng_or_not_x                   59218 non-null object
Unnamed: 0                     59218 non-null object
tweets_num                     59216 non-null float64
id_str_y                       59216 non-null object
from_user_y                    59216 non-null object
text_y                         59216 non-null object
time_y                         59216 non-null object
geo_coordinates_y              14 non-null object
user_lang_y                    59216 non-null object
in_reply_to_screen_name_y      2351 non-null object
from_user_id_str_y             59216 non-null float64
in_reply_to_status_id_str_y    2094 non-null float64
source_y                       59216 non-null object
user_followers_count_y         59183 non-null float64
user_friends_count_y           59115 non-null float64
user_location_y                45261 non-null object
entities_str_y                 59216 non-null object
place_y                        977 non-null object
retweet_count_y                51690 non-null float64
favorite_count_y               8500 non-null float64
user_description_y             54458 non-null object
user_created_at_y              59216 non-null object
user_geo_enabled_y             25624 non-null object
user_listed_count_y            52910 non-null float64
user_verified_y                3243 non-null object
user_statuses_count_y          59216 non-null float64
user_favourites_count_y        59074 non-null float64
possibly_sensitive_y           190 non-null object
lang_trans_y                   59216 non-null object
date_y                         59216 non-null object
outlier_is_y                   59216 non-null float64
outlier_dbs_y                  59216 non-null float64
isoutlier_if                   59216 non-null float64
created_days                   59216 non-null float64
eng_or_not_y                   59216 non-null object
pre_y                          38247 non-null object
no_y                           38246 non-null object
addr_y                         38240 non-null object
lat_y                          38240 non-null float64
long_y                         38240 non-null float64
point_x_y                      38240 non-null object
Accurarcy_y                    6920 non-null object
user_geo_enabled               59218 non-null object
user_verified                  59218 non-null object
dtypes: float64(32), object(53)
memory usage: 38.9+ MB
In [26]:
px.scatter(data_frame=compre_db[compre_db.tweets_num.notna()], x="user_statuses_count_y", y="user_favourites_count_y", size="tweets_num", symbol="eng_or_not_x"
           ,log_x=True, size_max=60, log_y=True, color="user_verified",animation_frame="yearmon")
In [80]:
pd.merge(new_long, user_set, on="user_screen_name").info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 178794 entries, 0 to 178793
Data columns (total 37 columns):
user_screen_name             178794 non-null object
level_1                      178794 non-null object
0                            178794 non-null object
Unnamed: 0                   178794 non-null int64
tweets_num                   178794 non-null int64
id_str                       178794 non-null int64
from_user                    178794 non-null object
text                         178794 non-null object
time                         178794 non-null object
geo_coordinates              63 non-null object
user_lang                    178794 non-null object
in_reply_to_screen_name      3890 non-null object
from_user_id_str             178794 non-null int64
in_reply_to_status_id_str    3073 non-null float64
source                       178794 non-null object
user_followers_count         178705 non-null float64
user_friends_count           178532 non-null float64
user_location                132076 non-null object
entities_str                 178794 non-null object
place                        1622 non-null object
retweet_count                162158 non-null float64
favorite_count               13591 non-null float64
user_description             159813 non-null object
user_created_at              178794 non-null object
user_geo_enabled             76003 non-null object
user_listed_count            156341 non-null float64
user_verified                3841 non-null object
user_statuses_count          178794 non-null int64
user_favourites_count        178450 non-null float64
possibly_sensitive           583 non-null object
lang_trans                   178794 non-null object
date                         178794 non-null object
outlier_is                   178794 non-null int64
outlier_dbs                  178794 non-null int64
isoutlier_if                 178794 non-null int64
created_days                 178794 non-null float64
eng_or_not                   178794 non-null object
dtypes: float64(8), int64(8), object(21)
memory usage: 51.8+ MB
In [10]:
t=db3.groupby(["year","month"]).count()
tt=t.reset_index().iloc[:,0:3]
tt.columns=['year', 'month', 'tweet_count']
tt=tt.pivot('year','month','tweet_count')
tt=tt.fillna(0)
tt=tt.astype(int)
In [122]:
fig20=ff.create_annotated_heatmap(tt.values[::-1],  x=list(tt.columns), y=list(tt.index)[::-1],  annotation_text=tt.values[::-1],colorscale='YlGnBu',reversescale=True,
                                    showscale=True)
In [125]:
fig20['layout'].update(title='Number of Tweets about SilentSam in different months',autosize=True)

py.iplot(fig20, filename='Number of Tweets about SilentSam in Different Months')
Out[125]:
In [126]:
dd=db3.groupby(['year','month']).user_screen_name.nunique().reset_index()
dd=dd.pivot('year','month','user_screen_name')
dd=dd.fillna(0)
dd=dd.astype(int)
In [128]:
dd
Out[128]:
month 1.0 2.0 3.0 4.0 5.0 6.0 7.0 8.0 9.0 10.0 11.0 12.0
year
2016.0 0 5 12 6 0 2 1 0 2 4 1 4
2017.0 5 2 1 7 2 0 0 1812 387 225 355 40
2018.0 38 121 86 129 391 118 163 16573 2714 853 271 5559
2019.0 2182 1622 0 0 0 0 0 0 0 0 0 0
In [130]:
fig21=ff.create_annotated_heatmap(dd.values[::-1],  x=list(dd.columns), y=list(dd.index)[::-1],  annotation_text=dd.values[::-1],colorscale='YlGnBu',reversescale=True,
                                    showscale=True)
fig21['layout'].update(title='Number of Unique Users in Different Months',autosize=True)

py.iplot(fig21, filename='Number of Unique Users in Different Months')
Out[130]:
In [139]:
tt2=tt.stack().reset_index()
In [141]:
tt2.columns=['year',"month","value"]
In [146]:
tt2['mon']=tt2['month']*30
In [183]:
px.line_polar(tt2, r="value", theta="mon", color="year",log_r=True, title="Polar Line Plot of Tweets with Log Transformation")
In [185]:
db3.groupby("yearmon").id_str.count().reset_index()
Out[185]:
yearmon id_str
0 2016-02 7
1 2016-03 14
2 2016-04 8
3 2016-06 2
4 2016-07 1
5 2016-09 2
6 2016-10 4
7 2016-11 1
8 2016-12 4
9 2017-01 6
10 2017-02 6
11 2017-03 1
12 2017-04 7
13 2017-05 2
14 2017-08 3894
15 2017-09 1232
16 2017-10 507
17 2017-11 1082
18 2017-12 68
19 2018-01 81
20 2018-02 196
21 2018-03 130
22 2018-04 237
23 2018-05 767
24 2018-06 165
25 2018-07 255
26 2018-08 26791
27 2018-09 4896
28 2018-10 1814
29 2018-11 461
30 2018-12 11129
31 2019-01 3626
32 2019-02 1820
In [186]:
ee1=db3.groupby("yearmon").id_str.count().reset_index()
ee1['type']="tweets"
ee2=db3.groupby("yearmon").user_screen_name.nunique().reset_index()
ee2['type']="users"
In [191]:
ee1.columns=['yearmon',"value","type"]
ee2.columns=['yearmon',"value","type"]
eef=pd.concat([ee1,ee2])
In [196]:
px.line(eef, x="yearmon", y="value", color="type",log_y=False, title="Tweets and Users Over the Months")
In [197]:
px.line(eef, x="yearmon", y="value", color="type",log_y=True, title="Tweets and Users Over the Months with Log Transformation")
In [180]:
dddd=db3.groupby(by=['yearmon','lang_trans']).lang_trans.count().unstack().stack().reset_index()
dddd.columns=['yearmon','lang_trans','tweets']
ddddlog=dddd.copy()
ddddlog.tweets=np.log(ddddlog.tweets)
In [198]:
ddddlog
Out[198]:
yearmon lang_trans tweets
0 2016-02 English 1.945910
1 2016-03 English 2.639057
2 2016-04 English 2.079442
3 2016-06 English 0.693147
4 2016-07 English 0.000000
5 2016-09 English 0.693147
6 2016-10 English 1.386294
7 2016-11 English 0.000000
8 2016-12 English 1.386294
9 2017-01 English 1.791759
10 2017-02 English 1.791759
11 2017-03 English 0.000000
12 2017-04 English 1.945910
13 2017-05 English 0.693147
14 2017-08 Danish 0.000000
15 2017-08 English 8.261010
16 2017-08 English UK 2.397895
17 2017-08 German 1.945910
18 2017-08 Spanish; Castilian 1.609438
19 2017-09 English 7.114769
20 2017-09 French 0.000000
21 2017-09 Spanish; Castilian 0.000000
22 2017-10 English 6.228511
23 2017-11 English 6.984716
24 2017-11 French 0.000000
25 2017-11 Japanese 0.000000
26 2017-12 English 4.219508
27 2018-01 English 4.394449
28 2018-02 Dutch; Flemish 0.000000
29 2018-02 English 5.252273
... ... ... ...
90 2018-12 Catalan; Valencian 0.000000
91 2018-12 English 9.312175
92 2018-12 English UK 2.944439
93 2018-12 Finnish 0.000000
94 2018-12 French 1.945910
95 2018-12 German 1.609438
96 2018-12 Greek, Modern (1453-) 0.000000
97 2018-12 Italian 0.000000
98 2018-12 Japanese 0.693147
99 2018-12 LOLCATZ 0.000000
100 2018-12 Polish 0.000000
101 2018-12 Portuguese 0.000000
102 2018-12 Spanish; Castilian 2.772589
103 2018-12 Swedish 0.000000
104 2019-01 English 8.192017
105 2019-01 English UK 0.693147
106 2019-01 French 1.098612
107 2019-01 German 0.693147
108 2019-01 Italian 0.000000
109 2019-01 Norwegian 0.000000
110 2019-01 Spanish; Castilian 1.609438
111 2019-02 English 7.496652
112 2019-02 English UK 1.098612
113 2019-02 French 1.386294
114 2019-02 German 0.000000
115 2019-02 Hebrew 0.000000
116 2019-02 Italian 0.693147
117 2019-02 Portuguese 0.000000
118 2019-02 Spanish; Castilian 1.609438
119 2019-02 Swedish 0.000000

120 rows × 3 columns

In [199]:
px.line(dddd, x="yearmon", y="tweets", color="lang_trans",log_y=False, title="Tweets among Different Language Users")
In [201]:
db3.info()
<class 'pandas.core.frame.DataFrame'>
Index: 59218 entries, 0 to 59253
Data columns (total 42 columns):
id_str                       59217 non-null float64
from_user                    59217 non-null object
text                         59217 non-null object
time                         59216 non-null object
geo_coordinates              14 non-null object
user_lang                    59216 non-null object
in_reply_to_screen_name      2916 non-null object
from_user_id_str             59216 non-null float64
in_reply_to_status_id_str    2490 non-null float64
source                       59216 non-null object
user_followers_count         59183 non-null float64
user_friends_count           59115 non-null float64
user_location                45261 non-null object
entities_str                 59216 non-null object
place                        908 non-null object
retweet_count                51963 non-null float64
favorite_count               9274 non-null float64
user_description             54458 non-null object
user_created_at              59216 non-null object
user_geo_enabled             25624 non-null object
user_listed_count            52910 non-null float64
user_verified                59216 non-null object
user_statuses_count          59216 non-null float64
user_screen_name             59216 non-null object
user_favourites_count        59074 non-null float64
possibly_sensitive           232 non-null object
lang_trans                   59216 non-null object
date                         59216 non-null object
outlier_is                   59216 non-null float64
outlier_dbs                  59216 non-null float64
year                         59216 non-null float64
yearmon                      59216 non-null object
month                        59216 non-null float64
trans_sour                   59216 non-null object
pre                          38247 non-null object
no                           38246 non-null object
addr                         38240 non-null object
lat                          38240 non-null float64
long                         38240 non-null float64
point_x                      38240 non-null object
Accurarcy                    752 non-null object
eng_or_not                   59218 non-null object
dtypes: float64(16), object(26)
memory usage: 19.4+ MB
In [209]:
db3[db3.time.isna()]
Out[209]:
id_str from_user text time geo_coordinates user_lang in_reply_to_screen_name from_user_id_str in_reply_to_status_id_str source ... month trans_sour pre no addr lat long point_x Accurarcy eng_or_not
252 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN Non-English
NU, Canada 56.572731 -79.56259552 56 34m 21.8316s N, 79 33m 45.3439s W NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN Non-English

2 rows × 42 columns

In [11]:
px.scatter(db3[db3.time.notna()], x="favorite_count", y="retweet_count", color="lang_trans", animation_frame="yearmon",animation_group="id_str",title="Fav. and Rt. Counts Over time")
In [29]:
top3places=db3.groupby(["yearmon",'no']).count().sort_values(by='user_screen_name').groupby(level=0).tail(3).user_screen_name.reset_index()
top3places=top3places.sort_values('yearmon')
In [30]:
top3places
Out[30]:
yearmon no user_screen_name
7 2016-02 newton wave pool 1
5 2016-02 Durham, NC 1
6 2016-02 Seattle, WA 1
9 2016-03 Chapel Hill, North Carolina 1
26 2016-03 Chapel Hill, NC 5
8 2016-03 Carrboro, North Carolina 1
23 2016-04 VA Beach 2
0 2016-04 In my bag 1
4 2016-04 Mississauga Ontario Canada _??? 1
2 2016-06 Seymour, TN 1
1 2016-06 Durham Region & the GTA 1
3 2016-09 Ottawa, Ontario 1
10 2016-10 Calgary, Alberta, Canada 1
15 2016-10 Nashville, Tennessee 1
16 2016-10 Tennessee 1
17 2016-11 Ontario, Canada 1
19 2016-12 Philadelphia, PA 1
18 2016-12 Austin, TX 1
20 2017-01 B.C Canada 1
21 2017-01 Wisconsin 1
22 2017-02 North West, England 1
12 2017-04 New Westminster, BC 1
14 2017-04 Boston.ma 1
11 2017-04 Washington, D.C. 1
13 2017-05 Central NC 1
68 2017-08 North Carolina, USA 188
72 2017-08 Durham, NC 313
70 2017-08 Chapel Hill, NC 304
57 2017-09 Carrboro, North Carolina 54
54 2017-09 North Carolina, USA 46
... ... ... ...
45 2018-05 Carrboro, North Carolina 32
63 2018-05 Chapel Hill, NC 136
47 2018-05 Durham, NC 40
28 2018-06 North Carolina 7
29 2018-06 Carrboro, North Carolina 8
44 2018-06 Chapel Hill, NC 30
39 2018-07 Carrboro, North Carolina 25
34 2018-07 Raleigh, NC 14
48 2018-07 Chapel Hill, NC 40
80 2018-08 Chapel Hill, NC 998
78 2018-08 Durham, NC 717
81 2018-08 USA 1026
67 2018-09 Durham, NC 187
75 2018-09 Chapel Hill, NC 407
62 2018-09 Raleigh, NC 135
61 2018-10 Durham, NC 123
69 2018-10 Chapel Hill, NC 265
59 2018-10 Carrboro, North Carolina 69
58 2018-11 Chapel Hill, NC 68
52 2018-11 Durham, NC 43
38 2018-11 Carrboro, North Carolina 19
73 2018-12 North Carolina, USA 398
76 2018-12 Durham, NC 454
79 2018-12 Chapel Hill, NC 824
71 2019-01 Chapel Hill, NC 307
66 2019-01 Durham, NC 170
64 2019-01 Raleigh, NC 163
53 2019-02 USA 45
56 2019-02 Chapel Hill, NC 47
51 2019-02 North Carolina, USA 42

82 rows × 3 columns

In [31]:
px.scatter(top3places, x="yearmon", y="user_screen_name", color="no",size="user_screen_name", title="Top 2 User Places Over Time")
In [43]:
time_place=pd.crosstab(db3.yearmon, db3.no)
time_place=time_place.loc[:,time_place.sum()>200].T
In [38]:
time_place
Out[38]:
yearmon 2016-02 2016-03 2016-04 2016-06 2016-09 2016-10 2016-11 2016-12 2017-01 2017-02 ... 2018-05 2018-06 2018-07 2018-08 2018-09 2018-10 2018-11 2018-12 2019-01 2019-02
no
Boston, MA 0 0 0 0 0 0 0 0 0 0 ... 1 0 0 88 23 6 4 78 16 7
California, USA 0 0 0 0 0 0 0 0 0 0 ... 0 1 0 138 17 2 0 29 6 8
Carrboro, NC 0 0 0 0 0 0 0 0 0 0 ... 10 3 1 113 30 18 15 137 40 3
Carrboro, North Carolina 0 1 0 0 0 0 0 0 0 0 ... 32 8 25 150 115 69 19 158 64 12
Chapel Hill, NC 0 5 0 0 0 0 0 0 0 0 ... 136 30 40 998 407 265 68 824 307 47
Chapel Hill-London-Tokyo 0 0 0 0 0 0 0 0 0 0 ... 15 3 0 77 23 26 3 42 18 4
Charlotte, NC 0 0 0 0 0 0 0 0 0 0 ... 4 0 3 241 40 11 2 120 53 16
Durham, NC 1 1 0 0 0 0 0 0 0 0 ... 40 4 7 717 187 123 43 454 170 29
Greensboro, NC 0 0 0 0 0 0 0 0 0 0 ... 1 0 1 259 22 9 10 173 37 29
New York, NY 0 0 0 0 0 0 0 0 0 0 ... 0 1 3 123 17 5 0 46 16 9
North Carolina 0 0 0 0 0 0 0 0 0 0 ... 12 7 5 379 96 39 14 288 107 23
North Carolina, USA 0 0 0 0 0 0 0 0 0 0 ... 16 6 7 562 108 47 17 398 157 42
Orange County, NC 0 0 0 0 0 0 0 0 0 0 ... 2 0 5 69 28 13 4 28 30 6
Raleigh, N.C. 0 0 0 0 0 0 0 0 0 0 ... 5 2 4 121 33 17 7 44 24 5
Raleigh, NC 0 0 0 0 0 0 0 0 0 0 ... 19 1 14 657 135 47 6 348 163 28
Richmond, VA 0 0 0 0 0 0 0 0 0 0 ... 1 1 1 114 34 7 6 57 7 4
USA 0 0 0 0 0 0 0 0 0 0 ... 7 2 4 1026 119 37 4 179 47 45
Washington, DC 0 0 0 0 0 0 0 0 0 0 ... 4 0 1 219 33 9 5 105 33 15

18 rows × 31 columns

In [53]:
f, ax = plt.subplots(figsize=(15, 9))
placeheat=sns.heatmap(time_place.loc[:,time_place.sum()>200].T.T, cmap="YlGnBu",linewidths=.6,fmt='g',ax=ax,annot=True)
placeheat.set_xticklabels(placeheat.get_xticklabels(),rotation=45)
placeheat.set_title("Tweets of Top 20 Most-Tweeted Places Over Time")
Out[53]:
Text(0.5,1,'Tweets of Top 20 Most-Tweeted Places Over Time')
In [55]:
cros_sour=pd.crosstab(db3.yearmon,db3.trans_sour)
cros_sour=cros_sour.loc[:,cros_sour.sum()>30]
cros_sour_per=cros_sour.apply(lambda x:x/x.sum()*100, axis=1)
In [59]:
fs, axn = plt.subplots(1,2,sharex=True, sharey=False,figsize=(16, 6))
ax1 = plt.subplot(1, 2, 1)
sns.heatmap(cros_sour.T, ax=ax1,cmap="YlGnBu")
ax1.set_xticklabels(ax1.get_xticklabels(),rotation=45)
ax1.set_title('Absolute Value')
ax2 = plt.subplot(1, 2, 2)
sns.heatmap(cros_sour_per.T, yticklabels=False, fmt='.f',ax=ax2,cmap="GnBu")
ax2.set_xticklabels(ax2.get_xticklabels(),rotation=45)
ax2.set_title('Percentage')
Out[59]:
Text(0.5,1,'Percentage')
In [76]:
db3[db3.trans_sour.isin(cros_sour.columns)]
Out[76]:
id_str from_user text time geo_coordinates user_lang in_reply_to_screen_name from_user_id_str in_reply_to_status_id_str source ... month trans_sour pre no addr lat long point_x Accurarcy eng_or_not
0 1.099692e+18 miriammarkfield RT @jordangreentcb: I filed my stories about t... 2019-02-24 15:24:51 NaN en NaN 1.022726e+08 NaN <a href="http://twitter.com" rel="nofollow">Tw... ... 2.0 Twitter Web Client North Carolina North Carolina North Carolina, USA 35.672964 -79.039292 35 40m 22.67s N, 79 2m 21.4508s W True English
1 1.099563e+18 1st_Reduce_Harm RT @jordangreentcb: #silentsam https://t.co/55... 2019-02-24 06:53:50 NaN en NaN 9.645217e+17 NaN <a href="http://twitter.com/download/android" ... ... 2.0 Twitter for Android Not "the Midwest", THE NORTH. Not "the Midwest", THE NORTH. Midwest, Natrona County, Wyoming, USA 43.411391 -106.280075 43 24m 41.0076s N, 106 16m 48.27s W False English
2 1.099726e+18 SilentSamIAm RT @jordangreentcb: Antiracists tell neo-Confe... 2019-02-24 17:42:29 NaN en NaN 9.137753e+17 NaN <a href="http://twitter.com/download/iphone" r... ... 2.0 Twitter for iPhone NaN NaN NaN NaN NaN NaN NaN English
3 1.099630e+18 IGD_News RT @adaure: Drowned out by the chants of “go h... 2019-02-24 11:18:06 NaN en NaN 3.289440e+09 NaN <a href="http://twitter.com/download/android" ... ... 2.0 Twitter for Android NaN NaN NaN NaN NaN NaN NaN English
4 1.099634e+18 tartnyc RT @jordangreentcb: Antiracists tell neo-Confe... 2019-02-24 11:35:24 NaN en NaN 1.596175e+08 NaN <a href="https://mobile.twitter.com" rel="nofo... ... 2.0 Twitter Web App NaN NaN NaN NaN NaN NaN NaN English
5 1.100961e+18 DT_Sensual People that seek separation without tolerance ... 2019-02-28 03:28:14 NaN en NaN 3.767591e+09 NaN <a href="http://twitter.com/download/android" ... ... 2.0 Twitter for Android Cary, NC Cary, NC Cary, Wake County, North Carolina, USA 35.788305 -78.781196 35 47m 17.8966s N, 78 46m 52.307s W True English
6 1.099965e+18 bluemazatl RT @jordangreentcb: The coalition of neo-Confe... 2019-02-25 09:30:09 NaN en NaN 2.365186e+08 NaN <a href="http://twitter.com/download/android" ... ... 2.0 Twitter for Android Valsetz, Oregon Valsetz, Oregon Valsetz, Polk County, Oregon, USA 44.836235 -123.651337 44 50m 10.4464s N, 123 39m 4.81248s W True English
7 1.099699e+18 KendraElWa RT @jordangreentcb: The coalition of neo-Confe... 2019-02-24 15:56:03 NaN en NaN 1.548189e+09 NaN <a href="http://twitter.com/download/iphone" r... ... 2.0 Twitter for iPhone NaN NaN NaN NaN NaN NaN NaN English
8 1.100041e+18 brklwyr In awe of the great print/online work this yea... 2019-02-25 14:33:48 NaN en NaN 3.304751e+09 NaN <a href="http://twitter.com/download/iphone" r... ... 2.0 Twitter for iPhone NaN NaN NaN NaN NaN NaN NaN English
9 1.099630e+18 cujowasagoodboy RT @adaure: Drowned out by the chants of “go h... 2019-02-24 11:19:38 NaN en NaN 6.079942e+07 NaN <a href="http://twitter.com/download/android" ... ... 2.0 Twitter for Android Tucson, AZ Tucson, AZ Tucson, Pima County, Arizona, USA 32.221892 -110.926235 32 13m 18.8101s N, 110 55m 34.4471s W True English
10 1.099636e+18 M_Abdirizak93 RT @jordangreentcb: #silentsam https://t.co/55... 2019-02-24 11:43:40 NaN en NaN 2.419161e+09 NaN <a href="http://twitter.com" rel="nofollow">Tw... ... 2.0 Twitter Web Client Nairobi, Kenya Nairobi, Kenya Nairobi, Kenya -1.283253 36.817245 1 16m 59.7119s S, 36 49m 2.08164s E True English
11 1.099712e+18 talkingattheTV2 RT @jordangreentcb: The coalition of neo-Confe... 2019-02-24 16:45:50 NaN en NaN 2.535755e+09 NaN <a href="http://twitter.com/download/iphone" r... ... 2.0 Twitter for iPhone NaN NaN NaN NaN NaN NaN NaN English
12 1.099823e+18 margaretcmaurer RT @jordangreentcb: I filed my stories about t... 2019-02-25 00:08:44 NaN en NaN 9.971599e+17 NaN <a href="http://twitter.com/download/iphone" r... ... 2.0 Twitter for iPhone Chapel Hill, NC Chapel Hill, NC Chapel Hill, Orange County, North Carolina, USA 35.913154 -79.055780 35 54m 47.3551s N, 79 3m 20.808s W True English
13 1.099696e+18 takeactionch RT @AylingLindsay: Before the rally, antiracis... 2019-02-24 15:42:44 NaN en NaN 1.036645e+18 NaN <a href="http://twitter.com/download/iphone" r... ... 2.0 Twitter for iPhone Chapel Hill, NC Chapel Hill, NC Chapel Hill, Orange County, North Carolina, USA 35.913154 -79.055780 35 54m 47.3551s N, 79 3m 20.808s W True English
14 1.099688e+18 dhosterman RT @jordangreentcb: The coalition of neo-Confe... 2019-02-24 15:11:46 NaN en NaN 1.682676e+07 NaN <a href="http://twitter.com/download/android" ... ... 2.0 Twitter for Android Durham, NC Durham, NC Durham County, North Carolina, USA 36.018132 -78.875158 36 1m 5.27376s N, 78 52m 30.5695s W True English
15 1.099675e+18 LocoCravey RT @jordangreentcb: The coalition of neo-Confe... 2019-02-24 14:17:19 NaN en NaN 1.119392e+09 NaN <a href="http://twitter.com/download/android" ... ... 2.0 Twitter for Android Carrboro, North Carolina Carrboro, North Carolina Carrboro, Orange County, North Carolina, 27510... 35.910144 -79.075289 35 54m 36.5177s N, 79 4m 31.0422s W True English
16 1.099655e+18 226press RT @jordangreentcb: Antiracists tell neo-Confe... 2019-02-24 12:58:28 NaN en NaN 1.710163e+09 NaN <a href="http://twitter.com/download/iphone" r... ... 2.0 Twitter for iPhone Philadelphia, PA Philadelphia, PA Philadelphia, Philadelphia County, Pennsylvani... 39.952415 -75.163576 39 57m 8.69472s N, 75 9m 48.8718s W True English
17 1.099748e+18 constantnatalie RT @jordangreentcb: The coalition of neo-Confe... 2019-02-24 19:10:30 NaN en NaN 2.796604e+08 NaN <a href="http://twitter.com/download/android" ... ... 2.0 Twitter for Android chicago chicago Chicago, Cook County, Illinois, USA 41.875562 -87.624421 41 52m 32.0218s N, 87 37m 27.9163s W True English
18 1.099924e+18 tmorman RT @jordangreentcb: I filed my stories about t... 2019-02-25 06:49:56 NaN en NaN 5.231526e+07 NaN <a href="https://mobile.twitter.com" rel="nofo... ... 2.0 Twitter Web App Raleigh, NC, Earth Raleigh, NC, Earth Earth Fare, Moncreiffe Road, Brier Creek, Rale... 35.899533 -78.792262 35 53m 58.3195s N, 78 47m 32.1441s W False English
19 1.099634e+18 amymorto RT @jordangreentcb: Antiracists tell neo-Confe... 2019-02-24 11:36:17 NaN en NaN 1.727731e+08 NaN <a href="http://twitter.com/download/android" ... ... 2.0 Twitter for Android NaN NaN NaN NaN NaN NaN NaN English
20 1.100498e+18 acthistreview .@sams_reckoning is an amazing activist-histor... 2019-02-26 20:50:12 NaN en NaN 8.044166e+17 NaN <a href="http://twitter.com" rel="nofollow">Tw... ... 2.0 Twitter Web Client NaN NaN NaN NaN NaN NaN NaN English
21 1.099632e+18 Dj_Sepultourist RT @jordangreentcb: Antiracists tell neo-Confe... 2019-02-24 11:28:45 NaN en NaN 1.734498e+09 NaN <a href="http://twitter.com/download/iphone" r... ... 2.0 Twitter for iPhone Poop, yucatan Poop, yucatan Poop, Tixcacalcupul, Yucat??n, M??xico 20.330462 -88.401261 20 19m 49.6625s N, 88 24m 4.53888s W True English
22 1.099908e+18 tarheelesq RT @jordangreentcb: The coalition of neo-Confe... 2019-02-25 05:45:42 NaN en NaN 2.645063e+08 NaN <a href="http://twitter.com/#!/download/ipad" ... ... 2.0 Twitter for iPad NaN NaN NaN NaN NaN NaN NaN English
23 1.099677e+18 angry_barbaloot RT @jordangreentcb: I filed my stories about t... 2019-02-24 14:25:35 NaN en NaN 8.204912e+17 NaN <a href="http://twitter.com" rel="nofollow">Tw... ... 2.0 Twitter Web Client Off Off Offutt Air Force Base, Looking Glass Avenue, O... 41.118376 -95.910131 41 7m 6.15288s N, 95 54m 36.4712s W False English
24 1.099633e+18 condorscondor RT @jordangreentcb: Antiracists tell neo-Confe... 2019-02-24 11:32:41 NaN en NaN 1.416424e+09 NaN <a href="http://twitter.com/download/android" ... ... 2.0 Twitter for Android NaN NaN NaN NaN NaN NaN NaN English
25 1.099684e+18 chogmorson RT @jordangreentcb: The coalition of neo-Confe... 2019-02-24 14:52:34 NaN en NaN 2.214182e+09 NaN <a href="http://twitter.com" rel="nofollow">Tw... ... 2.0 Twitter Web Client North Carolina, USA North Carolina, USA North Carolina, USA 35.672964 -79.039292 35 40m 22.67s N, 79 2m 21.4508s W True English
26 1.099635e+18 AndreFastayrol RT @adaure: Drowned out by the chants of “go h... 2019-02-24 11:39:33 NaN fr NaN 3.624639e+08 NaN <a href="http://twitter.com/#!/download/ipad" ... ... 2.0 Twitter for iPad NaN NaN NaN NaN NaN NaN NaN Non-English
27 1.099654e+18 takethemdownnow RT @jordangreentcb: “May Turner, John Brown, a... 2019-02-24 12:55:48 NaN en NaN 9.032689e+17 NaN <a href="http://twitter.com/download/iphone" r... ... 2.0 Twitter for iPhone NaN NaN NaN NaN NaN NaN NaN English
28 1.099692e+18 magus721rn RT @jordangreentcb: Antiracists tell neo-Confe... 2019-02-24 15:26:52 NaN en NaN 7.060636e+07 NaN <a href="http://twitter.com/download/android" ... ... 2.0 Twitter for Android WAKANDA! WAKANDA! Wakanda, Kyabishonga kibirizi, Bukoba, Kagera,... -1.663241 31.401493 1 39m 47.6672s S, 31 24m 5.37408s E False English
29 1.099675e+18 RichmondDoc RT @jordangreentcb: The coalition of neo-Confe... 2019-02-24 14:20:36 NaN en NaN 2.086598e+07 NaN <a href="http://twitter.com/download/iphone" r... ... 2.0 Twitter for iPhone NaN NaN NaN NaN NaN NaN NaN English
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
59218 7.225928e+17 Move_The_Statue #SikesSitIn #TillmanHall #CalhounHonorsCollege... 2016-04-20 02:08:45 NaN en NaN 7.149915e+17 NaN <a href="http://twitter.com" rel="nofollow">Tw... ... 4.0 Twitter Web Client NaN NaN NaN NaN NaN NaN NaN English
59219 7.073531e+17 pme919 RT @iah_unc: #artsandsocialjustice grant deadl... 2016-03-08 23:51:34 NaN en NaN 6.723252e+06 NaN <a href="https://about.twitter.com/products/tw... ... 3.0 TweetDeck Durham, NC Durham, NC Durham County, North Carolina, USA 36.018132 -78.875158 36 1m 5.27376s N, 78 52m 30.5695s W NaN English
59222 7.856950e+17 jhollymc @courtenayrogers @GlenCasada @WSMVHayleyMason ... 2016-10-11 05:14:21 NaN en courtenayrogers 1.191871e+08 7.856735e+17 <a href="http://twitter.com/download/iphone" r... ... 10.0 Twitter for iPhone NaN NaN NaN NaN NaN NaN NaN English
59223 7.073527e+17 unc_cfe RT @iah_unc: #artsandsocialjustice grant deadl... 2016-03-08 23:49:57 NaN en NaN 7.602359e+07 NaN <a href="https://about.twitter.com/products/tw... ... 3.0 TweetDeck Chapel Hill, NC Chapel Hill, NC Chapel Hill, Orange County, North Carolina, USA 35.913154 -79.055780 35 54m 47.3551s N, 79 3m 20.808s W NaN English
59224 7.863478e+17 knightstivender Here at the @franklinhomepg candidate forum. O... 2016-10-13 00:28:25 NaN en NaN 1.544421e+07 NaN <a href="http://twitter.com/download/iphone" r... ... 10.0 Twitter for iPhone Nashville, Tennessee Nashville, Tennessee Nashville-Davidson, Davidson County, Tennessee... 36.162230 -86.774353 36 9m 44.0266s N, 86 46m 27.6712s W NaN English
59227 7.033214e+17 CPJ_UNC What to do with #SilentSam at @UNC? Libbie Wei... 2016-02-26 20:51:12 NaN en NaN 3.092623e+09 NaN <a href="http://twitter.com" rel="nofollow">Tw... ... 2.0 Twitter Web Client Chapel Hill, North Carolina Chapel Hill, North Carolina Chapel Hill, Orange County, North Carolina, USA 35.913154 -79.055780 35 54m 47.3551s N, 79 3m 20.808s W NaN English
59228 7.410846e+17 laurenps_ "Y'alls stories from high school make me glad ... 2016-06-10 02:48:33 NaN en NaN 3.351230e+08 NaN <a href="http://twitter.com/download/iphone" r... ... 6.0 Twitter for iPhone Seymour, TN Seymour, TN Seymour, Sevier County, Tennessee, 37865, USA 35.881048 -83.776618 35 52m 51.7728s N, 83 46m 35.8258s W NaN English
59229 7.199931e+17 Robert_Smalls62 New Research Guide on the #SilentSam Confedera... 2016-04-12 21:58:26 NaN en NaN 7.161498e+17 NaN <a href="http://twitter.com" rel="nofollow">Tw... ... 4.0 Twitter Web Client NaN NaN NaN NaN NaN NaN NaN English
59230 6.950692e+17 WilsonforSBP A: Jim Carr #SilentSam dedication an example o... 2016-02-04 02:19:38 NaN en NaN 4.711040e+09 NaN <a href="http://twitter.com" rel="nofollow">Tw... ... 2.0 Twitter Web Client NaN NaN NaN NaN NaN NaN NaN English
59231 7.482184e+17 _thepopshop 󾀿Canada Day Strawberries with shots of #Silent... 2016-06-29 19:15:49 NaN en NaN 1.129817e+09 NaN <a href="http://www.facebook.com/twitter" rel=... ... 6.0 Facebook Durham Region & the GTA Durham Region & the GTA Trent University Durham GTA, Vancouver Court, ... 43.889869 -78.890732 43 53m 23.5271s N, 78 53m 26.6349s W NaN English
59232 6.950765e+17 WilsonforSBP "If we want to move Silent Sam, we have to mov... 2016-02-04 02:48:40 NaN en NaN 4.711040e+09 NaN <a href="http://twitter.com" rel="nofollow">Tw... ... 2.0 Twitter Web Client NaN NaN NaN NaN NaN NaN NaN English
59233 8.061298e+17 Big_G_09 @sec1968 Just got outta Edmonton and Arizona! ... 2016-12-06 13:35:04 NaN en sec1968 1.060183e+09 8.061295e+17 <a href="http://twitter.com" rel="nofollow">Tw... ... 12.0 Twitter Web Client NaN NaN NaN NaN NaN NaN NaN English
59234 7.877145e+17 calgaryhester @sambradd P.S. I like that there's a range of ... 2016-10-16 18:59:05 NaN en drawing_change 7.561168e+08 7.877103e+17 <a href="http://twitter.com" rel="nofollow">Tw... ... 10.0 Twitter Web Client Calgary, Alberta, Canada Calgary, Alberta, Canada Calgary, Alberta, Canada 51.025327 -114.049869 51 1m 31.1763s N, 114 2m 59.5265s W NaN English
59235 7.043884e+17 cdwyer0213 "Silent Sam may be without ammunition, but he ... 2016-02-29 19:31:00 NaN en NaN 2.604985e+07 NaN <a href="http://twitter.com/download/iphone" r... ... 2.0 Twitter for iPhone Durham, NC Durham, NC Durham County, North Carolina, USA 36.018132 -78.875158 36 1m 5.27376s N, 78 52m 30.5695s W NaN English
59236 7.862504e+17 tndp Tennesseans need to know where candidates stan... 2016-10-12 18:01:28 NaN en NaN 1.991617e+07 NaN <a href="https://about.twitter.com/products/tw... ... 10.0 TweetDeck Tennessee Tennessee Tennessee, USA 35.773008 -86.282008 35 46m 22.8274s N, 86 16m 55.2292s W NaN English
59237 7.997624e+17 Avilyst Silent sam Please be good to me tonight 😜😂 #Si... 2016-11-18 23:53:07 NaN en NaN 3.301059e+09 NaN <a href="http://twitter.com/download/iphone" r... ... 11.0 Twitter for iPhone Ontario, Canada Ontario, Canada Ontario, Canada 50.000678 -86.000977 50 0m 2.4408s N, 86 0m 3.5172s W NaN English
59239 7.130287e+17 UNChistory RT @iah_unc: Faculty Fellow @UNChistory Fitzhu... 2016-03-24 15:44:29 NaN en NaN 2.493137e+08 NaN <a href="http://twitter.com" rel="nofollow">Tw... ... 3.0 Twitter Web Client Chapel Hill, NC Chapel Hill, NC Chapel Hill, Orange County, North Carolina, USA 35.913154 -79.055780 35 54m 47.3551s N, 79 3m 20.808s W NaN English
59240 8.048933e+17 Schmocki_Boi @SI_ChrisBallard @samhinkie great article. If... 2016-12-03 03:41:35 NaN en SI_ChrisBallard 3.457689e+08 8.040047e+17 <a href="http://twitter.com/download/android" ... ... 12.0 Twitter for Android Philadelphia, PA Philadelphia, PA Philadelphia, Philadelphia County, Pennsylvani... 39.952415 -75.163576 39 57m 8.69472s N, 75 9m 48.8718s W NaN English
59241 6.960907e+17 Surreyissafe @CBCNews she looks like a plastic blow up doll... 2016-02-06 21:58:59 NaN en CBCNews 2.393043e+09 6.960835e+17 <a href="http://twitter.com/download/iphone" r... ... 2.0 Twitter for iPhone newton wave pool newton wave pool Newton Wave Pool, 13730, 72 Avenue, Newton, Su... 49.132854 -122.842398 49 7m 58.2737s N, 122 50m 32.632s W NaN English
59242 8.075521e+17 ljhhrpr Silent Sam!in the am fuck yeah!\r\r\n#silentsa... 2016-12-10 11:46:41 NaN en NaN 8.048924e+17 NaN <a href="http://twitter.com/download/android" ... ... 12.0 Twitter for Android NaN NaN NaN NaN NaN NaN NaN English
59243 7.065833e+17 TarHeelAltRight @jdmar3 @ChapelHillNews1 Ah yes, employing an ... 2016-03-06 20:52:51 NaN en jdmar3 7.060160e+17 7.065479e+17 <a href="http://twitter.com" rel="nofollow">Tw... ... 3.0 Twitter Web Client NaN NaN NaN NaN NaN NaN NaN English
59244 7.127749e+17 iah_unc Faculty Fellow @UNChistory Fitzhugh Brundage d... 2016-03-23 22:55:56 NaN en NaN 3.581632e+08 NaN <a href="http://twitter.com" rel="nofollow">Tw... ... 3.0 Twitter Web Client Chapel Hill, NC Chapel Hill, NC Chapel Hill, Orange County, North Carolina, USA 35.913154 -79.055780 35 54m 47.3551s N, 79 3m 20.808s W NaN English
59245 7.065353e+17 TriangleEditor RT @jdmar3: A monument to unknown dead people ... 2016-03-06 17:41:48 NaN en NaN 5.501892e+07 NaN <a href="https://about.twitter.com/products/tw... ... 3.0 TweetDeck Chapel Hill, NC Chapel Hill, NC Chapel Hill, Orange County, North Carolina, USA 35.913154 -79.055780 35 54m 47.3551s N, 79 3m 20.808s W NaN English
59246 7.073456e+17 AshHeffe RT @iah_unc: #artsandsocialjustice grant deadl... 2016-03-08 23:21:35 NaN en NaN 2.185474e+09 NaN <a href="http://twitter.com/download/iphone" r... ... 3.0 Twitter for iPhone NaN NaN NaN NaN NaN NaN NaN English
59247 7.065476e+17 TarHeelAltRight @jdmar3 @ChapelHillNews1 Because #SilentSam is... 2016-03-06 18:30:56 NaN en jdmar3 7.060160e+17 7.065298e+17 <a href="http://twitter.com" rel="nofollow">Tw... ... 3.0 Twitter Web Client NaN NaN NaN NaN NaN NaN NaN English
59248 7.570521e+17 niiikkkiii_mn I shall drown myself in this beautiful bottle ... 2016-07-24 04:17:36 NaN en NaN 4.055911e+09 NaN <a href="http://twitter.com/download/iphone" r... ... 7.0 Twitter for iPhone NaN NaN NaN NaN NaN NaN NaN English
59249 6.950764e+17 WilsonforSBP A: What will you do to support the students in... 2016-02-04 02:48:17 NaN en NaN 4.711040e+09 NaN <a href="http://twitter.com" rel="nofollow">Tw... ... 2.0 Twitter Web Client NaN NaN NaN NaN NaN NaN NaN English
59250 7.178766e+17 viejas46 @RealGlenMacnow That would be like #SilentSam. 2016-04-07 01:48:10 NaN en RealGlenMacnow 3.924814e+07 7.178748e+17 <a href="http://twitter.com" rel="nofollow">Tw... ... 4.0 Twitter Web Client VA Beach VA Beach V Beach, D550, Al?_??tepe, Eceabat, ??anakkale... 40.044048 26.183953 40 2m 38.5714s N, 26 11m 2.22936s E NaN English
59252 7.087297e+17 haley_nm I apologize to anyone that I was talking to la... 2016-03-12 19:01:45 NaN en NaN 3.278226e+08 NaN <a href="http://twitter.com/download/iphone" r... ... 3.0 Twitter for iPhone NaN NaN NaN NaN NaN NaN NaN English
59253 7.065352e+17 LocoCravey RT @jdmar3: A monument to unknown dead people ... 2016-03-06 17:41:31 NaN en NaN 1.119392e+09 NaN <a href="http://twitter.com" rel="nofollow">Tw... ... 3.0 Twitter Web Client Carrboro, North Carolina Carrboro, North Carolina Carrboro, Orange County, North Carolina, 27510... 35.910144 -79.075289 35 54m 36.5177s N, 79 4m 31.0422s W NaN English

58746 rows × 42 columns

In [84]:
px.parallel_categories(db3[db3.trans_sour.isin(cros_sour.columns) | db3.no.isin(top3places.columns)],dimensions=["trans_sour","eng_or_not",'retweet_count',"no"], color="retweet_count",
                      title="Sources and Language Co-corrence",color_continuous_scale=px.colors.sequential.GnBu)
In [88]:
px.bar(db3[db3.trans_sour.isin(cros_sour.columns)], x="trans_sour", y="retweet_count", color="trans_sour", title="RT Counts of Different Sources")
In [ ]:
px.bar(db3[db3.trans_sour.isin(cros_sour.columns)], x="trans_sour", y="favorite_count", color="trans_sour", title="Fav Counts of Different Sources")
In [90]:
cl.scales['10']['div']['RdYlBu'][-3]
Out[90]:
'rgb(116,173,209)'
In [ ]:
['Greys', 'YlGnBu', 'Greens', 'YlOrRd', 'Bluered', 'RdBu',
            'Reds', 'Blues', 'Picnic', 'Rainbow', 'Portland', 'Jet',
            'Hot', 'Blackbody', 'Earth', 'Electric', 'Viridis', 'Cividis']
In [33]:
user_set.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26097 entries, 0 to 26096
Data columns (total 35 columns):
Unnamed: 0                   26097 non-null int64
user_screen_name             26097 non-null object
tweets_num                   26097 non-null int64
id_str                       26097 non-null int64
from_user                    26097 non-null object
text                         26097 non-null object
time                         26097 non-null object
geo_coordinates              9 non-null object
user_lang                    26097 non-null object
in_reply_to_screen_name      571 non-null object
from_user_id_str             26097 non-null int64
in_reply_to_status_id_str    450 non-null float64
source                       26097 non-null object
user_followers_count         26077 non-null float64
user_friends_count           26047 non-null float64
user_location                19186 non-null object
entities_str                 26097 non-null object
place                        234 non-null object
retweet_count                23660 non-null float64
favorite_count               1977 non-null float64
user_description             23241 non-null object
user_created_at              26097 non-null object
user_geo_enabled             10984 non-null object
user_listed_count            22346 non-null float64
user_verified                549 non-null object
user_statuses_count          26097 non-null int64
user_favourites_count        26033 non-null float64
possibly_sensitive           85 non-null object
lang_trans                   26097 non-null object
date                         26097 non-null object
outlier_is                   26097 non-null int64
outlier_dbs                  26097 non-null int64
isoutlier_if                 26097 non-null int64
created_days                 26097 non-null float64
eng_or_not                   26097 non-null object
dtypes: float64(8), int64(8), object(19)
memory usage: 7.0+ MB
In [24]:
sns.jointplot(x="favorite_count", y="retweet_count", data=db4)
c:\python36\lib\site-packages\scipy\stats\stats.py:1713: FutureWarning:

Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.

Out[24]:
<seaborn.axisgrid.JointGrid at 0x25dbf571c88>
In [53]:
sns.jointplot(x="favorite_count", y="retweet_count", data=db4[db4['retweet_count'].notna() | db4['favorite_count'].notna()].fillna(0))
c:\python36\lib\site-packages\scipy\stats\stats.py:1713: FutureWarning:

Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.

Out[53]:
<seaborn.axisgrid.JointGrid at 0x25dc5cc6668>