[12]
%load_ext watermark
%watermark -a "Romell D.Z." -u -d -p tweepy,scipy,nltk,gensim,sklearn,networkx,textblob,spacy
The watermark extension is already loaded. To reload it, use: %reload_ext watermark Romell D.Z. last updated: 2019-08-25 tweepy 3.6.0 scipy 1.1.0 nltk 3.2.5 gensim 3.4.0 sklearn 0.20.0 networkx 2.3 textblob 0.15.1 spacy 2.0.12
[48]
import os
import tweepy
from tweepy import Stream

import netrc

from unidecode import unidecode
import re

from tweepy import StreamListener

%matplotlib inline

from pprint import pprint
import pyprind
import pandas as pd

import random
from scipy.stats import beta as beta_distribution
import numpy as np

from nltk.corpus import stopwords 
from nltk import word_tokenize
from nltk.data import load
from nltk.stem import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
import string

import gensim
from gensim import corpora

import pyLDAvis
import pyLDAvis.gensim  # don't skip this

from gensim.models.ldamodel import LdaModel
from string import punctuation
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV

from textblob import Word
from textblob import TextBlob

from sklearn.feature_extraction.text import TfidfVectorizer
import warnings
warnings.simplefilter('ignore')

from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

import advertools as adv

import networkx as nx

import spacy
from spacy import displacy
nlp = spacy.load('es_core_news_sm')
[3]
auth = netrc.netrc()
ckey,_,csecret=auth.authenticators('tweet_api')
atoken,_,asecret=auth.authenticators('tweet_secret')

BoundingBox CSV RAW

[4]
# LIMA_GEO_LOCATION_BOUNDING_BOX = [-77.1785277831,-12.1531578397,-76.8967618806,-11.9288928156]
LIMA_GEO_LOCATION_BOUNDING_BOX = [ -81.802362,-17.525482, -69.774343,-3.226278]
auth = tweepy.OAuthHandler(ckey, csecret)
auth.set_access_token(atoken, asecret)
api = tweepy.API(auth)
[5]
NUMBER_OF_TWEETS = 1000
pbar = pyprind.ProgBar(NUMBER_OF_TWEETS)
[6]
class FiniteStreamListener(StreamListener):
    
    def __init__(self, number_of_tweets):
        self.number_of_tweets = number_of_tweets
        self.tweets = []
        self.tweets_dict = []
        super(FiniteStreamListener,self).__init__()
        
    def on_status(self, status):
        if len(self.tweets) < self.number_of_tweets:
            self.tweets_dict.append(status._json)
            place = status._json['place']['name'] if(status._json['place']) else ""
            self.tweets.append({'date':status.created_at,
                                'text':status.text,
                                'location':place,
                                'followers':status._json['user']['followers_count']})
            pbar.update()
        else:
            return False
[7]
finite_stream_listener = FiniteStreamListener(number_of_tweets=NUMBER_OF_TWEETS)
streaming_api = Stream(auth=auth, listener=finite_stream_listener,timeout=60)

The streaming API doesn't allow to filter by location AND keyword simultaneously.

[8]
EMOTICONS = ">:] :-) :) :o) :] :3 :c) :> =] 8) =) :} :^) "
EMOTICONS = EMOTICONS.strip().split(' ')
# streaming_api.filter(track=EMOTICONS,async=True)
streaming_api.filter(locations=LIMA_GEO_LOCATION_BOUNDING_BOX,async=True)
[12]
print(len(finite_stream_listener.tweets))
56 0% [##############################] 100% | ETA: 00:00:00 Total time elapsed: 01:05:42
[13]
np.save('tweets_dict',finite_stream_listener.tweets_dict)
[17]
def make_lowercase(tweet):
    return tweet.lower()

def remove_diacritics(tweet):
    return unidecode(tweet)

def remove_non_alpha_characters(tweet):
    return ''.join(character for character in tweet if character.isalpha() or character == ' ')

def remove_web_site(tweet):
    return re.sub(r'http\w+', '', tweet)#, flags=re.MULTILINE)
[18]
tweets_df = pd.DataFrame.from_dict(finite_stream_listener.tweets)
tweets_df.rename(columns={'text':'Tweets'},inplace=True)
[19]
tweets_df['word_count'] = tweets_df['Tweets'].apply(lambda x: len(str(x).split(" ")))
tweets_df['char_count'] = tweets_df['Tweets'].str.len()

def avg_word(sentence):
    words = sentence.split()
    return (sum(len(word) for word in words)/len(words))

tweets_df['avg_word'] = tweets_df['Tweets'].apply(lambda x: avg_word(x))
tweets_df['hastags'] = tweets_df['Tweets'].apply(lambda x: len([x for x in x.split() if x.startswith('#')]))
tweets_df['numerics'] = tweets_df['Tweets'].apply(lambda x: len([x for x in x.split() if x.isdigit()]))
[20]
cleaned_tweets = list(tweets_df['Tweets'])
for cleaning_function in \
    [make_lowercase, 
#      remove_diacritics,
     remove_non_alpha_characters,
     remove_web_site]:
    cleaned_tweets = [cleaning_function(tweet) for tweet in cleaned_tweets]
[21]
random.sample(cleaned_tweets,5)
['ultima noche de semanademayordomia en la iasdesperanza aprendiendo sobre la importancia de ser fiel a dios con l ',
 'não existe boy legal em salvador asklgbt',
 'movistarvoleype vamos perú no podemos flaquear más adelante los equipos son más fuertes sabemos de su espíritu de lucha arriba perú',
 'noticiamerica tanto lío por ese pequeño lapsus',
 'pobre chica la cuerda siempre se rompe por el lado mas débil']
[22]
KEYWORD='lima'
number_of_occurences = sum(KEYWORD in tweet for tweet in cleaned_tweets)
print('Nuestra palabra clave aparece: {} veces'.format(number_of_occurences))
print('Nuestra palabra clave apareció en: {}% de los tweets'.format(100 * number_of_occurences/NUMBER_OF_TWEETS))
Nuestra palabra clave aparece: 41 veces Nuestra palabra clave apareció en: 4.1% de los tweets
[23]
pprint([tweet for tweet in cleaned_tweets if KEYWORD in tweet][:5])
['cooking with fire in peru today cookingclass seerundo lima peru ' 'southamerica lima peru ', 'shopping for our cooking class today mercado seerundo lima peru ' 'southamerica mercado n de surquillo ', 'im at the beer place in lima ', 'im at centro cultural ricardo palma munimiraflores in miraflores lima ', 'estoy listo y esperándote barbon amado una nueva jornada juntos hsmn ' 'octubremesmorado en sheraton lima hot ']
[24]
indices_of_tweets_containing_keyword=[index for index, tweet in enumerate(cleaned_tweets) if KEYWORD in tweet]
print('index de los 10 tweets:%s'%(indices_of_tweets_containing_keyword))
index de los 10 tweets:[6, 23, 103, 107, 116, 144, 149, 173, 211, 219, 237, 252, 254, 260, 277, 330, 373, 389, 392, 411, 421, 512, 522, 550, 653, 667, 684, 726, 727, 736, 743, 749, 821, 867, 877, 879, 884, 907, 912, 927, 969]
[25]
distances_between_indices_of_tweets_containing_keyword = [
    second_index - first_index for first_index, second_index in \
        zip(indices_of_tweets_containing_keyword[:-1], indices_of_tweets_containing_keyword[1:])
]
[26]
pd.Series(distances_between_indices_of_tweets_containing_keyword).hist()
plt.savefig('snapshot/lima_tweets_hist.png')
[27]
alpha = 1 + number_of_occurences
beta = 1 + (NUMBER_OF_TWEETS - number_of_occurences)

x_values = np.linspace(0, 1, 1002)[1:-1]
pdf_y_values = beta_distribution(alpha, beta).pdf(x_values)
cdf_y_values = np.cumsum(pdf_y_values) / np.sum(pdf_y_values)
[28]
plt.figure(figsize=(18, 6))
plt.subplot(121)
plt.plot(x_values, pdf_y_values, label=(r'$\alpha=%.1f,\ \beta=%.1f$' % (alpha, beta)))

plt.xlim(0, 1)
plt.xlabel('Probability of tweet containing keyword')
plt.ylabel('Probability density')
plt.title('Beta Distribution PDF')
plt.legend(loc=1)

plt.subplot(122)
plt.plot(x_values, cdf_y_values)
plt.xlim(0, 1)
plt.ylim(0, 1.005)
plt.yticks(np.linspace(0, 1, 21))
plt.xlabel('Probability of tweet containing keyword')
plt.ylabel('Cumulative probability')
plt.title('Beta Distribution CDF')
plt.savefig('snapshot/Beta Distribution CDF.png');
[29]
ix = [n for n,b in enumerate((cdf_y_values>.5)&(cdf_y_values<.95)) if b]
[30]
range_ =cdf_y_values[ix]
[31]
a=np.c_[x_values[ix],cdf_y_values[ix]]
max_ix,_ = np.unravel_index(a.argmax(), a.shape)
min_ix,_ = np.unravel_index(a.argmin(), a.shape)
fifth_percentile, ninety_fifth_percentile = x_values[max_ix],x_values[min_ix]
[32]
print('Con 90% de certeza digamos que la verdadera probabilidad se encuentra entre: {} y {}'.format(
    round(fifth_percentile, 10), round(ninety_fifth_percentile, 10)))
Con 90% de certeza digamos que la verdadera probabilidad se encuentra entre: 0.010989011 y 0.000999001
[33]
def compute_total_probability_that_probability_less_than_p(p):
    return max(cumulative_prob for cumulative_prob, x_value in zip(cdf_y_values, x_values) if x_value < p)

print('La probabilidad que la verdadera probabilidad es > .1 es: {}'.format(
    1 - compute_total_probability_that_probability_less_than_p(.1)))
La probabilidad que la verdadera probabilidad es > .1 es: 1.9441115384211116e-12
[34]
tweets_df.groupby('location').agg({'location':len}).sort_values('location').iloc[-15:].plot(kind='barh')
plt.title('Most Frequent locations')
plt.xlabel('Cantidad')
plt.tight_layout()
plt.savefig('snapshot/most Frequent locations.png');
[35]
tweets_df.to_csv('lima_tweets.csv',index=None)
[36]
tweets_df = pd.read_csv('lima_tweets.csv')
[39]
stop =stopwords.words('spanish') 
[40]
doc_complete = tweets_df.Tweets.values
exclude = set(string.punctuation) 
lemma = WordNetLemmatizer()
def clean(doc):
    stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
    normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
    return normalized

doc_clean = [clean(doc).split() for doc in doc_complete]   
tweets_df.dropna(inplace=True)
[41]
tweets_df['Tweets_clean'] = pd.Series(doc_clean).apply(lambda x:' '.join(x))
tweets_df['word_count'] = tweets_df['Tweets_clean'].apply(lambda x: len(str(x).split(" ")))
tweets_df['char_count'] = tweets_df['Tweets_clean'].str.len()
[42]
def avg_word(sentence):
    words = sentence.split()
    if len(words)==0:
        return 0
    return (sum(len(word) for word in words)/len(words))

tweets_df['avg_word'] = tweets_df['Tweets_clean'].apply(lambda x: avg_word(x))
tweets_df['hastags'] = tweets_df['Tweets'].apply(lambda x: len([x for x in x.split() if x.startswith('#')]))
tweets_df['numerics'] = tweets_df['Tweets'].apply(lambda x: len([x for x in x.split() if x.isdigit()]))
tweets_df.head()
date followers location Tweets word_count char_count avg_word hastags numerics Tweets_clean
0 2018-10-20 01:57:03 92 Lima @elpanfletope @AlanGarciaPeru chancho no come ... 5 48 8.800000 0 0 elpanfletope alangarciaperu chancho come chancho
1 2018-10-20 01:57:10 32 Lince @exitosape @JulianaOxenford A DONDE DICE ... 3 30 9.333333 0 0 exitosape julianaoxenford dice
2 2018-10-20 01:57:12 195 Brasil @anajuliachs kkkk 1 mês 4 22 4.750000 0 1 anajuliachs kkkk 1 mês
3 2018-10-20 01:57:17 228 Chimbote Ultima noche de #SemanaDeMayordomia en la @IAS... 11 105 8.636364 1 0 ultima noche semanademayordomia iasdesperanza ...
4 2018-10-20 01:57:18 123 San Miguel @fernando_roman1 Jajaja 2 21 10.000000 0 0 fernandoroman1 jajaja
[43]
freq = pd.Series(' '.join(tweets_df['Tweets_clean']).split()).value_counts()[:10]
freq.plot(kind='barh')
plt.title('Most Frequent words')
plt.xlabel('Count')
plt.tight_layout()
plt.savefig('snapshot/most Frequent words.png');
[44]
pprint(tweets_df['Tweets_clean'][:2])
0 elpanfletope alangarciaperu chancho come chancho 1 exitosape julianaoxenford dice Name: Tweets_clean, dtype: object
[45]
dictionary = corpora.Dictionary(tweets_df['Tweets_clean'].apply(lambda x:x.split()))

# Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.
doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]
[46]
pprint(doc_term_matrix[:2])
[[(0, 1), (1, 2), (2, 1), (3, 1)], [(4, 1), (5, 1), (6, 1)]]
[47]
Lda = gensim.models.ldamodel.LdaModel
ldamodel = Lda(doc_term_matrix, num_topics=3, id2word = dictionary, passes=50)
[48]
from pprint import pprint
pprint(ldamodel.print_topics(num_topics=3, num_words=3))
[(0, '0.006*"peru" + 0.006*"lima" + 0.005*"–"'), (1, '0.004*"lima" + 0.004*"in" + 0.003*"fiscal"'), (2, '0.005*"si" + 0.003*"canaln" + 0.002*"amor"')]
[49]
# from gensim.test.utils import datapath
# fname = datapath("lda_lima_tweet_model")
ldamodel.save("lda_lima_tweet_model")
[50]
from gensim.models.ldamodel import LdaModel
ldamodel = LdaModel.load("lda_lima_tweet_model")
[51]
doc_lda = ldamodel[doc_term_matrix]
[52]
print('Perplexity: ', ldamodel.log_perplexity(doc_term_matrix))  # a measure of how good the model is. lower the better
Perplexity: -8.825555626039103
[53]
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(ldamodel, doc_term_matrix, dictionary)
vis
[55]
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    texts_out = []
    for sent in texts:
        doc_ = nlp(sent)
        texts_out.append(list(set([str(c.head) for c in doc_ if c.head.tag_.startswith(tuple(allowed_postags))])))
    return texts_out

lemmatization(tweets_df['Tweets_clean'][:5],['VERB'])
[[], ['dice'], [], ['aprendiendo'], []]
[56]
def join_comma(row_list):
    if row_list == []:
        return np.NaN
    else:
        return ', '.join(row_list)

tweets_df['ACTIONS']=pd.Series(lemmatization(tweets_df['Tweets_clean'],['VERB'])).apply(join_comma)
tweets_df['NOUNS']=pd.Series(lemmatization(tweets_df['Tweets_clean'],['NOUN'])).apply(join_comma)
[57]
tweets_df[['Tweets_clean','NOUNS','ACTIONS']].head()
Tweets_clean NOUNS ACTIONS
0 elpanfletope alangarciaperu chancho come chancho elpanfletope NaN
1 exitosape julianaoxenford dice exitosape dice
2 anajuliachs kkkk 1 mês mês NaN
3 ultima noche semanademayordomia iasdesperanza ... noche aprendiendo
4 fernandoroman1 jajaja jajaja NaN
[58]
tweets_df.head()
date followers location Tweets word_count char_count avg_word hastags numerics Tweets_clean ACTIONS NOUNS
0 2018-10-20 01:57:03 92 Lima @elpanfletope @AlanGarciaPeru chancho no come ... 5 48 8.800000 0 0 elpanfletope alangarciaperu chancho come chancho NaN elpanfletope
1 2018-10-20 01:57:10 32 Lince @exitosape @JulianaOxenford A DONDE DICE ... 3 30 9.333333 0 0 exitosape julianaoxenford dice dice exitosape
2 2018-10-20 01:57:12 195 Brasil @anajuliachs kkkk 1 mês 4 22 4.750000 0 1 anajuliachs kkkk 1 mês NaN mês
3 2018-10-20 01:57:17 228 Chimbote Ultima noche de #SemanaDeMayordomia en la @IAS... 11 105 8.636364 1 0 ultima noche semanademayordomia iasdesperanza ... aprendiendo noche
4 2018-10-20 01:57:18 123 San Miguel @fernando_roman1 Jajaja 2 21 10.000000 0 0 fernandoroman1 jajaja NaN jajaja
[59]
tweets_df.to_csv('tweets_solutions.csv',index=None)
[8]
tweets_df = pd.read_csv('tweets_solutions.csv')
tweets_df.dropna(inplace=True)
[9]
tweets_df['Tweets_clean']
1                         exitosape julianaoxenford dice
3      ultima noche semanademayordomia iasdesperanza ...
6      cooking with fire in peru today cookingclass s...
7      show música vivo tecnópolis cierra temporada 2...
8                      maldita perra perdón alteré 💁🏻‍♀️
9      mininterperu pcmperu vizcarrhagan algo denle s...
10     🇪🇨🇪🇨quito ecuador 🇪🇨🇪🇨 vemos mañana eslae http...
11     sasha71396634 ernestojx hninurta gabospeed94 k...
12     ¡conoce proceso exhumación caso mascarilla com...
15     ronaldomendes triste fim da baleia bora fazer ...
17     mt colrichardkemp pt quinze mulheres disseram ...
19     kerch mourns victim of college massacre a surv...
21     pensar convento san francisco reposan restos f...
22     reaccionando ❌te puse perder ❌ javierramireze ...
23     shopping for our cooking class today mercado s...
26     acompañamos autores alvarobisama marcelo mella...
28     sturt0208 canaln martinvizcarrac arrugador fuj...
31     🎗️la prevención mejor tratamiento🎗️🙆‍♀️ cáncer...
32     tatahcomenta onde tendo live sigam anapaularen...
34     richardacunan tarde reacción sensatez tiempo h...
35     tô levando tempo necessário p entender preciso...
36     sí devolvió libropero parte cumpliócon debíaah...
39     cara palo jc rodríguez preguntando álvaro sala...
40     capital967 mauriciomulder vergüenza dan person...
41     story time i almost made my mom amp i late for...
43     malibulox pra mim não aparece sigam anapaulare...
44     210002ws2080 wind225°at00kmh gust00kmh t176°cd...
45     sentimientos encontrados respecto quedo 3 cosa...
47     emelecmax fefecuador pueden esperar hijoputa n...
48     varios días observado sesgo fujimorista optand...
                             ...                        
946            increíble cantidad bobadas twiteaba ex😂😂😂
947             conozco hace poquito tomé cariño solcito
948    vale verga gente alguien importa convierto pri...
949    gracias queridas amigas rcreadores romanticism...
950                                  heyjeans buen viaje
951    sirenita 💕🐚🐠 sirena famosa toda posando nosotr...
952    diariocorreo do hermanitos hermanita menor car...
954    jhueb bradgehlosu troncarternlu exactly right ...
955    acordo perfeito httpstcogcgczwtd4z belissima m...
959    gabrielitaaa10 rosamariabartra alvarosarco mil...
960          tu date cuenta vale va salir toda si quiero
961     meyastos salvajedigital pánico sabe cochinaditas
962    repost mheremer ・・・ name something hot and wet...
964    larryportera idlr puedes adelantar poquito fiz...
965    tiowalo9 labandadel86 edad colombia estan viaj...
971    kelitagrand2 xileone si pues “tipa” q veas exc...
974    acordo perfeito httpstcogcgczwtd4z maisvoce ma...
977                    fernandarrocha toda hora nem xuxa
978    tubinocarlos ja cómo orinas gente amoral digni...
980             dando argumentos solicite asilo político
982    ayacucho casi lleva 3 pto huancayo empató 33 v...
985    quem diria 20102018 sérgio lobo sessentou ning...
987    tiowalo9 labandadel86 tener miedo poner jugado...
988    convocape milagrosleivag mulderrctm aún ustede...
989    noticiastvperu richardacunan ustedes sarta cor...
990                     gente do céu cuzco é muito longe
993    xileone luzsalgador tvperupe imagínate q indig...
994    abrazo inmenso aquí cielo ¡feliz cumpleaños ab...
996    feliz siento orgullosa pertenecer gran familia...
997    padre caraquista pedro quiere magallanero pped...
Name: Tweets_clean, Length: 523, dtype: object
[67]
tweets_df['Tweets_clean'] = tweets_df['Tweets_clean'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))
print(tweets_df.shape)
tweets_df['Tweets_clean'].head()
(523, 12)
1                       exitosape julianaoxenford dice
3    ultima noche semanademayordomia iasdesperanza ...
6    cooking with fire in peru today cookingclass s...
7    show música vivo tecnópolis cierra temporada 2...
8                    maldita perra perdón alteré 💁🏻‍♀️
Name: Tweets_clean, dtype: object
[68]
# displacy.serve(doc, style="dep")
[74]
doc = nlp(tweets_df['Tweets_clean'][1])
from IPython.display import HTML, Image, display
displacy.render(doc, style="dep",jupyter=True,options={'distance':100})
exitosape NOUN julianaoxenford ADJ dice VERB nsubj amod
[75]
displacy.render(doc, style="ent",jupyter=True,)
exitosape julianaoxenford dice
[69]
TextBlob(tweets_df['Tweets_clean'][1]).ngrams(2)
[WordList(['exitosape', 'julianaoxenford']),
 WordList(['julianaoxenford', 'dice'])]
[70]
tf1 = (tweets_df['Tweets_clean']).apply(lambda x: pd.value_counts(x.split(" "))).sum(axis = 0).reset_index()
tf1.columns = ['words','tf']
for i,word in enumerate(tf1['words']):
    tf1.loc[i, 'idf'] = np.log(tweets_df.shape[0]/(len(tweets_df[tweets_df['Tweets_clean'].str.contains(word)])))

tf1['tfidf'] = tf1['tf'] * tf1['idf']
print(tf1.shape)
tf1.head(10)
(3630, 4)
words tf idf tfidf
0 dice 7.0 3.774675 26.422724
1 julianaoxenford 1.0 6.259581 6.259581
2 exitosape 2.0 5.566434 11.132869
3 httpstcogmo19ienh6 1.0 6.259581 6.259581
4 iasdesperanza 1.0 6.259581 6.259581
5 l… 1.0 3.551531 3.551531
6 fiel 1.0 6.259581 6.259581
7 noche 3.0 4.873287 14.619861
8 semanademayordomia 1.0 6.259581 6.259581
9 dios 3.0 4.650144 13.950431
[71]
stop = set(stopwords.words('spanish'))
stop |= set(['lima','si','ser'])
[72]
tfidf = TfidfVectorizer(max_features=1000, lowercase=True, analyzer='word', stop_words= stop,ngram_range=(1,1))
train_vect = tfidf.fit_transform(tweets_df['Tweets_clean'])
train_vect
<523x1000 sparse matrix of type '<class 'numpy.float64'>'
	with 2517 stored elements in Compressed Sparse Row format>
[73]
from sklearn.feature_extraction.text import CountVectorizer
bow = CountVectorizer(max_features=1000, lowercase=True, ngram_range=(1,1),analyzer = "word")
train_bow = bow.fit_transform(tweets_df['Tweets_clean'])
print(train_bow.shape)
train_bow
(523, 1000)
<523x1000 sparse matrix of type '<class 'numpy.int64'>'
	with 2618 stored elements in Compressed Sparse Row format>
[74]
from sklearn.metrics.pairwise import linear_kernel

def find_similar(tfidf_matrix, index, top_n = 5):
    cosine_similarities = linear_kernel(tfidf_matrix[index:index+1], tfidf_matrix).flatten()
    related_docs_indices = [i for i in cosine_similarities.argsort()[::-1] if i != index]
    return [(index, cosine_similarities[index]) for index in related_docs_indices][0:top_n]
[81]
tweet = tweets_df.sample(1)
tweet
date followers location Tweets word_count char_count avg_word hastags numerics Tweets_clean ACTIONS NOUNS
542 2018-10-20 02:32:30 159 Brasil @RogerioVilela @pauloap O problema é, quantos serão mortos??? 7 53 6.714286 0 0 rogeriovilela pauloap problema é quantos serão mortos serão problema, quantos
[82]
print(tweet['Tweets'].values)
['@RogerioVilela @pauloap O problema é, quantos serão mortos???']
[87]
tweet.reset_index(drop=True,inplace=True)
[89]
pd.options.display.max_colwidth = 120
vals = pd.DataFrame()
for index, score in find_similar(train_vect, tweet.index[0],top_n = 5):
    vals = vals.append(tweets_df.iloc[index:index+1,:])
    vals.loc[index,'score'] = score
    
vals[['Tweets','score']].head()
Tweets score
845 @sigridbazan Se agranda mi admiración Sigrid. Lo dices porque lo dices. NaN
440 NaN 0.565055
452 @exitosape \nNo sé qué es lo que sucede en #Exitosa que invitan a adefesios como ese seudoanalista politico de apell... NaN
234 NaN 0.535451
900 Para mí aquí siempre dice "MI ALMA" NaN
[90]
corpus = nlp('\n'.join(tweets_df['NOUNS'].dropna()))
[91]
visited = {}
nouns = []
for word in corpus:
    if word.pos_.startswith('N') and len(word.string) < 15 and len(word.string) > 2:
        token = word.string.strip().lower()
        if token in visited:
            visited[token] += 1
            continue
        else:
            visited[token] = 1
            nouns.append(word)
nouns = sorted(nouns, key=lambda w: -visited[w.string.strip().lower()])[:150]
pd.DataFrame([[w.text, visited[w.string.strip().lower()]] for w in nouns], columns=['Noun', 'Freq'])
Noun Freq
0 canaln 13
1 pra 6
2 gente 6
3 amor 6
4 patas 6
5 casa 5
6 franpetrozzi 5
7 juez 5
8 seguridad 4
9 caso 4
10 mama 4
11 cosas 4
12 país 4
13 día 4
14 vizcarra 4
15 persona 4
16 procesion 4
17 pueblo 3
18 señor 3
19 hora 3
20 tema 3
21 fútbol 3
22 fuerza 3
23 gracias 3
24 from 3
25 equipos 3
26 años 3
27 veces 3
28 presidente 3
29 milibrujita 3
... ... ...
120 tratamiento 1
121 tumor 1
122 tatahcomenta 1
123 reacción 1
124 voz 1
125 perdoar 1
126 tempo 1
127 minhas 1
128 orejas 1
129 rodríguez 1
130 personajes 1
131 made 1
132 they 1
133 sentimientos 1
134 narradores 1
135 emelecmax 1
136 ghibellini 1
137 linares 1
138 chantaje 1
139 momentos 1
140 derecho 1
141 scattered 1
142 elcomerciocom 1
143 panoramaptv 1
144 niñito 1
145 página 1
146 video 1
147 prisión 1
148 verdad 1
149 josé 1

150 rows × 2 columns

[92]
def plot_with_labels(low_dim_embs, labels, filename='tsne.png'):
    assert low_dim_embs.shape[0] >= len(labels), "More labels than embeddings"
    plt.figure(figsize=(18, 18))  # in inches
    for i, label in enumerate(labels):
        x, y = low_dim_embs[i, :]
        plt.scatter(x, y, s=2.0)
        plt.annotate(label, xy=(x, y), xytext=(5, 2),
                 textcoords='offset points',ha='right',va='bottom')
    plt.tight_layout()
    plt.savefig('snapshot/lima_words_TSNE.png')
    plt.show()

# Creating the tsne plot [Warning: will take time]
tsne = TSNE(perplexity=50.0, n_components=2, init='pca', n_iter=10000)

low_dim_embedding = tsne.fit_transform(np.array([word.vector for word in nouns]))

# Finally plotting and saving the fig 
plot_with_labels(low_dim_embedding, [word.text for word in nouns])
[93]
tweets_df = pd.read_csv('tweets_solutions.csv')
tweets_df.head(1)
date followers location Tweets word_count char_count avg_word hastags numerics Tweets_clean ACTIONS NOUNS
0 2018-10-20 01:57:03 92 Lima @elpanfletope @AlanGarciaPeru chancho no come chancho 5 48 8.8 0 0 elpanfletope alangarciaperu chancho come chancho NaN elpanfletope
[94]
hashtag_summary = adv.extract_hashtags(tweets_df['Tweets'])
hashtag_summary.keys()
dict_keys(['hashtags', 'hashtags_flat', 'hashtag_counts', 'hashtag_freq', 'top_hashtags', 'overview'])
[95]
hashtag_summary['overview']
{'num_posts': 1000,
 'num_hashtags': 344,
 'hashtags_per_post': 0.344,
 'unique_hashtags': 238}
[96]
hashtag_summary['hashtags'][:20]
[[],
 [],
 [],
 ['#semanademayordomia'],
 [],
 [],
 ['#cookingclass', '#seerundo', '#lima', '#peru', '#southamerica'],
 [],
 [],
 [],
 ['#eslae'],
 [],
 ['#comprometidosconlaverdad', '#forensesec'],
 [],
 [],
 [],
 [],
 [],
 [],
 []]
[97]
hashtag_summary['hashtag_counts'][:20]
[0, 0, 0, 1, 0, 0, 5, 0, 0, 0, 1, 0, 2, 0, 0, 0, 0, 0, 0, 0]
[98]
hashtag_summary['hashtag_freq'][:20]
[(0, 841),
 (1, 93),
 (2, 30),
 (3, 7),
 (4, 6),
 (5, 5),
 (6, 10),
 (7, 5),
 (8, 1),
 (9, 2)]
[101]
plt.figure(facecolor='#ebebeb', figsize=(11, 8))
plt.bar([x[0] for x in hashtag_summary['hashtag_freq'][:15]],
        [x[1] for x in hashtag_summary['hashtag_freq'][:15]])
plt.title('Hashtag frequency')
plt.xlabel('Hashtags per tweet')
plt.ylabel('Number of tweets')
plt.yscale('log')
plt.grid(alpha=0.5)
plt.gca().set_frame_on(False)
[102]
hashtag_summary['top_hashtags'][:10]
[('#lima', 9),
 ('#mulderrctm', 9),
 ('#22díasparaveracaché', 6),
 ('#cristomoreno', 6),
 ('#señordelosmilagros', 6),
 ('#elorigendelorigen', 5),
 ('#esviernesyyonecesito', 5),
 ('#mesmorado', 5),
 ('#turrondedoñapepa', 5),
 ('#anticuchos', 5)]
[103]
plt.figure(facecolor='#ebebeb', figsize=(8, 12))
plt.barh([x[0] for x in hashtag_summary['top_hashtags'][2:][:5]][::-1],
         [x[1] for x in hashtag_summary['top_hashtags'][2:][:5]][::-1])
plt.title('Top Hashtags')
# plt.xticks(range(3))
plt.grid(alpha=0.5)
plt.gca().set_frame_on(False)
[104]
emoji_summary = adv.extract_emoji(tweets_df['Tweets'])
emoji_summary.keys()
dict_keys(['emoji', 'emoji_text', 'emoji_flat', 'emoji_flat_text', 'emoji_counts', 'emoji_freq', 'top_emoji', 'top_emoji_text', 'overview'])
[105]
emoji_summary['overview']
{'num_posts': 1000,
 'num_emoji': 511,
 'emoji_per_post': 0.511,
 'unique_emoji': 132}
[106]
emoji_summary['emoji'][50:80]
[['📻', '📣'],
 [],
 [],
 [],
 [],
 [],
 ['🤔'],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 ['😦'],
 [],
 [],
 [],
 [],
 [],
 [],
 ['😀'],
 [],
 [],
 [],
 []]
[107]
emoji_summary['emoji_text'][50:80]
[['radio', 'megaphone'],
 [],
 [],
 [],
 [],
 [],
 ['thinking face'],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 ['frowning face with open mouth'],
 [],
 [],
 [],
 [],
 [],
 [],
 ['grinning face'],
 [],
 [],
 [],
 []]
[108]
emoji_summary['emoji_flat'][:10]
['💁🏻\u200d♀️', '🇪🇨', '🇪🇨', '🇪🇨', '🇪🇨', '🤔', '😆', '😆', '😆', '😆']
[109]
emoji_summary['emoji_flat_text'][:10]
['woman tipping hand light skin tone',
 'Ecuador',
 'Ecuador',
 'Ecuador',
 'Ecuador',
 'thinking face',
 'grinning squinting face',
 'grinning squinting face',
 'grinning squinting face',
 'grinning squinting face']
[110]
list(zip(emoji_summary['emoji_flat'][:10], emoji_summary['emoji_flat_text'][:10]))
[('💁🏻\u200d♀️', 'woman tipping hand light skin tone'),
 ('🇪🇨', 'Ecuador'),
 ('🇪🇨', 'Ecuador'),
 ('🇪🇨', 'Ecuador'),
 ('🇪🇨', 'Ecuador'),
 ('🤔', 'thinking face'),
 ('😆', 'grinning squinting face'),
 ('😆', 'grinning squinting face'),
 ('😆', 'grinning squinting face'),
 ('😆', 'grinning squinting face')]
[111]
emoji_summary['emoji_counts'][:15]
[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 4, 0, 0, 0, 1]
[112]
emoji_summary['emoji_freq'][:15]
[(0, 808),
 (1, 108),
 (2, 27),
 (3, 23),
 (4, 11),
 (5, 9),
 (6, 2),
 (8, 1),
 (9, 2),
 (10, 1),
 (11, 2),
 (13, 1),
 (15, 1),
 (18, 1),
 (23, 1)]
[115]
plt.figure(facecolor='#ebebeb', figsize=(8, 8))
plt.bar([x[0] for x in emoji_summary['emoji_freq'][:15]],
        [x[1] for x in emoji_summary['emoji_freq'][:15]])
plt.title('Emoji frequency')
plt.xlabel('Emoji per tweet')
plt.ylabel('Number of tweets')
plt.yscale('log')
plt.grid(alpha=0.5)
plt.gca().set_frame_on(False)
[116]
emoji_summary['top_emoji'][:8]
[('😭', 39),
 ('😂', 37),
 ('🤣', 32),
 ('❤', 21),
 ('♥', 20),
 ('😍', 17),
 ('🤤', 14),
 ('😻', 14)]
[117]
emoji_summary['top_emoji_text'][:8]
[('loudly crying face', 39),
 ('face with tears of joy', 37),
 ('rolling on the floor laughing', 32),
 ('red heart', 21),
 ('heart suit', 20),
 ('smiling face with heart-eyes', 17),
 ('drooling face', 14),
 ('smiling cat face with heart-eyes', 14)]
[118]
plt.figure(facecolor='#ebebeb', figsize=(8, 8))
plt.barh([x[0] for x in emoji_summary['top_emoji_text'][:8]][::-1],
         [x[1] for x in emoji_summary['top_emoji_text'][:8]][::-1])
plt.title('Top Emoji')
plt.grid(alpha=0.5)
plt.gca().set_frame_on(False)
[119]
mention_summary = adv.extract_mentions(tweets_df.Tweets)
mention_summary.keys()
dict_keys(['mentions', 'mentions_flat', 'mention_counts', 'mention_freq', 'top_mentions', 'overview'])
[120]
mention_summary['overview']
{'num_posts': 1000,
 'num_mentions': 886,
 'mentions_per_post': 0.886,
 'unique_mentions': 503}
[121]
mention_summary['mentions'][:15]
[['@elpanfletope', '@alangarciaperu'],
 ['@exitosape', '@julianaoxenford'],
 ['@anajuliachs'],
 ['@iasdesperanza'],
 ['@fernando_roman1'],
 [],
 [],
 [],
 [],
 ['@mininterperu', '@pcmperu', '@vizcarrhagan'],
 [],
 ['@sasha71396634',
  '@ernesto_jx',
  '@hninurta',
  '@gabospeed94',
  '@karla_ugaz',
  '@vero_mendoza_f'],
 [],
 [],
 []]
[122]
mention_summary['mentions_flat'][:10]
['@elpanfletope',
 '@alangarciaperu',
 '@exitosape',
 '@julianaoxenford',
 '@anajuliachs',
 '@iasdesperanza',
 '@fernando_roman1',
 '@mininterperu',
 '@pcmperu',
 '@vizcarrhagan']
[123]
mention_summary['mention_counts'][:20]
[2, 2, 1, 1, 1, 0, 0, 0, 0, 3, 0, 6, 0, 0, 0, 1, 1, 1, 0, 0]
[124]
mention_summary['mention_freq'][:15]
[(0, 484),
 (1, 341),
 (2, 105),
 (3, 29),
 (4, 11),
 (5, 6),
 (6, 4),
 (7, 10),
 (8, 10)]
[131]
plt.figure(facecolor='#ebebeb', figsize=(8, 8))
plt.bar([x[0] for x in mention_summary['mention_freq'][:15]],
        [x[1] for x in mention_summary['mention_freq'][:15]])
plt.title('Mention frequency')
plt.xlabel('Mention per tweet')
plt.ylabel('Number of tweets')
plt.grid(alpha=0.5)
plt.yscale('log')
plt.gca().set_frame_on(False)
plt.savefig('snapshot/Mention Frequency.png');
[132]
mention_summary['top_mentions'][:10]
[('@canaln_', 31),
 ('@franpetrozzi', 11),
 ('@idl_r', 10),
 ('@rppnoticias', 10),
 ('@rosamariabartra', 9),
 ('@keikofujimori', 8),
 ('@ximena_casanova', 7),
 ('@policiaperu', 7),
 ('@milagrosleivag', 7),
 ('@abbu25', 7)]
[137]
plt.figure(facecolor='#ebebeb', figsize=(8, 8))
plt.barh([x[0] for x in mention_summary['top_mentions'][:15]][::-1],
         [x[1] for x in mention_summary['top_mentions'][:15]][::-1])
plt.title('Top Mentions')
plt.grid(alpha=0.5)
plt.gca().set_frame_on(False)
plt.savefig('snapshot/Top Mentions.png');
[138]
tweets_df.columns
Index(['date', 'followers', 'location', 'Tweets', 'word_count', 'char_count',
       'avg_word', 'hastags', 'numerics', 'Tweets_clean', 'ACTIONS', 'NOUNS'],
      dtype='object')
[139]
extracted_tweets =  (tweets_df[['Tweets', 'followers']]
 .assign(hashtags=hashtag_summary['hashtags'],
         hashcounts=hashtag_summary['hashtag_counts'],
         mentions=mention_summary['mentions'],
         mention_count=mention_summary['mention_counts'],
         emoji=emoji_summary['emoji'],
         emoji_text=emoji_summary['emoji_text'],
         emoji_count=emoji_summary['emoji_counts'],))
extracted_tweets.head()
Tweets followers hashtags hashcounts mentions mention_count emoji emoji_text emoji_count
0 @elpanfletope @AlanGarciaPeru chancho no come chancho 92 [] 0 [@elpanfletope, @alangarciaperu] 2 [] [] 0
1 @exitosape @JulianaOxenford A DONDE DICE ... 32 [] 0 [@exitosape, @julianaoxenford] 2 [] [] 0
2 @anajuliachs kkkk 1 mês 195 [] 0 [@anajuliachs] 1 [] [] 0
3 Ultima noche de #SemanaDeMayordomia en la @IASDEsperanza, aprendiendo sobre la importancia de ser fiel a Dios con l…... 228 [#semanademayordomia] 1 [@iasdesperanza] 1 [] [] 0
4 @fernando_roman1 Jajaja 123 [] 0 [@fernando_roman1] 1 [] [] 0
[140]
extracted_tweets.columns
Index(['Tweets', 'followers', 'hashtags', 'hashcounts', 'mentions',
       'mention_count', 'emoji', 'emoji_text', 'emoji_count'],
      dtype='object')
[141]
word_freq_hash = adv.word_frequency(extracted_tweets['hashtags'].str.join(' '), 
                                    extracted_tweets['followers'].fillna(0))#.sort_values(['abs_freq'], ascending=False).head(20)
word_freq_hash.head(10)
word abs_freq wtd_freq rel_value
0 #lima 9 430823 47869.0
1 #drogas 2 295632 147816.0
2 #microcomercializar 2 295632 147816.0
3 #chorrillos 1 147819 147819.0
4 #esviernesyyonecesito 5 147727 29545.0
5 #renuncia 1 107446 107446.0
6 #pérez 1 107446 107446.0
7 #tablitasexcel 2 92850 46425.0
8 #22díasparaveracaché 6 40669 6778.0
9 #burnthestagethemovieinvzla 1 39846 39846.0
[142]
extracted_tweets[extracted_tweets['hashtags'].str.join(' ')
                  .str.contains('lima',case=False)]
Tweets followers hashtags hashcounts mentions mention_count emoji emoji_text emoji_count
6 Cooking with fire in Peru today #cookingclass #seerundo #lima #peru #southamerica @ Lima, Peru https://t.co/llmzIMjMUo 13 [#cookingclass, #seerundo, #lima, #peru, #southamerica] 5 [] 0 [] [] 0
23 Shopping for our cooking class today #mercado #seerundo #lima #peru #southamerica @ Mercado N#1 de Surquillo https:/... 13 [#mercado, #seerundo, #lima, #peru, #southamerica, #1] 6 [] 0 [] [] 0
252 Feels good to be back home 🇵🇪 #peru #larcomar #lima @ Lima, Peru https://t.co/agFso0gRcs 247 [#peru, #larcomar, #lima] 3 [] 0 [🇵🇪] [Peru] 1
330 #esviernesyyonecesito es ahora una tendencia en #Lima\n\nhttps://t.co/UOmWB9sTSw https://t.co/oZBAUS7GPe 107446 [#esviernesyyonecesito, #lima] 2 [] 0 [] [] 0
389 By @mariotestino 🙌🏼 \n.\n.\n.\n.\n.\n.\n.\n.\n#igersperu #lima #museum #musee #peru #mariotestino #photography #mode... 213 [#igersperu, #lima, #museum, #musee, #peru, #mariotestino, #photography, #mode] 8 [@mariotestino] 1 [🙌🏼] [raising hands medium-light skin tone] 1
512 #pérez es ahora una tendencia en #Lima\n\nhttps://t.co/CXgZJw73Ty https://t.co/a7wfyKchQU 107446 [#pérez, #lima] 2 [] 0 [] [] 0
653 Pequeña\n#traveler #explorer #flower #lima #pic #nice #photograpy https://t.co/82r1wgHu3w 553 [#traveler, #explorer, #flower, #lima, #pic, #nice, #photograpy] 7 [] 0 [] [] 0
749 #renuncia es ahora una tendencia en #Lima\n\nhttps://t.co/DhxjeOKeix https://t.co/61jYXcNSwk 107446 [#renuncia, #lima] 2 [] 0 [] [] 0
821 'cerradura', 'desconocidos' y 'amedrentamiento' es ahora una tendencia en #Lima\n\nhttps://t.co/zpEunuSxrb https://t... 107446 [#lima] 1 [] 0 [] [] 0
[143]
word_freq_mention = adv.word_frequency(extracted_tweets['mentions'].str.join(' '), 
                                       extracted_tweets['followers'].fillna(0))
                                    #.sort_values(['abs_freq'], ascending=False).head(20)
word_freq_mention.head(10)
word abs_freq wtd_freq rel_value
0 @gissellereyes 1 214460 214460.0
1 @ucatolicaec 3 52617 17539.0
2 @macara_oficial 1 46425 46425.0
3 @dcm_online 1 44161 44161.0
4 @brasil247 1 44161 44161.0
5 @terranoticiasbr 1 44160 44160.0
6 @canaln_ 31 31715 1023.0
7 @jacquelinabravo 1 31299 31299.0
8 @alokadalis 2 27305 13652.0
9 @colrichardkemp 1 23611 23611.0
[144]
word_freq_emoji = adv.word_frequency(extracted_tweets['emoji'].str.join(' '), 
                                       extracted_tweets['followers'].fillna(0))#.sort_values(['abs_freq'], ascending=False).head(20)
word_freq_emoji.head(10)
word abs_freq wtd_freq rel_value
0 😭 39 434614 11144.0
1 🖤 1 377740 377740.0
2 🤔 7 304059 43437.0
3 📣 3 295880 98627.0
4 🚔 2 295632 147816.0
5 👮🏿‍♀️ 2 295632 147816.0
6 👮🏿‍♂️ 2 295632 147816.0
7 👉 7 221454 31636.0
8 😝 2 214546 107273.0
9 🇪🇨 4 153264 38316.0
[145]
[adv.emoji_dict.emoji_dict[k] for k in word_freq_emoji['word'][:10]]
[':loudly_crying_face:',
 ':black_heart:',
 ':thinking_face:',
 ':megaphone:',
 ':oncoming_police_car:',
 ':woman_police_officer_dark_skin_tone:',
 ':man_police_officer_dark_skin_tone:',
 ':backhand_index_pointing_right:',
 ':squinting_face_with_tongue:',
 ':Ecuador:']
[146]
word_freq_emoji[:10].assign(emoji_text=[adv.emoji_dict.emoji_dict[k] for k in word_freq_emoji['word'][:10]])
word abs_freq wtd_freq rel_value emoji_text
0 😭 39 434614 11144.0 :loudly_crying_face:
1 🖤 1 377740 377740.0 :black_heart:
2 🤔 7 304059 43437.0 :thinking_face:
3 📣 3 295880 98627.0 :megaphone:
4 🚔 2 295632 147816.0 :oncoming_police_car:
5 👮🏿‍♀️ 2 295632 147816.0 :woman_police_officer_dark_skin_tone:
6 👮🏿‍♂️ 2 295632 147816.0 :man_police_officer_dark_skin_tone:
7 👉 7 221454 31636.0 :backhand_index_pointing_right:
8 😝 2 214546 107273.0 :squinting_face_with_tongue:
9 🇪🇨 4 153264 38316.0 :Ecuador:
[147]
sotu_retweets = np.load('tweets_dict.npy')
[152]
def buildDataFrameFromDict(mapping):
    df=[]
    for f in mapping:
        f_n = {}
        for k,item in f.items():
            if isinstance(item,dict):
                for i,j in item.items():
                    f_n[k+'-'+i] = j
            else:
                f_n[k] = f[k]
        df.append(f_n)
    
    df = pd.DataFrame(df)
    return df

sotu = buildDataFrameFromDict(sotu_retweets)
sotu.head()
contributors coordinates coordinates-coordinates coordinates-type created_at display_text_range entities-hashtags entities-media entities-symbols entities-urls ... user-profile_text_color user-profile_use_background_image user-protected user-screen_name user-statuses_count user-time_zone user-translator_type user-url user-utc_offset user-verified
0 None NaN NaN NaN Sat Oct 20 01:57:03 +0000 2018 [30, 53] [] NaN [] [] ... 333333 True False geriitx 293 None none https://m.facebook.com/geriita.ab?__user=1571199766 None False
1 None NaN NaN NaN Sat Oct 20 01:57:10 +0000 2018 [28, 44] [] NaN [] [] ... 333333 True False MurgaRodolfo 256 None none None None False
2 None NaN NaN NaN Sat Oct 20 01:57:12 +0000 2018 [13, 23] [] NaN [] [] ... 333333 True False GusRodrigues4 752 None none None None False
3 None NaN NaN NaN Sat Oct 20 01:57:17 +0000 2018 [0, 140] [{'text': 'SemanaDeMayordomia', 'indices': [16, 35]}] NaN [] [{'url': 'https://t.co/gMo19IEnh6', 'expanded_url': 'https://twitter.com/i/web/status/1053465144875991045', 'display... ... 333333 True False IASDEsperanza 698 None none None None False
4 None NaN NaN NaN Sat Oct 20 01:57:18 +0000 2018 [17, 23] [] NaN [] [] ... 333333 True False mcthuglife666 1168 None none None None False

5 rows × 125 columns

[156]
for c in sotu.columns:
    print(c)
contributors coordinates coordinates-coordinates coordinates-type created_at display_text_range entities-hashtags entities-media entities-symbols entities-urls entities-user_mentions extended_entities-media extended_tweet-display_text_range extended_tweet-entities extended_tweet-extended_entities extended_tweet-full_text favorite_count favorited filter_level geo geo-coordinates geo-type id id_str in_reply_to_screen_name in_reply_to_status_id in_reply_to_status_id_str in_reply_to_user_id in_reply_to_user_id_str is_quote_status lang place-attributes place-bounding_box place-country place-country_code place-full_name place-id place-name place-place_type place-url possibly_sensitive quote_count quoted_status-contributors quoted_status-coordinates quoted_status-created_at quoted_status-display_text_range quoted_status-entities quoted_status-extended_entities quoted_status-extended_tweet quoted_status-favorite_count quoted_status-favorited quoted_status-filter_level quoted_status-geo quoted_status-id quoted_status-id_str quoted_status-in_reply_to_screen_name quoted_status-in_reply_to_status_id quoted_status-in_reply_to_status_id_str quoted_status-in_reply_to_user_id quoted_status-in_reply_to_user_id_str quoted_status-is_quote_status quoted_status-lang quoted_status-place quoted_status-possibly_sensitive quoted_status-quote_count quoted_status-quoted_status_id quoted_status-quoted_status_id_str quoted_status-reply_count quoted_status-retweet_count quoted_status-retweeted quoted_status-source quoted_status-text quoted_status-truncated quoted_status-user quoted_status_id quoted_status_id_str quoted_status_permalink-display quoted_status_permalink-expanded quoted_status_permalink-url reply_count retweet_count retweeted source text timestamp_ms truncated user-contributors_enabled user-created_at user-default_profile user-default_profile_image user-description user-favourites_count user-follow_request_sent user-followers_count user-following user-friends_count user-geo_enabled user-id user-id_str user-is_translator user-lang user-listed_count user-location user-name user-notifications user-profile_background_color user-profile_background_image_url user-profile_background_image_url_https user-profile_background_tile user-profile_banner_url user-profile_image_url user-profile_image_url_https user-profile_link_color user-profile_sidebar_border_color user-profile_sidebar_fill_color user-profile_text_color user-profile_use_background_image user-protected user-screen_name user-statuses_count user-time_zone user-translator_type user-url user-utc_offset user-verified
[159]
data = sotu.sample(10)['user-screen_name']
sotu['retweeted_status-user-screen_name'] = np.random.choice(data,len(sotu))
[162]
sotu[['user-screen_name','retweeted_status-user-screen_name']].head()
user-screen_name retweeted_status-user-screen_name
0 geriitx hhbacigalupo
1 MurgaRodolfo hhbacigalupo
2 GusRodrigues4 mpauta
3 IASDEsperanza hhbacigalupo
4 mcthuglife666 casidi13243
[161]
G_rt = nx.from_pandas_edgelist(
    sotu,
    source = 'user-screen_name', 
    target = 'retweeted_status-user-screen_name',
    create_using = nx.DiGraph())
    
print('Nodes in RT network:', len(G_rt.nodes()))
print('Edges in RT network:', len(G_rt.edges()))
Nodes in RT network: 505 Edges in RT network: 855
[166]
G_reply = nx.from_pandas_edgelist(
    sotu,
    source = 'user-screen_name', 
    target = 'in_reply_to_screen_name',
    create_using = nx.DiGraph())
    
print('Nodes in reply network:', len(G_reply.nodes()))

print('Edges in reply network:', len(G_reply.edges()))
Nodes in reply network: 796 Edges in reply network: 708
[177]
pos = nx.random_layout(G_rt)

sizes = [x[1] for x in G_rt.degree()]

nx.draw_networkx(G_rt, pos, 
    with_labels = False, 
    node_size = sizes,
    width = 0.1, alpha = 0.7,
    arrowsize = 2, linewidths = 0)

plt.savefig('snapshot/lima_tweets_influencing_graph.png')
plt.axis('off'); plt.show()
[168]
pos = nx.random_layout(G_reply)
sizes = [x[1] for x in G_reply.degree()]

nx.draw_networkx(G_reply, pos, 
    with_labels = False, 
    node_size = sizes,
    width = 0.1, alpha = 0.7,
    arrowsize = 2, linewidths = 0)

plt.axis('off'); plt.show()
[172]
column_names = ['screen_name', 'degree_centrality']
rt_centrality = nx.in_degree_centrality(G_rt)

reply_centrality = nx.in_degree_centrality(G_reply)

rt = pd.DataFrame(list(rt_centrality.items()), columns = column_names)
reply = pd.DataFrame(list(reply_centrality.items()), columns = column_names)

display(rt.sort_values('degree_centrality', ascending = False).head())

display(reply.sort_values('degree_centrality', ascending = False).head())
screen_name degree_centrality
1 hhbacigalupo 0.208333
12 NOTIELMOMENTO 0.178571
7 casidi13243 0.176587
4 mpauta 0.174603
17 Rockmanita 0.168651
screen_name degree_centrality
7 None 0.441509
105 canalN_ 0.015094
83 SolCn 0.005031
400 CesarNakazaki 0.005031
448 IDL_R 0.003774
[171]
column_names = ['screen_name', 'betweenness_centrality']
# Generate betweenness centrality for retweets 
rt_centrality = nx.betweenness_centrality(G_rt)

# Generate betweenness centrality for replies 
reply_centrality = nx.betweenness_centrality(G_reply)

# Store centralities in data frames
rt = pd.DataFrame(list(rt_centrality.items()), columns = column_names)
reply = pd.DataFrame(list(reply_centrality.items()), columns = column_names)

# Print first five results in descending order of centrality
display(rt.sort_values('betweenness_centrality', ascending = False).head())

# Print first five results in descending order of centrality
display(reply.sort_values('betweenness_centrality', ascending = False).head())
screen_name betweenness_centrality
12 NOTIELMOMENTO 0.004589
27 FedericoClemen8 0.004563
10 marioporlavida 0.003482
17 Rockmanita 0.003354
1 hhbacigalupo 0.003119
screen_name betweenness_centrality
413 mindreaux 0.000003
442 famasem 0.000003
405 Ldavidesc 0.000002
0 geriitx 0.000000
526 rolo18al 0.000000
[174]
column_names = ['screen_name', 'degree']

degree_rt = pd.DataFrame(list(G_rt.in_degree()), columns = column_names)
degree_reply = pd.DataFrame(list(G_reply.in_degree()), columns = column_names)

ratio = degree_rt.merge(degree_reply, on = 'screen_name', suffixes = ('_rt', '_reply'))

ratio['ratio'] = ratio['degree_reply'] / ratio['degree_rt']

ratio = ratio[ratio['degree_rt'] >= 5]

display(ratio.sort_values('ratio', ascending = False).head())
screen_name degree_rt degree_reply ratio
1 hhbacigalupo 105 0 0.0
4 mpauta 88 0 0.0
7 casidi13243 89 0 0.0
10 marioporlavida 82 0 0.0
12 NOTIELMOMENTO 90 0 0.0
[ ]