PK!'>>birdspotter/BirdSpotter.py""" BirdSpotter is a module which provides an influence and bot detection toolkit for twitter. """ import json import pandas as pd import pickle as pk import lzma import numpy as np from .utils import * # from utils import * import traceback import collections from xgboost.sklearn import XGBClassifier import xgboost as xgb import os from sklearn.feature_extraction.text import TfidfTransformer from sklearn.feature_extraction.text import CountVectorizer class BirdSpotter: """ Influence and Bot Detection toolkit for twitter dumps. This module takes a twitter json dump and extract metrics bot and influence metrics for the users. It requires a a labelled dataset of bots to do bot detection. It exposes processed data from the tweet dumps. """ def __init__(self): self.word2vecEmbeddings = None def setWord2VecEmbeddings(self, embeddings=None, forceReload=True): """ Sets the word2vec embeddings. Sets the word2vec embeddings if it hasn't alright been set, either through a python dict-like object or a path to a pickle or glove text file. Parameters ---------- embeddings : dict or str Either a python mapping object or a path to a pickle or glove text file of the w2v embeddings forceReload : boolean If True then the modules embeddings are overridden, otherwise if they exist in the module they aren't reloaded """ if not forceReload and self.word2vecEmbeddings is not None: return if embeddings is None: # print("Loading Word2Vec Embeddings...") # with lzma.open("word2vec.xz","r") as f: # self.word2vecEmbeddings = json.loads(f.read()) print("Finished loading Word2Vec Embeddings") elif isinstance(embeddings, str): embeddingsPath = embeddings _,fileextension = os.path.splitext(embeddingsPath) if fileextension == '.pickle': print("Loading Word2Vec Embeddings...") with open(embeddingsPath,"rb") as f: self.word2vecEmbeddings = pk.load(f) print("Finished loading Word2Vec Embeddings") elif fileextension == '.txt': print("Loading Word2Vec Embeddings...") with open(embeddingsPath,"r") as f: model = {} for line in f: splitLine = line.split() word = splitLine[0] embedding = np.array([float(val) for val in splitLine[1:]]) model[word] = embedding self.word2vecEmbeddings = model print("Finished loading Word2Vec Embeddings") elif isinstance(embeddings, collections.Mapping): self.word2vecEmbeddings = embeddings def extractTweets(self, filePath, tweetLimit = None, embeddings=None): """ Extracts tweets from a json dump into a pandas dataframe. Parameters ---------- filePath : str The path to the json twitter dump, to be loaded. tweetLimit : int Sets a limit on the number of tweets read. embeddings : dict or str Either a python mapping object or a path to a pickle or glove text file of the w2v embeddings Returns ------- Dataframe A dataframe of the features for each user """ # Appending DataFrames line by line is inefficient, because it generates a # new dataframe each time. It better to get the entire list and them concat. user_list = [] tweet_list = [] w2v_content_list = [] w2v_description_list = [] cascade_list = [] with open(filePath) as f: for i, line in enumerate(f,1): if tweetLimit is not None and tweetLimit < i: break j = json.loads(line) try: temp_user = {} temp_tweet = {} temp_content = {'status_text':j['text'], 'user_id' : j['user']['id']} temp_description = {'description':j['user']['description'], 'user_id' : j['user']['id']} temp_cascade = {} if 'retweeted_status' in j: temp_cascade['cascade_id'] = j['retweeted_status']['id'] temp_cascade['original_created_at'] = j['retweeted_status']['created_at'] temp_cascade['created_at'] = j['created_at'] temp_cascade['retweeted'] = True else: temp_cascade['cascade_id'] = j['id'] temp_cascade['original_created_at'] = j['created_at'] temp_cascade['created_at'] = j['created_at'] temp_cascade['retweeted'] = False temp_cascade['follower_count'] = j['user']['followers_count'] temp_cascade['status_text'] = j['text'] temp_cascade['screen_name'] = j['user']['screen_name'] temp_cascade['user_id'] = j['user']['id'] temp_user['user_id'] = j['user']['id'] temp_tweet['user_id'] = j['user']['id'] temp_user.update(getTextFeatures('name',j['user']['name'])) temp_user.update(getTextFeatures('location',j['user']['location'])) temp_user.update(getTextFeatures('description',j['user']['description'])) for key in ['statuses_count', 'listed_count', 'friends_count', 'followers_count']: temp_user[key] = j['user'][key] temp_user['verified'] = 1 if j['user']['verified'] else 0 temp_user['ff_ratio'] = (temp_user['followers_count'] + 1)/(temp_user['followers_count'] + temp_user['friends_count'] + 1) temp_user['years_on_twitter'] = (datetime.now() - datetime.strptime(j['user']['created_at'], '%a %b %d %H:%M:%S +0000 %Y')).days/365 temp_user['statuses_rate'] = (temp_user['statuses_count'] + 1)/(temp_user['years_on_twitter'] + .001) temp_user['tweets_to_followers'] = (temp_user['statuses_count'] + 1)/(temp_user['followers_count'] + 1) temp_user['retweet_count'] = j['retweet_count'] temp_user['favorite_count'] = j['favorite_count'] temp_user['favourites_count'] = j['user']['favourites_count'] temp_tweet.update(getTextFeatures('status_text',j['text'])) temp_tweet['n_tweets'] = 1 if 'retweeted_status' in j and ('quoted_status_is' in j) else 0 temp_tweet['n_retweets'] = 1 if 'retweeted_status' in j else 0 temp_tweet['n_quotes'] = 1 if 'quoted_status_id' in j else 0 temp_tweet['n_timeofday'] = hourofweekday(j['created_at']) temp_tweet.update(getSource(j['source'])) user_list.append(temp_user) tweet_list.append(temp_tweet) w2v_content_list.append(temp_content) w2v_description_list.append(temp_description) cascade_list.append(temp_cascade) except Exception as err: traceback.print_tb(err.__traceback__) # We are assuming that user data doesn't change much and if it does, we take that 'latest' as our feature userDataframe = pd.DataFrame(user_list).fillna(0).set_index('user_id') userDataframe = userDataframe[~userDataframe.index.duplicated(keep='last')] tweetDataframe = pd.DataFrame(tweet_list).fillna(0).set_index('user_id') n_retweets = tweetDataframe['n_retweets'].groupby('user_id').sum() n_quoted = tweetDataframe['n_quotes'].groupby('user_id').sum() tweetDataframe = tweetDataframe.groupby('user_id').mean() tweetDataframe['n_retweets'] = n_retweets tweetDataframe['n_quotes'] = n_quoted self.cascadeDataframe = pd.DataFrame(cascade_list).fillna(0) contentDataframe = pd.DataFrame(w2v_content_list).set_index('user_id') descriptionDataframe = pd.DataFrame(w2v_description_list).set_index('user_id') descriptionDataframe = descriptionDataframe[~descriptionDataframe.index.duplicated(keep='last')] #If embeddings are None then setWord2VecEmbeddings will automatically use the .xz to supply embeddings self.setWord2VecEmbeddings(embeddings, forceReload=False) w2vDataframe = self.__computeVectors(contentDataframe, descriptionDataframe) self.featureDataframe = userDataframe.join(tweetDataframe) if self.word2vecEmbeddings is not None: self.featureDataframe = self.featureDataframe.join(w2vDataframe) #Computes the features for all the hashtags. Is currently not protected from namespace errors. self.hashtagdf = self.__computeHashtagFeatures(contentDataframe) self.featureDataframe = self.featureDataframe.join(self.hashtagdf) return self.featureDataframe def getBotAnnotationTemplate(filename="annotationTemplate.csv"): """ Writes a CSV with the list of users and a blank column "isbot" to be annotated. A helper function which outputs a CSV to be annotated by a human. The output is a list of users with the blank "isbot" column. Parameters ---------- filename : str The name of the file to write the CSV Returns ------- Dataframe A dataframe of the users, with their screen names and a blank "is_bot" column. """ csv_data = (self.cascadeDataframe.groupby(['screen_name', 'user_id']).apply(lambda d: '').reset_index(name='isbot')) csv_data.to_csv(filename) return csv_data def __computeHashtagFeatures(self, contentdf): """Computes the hashtag tfidf features as a dataframe""" hashtagSeries = contentdf['status_text'].str.findall(r'(?|\?|@|\[|\\|\]|^|_|`|{|||}|~]""",x) return words + digs + punc def hourofweekday(datestring): d = datetime.strptime(datestring, '%a %b %d %H:%M:%S +0000 %Y') return d.weekday()*24 + d.hour + d.minute/60 + d.second/360 def grep(sourcestring, pattern): return 1 if pattern.lower() in sourcestring.lower() else 0 def getSource(sourcestring): sources = [('google','google'),('ifttt','IFTTT'),('facebook','facebook'),('ipad','for iPad'),('lite','twitter lite'),('hootsuite','hootsuite'),('android','android'),('webclient','web client'),('iphone','iphone')] return { x:grep(sourcestring,y) for x,y in sources} def getURLs(string): url = re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\), ]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', string) return url #No idea what lowersp or capsp is, but assume it means percentage def getTextFeatures(key, text): res = {} if text is not None: res[key+'_n_chars'] = len(text) res[key+'_n_commas'] = text.count(",") res[key+'_n_digits'] = sum([x.isdigit() for x in list(text)]) res[key+'_n_exclaims'] = sum([x=='!' for x in list(text)]) res[key+'_n_extraspaces'] = sum([x==' ' for x in list(text)]) res[key+'_n_hashtags'] = sum([x=='#' for x in list(text)]) res[key+'_n_lowers'] = sum([x.islower() for x in list(text)]) res[key+'_n_mentions'] = sum([x=='@' for x in list(text)]) res[key+'_n_periods'] = sum([x=='.' for x in list(text)]) if key != 'name': res[key+'_n_urls'] = len(getURLs(text)) res[key+'_n_words'] = len(re.sub("[^\w]", " ", text).split()) res[key+'_n_caps'] = sum([x.isupper() for x in list(text)]) res[key+'_n_nonasciis'] = sum([ord(x) < 128 for x in list(text)]) res[key+'_n_puncts'] = sum([x in string.punctuation for x in list(text)]) res[key+'_n_charsperword'] = (len(text)+1)/(res[key+'_n_words']+1) res[key+'_n_lowersp'] = (res[key+'_n_lowers']+1)/(res[key+'_n_chars'] + 1) res[key+'_n_capsp'] = (res[key+'_n_caps'] + 1)/(res[key+'_n_chars'] + 1) return resPK!Hf\VX!birdspotter-0.1.0.dist-info/WHEEL A н#f;/"d&&]xzw@fh*#ڭwCpYE*/-MQm= |PK!H%pv?$birdspotter-0.1.0.dist-info/METADATAN1 y FmU/HgwݵIVCշ'⚛=3$hQ8xV-:2phD(?{z޲sv0cwƁ2r7[JS&QY,%NJHәA.s18je}j_>[u|'첌!8GtJCy2]Gt&5f0% Pl~!/_K_Mm^o+ڼ&jbPK!H]-"birdspotter-0.1.0.dist-info/RECORD}Ϲ@gd,(8NEC}5٘_}6nug\~m"k'e~Q }αCDγJ`O0$p.^^-qtٚMD9F{OnBɶCvN9j_ǃY0,Isi[>B`BNGVdHQ+yHEzZӃVnfR 1[[ٖ/w*WX߿iS)j PK!'>>birdspotter/BirdSpotter.pyPK!!ɒ;; ?birdspotter/__init__.pyPK!m m y?birdspotter/utils.pyPK!Hf\VX!Ibirdspotter-0.1.0.dist-info/WHEELPK!H%pv?$Ibirdspotter-0.1.0.dist-info/METADATAPK!H]-"Jbirdspotter-0.1.0.dist-info/RECORDPK?L