PK!?א55botspot/BotSpot.py""" A module to find the botness of users from a twitter json dump. """ import json import pandas as pd import pickle as pk import numpy as np from .utils import * # from utils import * from .featureLists import * # from featureLists import * import traceback import collections from xgboost.sklearn import XGBClassifier import xgboost as xgb import os from sklearn.feature_extraction.text import TfidfTransformer from sklearn.feature_extraction.text import CountVectorizer class BotSpot: def __init__(self): self.user_feature_list = user_feature_list self.tweet_feature_list = tweet_feature_list self.word2vecEmbeddings = None self.dataHashtags = None def setWord2VecEmbeddings(self, embeddings=None, embeddingsPath=None, forceReload=True): if not forceReload and self.word2vecEmbeddings is not None: return if embeddings is not None and embeddingsPath is not None: raise Exception("Please only specify one source for the Word2Vec embeddings.") elif embeddings is not None and isinstance(embeddings, collections.Mapping): self.word2vecEmbeddings = embeddings elif embeddingsPath is not None: _,fileextension = os.path.splitext(embeddingsPath) if fileextension == '.pickle': print("Loading Word2Vec Embeddings...") with open(embeddingsPath,"rb") as f: self.word2vecEmbeddings = pk.load(f) print("Finished loading Word2Vec Embeddings") elif fileextension == '.txt': print("Loading Word2Vec Embeddings...") with open(embeddingsPath,"r") as f: model = {} for line in f: splitLine = line.split() word = splitLine[0] embedding = np.array([float(val) for val in splitLine[1:]]) model[word] = embedding self.word2vecEmbeddings = model print("Finished loading Word2Vec Embeddings") def extractTweets(self, filePath, tweetLimit = None, embeddings=None, embeddingsPath=None, hashtagFilter=None): """Extracts tweets from a json dump into a pandas dataframe""" # Appending DataFrames line by line is inefficient, because it generates a # new dataframe each time. It better to get the entire list and them concat. user_list = [] tweet_list = [] w2v_content_list = [] w2v_description_list = [] cascade_list = [] with open(filePath) as f: for i, line in enumerate(f,1): if tweetLimit is not None and tweetLimit < i: break j = json.loads(line) try: temp_user = {} temp_tweet = {} temp_content = {'status_text':j['text'], 'user_id' : j['user']['id']} temp_description = {'description':j['user']['description'], 'user_id' : j['user']['id']} temp_cascade = {} if 'retweeted_status' in j: temp_cascade['cascade_id'] = j['retweeted_status']['id'] temp_cascade['original_created_at'] = j['retweeted_status']['created_at'] temp_cascade['created_at'] = j['created_at'] temp_cascade['retweeted'] = True else: temp_cascade['cascade_id'] = j['id'] temp_cascade['original_created_at'] = j['created_at'] temp_cascade['created_at'] = j['created_at'] temp_cascade['retweeted'] = False temp_cascade['follower_count'] = j['user']['followers_count'] temp_cascade['status_text'] = j['text'] temp_cascade['screen_name'] = j['user']['screen_name'] temp_cascade['user_id'] = j['user']['id'] temp_user['user_id'] = j['user']['id'] temp_tweet['user_id'] = j['user']['id'] temp_user.update(getTextFeatures('name',j['user']['name'])) temp_user.update(getTextFeatures('location',j['user']['location'])) temp_user.update(getTextFeatures('description',j['user']['description'])) for key in ['statuses_count', 'listed_count', 'friends_count', 'followers_count']: temp_user[key] = j['user'][key] temp_user['verified'] = 1 if j['user']['verified'] else 0 temp_user['ff_ratio'] = (temp_user['followers_count'] + 1)/(temp_user['followers_count'] + temp_user['friends_count'] + 1) temp_user['years_on_twitter'] = (datetime.now() - datetime.strptime(j['user']['created_at'], '%a %b %d %H:%M:%S +0000 %Y')).days/365 temp_user['statuses_rate'] = (temp_user['statuses_count'] + 1)/(temp_user['years_on_twitter'] + .001) temp_user['tweets_to_followers'] = (temp_user['statuses_count'] + 1)/(temp_user['followers_count'] + 1) temp_user['retweet_count'] = j['retweet_count'] temp_user['favorite_count'] = j['favorite_count'] temp_user['favourites_count'] = j['user']['favourites_count'] temp_tweet.update(getTextFeatures('status_text',j['text'])) temp_tweet['n_tweets'] = 1 if 'retweeted_status' in j and ('quoted_status_is' in j) else 0 temp_tweet['n_retweets'] = 1 if 'retweeted_status' in j else 0 temp_tweet['n_quotes'] = 1 if 'quoted_status_id' in j else 0 temp_tweet['n_timeofday'] = hourofweekday(j['created_at']) temp_tweet.update(getSource(j['source'])) user_list.append(temp_user) tweet_list.append(temp_tweet) w2v_content_list.append(temp_content) w2v_description_list.append(temp_description) cascade_list.append(temp_cascade) except Exception as err: traceback.print_tb(err.__traceback__) # We are assuming that user data doesn't change much and if it does, we take that 'latest' as our feature userDataframe = pd.DataFrame(user_list).fillna(0).set_index('user_id') userDataframe = userDataframe[~userDataframe.index.duplicated(keep='last')] tweetDataframe = pd.DataFrame(tweet_list).fillna(0).set_index('user_id') n_retweets = tweetDataframe['n_retweets'].groupby('user_id').sum() n_quoted = tweetDataframe['n_quotes'].groupby('user_id').sum() tweetDataframe = tweetDataframe.groupby('user_id').mean() tweetDataframe['n_retweets'] = n_retweets tweetDataframe['n_quotes'] = n_quoted self.cascadeDataframe = pd.DataFrame(cascade_list).fillna(0) # We need to filter out the features we don't have in our training set userDataframe = userDataframe[self.user_feature_list] tweetDataframe = tweetDataframe[self.tweet_feature_list] contentDataframe = pd.DataFrame(w2v_content_list).set_index('user_id') descriptionDataframe = pd.DataFrame(w2v_description_list).set_index('user_id') descriptionDataframe = descriptionDataframe[~descriptionDataframe.index.duplicated(keep='last')] if embeddingsPath is not None or embeddings is not None: self.setWord2VecEmbeddings(embeddings,embeddingsPath, forceReload=False) w2vDataframe = self.__computeVectors(contentDataframe, descriptionDataframe) self.featureDataframe = userDataframe.join(tweetDataframe) if self.word2vecEmbeddings is not None: self.featureDataframe = self.featureDataframe.join(w2vDataframe) if hashtagFilter is not None: self.dataHashtags = hashtagFilter hashtagdf = self.__computeHashtagFeatures(contentDataframe)[hashtagFilter] self.featureDataframe = self.featureDataframe.join(hashtagdf) return self.featureDataframe def __computeHashtagFeatures(self, contentdf): hashtagSeries = contentdf['status_text'].str.findall(r'(? 0: # We want to check that all the features we requested are in the training set raise Exception("There are some features in the training set which are missing, namely ",set(self.featureDataframe.columns.values) - set(botrnot.columns.values)) botrnot = botrnot[self.featureDataframe.columns.values] train = xgb.DMatrix(botrnot.values, botTarget.values, feature_names=botrnot.columns.values) self.clf = xgb.train(params, train, 80) if saveFileName is not None: self.clf.save_model(saveFileName) def getBotness(self): if self.clf is None: raise Exception("The classifier has not been loaded yet") if self.featureDataframe is None: raise Exception("Tweets haven't been extracted yet") test = xgb.DMatrix(self.featureDataframe.values, feature_names=self.featureDataframe.columns.values) bdf = pd.DataFrame() bdf['botness'] = self.clf.predict(test) bdf['user_id'] = self.featureDataframe.index self.botnessDataframe = bdf.set_index('user_id') return self.botnessDataframe def composeData(self): new = None if self.botnessDataframe is not None and self.cascadeDataframe is not None: new = self.botnessDataframe.loc[self.cascadeDataframe['user_id']].reset_index()['botness'] new = pd.concat([self.cascadeDataframe,new], ignore_index=True, axis=1) new.columns = list(self.cascadeDataframe.columns.values) + ['botness'] self.composedDataframe = new return new PK!>+}33botspot/__init__.py__version__ = '0.1.2' from .BotSpot import BotSpotPK!э3botspot/featureLists.pyuser_feature_list = ['location_n_periods', 'description_n_capsp', 'description_n_caps', 'name_n_exclaims', 'location_n_chars', 'location_n_exclaims', 'description_n_extraspaces', 'name_n_periods', 'location_n_lowersp', 'location_n_words', 'description_n_lowersp', 'description_n_periods', 'location_n_urls', 'description_n_mentions', 'name_n_puncts', 'location_n_capsp', 'description_n_chars', 'verified', 'name_n_lowers', 'description_n_urls', 'tweets_to_followers', 'ff_ratio', 'description_n_puncts', 'description_n_charsperword', 'location_n_charsperword', 'description_n_words', 'description_n_digits', 'description_n_commas', 'name_n_lowersp', 'location_n_extraspaces', 'location_n_puncts', 'name_n_mentions', 'name_n_words', 'name_n_caps', 'name_n_extraspaces', 'name_n_commas', 'description_n_exclaims', 'location_n_commas', 'name_n_digits', 'location_n_digits', 'name_n_charsperword', 'description_n_lowers', 'name_n_chars', 'name_n_capsp', 'location_n_mentions', 'location_n_caps', 'location_n_lowers',] # +['favourites_count',] # +['location_n_hashtags', # 'location_n_nonasciis', # 'listed_count', # 'retweet_count', # 'statuses_rate', # 'friends_count', # 'name_n_hashtags', # 'description_n_nonasciis', # 'favorite_count', # 'description_n_hashtags', # 'followers_count', # 'statuses_count', # 'years_on_twitter', # 'name_n_nonasciis'] tweet_feature_list = ['status_text_n_capsp', 'status_text_n_exclaims', 'status_text_n_urls', 'status_text_n_extraspaces', 'status_text_n_lowers', 'status_text_n_charsperword', 'status_text_n_commas', 'status_text_n_words', 'status_text_n_hashtags', 'status_text_n_mentions', 'n_tweets', 'status_text_n_digits', 'status_text_n_caps', 'status_text_n_puncts', 'status_text_n_lowersp', 'status_text_n_chars', 'status_text_n_periods'] # +['google', # 'facebook', # 'ifttt', # 'hootsuite', # 'ipad', # 'iphone', # 'webclient', # 'android', # 'lite', # 'n_quotes', # 'status_text_n_nonasciis', # 'n_timeofday', # 'n_retweets']PK!m m botspot/utils.pyfrom datetime import datetime import re import string def parse(x): if type(x) != str: return [] x = re.sub("https?:","",x) x = x.lower() words = re.findall("[a-z]+",x) digs = re.findall("\\d",x) punc = re.findall(r"""[!|"|#|$|%|&|\'|(|)|\*|\+|,|-|.|\/|:|;|<|=|>|\?|@|\[|\\|\]|^|_|`|{|||}|~]""",x) return words + digs + punc def hourofweekday(datestring): d = datetime.strptime(datestring, '%a %b %d %H:%M:%S +0000 %Y') return d.weekday()*24 + d.hour + d.minute/60 + d.second/360 def grep(sourcestring, pattern): return 1 if pattern.lower() in sourcestring.lower() else 0 def getSource(sourcestring): sources = [('google','google'),('ifttt','IFTTT'),('facebook','facebook'),('ipad','for iPad'),('lite','twitter lite'),('hootsuite','hootsuite'),('android','android'),('webclient','web client'),('iphone','iphone')] return { x:grep(sourcestring,y) for x,y in sources} def getURLs(string): url = re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\), ]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', string) return url #No idea what lowersp or capsp is, but assume it means percentage def getTextFeatures(key, text): res = {} if text is not None: res[key+'_n_chars'] = len(text) res[key+'_n_commas'] = text.count(",") res[key+'_n_digits'] = sum([x.isdigit() for x in list(text)]) res[key+'_n_exclaims'] = sum([x=='!' for x in list(text)]) res[key+'_n_extraspaces'] = sum([x==' ' for x in list(text)]) res[key+'_n_hashtags'] = sum([x=='#' for x in list(text)]) res[key+'_n_lowers'] = sum([x.islower() for x in list(text)]) res[key+'_n_mentions'] = sum([x=='@' for x in list(text)]) res[key+'_n_periods'] = sum([x=='.' for x in list(text)]) if key != 'name': res[key+'_n_urls'] = len(getURLs(text)) res[key+'_n_words'] = len(re.sub("[^\w]", " ", text).split()) res[key+'_n_caps'] = sum([x.isupper() for x in list(text)]) res[key+'_n_nonasciis'] = sum([ord(x) < 128 for x in list(text)]) res[key+'_n_puncts'] = sum([x in string.punctuation for x in list(text)]) res[key+'_n_charsperword'] = (len(text)+1)/(res[key+'_n_words']+1) res[key+'_n_lowersp'] = (res[key+'_n_lowers']+1)/(res[key+'_n_chars'] + 1) res[key+'_n_capsp'] = (res[key+'_n_caps'] + 1)/(res[key+'_n_chars'] + 1) return resPK!Hf\VXbotspot-0.1.2.dist-info/WHEEL A н#f;/"d&&]xzw@fh*#ڭwCpYE*/-MQm= |PK!H+ botspot-0.1.2.dist-info/METADATA 0E@BVG;&)y, 6{0@;:b%ӧ ~+YEQkp`5#Խ8?i z8mmmAI9s`".2d?0ge26`dL(O|J3f>3enu6jPK!HjE[botspot-0.1.2.dist-info/RECORDuιr0@>"l)^Cl"ޠa@F1|KgܤݙA9)(jFsT ~AH^%8U}H,HK]>~,#Yl=ҳR glF\ g*-+q(6qg=͇Cpd⡤CMu,ܗkUԴ_oygv\m+$m؄tB8}vymhƟݙ3Bb}հJd,1B ӊQ_&EӍ];PK!?א55botspot/BotSpot.pyPK!>+}335botspot/__init__.pyPK!э3$6botspot/featureLists.pyPK!m m #Jbotspot/utils.pyPK!Hf\VXSbotspot-0.1.2.dist-info/WHEELPK!H+ OTbotspot-0.1.2.dist-info/METADATAPK!HjE[;Ubotspot-0.1.2.dist-info/RECORDPKV