import urllib.request import json import numpy as np from banapedia.wapi.WikipediaQuery import BlockQuery from banapedia.Ban import * class BanList(): def __init__(self, data_file, samples=30000, samples_by_query=500, from_internet=False): if from_internet: proxy = urllib.request.ProxyHandler(urllib.request.getproxies()) opener = urllib.request.build_opener(proxy) urllib.request.install_opener(opener) self.dict_list = self.fetch_multipart(samples, samples_by_query) else: with open(data_file, "r") as ban_dict_file: self.dict_list = json.load(ban_dict_file) self.ban_list = [] for ban_dict in self.dict_list: self.ban_list.append(Ban( ban_dict["user"], datetime.strptime(ban_dict["timestamp"], ISO_TIMESTAMP), datetime.strptime(ban_dict["expiry"], ISO_TIMESTAMP), )) def fetch_multipart(self, n, query_limit): ban_dict_list = [] n_fetched = 0 continue_token = None print("[INFO]", "Fetching %d bans" % n) while n_fetched < n: to_fetch = min(query_limit, n - n_fetched) query = BlockQuery( bkprop=["user", "timestamp", "expiry"], bkshow=["temp", "ip"], limit=to_fetch, continue_token=continue_token, ) results = query.fetch_result() ban_dict_list.extend(results["query"]["blocks"]) continue_token = results["query-continue"]["blocks"]["bkcontinue"] n_fetched += to_fetch print("[INFO]", "Fetched %d over %d bans" % (n_fetched, n)) print("[INFO]", "Bans fetching complete") return ban_dict_list def write_to_file(self, outfile): with open(outfile, "w") as ban_dict_file: json.dump(self.dict_list, ban_dict_file, indent="\t") def get_durations(self): return [ban.get_duration() for ban in self.ban_list] def get_countries(self): return [ban.get_country_code() for ban in self.ban_list] def __iter__(self): return self.dict_list.__iter__() def items(self): return self.dict_list.items() def by_country(self): ban_duration_by_country = {} for ban in self.ban_list: country_code = ban.get_country_code() if country_code not in ban_duration_by_country.keys(): ban_duration_by_country[country_code] = [] ban_duration_by_country[country_code].append(ban) return ban_duration_by_country def average_ban_by_country(self): average_ban_duration_ctry = {} for country, bans in self.by_country().items(): average = np.mean([ban.get_duration() for ban in bans]) average_ban_duration_ctry[country] = average # In months average_ban_duration_ctry = {country: duration/30 for country, duration in average_ban_duration_ctry.items()} return average_ban_duration_ctry