From a95864928cd74df172f172d7c18b4d89e82548e1 Mon Sep 17 00:00:00 2001 From: Minijackson Date: Fri, 24 Oct 2014 10:20:14 +0200 Subject: Moving to object type ban dict --- bandict/__init__.py | 91 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 91 insertions(+) create mode 100644 bandict/__init__.py (limited to 'bandict') diff --git a/bandict/__init__.py b/bandict/__init__.py new file mode 100644 index 0000000..389ae0e --- /dev/null +++ b/bandict/__init__.py @@ -0,0 +1,91 @@ +import urllib.request +import json +import numpy as np +from banapedia.wapi.WikipediaQuery import BlockQuery +from banapedia.Ban import * + + +class BanList(): + + def __init__(self, data_file, samples=30000, samples_by_query=500, + from_internet=False): + if from_internet: + proxy = urllib.request.ProxyHandler(urllib.request.getproxies()) + opener = urllib.request.build_opener(proxy) + urllib.request.install_opener(opener) + self.dict_list = self.fetch_multipart(samples, samples_by_query) + else: + with open(data_file, "r") as ban_dict_file: + self.dict_list = json.load(ban_dict_file) + self.ban_list = [] + for ban_dict in self.dict_list: + self.ban_list.append(Ban( + ban_dict["user"], + datetime.strptime(ban_dict["timestamp"], ISO_TIMESTAMP), + datetime.strptime(ban_dict["expiry"], ISO_TIMESTAMP), + )) + + def fetch_multipart(self, n, query_limit): + ban_dict_list = [] + n_fetched = 0 + continue_token = None + + print("[INFO]", "Fetching %d bans" % n) + while n_fetched < n: + to_fetch = min(query_limit, n - n_fetched) + query = BlockQuery( + bkprop=["user", "timestamp", "expiry"], + bkshow=["temp", "ip"], + limit=to_fetch, + continue_token=continue_token, + ) + results = query.fetch_result() + ban_dict_list.extend(results["query"]["blocks"]) + continue_token = results["query-continue"]["blocks"]["bkcontinue"] + n_fetched += to_fetch + print("[INFO]", "Fetched %d over %d bans" % (n_fetched, n)) + + print("[INFO]", "Bans fetching complete") + return ban_dict_list + + def write_to_file(self, outfile): + with open(outfile, "w") as ban_dict_file: + json.dump(self.dict_list, ban_dict_file, indent="\t") + + def get_durations(self): + return [ban.get_duration() for ban in self.ban_list] + + def get_countries(self): + return [ban.get_country_code() for ban in self.ban_list] + + def __iter__(self): + return self.dict_list.__iter__() + + def items(self): + return self.dict_list.items() + + def by_country(self): + ban_duration_by_country = {} + + for ban in self.ban_list: + country_code = ban.get_country_code() + + if country_code not in ban_duration_by_country.keys(): + ban_duration_by_country[country_code] = [] + + ban_duration_by_country[country_code].append(ban) + + return ban_duration_by_country + + def average_ban_by_country(self): + average_ban_duration_ctry = {} + + for country, bans in self.by_country().items(): + average = np.mean([ban.get_duration() for ban in bans]) + average_ban_duration_ctry[country] = average + + # In months + average_ban_duration_ctry = {country: duration/30 + for country, duration in + average_ban_duration_ctry.items()} + return average_ban_duration_ctry -- cgit v1.2.3