from banapedia.Ban import * from collections import Counter import json import pygal import numpy as np import urllib.request __author__ = 'pacien' BAN_MAP_FILE = "output/ban-map.svg" BAN_DURATION_MAP_FILE = "output/ban-duration-map.svg" HIST_FILE = "output/histogram.svg" BAN_FILE = "resources/ban_list.json" SAMPLES = 30000 SAMPLES_BY_QUERY = 500 def configure_proxy(): proxy = urllib.request.ProxyHandler(urllib.request.getproxies()) opener = urllib.request.build_opener(proxy) urllib.request.install_opener(opener) def load_from_internet(): configure_proxy() return fetch_multipart_ban_dict(SAMPLES, SAMPLES_BY_QUERY) def load_from_local(): with open(BAN_FILE, "r") as ban_dict_file: return json.load(ban_dict_file) def write_to_local(ban_dict_list): with open(BAN_FILE, "w") as ban_dict_file: json.dump(ban_dict_list, ban_dict_file, indent="\t") # ban_dict_list = load_from_internet() # write_to_local(ban_dict_list) ban_dict_list = load_from_local() ban_list = map_bans(ban_dict_list) ########## HISTOGRAM ########## ban_durations = [ban.get_duration() for ban in ban_list] (ban_durations_bars, bins) = np.histogram(ban_durations, bins=[round(365/12*x) for x in range(1, 50+2)]) print("[INFO]", "Generating histogram") bar_chart = pygal.Bar(legend_at_bottom=True) bar_chart.title = "Active Wikipedia bans by duration (%d samples)" % SAMPLES bar_chart.x_labels = map(str, range(1, len(ban_durations_bars)+1)) bar_chart.add("Number of active bans", ban_durations_bars) bar_chart.render_to_file(HIST_FILE) print("[INFO]", "Histogram generation complete") ########## NB BAN MAP ########## def count_by_country(ban_list): country_ban_list = [ban.get_country_code() for ban in ban_list] return Counter(country_ban_list) nb_bans_by_country = count_by_country(ban_list) print("[INFO]", "Generating ban map") worldmap_chart = pygal.Worldmap(legend_at_bottom=True) worldmap_chart.title = "World active Wikipedia bans by country (%d samples)" % SAMPLES worldmap_chart.add("Active bans", nb_bans_by_country) worldmap_chart.render_to_file(BAN_MAP_FILE) print("[INFO]", "Ban map generation complete") ########## BAN DURATION MAP ########## def group_by_country(ban_list): ban_duration_by_country = {} for ban in ban_list: country_code = ban.get_country_code() if country_code not in ban_duration_by_country.keys(): ban_duration_by_country[country_code] = [] ban_duration_by_country[country_code].append(ban) return ban_duration_by_country def calc_average_ban_by_country(ban_by_country_dict): average_ban_duration_by_country = {} for country, bans in ban_by_country_dict.items(): average = np.mean([ban.get_duration() for ban in bans]) average_ban_duration_by_country[country] = average return average_ban_duration_by_country ban_duration_by_country = group_by_country(ban_list) average_ban_duration_by_country = calc_average_ban_by_country(ban_duration_by_country) average_ban_duration_by_country = {country: duration/30 for country, duration in average_ban_duration_by_country.items()} print("[INFO]", "Generating ban duration map") worldmap_chart = pygal.Worldmap(legend_at_bottom=True) worldmap_chart.title = "Average Wikipedia ban duration by country (%d samples)" % SAMPLES worldmap_chart.add("Average ban duration (months)", average_ban_duration_by_country) worldmap_chart.render_to_file(BAN_DURATION_MAP_FILE) print("[INFO]", "Ban duration map generation complete") print("\nTHIS WAS A TRIUMPH!") print("I'M MAKING A NOTE HERE:") print("HUGE [SUCCESS]\n") print("Some additional stats about ban durations:") print(" Mean: %.2f days" % np.mean(ban_durations)) print(" Median: %.2f days" % np.median(ban_durations)) print(" Deviation: %.2f" % np.std(ban_durations)) print(" Variance: %.2f" % np.var(ban_durations))