diff options
author | Pacien TRAN-GIRARD | 2014-10-24 01:45:28 +0200 |
---|---|---|
committer | Pacien TRAN-GIRARD | 2014-10-24 01:45:28 +0200 |
commit | 5e4f38688e4c14846b8264970a79c482c1ca7012 (patch) | |
tree | 6d150aa8f26eed939aabc9de1db3239a46cb7e05 /main.py | |
download | wikistats-5e4f38688e4c14846b8264970a79c482c1ca7012.tar.gz |
Initial commit
Diffstat (limited to 'main.py')
-rw-r--r-- | main.py | 119 |
1 files changed, 119 insertions, 0 deletions
@@ -0,0 +1,119 @@ | |||
1 | from banapedia.Ban import * | ||
2 | from collections import Counter | ||
3 | import json | ||
4 | import pygal | ||
5 | import numpy as np | ||
6 | import urllib.request | ||
7 | |||
8 | __author__ = 'pacien' | ||
9 | |||
10 | |||
11 | BAN_MAP_FILE = "output/ban-map.svg" | ||
12 | BAN_DURATION_MAP_FILE = "output/ban-duration-map.svg" | ||
13 | HIST_FILE = "output/histogram.svg" | ||
14 | |||
15 | BAN_FILE = "resources/ban_list.json" | ||
16 | |||
17 | SAMPLES = 30000 | ||
18 | SAMPLES_BY_QUERY = 500 | ||
19 | |||
20 | |||
21 | def configure_proxy(): | ||
22 | proxy = urllib.request.ProxyHandler(urllib.request.getproxies()) | ||
23 | opener = urllib.request.build_opener(proxy) | ||
24 | urllib.request.install_opener(opener) | ||
25 | |||
26 | |||
27 | def load_from_internet(): | ||
28 | configure_proxy() | ||
29 | return fetch_multipart_ban_dict(SAMPLES, SAMPLES_BY_QUERY) | ||
30 | |||
31 | |||
32 | def load_from_local(): | ||
33 | with open(BAN_FILE, "r") as ban_dict_file: | ||
34 | return json.load(ban_dict_file) | ||
35 | |||
36 | |||
37 | def write_to_local(ban_dict_list): | ||
38 | with open(BAN_FILE, "w") as ban_dict_file: | ||
39 | json.dump(ban_dict_list, ban_dict_file, indent="\t") | ||
40 | |||
41 | |||
42 | # ban_dict_list = load_from_internet() | ||
43 | # write_to_local(ban_dict_list) | ||
44 | |||
45 | ban_dict_list = load_from_local() | ||
46 | |||
47 | ban_list = map_bans(ban_dict_list) | ||
48 | |||
49 | |||
50 | ########## HISTOGRAM ########## | ||
51 | |||
52 | ban_durations = [ban.get_duration() for ban in ban_list] | ||
53 | (ban_durations, bins) = np.histogram(ban_durations, bins=[round(365/12*x) for x in range(1, 50+2)]) | ||
54 | |||
55 | print("[INFO]", "Generating histogram") | ||
56 | bar_chart = pygal.Bar(legend_at_bottom=True) | ||
57 | bar_chart.title = "Active Wikipedia bans by duration (%d samples)" % SAMPLES | ||
58 | bar_chart.x_labels = map(str, range(1, len(ban_durations)+1)) | ||
59 | bar_chart.add("Number of active bans", ban_durations) | ||
60 | bar_chart.render_to_file(HIST_FILE) | ||
61 | print("[INFO]", "Histogram generation complete") | ||
62 | |||
63 | |||
64 | ########## NB BAN MAP ########## | ||
65 | |||
66 | def count_by_country(ban_list): | ||
67 | country_ban_list = [ban.get_country_code() for ban in ban_list] | ||
68 | return Counter(country_ban_list) | ||
69 | |||
70 | nb_bans_by_country = count_by_country(ban_list) | ||
71 | |||
72 | print("[INFO]", "Generating ban map") | ||
73 | worldmap_chart = pygal.Worldmap(legend_at_bottom=True) | ||
74 | worldmap_chart.title = "World active Wikipedia bans by country (%d samples)" % SAMPLES | ||
75 | worldmap_chart.add("Active bans", nb_bans_by_country) | ||
76 | worldmap_chart.render_to_file(BAN_MAP_FILE) | ||
77 | print("[INFO]", "Ban map generation complete") | ||
78 | |||
79 | |||
80 | ########## BAN DURATION MAP ########## | ||
81 | |||
82 | def group_by_country(ban_list): | ||
83 | ban_duration_by_country = {} | ||
84 | |||
85 | for ban in ban_list: | ||
86 | country_code = ban.get_country_code() | ||
87 | |||
88 | if country_code not in ban_duration_by_country.keys(): | ||
89 | ban_duration_by_country[country_code] = [] | ||
90 | |||
91 | ban_duration_by_country[country_code].append(ban) | ||
92 | |||
93 | return ban_duration_by_country | ||
94 | |||
95 | |||
96 | def calc_average_ban_by_country(ban_by_country_dict): | ||
97 | average_ban_duration_by_country = {} | ||
98 | |||
99 | for country, bans in ban_by_country_dict.items(): | ||
100 | average = np.mean([ban.get_duration() for ban in bans]) | ||
101 | average_ban_duration_by_country[country] = average | ||
102 | |||
103 | return average_ban_duration_by_country | ||
104 | |||
105 | ban_duration_by_country = group_by_country(ban_list) | ||
106 | average_ban_duration_by_country = calc_average_ban_by_country(ban_duration_by_country) | ||
107 | |||
108 | average_ban_duration_by_country = {country: duration/30 for country, duration in average_ban_duration_by_country.items()} | ||
109 | |||
110 | print("[INFO]", "Generating ban duration map") | ||
111 | worldmap_chart = pygal.Worldmap(legend_at_bottom=True) | ||
112 | worldmap_chart.title = "Average Wikipedia ban duration by country (%d samples)" % SAMPLES | ||
113 | worldmap_chart.add("Average ban duration (months)", average_ban_duration_by_country) | ||
114 | worldmap_chart.render_to_file(BAN_DURATION_MAP_FILE) | ||
115 | print("[INFO]", "Ban duration map generation complete") | ||
116 | |||
117 | print("\nTHIS WAS A TRIUMPH!") | ||
118 | print("I'M MAKING A NOTE HERE:") | ||
119 | print("HUGE [SUCCESS]") | ||