diff options
-rw-r--r-- | banapedia/Ban.py | 40 | ||||
-rw-r--r-- | bandict/__init__.py | 91 | ||||
-rw-r--r-- | main.py | 83 |
3 files changed, 101 insertions, 113 deletions
diff --git a/banapedia/Ban.py b/banapedia/Ban.py index d8666b4..4714274 100644 --- a/banapedia/Ban.py +++ b/banapedia/Ban.py | |||
@@ -34,43 +34,3 @@ class Ban: | |||
34 | 34 | ||
35 | self.country_code = country_code | 35 | self.country_code = country_code |
36 | return country_code | 36 | return country_code |
37 | |||
38 | |||
39 | def map_ban(ban_dict): | ||
40 | return Ban( | ||
41 | ban_dict["user"], | ||
42 | datetime.strptime(ban_dict["timestamp"], ISO_TIMESTAMP), | ||
43 | datetime.strptime(ban_dict["expiry"], ISO_TIMESTAMP), | ||
44 | ) | ||
45 | |||
46 | |||
47 | def map_bans(ban_dict_list): | ||
48 | ban_list = [] | ||
49 | for ban_dict in ban_dict_list: | ||
50 | ban_list.append(map_ban(ban_dict)) | ||
51 | |||
52 | return ban_list | ||
53 | |||
54 | |||
55 | def fetch_multipart_ban_dict(n, query_limit): | ||
56 | ban_dict_list = [] | ||
57 | n_fetched = 0 | ||
58 | continue_token = None | ||
59 | |||
60 | print("[INFO]", "Fetching %d bans" % n) | ||
61 | while n_fetched < n: | ||
62 | to_fetch = min(query_limit, n - n_fetched) | ||
63 | query = BlockQuery( | ||
64 | bkprop=["user", "timestamp", "expiry"], | ||
65 | bkshow=["temp", "ip"], | ||
66 | limit=to_fetch, | ||
67 | continue_token=continue_token, | ||
68 | ) | ||
69 | results = query.fetch_result() | ||
70 | ban_dict_list.extend(results["query"]["blocks"]) | ||
71 | continue_token = results["query-continue"]["blocks"]["bkcontinue"] | ||
72 | n_fetched += to_fetch | ||
73 | print("[INFO]", "Fetched %d over %d bans" % (n_fetched, n)) | ||
74 | |||
75 | print("[INFO]", "Bans fetching complete") | ||
76 | return ban_dict_list | ||
diff --git a/bandict/__init__.py b/bandict/__init__.py new file mode 100644 index 0000000..389ae0e --- /dev/null +++ b/bandict/__init__.py | |||
@@ -0,0 +1,91 @@ | |||
1 | import urllib.request | ||
2 | import json | ||
3 | import numpy as np | ||
4 | from banapedia.wapi.WikipediaQuery import BlockQuery | ||
5 | from banapedia.Ban import * | ||
6 | |||
7 | |||
8 | class BanList(): | ||
9 | |||
10 | def __init__(self, data_file, samples=30000, samples_by_query=500, | ||
11 | from_internet=False): | ||
12 | if from_internet: | ||
13 | proxy = urllib.request.ProxyHandler(urllib.request.getproxies()) | ||
14 | opener = urllib.request.build_opener(proxy) | ||
15 | urllib.request.install_opener(opener) | ||
16 | self.dict_list = self.fetch_multipart(samples, samples_by_query) | ||
17 | else: | ||
18 | with open(data_file, "r") as ban_dict_file: | ||
19 | self.dict_list = json.load(ban_dict_file) | ||
20 | self.ban_list = [] | ||
21 | for ban_dict in self.dict_list: | ||
22 | self.ban_list.append(Ban( | ||
23 | ban_dict["user"], | ||
24 | datetime.strptime(ban_dict["timestamp"], ISO_TIMESTAMP), | ||
25 | datetime.strptime(ban_dict["expiry"], ISO_TIMESTAMP), | ||
26 | )) | ||
27 | |||
28 | def fetch_multipart(self, n, query_limit): | ||
29 | ban_dict_list = [] | ||
30 | n_fetched = 0 | ||
31 | continue_token = None | ||
32 | |||
33 | print("[INFO]", "Fetching %d bans" % n) | ||
34 | while n_fetched < n: | ||
35 | to_fetch = min(query_limit, n - n_fetched) | ||
36 | query = BlockQuery( | ||
37 | bkprop=["user", "timestamp", "expiry"], | ||
38 | bkshow=["temp", "ip"], | ||
39 | limit=to_fetch, | ||
40 | continue_token=continue_token, | ||
41 | ) | ||
42 | results = query.fetch_result() | ||
43 | ban_dict_list.extend(results["query"]["blocks"]) | ||
44 | continue_token = results["query-continue"]["blocks"]["bkcontinue"] | ||
45 | n_fetched += to_fetch | ||
46 | print("[INFO]", "Fetched %d over %d bans" % (n_fetched, n)) | ||
47 | |||
48 | print("[INFO]", "Bans fetching complete") | ||
49 | return ban_dict_list | ||
50 | |||
51 | def write_to_file(self, outfile): | ||
52 | with open(outfile, "w") as ban_dict_file: | ||
53 | json.dump(self.dict_list, ban_dict_file, indent="\t") | ||
54 | |||
55 | def get_durations(self): | ||
56 | return [ban.get_duration() for ban in self.ban_list] | ||
57 | |||
58 | def get_countries(self): | ||
59 | return [ban.get_country_code() for ban in self.ban_list] | ||
60 | |||
61 | def __iter__(self): | ||
62 | return self.dict_list.__iter__() | ||
63 | |||
64 | def items(self): | ||
65 | return self.dict_list.items() | ||
66 | |||
67 | def by_country(self): | ||
68 | ban_duration_by_country = {} | ||
69 | |||
70 | for ban in self.ban_list: | ||
71 | country_code = ban.get_country_code() | ||
72 | |||
73 | if country_code not in ban_duration_by_country.keys(): | ||
74 | ban_duration_by_country[country_code] = [] | ||
75 | |||
76 | ban_duration_by_country[country_code].append(ban) | ||
77 | |||
78 | return ban_duration_by_country | ||
79 | |||
80 | def average_ban_by_country(self): | ||
81 | average_ban_duration_ctry = {} | ||
82 | |||
83 | for country, bans in self.by_country().items(): | ||
84 | average = np.mean([ban.get_duration() for ban in bans]) | ||
85 | average_ban_duration_ctry[country] = average | ||
86 | |||
87 | # In months | ||
88 | average_ban_duration_ctry = {country: duration/30 | ||
89 | for country, duration in | ||
90 | average_ban_duration_ctry.items()} | ||
91 | return average_ban_duration_ctry | ||
@@ -1,9 +1,8 @@ | |||
1 | from banapedia.Ban import * | 1 | from banapedia.Ban import * |
2 | import bandict | ||
2 | from collections import Counter | 3 | from collections import Counter |
3 | import json | ||
4 | import pygal | 4 | import pygal |
5 | import numpy as np | 5 | import numpy as np |
6 | import urllib.request | ||
7 | 6 | ||
8 | __author__ = 'pacien' | 7 | __author__ = 'pacien' |
9 | 8 | ||
@@ -12,44 +11,15 @@ BAN_MAP_FILE = "output/ban-map.svg" | |||
12 | BAN_DURATION_MAP_FILE = "output/ban-duration-map.svg" | 11 | BAN_DURATION_MAP_FILE = "output/ban-duration-map.svg" |
13 | HIST_FILE = "output/histogram.svg" | 12 | HIST_FILE = "output/histogram.svg" |
14 | 13 | ||
15 | BAN_FILE = "resources/ban_list.json" | ||
16 | |||
17 | SAMPLES = 30000 | 14 | SAMPLES = 30000 |
18 | SAMPLES_BY_QUERY = 500 | ||
19 | |||
20 | |||
21 | def configure_proxy(): | ||
22 | proxy = urllib.request.ProxyHandler(urllib.request.getproxies()) | ||
23 | opener = urllib.request.build_opener(proxy) | ||
24 | urllib.request.install_opener(opener) | ||
25 | |||
26 | |||
27 | def load_from_internet(): | ||
28 | configure_proxy() | ||
29 | return fetch_multipart_ban_dict(SAMPLES, SAMPLES_BY_QUERY) | ||
30 | |||
31 | |||
32 | def load_from_local(): | ||
33 | with open(BAN_FILE, "r") as ban_dict_file: | ||
34 | return json.load(ban_dict_file) | ||
35 | |||
36 | |||
37 | def write_to_local(ban_dict_list): | ||
38 | with open(BAN_FILE, "w") as ban_dict_file: | ||
39 | json.dump(ban_dict_list, ban_dict_file, indent="\t") | ||
40 | |||
41 | |||
42 | # ban_dict_list = load_from_internet() | ||
43 | # write_to_local(ban_dict_list) | ||
44 | |||
45 | ban_dict_list = load_from_local() | ||
46 | 15 | ||
47 | ban_list = map_bans(ban_dict_list) | 16 | BAN_FILE = "resources/ban_list.json" |
48 | 17 | ||
18 | ban_dict_list = bandict.BanList(BAN_FILE) | ||
49 | 19 | ||
50 | ########## HISTOGRAM ########## | 20 | # ======== HISTOGRAM ======= # |
51 | 21 | ||
52 | ban_durations = [ban.get_duration() for ban in ban_list] | 22 | ban_durations = ban_dict_list.get_durations() |
53 | (ban_durations_bars, bins) = np.histogram(ban_durations, bins=[round(365/12*x) for x in range(1, 50+2)]) | 23 | (ban_durations_bars, bins) = np.histogram(ban_durations, bins=[round(365/12*x) for x in range(1, 50+2)]) |
54 | 24 | ||
55 | print("[INFO]", "Generating histogram") | 25 | print("[INFO]", "Generating histogram") |
@@ -60,13 +30,10 @@ bar_chart.add("Number of active bans", ban_durations_bars) | |||
60 | bar_chart.render_to_file(HIST_FILE) | 30 | bar_chart.render_to_file(HIST_FILE) |
61 | print("[INFO]", "Histogram generation complete") | 31 | print("[INFO]", "Histogram generation complete") |
62 | 32 | ||
63 | ########## NB BAN MAP ########## | 33 | # ======= NB BAN MAP ======= # |
64 | |||
65 | def count_by_country(ban_list): | ||
66 | country_ban_list = [ban.get_country_code() for ban in ban_list] | ||
67 | return Counter(country_ban_list) | ||
68 | 34 | ||
69 | nb_bans_by_country = count_by_country(ban_list) | 35 | country_ban_list = ban_dict_list.get_countries() |
36 | nb_bans_by_country = Counter(country_ban_list) | ||
70 | 37 | ||
71 | print("[INFO]", "Generating ban map") | 38 | print("[INFO]", "Generating ban map") |
72 | worldmap_chart = pygal.Worldmap(legend_at_bottom=True) | 39 | worldmap_chart = pygal.Worldmap(legend_at_bottom=True) |
@@ -76,35 +43,9 @@ worldmap_chart.render_to_file(BAN_MAP_FILE) | |||
76 | print("[INFO]", "Ban map generation complete") | 43 | print("[INFO]", "Ban map generation complete") |
77 | 44 | ||
78 | 45 | ||
79 | ########## BAN DURATION MAP ########## | 46 | # ======= BAN DURATION MAP ======= # |
80 | |||
81 | def group_by_country(ban_list): | ||
82 | ban_duration_by_country = {} | ||
83 | |||
84 | for ban in ban_list: | ||
85 | country_code = ban.get_country_code() | ||
86 | |||
87 | if country_code not in ban_duration_by_country.keys(): | ||
88 | ban_duration_by_country[country_code] = [] | ||
89 | |||
90 | ban_duration_by_country[country_code].append(ban) | ||
91 | |||
92 | return ban_duration_by_country | ||
93 | 47 | ||
94 | 48 | average_ban_duration_by_country = ban_dict_list.average_ban_by_country() | |
95 | def calc_average_ban_by_country(ban_by_country_dict): | ||
96 | average_ban_duration_by_country = {} | ||
97 | |||
98 | for country, bans in ban_by_country_dict.items(): | ||
99 | average = np.mean([ban.get_duration() for ban in bans]) | ||
100 | average_ban_duration_by_country[country] = average | ||
101 | |||
102 | return average_ban_duration_by_country | ||
103 | |||
104 | ban_duration_by_country = group_by_country(ban_list) | ||
105 | average_ban_duration_by_country = calc_average_ban_by_country(ban_duration_by_country) | ||
106 | |||