diff options
author | Minijackson | 2014-10-24 10:20:14 +0200 |
---|---|---|
committer | Minijackson | 2014-10-24 10:20:14 +0200 |
commit | a95864928cd74df172f172d7c18b4d89e82548e1 (patch) | |
tree | 7a2632e9f5172f36187995d1521228bb5e4e45d8 /bandict | |
parent | a63aafe88b55c1c500646f5c93e9ff16780d37ca (diff) | |
download | wikistats-a95864928cd74df172f172d7c18b4d89e82548e1.tar.gz |
Moving to object type ban dict
Diffstat (limited to 'bandict')
-rw-r--r-- | bandict/__init__.py | 91 |
1 files changed, 91 insertions, 0 deletions
diff --git a/bandict/__init__.py b/bandict/__init__.py new file mode 100644 index 0000000..389ae0e --- /dev/null +++ b/bandict/__init__.py | |||
@@ -0,0 +1,91 @@ | |||
1 | import urllib.request | ||
2 | import json | ||
3 | import numpy as np | ||
4 | from banapedia.wapi.WikipediaQuery import BlockQuery | ||
5 | from banapedia.Ban import * | ||
6 | |||
7 | |||
8 | class BanList(): | ||
9 | |||
10 | def __init__(self, data_file, samples=30000, samples_by_query=500, | ||
11 | from_internet=False): | ||
12 | if from_internet: | ||
13 | proxy = urllib.request.ProxyHandler(urllib.request.getproxies()) | ||
14 | opener = urllib.request.build_opener(proxy) | ||
15 | urllib.request.install_opener(opener) | ||
16 | self.dict_list = self.fetch_multipart(samples, samples_by_query) | ||
17 | else: | ||
18 | with open(data_file, "r") as ban_dict_file: | ||
19 | self.dict_list = json.load(ban_dict_file) | ||
20 | self.ban_list = [] | ||
21 | for ban_dict in self.dict_list: | ||
22 | self.ban_list.append(Ban( | ||
23 | ban_dict["user"], | ||
24 | datetime.strptime(ban_dict["timestamp"], ISO_TIMESTAMP), | ||
25 | datetime.strptime(ban_dict["expiry"], ISO_TIMESTAMP), | ||
26 | )) | ||
27 | |||
28 | def fetch_multipart(self, n, query_limit): | ||
29 | ban_dict_list = [] | ||
30 | n_fetched = 0 | ||
31 | continue_token = None | ||
32 | |||
33 | print("[INFO]", "Fetching %d bans" % n) | ||
34 | while n_fetched < n: | ||
35 | to_fetch = min(query_limit, n - n_fetched) | ||
36 | query = BlockQuery( | ||
37 | bkprop=["user", "timestamp", "expiry"], | ||
38 | bkshow=["temp", "ip"], | ||
39 | limit=to_fetch, | ||
40 | continue_token=continue_token, | ||
41 | ) | ||
42 | results = query.fetch_result() | ||
43 | ban_dict_list.extend(results["query"]["blocks"]) | ||
44 | continue_token = results["query-continue"]["blocks"]["bkcontinue"] | ||
45 | n_fetched += to_fetch | ||
46 | print("[INFO]", "Fetched %d over %d bans" % (n_fetched, n)) | ||
47 | |||
48 | print("[INFO]", "Bans fetching complete") | ||
49 | return ban_dict_list | ||
50 | |||
51 | def write_to_file(self, outfile): | ||
52 | with open(outfile, "w") as ban_dict_file: | ||
53 | json.dump(self.dict_list, ban_dict_file, indent="\t") | ||
54 | |||
55 | def get_durations(self): | ||
56 | return [ban.get_duration() for ban in self.ban_list] | ||
57 | |||
58 | def get_countries(self): | ||
59 | return [ban.get_country_code() for ban in self.ban_list] | ||
60 | |||
61 | def __iter__(self): | ||
62 | return self.dict_list.__iter__() | ||
63 | |||
64 | def items(self): | ||
65 | return self.dict_list.items() | ||
66 | |||
67 | def by_country(self): | ||
68 | ban_duration_by_country = {} | ||
69 | |||
70 | for ban in self.ban_list: | ||
71 | country_code = ban.get_country_code() | ||
72 | |||
73 | if country_code not in ban_duration_by_country.keys(): | ||
74 | ban_duration_by_country[country_code] = [] | ||
75 | |||
76 | ban_duration_by_country[country_code].append(ban) | ||
77 | |||
78 | return ban_duration_by_country | ||
79 | |||
80 | def average_ban_by_country(self): | ||
81 | average_ban_duration_ctry = {} | ||
82 | |||
83 | for country, bans in self.by_country().items(): | ||
84 | average = np.mean([ban.get_duration() for ban in bans]) | ||
85 | average_ban_duration_ctry[country] = average | ||
86 | |||
87 | # In months | ||
88 | average_ban_duration_ctry = {country: duration/30 | ||
89 | for country, duration in | ||
90 | average_ban_duration_ctry.items()} | ||
91 | return average_ban_duration_ctry | ||