summaryrefslogtreecommitdiff
path: root/bandict
diff options
context:
space:
mode:
Diffstat (limited to 'bandict')
-rw-r--r--bandict/__init__.py91
1 files changed, 91 insertions, 0 deletions
diff --git a/bandict/__init__.py b/bandict/__init__.py
new file mode 100644
index 0000000..389ae0e
--- /dev/null
+++ b/bandict/__init__.py
@@ -0,0 +1,91 @@
1import urllib.request
2import json
3import numpy as np
4from banapedia.wapi.WikipediaQuery import BlockQuery
5from banapedia.Ban import *
6
7
8class BanList():
9
10 def __init__(self, data_file, samples=30000, samples_by_query=500,
11 from_internet=False):
12 if from_internet:
13 proxy = urllib.request.ProxyHandler(urllib.request.getproxies())
14 opener = urllib.request.build_opener(proxy)
15 urllib.request.install_opener(opener)
16 self.dict_list = self.fetch_multipart(samples, samples_by_query)
17 else:
18 with open(data_file, "r") as ban_dict_file:
19 self.dict_list = json.load(ban_dict_file)
20 self.ban_list = []
21 for ban_dict in self.dict_list:
22 self.ban_list.append(Ban(
23 ban_dict["user"],
24 datetime.strptime(ban_dict["timestamp"], ISO_TIMESTAMP),
25 datetime.strptime(ban_dict["expiry"], ISO_TIMESTAMP),
26 ))
27
28 def fetch_multipart(self, n, query_limit):
29 ban_dict_list = []
30 n_fetched = 0
31 continue_token = None
32
33 print("[INFO]", "Fetching %d bans" % n)
34 while n_fetched < n:
35 to_fetch = min(query_limit, n - n_fetched)
36 query = BlockQuery(
37 bkprop=["user", "timestamp", "expiry"],
38 bkshow=["temp", "ip"],
39 limit=to_fetch,
40 continue_token=continue_token,
41 )
42 results = query.fetch_result()
43 ban_dict_list.extend(results["query"]["blocks"])
44 continue_token = results["query-continue"]["blocks"]["bkcontinue"]
45 n_fetched += to_fetch
46 print("[INFO]", "Fetched %d over %d bans" % (n_fetched, n))
47
48 print("[INFO]", "Bans fetching complete")
49 return ban_dict_list
50
51 def write_to_file(self, outfile):
52 with open(outfile, "w") as ban_dict_file:
53 json.dump(self.dict_list, ban_dict_file, indent="\t")
54
55 def get_durations(self):
56 return [ban.get_duration() for ban in self.ban_list]
57
58 def get_countries(self):
59 return [ban.get_country_code() for ban in self.ban_list]
60
61 def __iter__(self):
62 return self.dict_list.__iter__()
63
64 def items(self):
65 return self.dict_list.items()
66
67 def by_country(self):
68 ban_duration_by_country = {}
69
70 for ban in self.ban_list:
71 country_code = ban.get_country_code()
72
73 if country_code not in ban_duration_by_country.keys():
74 ban_duration_by_country[country_code] = []
75
76 ban_duration_by_country[country_code].append(ban)
77
78 return ban_duration_by_country
79
80 def average_ban_by_country(self):
81 average_ban_duration_ctry = {}
82
83 for country, bans in self.by_country().items():
84 average = np.mean([ban.get_duration() for ban in bans])
85 average_ban_duration_ctry[country] = average
86
87 # In months
88 average_ban_duration_ctry = {country: duration/30
89 for country, duration in
90 average_ban_duration_ctry.items()}
91 return average_ban_duration_ctry