summaryrefslogtreecommitdiff
path: root/wikibania
diff options
context:
space:
mode:
Diffstat (limited to 'wikibania')
-rw-r--r--wikibania/__init__.py1
-rw-r--r--wikibania/api/Query.py24
-rw-r--r--wikibania/api/__init__.py1
-rw-r--r--wikibania/ban/Ban.py37
-rw-r--r--wikibania/ban/BanDB.py50
-rw-r--r--wikibania/ban/BanDBWrapper.py25
-rw-r--r--wikibania/ban/__init__.py1
-rw-r--r--wikibania/wapi/WikipediaQuery.py48
-rw-r--r--wikibania/wapi/__init__.py1
9 files changed, 188 insertions, 0 deletions
diff --git a/wikibania/__init__.py b/wikibania/__init__.py
new file mode 100644
index 0000000..792d600
--- /dev/null
+++ b/wikibania/__init__.py
@@ -0,0 +1 @@
#
diff --git a/wikibania/api/Query.py b/wikibania/api/Query.py
new file mode 100644
index 0000000..0928478
--- /dev/null
+++ b/wikibania/api/Query.py
@@ -0,0 +1,24 @@
1import urllib.parse
2import urllib.request
3import json
4
5
6class Query:
7 def __init__(self, base_url="", params=None, encoding="utf8"):
8 if params is None:
9 params = {}
10
11 self.base_url = base_url
12 self.params = params
13 self.encoding = encoding
14
15 def fetch_raw_result(self):
16 post_query = urllib.parse.urlencode(self.params)
17 post_query = post_query.encode(self.encoding)
18 document = urllib.request.urlopen(self.base_url, post_query)
19 return document.read().decode(self.encoding)
20
21
22class JSONQuery(Query):
23 def fetch_result(self):
24 return json.loads(self.fetch_raw_result())
diff --git a/wikibania/api/__init__.py b/wikibania/api/__init__.py
new file mode 100644
index 0000000..792d600
--- /dev/null
+++ b/wikibania/api/__init__.py
@@ -0,0 +1 @@
#
diff --git a/wikibania/ban/Ban.py b/wikibania/ban/Ban.py
new file mode 100644
index 0000000..e06ca89
--- /dev/null
+++ b/wikibania/ban/Ban.py
@@ -0,0 +1,37 @@
1from datetime import datetime
2
3import pygeoip
4
5
6ISO_TIMESTAMP = "%Y-%m-%dT%H:%M:%SZ"
7
8
9class Ban:
10 def __init__(self, geoip_looker, user=None, timestamp=None, expiry=None, timestamp_format=ISO_TIMESTAMP):
11 self.geoip_looker = geoip_looker
12 self.timestamp_format = timestamp_format
13
14 self.user = user
15 self.timestamp = timestamp
16 self.expiry = expiry
17
18 def items(self):
19 return {
20 "user": self.user,
21 "timestamp": self.timestamp.strftime(ISO_TIMESTAMP),
22 "expiry": self.expiry.strftime(ISO_TIMESTAMP),
23 }
24
25 def hydrate(self, ban_dict):
26 self.user = ban_dict["user"]
27 self.timestamp = datetime.strptime(ban_dict["timestamp"], ISO_TIMESTAMP)
28 self.expiry = datetime.strptime(ban_dict["expiry"], ISO_TIMESTAMP)
29
30 def calc_duration(self):
31 return (self.expiry - self.timestamp).days
32
33 def lookup_country_code(self):
34 try:
35 return self.geoip_looker.country_code_by_addr(self.user).lower()
36 except pygeoip.GeoIPError:
37 return "UNKNOWN"
diff --git a/wikibania/ban/BanDB.py b/wikibania/ban/BanDB.py
new file mode 100644
index 0000000..e83aa3c
--- /dev/null
+++ b/wikibania/ban/BanDB.py
@@ -0,0 +1,50 @@
1import json
2
3from wikibania.ban.Ban import Ban
4from wikibania.wapi.WikipediaQuery import BlockQuery
5
6
7class BanDB:
8 def __init__(self, geoip_looker):
9 self.geoip_looker = geoip_looker
10 self.bans = []
11
12 def list(self):
13 return self.bans
14
15 def load(self, ban_list):
16 for entry in ban_list:
17 ban = Ban(self.geoip_looker)
18 ban.hydrate(entry)
19 self.bans.append(ban)
20
21 def load_file(self, file_name):
22 with open(file_name, "r") as file:
23 entries = json.load(file)
24 self.load(entries)
25
26 def dump(self):
27 return [ban.items() for ban in self.bans]
28
29 def dump_file(self, file_name):
30 with open(file_name, "w") as file:
31 ban_list = self.dump()
32 json.dump(ban_list, file)
33
34 def fetch(self, nb_samples, query_limit=500, continue_token=None):
35 fetch = min(nb_samples, query_limit)
36
37 query = BlockQuery(
38 properties=["user", "timestamp", "expiry"],
39 show=["temp", "ip"],
40 limit=fetch,
41 continue_token=continue_token,
42 )
43 results = query.fetch_result()
44
45 entries = results["query"]["blocks"]
46 self.load(entries)
47
48 if nb_samples - fetch > 0:
49 continue_token = results["query-continue"]["blocks"]["bkcontinue"]
50 self.fetch(nb_samples - fetch, query_limit, continue_token)
diff --git a/wikibania/ban/BanDBWrapper.py b/wikibania/ban/BanDBWrapper.py
new file mode 100644
index 0000000..8f396b0
--- /dev/null
+++ b/wikibania/ban/BanDBWrapper.py
@@ -0,0 +1,25 @@
1from collections import defaultdict
2
3import numpy
4
5
6class BanDBWrapper:
7 def __init__(self, ban_db):
8 self.ban_db = ban_db
9
10 def get_all_durations(self):
11 return [ban.calc_duration() for ban in self.ban_db.list()]
12
13 def get_all_countries(self):
14 return [ban.lookup_country_code() for ban in self.ban_db.list()]
15
16 def get_durations_by_country(self):
17 return [(ban.lookup_country_code(), ban.calc_duration()) for ban in self.ban_db.list()]
18
19 def calc_average_duration_by_country(self):
20 ban_durations_by_country = defaultdict(list)
21
22 for country, ban_duration in self.get_durations_by_country():
23 ban_durations_by_country[country].append(ban_duration)
24
25 return {country: numpy.mean(ban_durations) for country, ban_durations in ban_durations_by_country.items()}
diff --git a/wikibania/ban/__init__.py b/wikibania/ban/__init__.py
new file mode 100644
index 0000000..792d600
--- /dev/null
+++ b/wikibania/ban/__init__.py
@@ -0,0 +1 @@
#
diff --git a/wikibania/wapi/WikipediaQuery.py b/wikibania/wapi/WikipediaQuery.py
new file mode 100644
index 0000000..3f544b6
--- /dev/null
+++ b/wikibania/wapi/WikipediaQuery.py
@@ -0,0 +1,48 @@
1from ..api.Query import JSONQuery
2
3WIKIPEDIA_QUERY_BASE_URL = "https://en.wikipedia.org/w/api.php"
4LIST_SEPARATOR = "|"
5
6
7class WikipediaQuery(JSONQuery):
8 def __init__(self, params=None):
9 if params is None:
10 params = {}
11
12 params.update({
13 "action": "query",
14 "format": "json",
15 })
16 super(WikipediaQuery, self).__init__(base_url=WIKIPEDIA_QUERY_BASE_URL, params=params)
17
18
19class ListQuery(WikipediaQuery):
20 def __init__(self, list_name, params=None):
21 if params is None:
22 params = {}
23
24 params.update({
25 "list": list_name,
26 })
27 super(ListQuery, self).__init__(params)
28
29
30class BlockQuery(ListQuery):
31 def __init__(self, properties=None, show=None, sort="newer", limit=500, continue_token=None):
32 if properties is None:
33 properties = []
34
35 if show is None:
36 show = []
37
38 params = {
39 "bkprop": LIST_SEPARATOR.join(properties),
40 "bkshow": LIST_SEPARATOR.join(show),
41 "bkdir": sort,
42 "bklimit": limit,
43 }
44
45 if continue_token is not None:
46 params.update({"bkcontinue": continue_token})
47
48 super(BlockQuery, self).__init__("blocks", params=params)
diff --git a/wikibania/wapi/__init__.py b/wikibania/wapi/__init__.py
new file mode 100644
index 0000000..792d600
--- /dev/null
+++ b/wikibania/wapi/__init__.py
@@ -0,0 +1 @@
#