From bdf9099df8c2a4636b0ad0e710b73330877eef37 Mon Sep 17 00:00:00 2001 From: Pacien TRAN-GIRARD Date: Fri, 24 Oct 2014 19:59:05 +0200 Subject: Very cleaner, much class, such readable, wow --- wikibania/__init__.py | 1 + wikibania/api/Query.py | 24 +++++++++++++++++++ wikibania/api/__init__.py | 1 + wikibania/ban/Ban.py | 37 +++++++++++++++++++++++++++++ wikibania/ban/BanDB.py | 50 ++++++++++++++++++++++++++++++++++++++++ wikibania/ban/BanDBWrapper.py | 25 ++++++++++++++++++++ wikibania/ban/__init__.py | 1 + wikibania/wapi/WikipediaQuery.py | 48 ++++++++++++++++++++++++++++++++++++++ wikibania/wapi/__init__.py | 1 + 9 files changed, 188 insertions(+) create mode 100644 wikibania/__init__.py create mode 100644 wikibania/api/Query.py create mode 100644 wikibania/api/__init__.py create mode 100644 wikibania/ban/Ban.py create mode 100644 wikibania/ban/BanDB.py create mode 100644 wikibania/ban/BanDBWrapper.py create mode 100644 wikibania/ban/__init__.py create mode 100644 wikibania/wapi/WikipediaQuery.py create mode 100644 wikibania/wapi/__init__.py (limited to 'wikibania') diff --git a/wikibania/__init__.py b/wikibania/__init__.py new file mode 100644 index 0000000..792d600 --- /dev/null +++ b/wikibania/__init__.py @@ -0,0 +1 @@ +# diff --git a/wikibania/api/Query.py b/wikibania/api/Query.py new file mode 100644 index 0000000..0928478 --- /dev/null +++ b/wikibania/api/Query.py @@ -0,0 +1,24 @@ +import urllib.parse +import urllib.request +import json + + +class Query: + def __init__(self, base_url="", params=None, encoding="utf8"): + if params is None: + params = {} + + self.base_url = base_url + self.params = params + self.encoding = encoding + + def fetch_raw_result(self): + post_query = urllib.parse.urlencode(self.params) + post_query = post_query.encode(self.encoding) + document = urllib.request.urlopen(self.base_url, post_query) + return document.read().decode(self.encoding) + + +class JSONQuery(Query): + def fetch_result(self): + return json.loads(self.fetch_raw_result()) diff --git a/wikibania/api/__init__.py b/wikibania/api/__init__.py new file mode 100644 index 0000000..792d600 --- /dev/null +++ b/wikibania/api/__init__.py @@ -0,0 +1 @@ +# diff --git a/wikibania/ban/Ban.py b/wikibania/ban/Ban.py new file mode 100644 index 0000000..e06ca89 --- /dev/null +++ b/wikibania/ban/Ban.py @@ -0,0 +1,37 @@ +from datetime import datetime + +import pygeoip + + +ISO_TIMESTAMP = "%Y-%m-%dT%H:%M:%SZ" + + +class Ban: + def __init__(self, geoip_looker, user=None, timestamp=None, expiry=None, timestamp_format=ISO_TIMESTAMP): + self.geoip_looker = geoip_looker + self.timestamp_format = timestamp_format + + self.user = user + self.timestamp = timestamp + self.expiry = expiry + + def items(self): + return { + "user": self.user, + "timestamp": self.timestamp.strftime(ISO_TIMESTAMP), + "expiry": self.expiry.strftime(ISO_TIMESTAMP), + } + + def hydrate(self, ban_dict): + self.user = ban_dict["user"] + self.timestamp = datetime.strptime(ban_dict["timestamp"], ISO_TIMESTAMP) + self.expiry = datetime.strptime(ban_dict["expiry"], ISO_TIMESTAMP) + + def calc_duration(self): + return (self.expiry - self.timestamp).days + + def lookup_country_code(self): + try: + return self.geoip_looker.country_code_by_addr(self.user).lower() + except pygeoip.GeoIPError: + return "UNKNOWN" diff --git a/wikibania/ban/BanDB.py b/wikibania/ban/BanDB.py new file mode 100644 index 0000000..e83aa3c --- /dev/null +++ b/wikibania/ban/BanDB.py @@ -0,0 +1,50 @@ +import json + +from wikibania.ban.Ban import Ban +from wikibania.wapi.WikipediaQuery import BlockQuery + + +class BanDB: + def __init__(self, geoip_looker): + self.geoip_looker = geoip_looker + self.bans = [] + + def list(self): + return self.bans + + def load(self, ban_list): + for entry in ban_list: + ban = Ban(self.geoip_looker) + ban.hydrate(entry) + self.bans.append(ban) + + def load_file(self, file_name): + with open(file_name, "r") as file: + entries = json.load(file) + self.load(entries) + + def dump(self): + return [ban.items() for ban in self.bans] + + def dump_file(self, file_name): + with open(file_name, "w") as file: + ban_list = self.dump() + json.dump(ban_list, file) + + def fetch(self, nb_samples, query_limit=500, continue_token=None): + fetch = min(nb_samples, query_limit) + + query = BlockQuery( + properties=["user", "timestamp", "expiry"], + show=["temp", "ip"], + limit=fetch, + continue_token=continue_token, + ) + results = query.fetch_result() + + entries = results["query"]["blocks"] + self.load(entries) + + if nb_samples - fetch > 0: + continue_token = results["query-continue"]["blocks"]["bkcontinue"] + self.fetch(nb_samples - fetch, query_limit, continue_token) diff --git a/wikibania/ban/BanDBWrapper.py b/wikibania/ban/BanDBWrapper.py new file mode 100644 index 0000000..8f396b0 --- /dev/null +++ b/wikibania/ban/BanDBWrapper.py @@ -0,0 +1,25 @@ +from collections import defaultdict + +import numpy + + +class BanDBWrapper: + def __init__(self, ban_db): + self.ban_db = ban_db + + def get_all_durations(self): + return [ban.calc_duration() for ban in self.ban_db.list()] + + def get_all_countries(self): + return [ban.lookup_country_code() for ban in self.ban_db.list()] + + def get_durations_by_country(self): + return [(ban.lookup_country_code(), ban.calc_duration()) for ban in self.ban_db.list()] + + def calc_average_duration_by_country(self): + ban_durations_by_country = defaultdict(list) + + for country, ban_duration in self.get_durations_by_country(): + ban_durations_by_country[country].append(ban_duration) + + return {country: numpy.mean(ban_durations) for country, ban_durations in ban_durations_by_country.items()} diff --git a/wikibania/ban/__init__.py b/wikibania/ban/__init__.py new file mode 100644 index 0000000..792d600 --- /dev/null +++ b/wikibania/ban/__init__.py @@ -0,0 +1 @@ +# diff --git a/wikibania/wapi/WikipediaQuery.py b/wikibania/wapi/WikipediaQuery.py new file mode 100644 index 0000000..3f544b6 --- /dev/null +++ b/wikibania/wapi/WikipediaQuery.py @@ -0,0 +1,48 @@ +from ..api.Query import JSONQuery + +WIKIPEDIA_QUERY_BASE_URL = "https://en.wikipedia.org/w/api.php" +LIST_SEPARATOR = "|" + + +class WikipediaQuery(JSONQuery): + def __init__(self, params=None): + if params is None: + params = {} + + params.update({ + "action": "query", + "format": "json", + }) + super(WikipediaQuery, self).__init__(base_url=WIKIPEDIA_QUERY_BASE_URL, params=params) + + +class ListQuery(WikipediaQuery): + def __init__(self, list_name, params=None): + if params is None: + params = {} + + params.update({ + "list": list_name, + }) + super(ListQuery, self).__init__(params) + + +class BlockQuery(ListQuery): + def __init__(self, properties=None, show=None, sort="newer", limit=500, continue_token=None): + if properties is None: + properties = [] + + if show is None: + show = [] + + params = { + "bkprop": LIST_SEPARATOR.join(properties), + "bkshow": LIST_SEPARATOR.join(show), + "bkdir": sort, + "bklimit": limit, + } + + if continue_token is not None: + params.update({"bkcontinue": continue_token}) + + super(BlockQuery, self).__init__("blocks", params=params) diff --git a/wikibania/wapi/__init__.py b/wikibania/wapi/__init__.py new file mode 100644 index 0000000..792d600 --- /dev/null +++ b/wikibania/wapi/__init__.py @@ -0,0 +1 @@ +# -- cgit v1.2.3