From bdf9099df8c2a4636b0ad0e710b73330877eef37 Mon Sep 17 00:00:00 2001 From: Pacien TRAN-GIRARD Date: Fri, 24 Oct 2014 19:59:05 +0200 Subject: Very cleaner, much class, such readable, wow --- banapedia/Ban.py | 36 -------------- banapedia/__init__.py | 1 - banapedia/api/Query.py | 23 --------- banapedia/api/__init__.py | 0 banapedia/wapi/WikipediaQuery.py | 42 ---------------- banapedia/wapi/__init__.py | 0 bandict/__init__.py | 91 ----------------------------------- main.py | 101 +++++++++++++++++++++++++++------------ rapport.md | 2 +- rapport.pdf | Bin 86060 -> 0 bytes sysproxy.py | 7 +++ wikibania/__init__.py | 1 + wikibania/api/Query.py | 24 ++++++++++ wikibania/api/__init__.py | 1 + wikibania/ban/Ban.py | 37 ++++++++++++++ wikibania/ban/BanDB.py | 50 +++++++++++++++++++ wikibania/ban/BanDBWrapper.py | 25 ++++++++++ wikibania/ban/__init__.py | 1 + wikibania/wapi/WikipediaQuery.py | 48 +++++++++++++++++++ wikibania/wapi/__init__.py | 1 + 20 files changed, 266 insertions(+), 225 deletions(-) delete mode 100644 banapedia/Ban.py delete mode 100644 banapedia/__init__.py delete mode 100644 banapedia/api/Query.py delete mode 100644 banapedia/api/__init__.py delete mode 100644 banapedia/wapi/WikipediaQuery.py delete mode 100644 banapedia/wapi/__init__.py delete mode 100644 bandict/__init__.py delete mode 100644 rapport.pdf create mode 100644 sysproxy.py create mode 100644 wikibania/__init__.py create mode 100644 wikibania/api/Query.py create mode 100644 wikibania/api/__init__.py create mode 100644 wikibania/ban/Ban.py create mode 100644 wikibania/ban/BanDB.py create mode 100644 wikibania/ban/BanDBWrapper.py create mode 100644 wikibania/ban/__init__.py create mode 100644 wikibania/wapi/WikipediaQuery.py create mode 100644 wikibania/wapi/__init__.py diff --git a/banapedia/Ban.py b/banapedia/Ban.py deleted file mode 100644 index 4714274..0000000 --- a/banapedia/Ban.py +++ /dev/null @@ -1,36 +0,0 @@ -from banapedia.wapi.WikipediaQuery import BlockQuery -from datetime import datetime -import pygeoip - -__author__ = 'pacien' - - -GEOIP_FILE = "/usr/share/GeoIP/GeoIP.dat" -geoip = pygeoip.GeoIP(GEOIP_FILE) - -ISO_TIMESTAMP = "%Y-%m-%dT%H:%M:%SZ" - - -class Ban: - def __init__(self, ip, start, end): - self.ip = ip - self.start = start - self.end = end - self.country_code = None - - def get_duration(self): - return (self.end - self.start).days - - def get_country_code(self): - if self.country_code is not None: - return self.country_code - - country_code = "" - - try: - country_code = geoip.country_code_by_addr(self.ip).lower() - except pygeoip.GeoIPError: - print("[ERROR]", "Could not determine country for ip", self.ip) - - self.country_code = country_code - return country_code diff --git a/banapedia/__init__.py b/banapedia/__init__.py deleted file mode 100644 index 8b13789..0000000 --- a/banapedia/__init__.py +++ /dev/null @@ -1 +0,0 @@ - diff --git a/banapedia/api/Query.py b/banapedia/api/Query.py deleted file mode 100644 index 7453df9..0000000 --- a/banapedia/api/Query.py +++ /dev/null @@ -1,23 +0,0 @@ -import urllib.parse -import urllib.request -import json - -__author__ = 'pacien' - - -class Query: - def __init__(self, base_url="", params={}, encoding="utf8"): - self.base_url = base_url - self.params = params - self.encoding = encoding - - def fetch_raw_result(self): - post_query = urllib.parse.urlencode(self.params) - post_query = post_query.encode(self.encoding) - document = urllib.request.urlopen(self.base_url, post_query) - return document.read().decode(self.encoding) - - -class JSONQuery(Query): - def fetch_result(self): - return json.loads(self.fetch_raw_result()) diff --git a/banapedia/api/__init__.py b/banapedia/api/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/banapedia/wapi/WikipediaQuery.py b/banapedia/wapi/WikipediaQuery.py deleted file mode 100644 index d3d2f94..0000000 --- a/banapedia/wapi/WikipediaQuery.py +++ /dev/null @@ -1,42 +0,0 @@ -from ..api.Query import JSONQuery - -__author__ = 'pacien' - -WIKIPEDIA_QUERY_BASE_URL = "https://en.wikipedia.org/w/api.php" -LIST_SEPARATOR = "|" -DEFAULT_BKPROP = ["id", "user", "userid", "by", "byid", "timestamp", "expiry", "reason", "range", "flags"] -DEFAULT_BKSHOW = ["account", "temp", "ip", "range"] - - -class WikipediaQuery(JSONQuery): - def __init__(self, params={}): - params.update({ - "action": "query", - "format": "json", - }) - JSONQuery.__init__(self, base_url=WIKIPEDIA_QUERY_BASE_URL, params=params) - - -class ListQuery(WikipediaQuery): - def __init__(self, list_name, params={}): - params.update({ - "list": list_name, - }) - WikipediaQuery.__init__(self, params) - - -class BlockQuery(ListQuery): - def __init__(self, bkprop=DEFAULT_BKPROP, bkshow=DEFAULT_BKSHOW, bkdir="newer", limit=500, continue_token=None): - params = { - "bkprop": LIST_SEPARATOR.join(bkprop), - "bkshow": LIST_SEPARATOR.join(bkshow), - "bkdir": bkdir, - "bklimit": limit, - } - - if continue_token is not None: - params.update({"bkcontinue": continue_token}) - - ListQuery.__init__(self, "blocks", params=params) - - diff --git a/banapedia/wapi/__init__.py b/banapedia/wapi/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/bandict/__init__.py b/bandict/__init__.py deleted file mode 100644 index 389ae0e..0000000 --- a/bandict/__init__.py +++ /dev/null @@ -1,91 +0,0 @@ -import urllib.request -import json -import numpy as np -from banapedia.wapi.WikipediaQuery import BlockQuery -from banapedia.Ban import * - - -class BanList(): - - def __init__(self, data_file, samples=30000, samples_by_query=500, - from_internet=False): - if from_internet: - proxy = urllib.request.ProxyHandler(urllib.request.getproxies()) - opener = urllib.request.build_opener(proxy) - urllib.request.install_opener(opener) - self.dict_list = self.fetch_multipart(samples, samples_by_query) - else: - with open(data_file, "r") as ban_dict_file: - self.dict_list = json.load(ban_dict_file) - self.ban_list = [] - for ban_dict in self.dict_list: - self.ban_list.append(Ban( - ban_dict["user"], - datetime.strptime(ban_dict["timestamp"], ISO_TIMESTAMP), - datetime.strptime(ban_dict["expiry"], ISO_TIMESTAMP), - )) - - def fetch_multipart(self, n, query_limit): - ban_dict_list = [] - n_fetched = 0 - continue_token = None - - print("[INFO]", "Fetching %d bans" % n) - while n_fetched < n: - to_fetch = min(query_limit, n - n_fetched) - query = BlockQuery( - bkprop=["user", "timestamp", "expiry"], - bkshow=["temp", "ip"], - limit=to_fetch, - continue_token=continue_token, - ) - results = query.fetch_result() - ban_dict_list.extend(results["query"]["blocks"]) - continue_token = results["query-continue"]["blocks"]["bkcontinue"] - n_fetched += to_fetch - print("[INFO]", "Fetched %d over %d bans" % (n_fetched, n)) - - print("[INFO]", "Bans fetching complete") - return ban_dict_list - - def write_to_file(self, outfile): - with open(outfile, "w") as ban_dict_file: - json.dump(self.dict_list, ban_dict_file, indent="\t") - - def get_durations(self): - return [ban.get_duration() for ban in self.ban_list] - - def get_countries(self): - return [ban.get_country_code() for ban in self.ban_list] - - def __iter__(self): - return self.dict_list.__iter__() - - def items(self): - return self.dict_list.items() - - def by_country(self): - ban_duration_by_country = {} - - for ban in self.ban_list: - country_code = ban.get_country_code() - - if country_code not in ban_duration_by_country.keys(): - ban_duration_by_country[country_code] = [] - - ban_duration_by_country[country_code].append(ban) - - return ban_duration_by_country - - def average_ban_by_country(self): - average_ban_duration_ctry = {} - - for country, bans in self.by_country().items(): - average = np.mean([ban.get_duration() for ban in bans]) - average_ban_duration_ctry[country] = average - - # In months - average_ban_duration_ctry = {country: duration/30 - for country, duration in - average_ban_duration_ctry.items()} - return average_ban_duration_ctry diff --git a/main.py b/main.py index 0d12a1e..da7c6ae 100644 --- a/main.py +++ b/main.py @@ -1,62 +1,101 @@ -from banapedia.Ban import * -import bandict from collections import Counter +import webbrowser + import pygal import numpy as np +import pygeoip + +from wikibania.ban.BanDB import BanDB +from wikibania.ban.BanDBWrapper import BanDBWrapper +import sysproxy + + +# PARAMS + +GEOIP_DB = "/usr/share/GeoIP/GeoIP.dat" BAN_MAP_FILE = "output/ban-map.svg" BAN_DURATION_MAP_FILE = "output/ban-duration-map.svg" HIST_FILE = "output/histogram.svg" +STATS_FILE = "output/stats.txt" + +BAN_DB_FILE = "resources/ban_list.json" + +FETCH_SAMPLES = 2000 +FETCH_DB = False +DUMP_DB = False +LOAD_DB = False + +OPEN_FILES = False + + +# SETUP -SAMPLES = 1000 +sysproxy.configure_system_proxy() +geoip_looker = pygeoip.GeoIP(GEOIP_DB) -BAN_FILE = "resources/ban_list.json" +ban_db = BanDB(geoip_looker) -ban_dict_list = bandict.BanList(BAN_FILE, samples=SAMPLES, from_internet=True) +if FETCH_DB: + ban_db.fetch(FETCH_SAMPLES) +if LOAD_DB: + ban_db.load_file(BAN_DB_FILE) +if DUMP_DB: + ban_db.dump_file(BAN_DB_FILE) -# ======== HISTOGRAM ======= # +ban_db_wrapper = BanDBWrapper(ban_db) -ban_durations = ban_dict_list.get_durations() -(ban_durations_bars, bins) = np.histogram(ban_durations, - bins=[round(365/12*x) - for x in range(1, 50+2) - ] - ) -print("[INFO]", "Generating histogram") +# HISTOGRAM + +ban_durations = ban_db_wrapper.get_all_durations() +duration_bins = [round(365 / 12 * x) for x in range(1, 50 + 2)] +(ban_durations_bars, bins) = np.histogram(ban_durations, bins=duration_bins) + bar_chart = pygal.Bar(legend_at_bottom=True) -bar_chart.title = "Active Wikipedia bans by duration (%d samples)" % SAMPLES -bar_chart.x_labels = map(str, range(1, len(ban_durations_bars)+1)) +bar_chart.title = "Active Wikipedia bans by duration (%d samples)" % len(ban_db.list()) +bar_chart.x_labels = map(str, range(1, len(ban_durations_bars) + 1)) bar_chart.add("Number of active bans", ban_durations_bars) bar_chart.render_to_file(HIST_FILE) -print("[INFO]", "Histogram generation complete") -# ======= NB BAN MAP ======= # +if OPEN_FILES: + webbrowser.open(HIST_FILE, 2) + + +# BAN DURATION STATS + +with open(STATS_FILE, "w") as stats_file: + stats_file.write("Mean: %.2f days\n" % np.mean(ban_durations)) + stats_file.write("Median: %.2f days\n" % np.median(ban_durations)) + stats_file.write("Deviation: %.2f\n" % np.std(ban_durations)) + stats_file.write("Variance: %.2f\n" % np.var(ban_durations)) -country_ban_list = ban_dict_list.get_countries() +if OPEN_FILES: + webbrowser.open(STATS_FILE, 2) + + +# NB BAN MAP + +country_ban_list = ban_db_wrapper.get_all_countries() nb_bans_by_country = Counter(country_ban_list) -print("[INFO]", "Generating ban map") worldmap_chart = pygal.Worldmap(legend_at_bottom=True) -worldmap_chart.title = "World active Wikipedia bans by country (%d samples)" % SAMPLES +worldmap_chart.title = "World active Wikipedia bans by country (%d samples)" % len(ban_db.list()) worldmap_chart.add("Active bans", nb_bans_by_country) worldmap_chart.render_to_file(BAN_MAP_FILE) -print("[INFO]", "Ban map generation complete") + +if OPEN_FILES: + webbrowser.open(BAN_MAP_FILE, 2) -# ======= BAN DURATION MAP ======= # +# BAN DURATION MAP -average_ban_duration_by_country = ban_dict_list.average_ban_by_country() +average_ban_duration_by_country = ban_db_wrapper.calc_average_duration_by_country() -print("[INFO]", "Generating ban duration map") worldmap_chart = pygal.Worldmap(legend_at_bottom=True) -worldmap_chart.title = "Average Wikipedia ban duration by country (%d samples)" % SAMPLES +worldmap_chart.title = "Average Wikipedia ban duration by country (%d samples)" % len(ban_db.list()) worldmap_chart.add("Average ban duration (months)", average_ban_duration_by_country) worldmap_chart.render_to_file(BAN_DURATION_MAP_FILE) -print("[INFO]", "Ban duration map generation complete") -print("Some additional stats about ban durations:") -print(" Mean: %.2f days" % np.mean(ban_durations)) -print(" Median: %.2f days" % np.median(ban_durations)) -print(" Deviation: %.2f" % np.std(ban_durations)) -print(" Variance: %.2f" % np.var(ban_durations)) +if OPEN_FILES: + webbrowser.open(BAN_DURATION_MAP_FILE, 2) diff --git a/rapport.md b/rapport.md index da8baff..6984654 100644 --- a/rapport.md +++ b/rapport.md @@ -10,7 +10,7 @@ Auteurs : TL;DR ----- -Banapedia ressence les bannissements effectifs d'utilisateurs anonymes de +Wikibania ressence les bannissements effectifs d'utilisateurs anonymes de Wikipédia, l'encyclopédie libre. diff --git a/rapport.pdf b/rapport.pdf deleted file mode 100644 index 6902ab2..0000000 Binary files a/rapport.pdf and /dev/null differ diff --git a/sysproxy.py b/sysproxy.py new file mode 100644 index 0000000..81867ad --- /dev/null +++ b/sysproxy.py @@ -0,0 +1,7 @@ +import urllib.request + + +def configure_system_proxy(): + proxy = urllib.request.ProxyHandler(urllib.request.getproxies()) + opener = urllib.request.build_opener(proxy) + urllib.request.install_opener(opener) diff --git a/wikibania/__init__.py b/wikibania/__init__.py new file mode 100644 index 0000000..792d600 --- /dev/null +++ b/wikibania/__init__.py @@ -0,0 +1 @@ +# diff --git a/wikibania/api/Query.py b/wikibania/api/Query.py new file mode 100644 index 0000000..0928478 --- /dev/null +++ b/wikibania/api/Query.py @@ -0,0 +1,24 @@ +import urllib.parse +import urllib.request +import json + + +class Query: + def __init__(self, base_url="", params=None, encoding="utf8"): + if params is None: + params = {} + + self.base_url = base_url + self.params = params + self.encoding = encoding + + def fetch_raw_result(self): + post_query = urllib.parse.urlencode(self.params) + post_query = post_query.encode(self.encoding) + document = urllib.request.urlopen(self.base_url, post_query) + return document.read().decode(self.encoding) + + +class JSONQuery(Query): + def fetch_result(self): + return json.loads(self.fetch_raw_result()) diff --git a/wikibania/api/__init__.py b/wikibania/api/__init__.py new file mode 100644 index 0000000..792d600 --- /dev/null +++ b/wikibania/api/__init__.py @@ -0,0 +1 @@ +# diff --git a/wikibania/ban/Ban.py b/wikibania/ban/Ban.py new file mode 100644 index 0000000..e06ca89 --- /dev/null +++ b/wikibania/ban/Ban.py @@ -0,0 +1,37 @@ +from datetime import datetime + +import pygeoip + + +ISO_TIMESTAMP = "%Y-%m-%dT%H:%M:%SZ" + + +class Ban: + def __init__(self, geoip_looker, user=None, timestamp=None, expiry=None, timestamp_format=ISO_TIMESTAMP): + self.geoip_looker = geoip_looker + self.timestamp_format = timestamp_format + + self.user = user + self.timestamp = timestamp + self.expiry = expiry + + def items(self): + return { + "user": self.user, + "timestamp": self.timestamp.strftime(ISO_TIMESTAMP), + "expiry": self.expiry.strftime(ISO_TIMESTAMP), + } + + def hydrate(self, ban_dict): + self.user = ban_dict["user"] + self.timestamp = datetime.strptime(ban_dict["timestamp"], ISO_TIMESTAMP) + self.expiry = datetime.strptime(ban_dict["expiry"], ISO_TIMESTAMP) + + def calc_duration(self): + return (self.expiry - self.timestamp).days + + def lookup_country_code(self): + try: + return self.geoip_looker.country_code_by_addr(self.user).lower() + except pygeoip.GeoIPError: + return "UNKNOWN" diff --git a/wikibania/ban/BanDB.py b/wikibania/ban/BanDB.py new file mode 100644 index 0000000..e83aa3c --- /dev/null +++ b/wikibania/ban/BanDB.py @@ -0,0 +1,50 @@ +import json + +from wikibania.ban.Ban import Ban +from wikibania.wapi.WikipediaQuery import BlockQuery + + +class BanDB: + def __init__(self, geoip_looker): + self.geoip_looker = geoip_looker + self.bans = [] + + def list(self): + return self.bans + + def load(self, ban_list): + for entry in ban_list: + ban = Ban(self.geoip_looker) + ban.hydrate(entry) + self.bans.append(ban) + + def load_file(self, file_name): + with open(file_name, "r") as file: + entries = json.load(file) + self.load(entries) + + def dump(self): + return [ban.items() for ban in self.bans] + + def dump_file(self, file_name): + with open(file_name, "w") as file: + ban_list = self.dump() + json.dump(ban_list, file) + + def fetch(self, nb_samples, query_limit=500, continue_token=None): + fetch = min(nb_samples, query_limit) + + query = BlockQuery( + properties=["user", "timestamp", "expiry"], + show=["temp", "ip"], + limit=fetch, + continue_token=continue_token, + ) + results = query.fetch_result() + + entries = results["query"]["blocks"] + self.load(entries) + + if nb_samples - fetch > 0: + continue_token = results["query-continue"]["blocks"]["bkcontinue"] + self.fetch(nb_samples - fetch, query_limit, continue_token) diff --git a/wikibania/ban/BanDBWrapper.py b/wikibania/ban/BanDBWrapper.py new file mode 100644 index 0000000..8f396b0 --- /dev/null +++ b/wikibania/ban/BanDBWrapper.py @@ -0,0 +1,25 @@ +from collections import defaultdict + +import numpy + + +class BanDBWrapper: + def __init__(self, ban_db): + self.ban_db = ban_db + + def get_all_durations(self): + return [ban.calc_duration() for ban in self.ban_db.list()] + + def get_all_countries(self): + return [ban.lookup_country_code() for ban in self.ban_db.list()] + + def get_durations_by_country(self): + return [(ban.lookup_country_code(), ban.calc_duration()) for ban in self.ban_db.list()] + + def calc_average_duration_by_country(self): + ban_durations_by_country = defaultdict(list) + + for country, ban_duration in self.get_durations_by_country(): + ban_durations_by_country[country].append(ban_duration) + + return {country: numpy.mean(ban_durations) for country, ban_durations in ban_durations_by_country.items()} diff --git a/wikibania/ban/__init__.py b/wikibania/ban/__init__.py new file mode 100644 index 0000000..792d600 --- /dev/null +++ b/wikibania/ban/__init__.py @@ -0,0 +1 @@ +# diff --git a/wikibania/wapi/WikipediaQuery.py b/wikibania/wapi/WikipediaQuery.py new file mode 100644 index 0000000..3f544b6 --- /dev/null +++ b/wikibania/wapi/WikipediaQuery.py @@ -0,0 +1,48 @@ +from ..api.Query import JSONQuery + +WIKIPEDIA_QUERY_BASE_URL = "https://en.wikipedia.org/w/api.php" +LIST_SEPARATOR = "|" + + +class WikipediaQuery(JSONQuery): + def __init__(self, params=None): + if params is None: + params = {} + + params.update({ + "action": "query", + "format": "json", + }) + super(WikipediaQuery, self).__init__(base_url=WIKIPEDIA_QUERY_BASE_URL, params=params) + + +class ListQuery(WikipediaQuery): + def __init__(self, list_name, params=None): + if params is None: + params = {} + + params.update({ + "list": list_name, + }) + super(ListQuery, self).__init__(params) + + +class BlockQuery(ListQuery): + def __init__(self, properties=None, show=None, sort="newer", limit=500, continue_token=None): + if properties is None: + properties = [] + + if show is None: + show = [] + + params = { + "bkprop": LIST_SEPARATOR.join(properties), + "bkshow": LIST_SEPARATOR.join(show), + "bkdir": sort, + "bklimit": limit, + } + + if continue_token is not None: + params.update({"bkcontinue": continue_token}) + + super(BlockQuery, self).__init__("blocks", params=params) diff --git a/wikibania/wapi/__init__.py b/wikibania/wapi/__init__.py new file mode 100644 index 0000000..792d600 --- /dev/null +++ b/wikibania/wapi/__init__.py @@ -0,0 +1 @@ +# -- cgit v1.2.3