diff options
author | Pacien TRAN-GIRARD | 2014-10-24 18:05:48 +0000 |
---|---|---|
committer | Pacien TRAN-GIRARD | 2014-10-24 18:05:48 +0000 |
commit | 4403fda939ef42aeffeccb343d74f3dc3b840f91 (patch) | |
tree | 63fd704f15f3030f1455aad0ef92403c5d093c70 | |
parent | 16529a0d212e1387eacd590c0e5e1b1a13dc2641 (diff) | |
parent | bdf9099df8c2a4636b0ad0e710b73330877eef37 (diff) | |
download | wikistats-4403fda939ef42aeffeccb343d74f3dc3b840f91.tar.gz |
Merge branch 'refactor' into 'master'
Refactor
See merge request !1
-rw-r--r-- | banapedia/Ban.py | 36 | ||||
-rw-r--r-- | banapedia/__init__.py | 1 | ||||
-rw-r--r-- | banapedia/api/__init__.py | 0 | ||||
-rw-r--r-- | banapedia/wapi/WikipediaQuery.py | 42 | ||||
-rw-r--r-- | banapedia/wapi/__init__.py | 0 | ||||
-rw-r--r-- | bandict/__init__.py | 91 | ||||
-rw-r--r-- | main.py | 101 | ||||
-rw-r--r-- | rapport.md | 2 | ||||
-rw-r--r-- | rapport.pdf | bin | 86060 -> 0 bytes | |||
-rw-r--r-- | sysproxy.py | 7 | ||||
-rw-r--r-- | wikibania/__init__.py | 1 | ||||
-rw-r--r-- | wikibania/api/Query.py (renamed from banapedia/api/Query.py) | 7 | ||||
-rw-r--r-- | wikibania/api/__init__.py | 1 | ||||
-rw-r--r-- | wikibania/ban/Ban.py | 37 | ||||
-rw-r--r-- | wikibania/ban/BanDB.py | 50 | ||||
-rw-r--r-- | wikibania/ban/BanDBWrapper.py | 25 | ||||
-rw-r--r-- | wikibania/ban/__init__.py | 1 | ||||
-rw-r--r-- | wikibania/wapi/WikipediaQuery.py | 48 | ||||
-rw-r--r-- | wikibania/wapi/__init__.py | 1 |
19 files changed, 246 insertions, 205 deletions
diff --git a/banapedia/Ban.py b/banapedia/Ban.py deleted file mode 100644 index 4714274..0000000 --- a/banapedia/Ban.py +++ /dev/null | |||
@@ -1,36 +0,0 @@ | |||
1 | from banapedia.wapi.WikipediaQuery import BlockQuery | ||
2 | from datetime import datetime | ||
3 | import pygeoip | ||
4 | |||
5 | __author__ = 'pacien' | ||
6 | |||
7 | |||
8 | GEOIP_FILE = "/usr/share/GeoIP/GeoIP.dat" | ||
9 | geoip = pygeoip.GeoIP(GEOIP_FILE) | ||
10 | |||
11 | ISO_TIMESTAMP = "%Y-%m-%dT%H:%M:%SZ" | ||
12 | |||
13 | |||
14 | class Ban: | ||
15 | def __init__(self, ip, start, end): | ||
16 | self.ip = ip | ||
17 | self.start = start | ||
18 | self.end = end | ||
19 | self.country_code = None | ||
20 | |||
21 | def get_duration(self): | ||
22 | return (self.end - self.start).days | ||
23 | |||
24 | def get_country_code(self): | ||
25 | if self.country_code is not None: | ||
26 | return self.country_code | ||
27 | |||
28 | country_code = "" | ||
29 | |||
30 | try: | ||
31 | country_code = geoip.country_code_by_addr(self.ip).lower() | ||
32 | except pygeoip.GeoIPError: | ||
33 | print("[ERROR]", "Could not determine country for ip", self.ip) | ||
34 | |||
35 | self.country_code = country_code | ||
36 | return country_code | ||
diff --git a/banapedia/__init__.py b/banapedia/__init__.py deleted file mode 100644 index 8b13789..0000000 --- a/banapedia/__init__.py +++ /dev/null | |||
@@ -1 +0,0 @@ | |||
1 | |||
diff --git a/banapedia/api/__init__.py b/banapedia/api/__init__.py deleted file mode 100644 index e69de29..0000000 --- a/banapedia/api/__init__.py +++ /dev/null | |||
diff --git a/banapedia/wapi/WikipediaQuery.py b/banapedia/wapi/WikipediaQuery.py deleted file mode 100644 index d3d2f94..0000000 --- a/banapedia/wapi/WikipediaQuery.py +++ /dev/null | |||
@@ -1,42 +0,0 @@ | |||
1 | from ..api.Query import JSONQuery | ||
2 | |||
3 | __author__ = 'pacien' | ||
4 | |||
5 | WIKIPEDIA_QUERY_BASE_URL = "https://en.wikipedia.org/w/api.php" | ||
6 | LIST_SEPARATOR = "|" | ||
7 | DEFAULT_BKPROP = ["id", "user", "userid", "by", "byid", "timestamp", "expiry", "reason", "range", "flags"] | ||
8 | DEFAULT_BKSHOW = ["account", "temp", "ip", "range"] | ||
9 | |||
10 | |||
11 | class WikipediaQuery(JSONQuery): | ||
12 | def __init__(self, params={}): | ||
13 | params.update({ | ||
14 | "action": "query", | ||
15 | "format": "json", | ||
16 | }) | ||
17 | JSONQuery.__init__(self, base_url=WIKIPEDIA_QUERY_BASE_URL, params=params) | ||
18 | |||
19 | |||
20 | class ListQuery(WikipediaQuery): | ||
21 | def __init__(self, list_name, params={}): | ||
22 | params.update({ | ||
23 | "list": list_name, | ||
24 | }) | ||
25 | WikipediaQuery.__init__(self, params) | ||
26 | |||
27 | |||
28 | class BlockQuery(ListQuery): | ||
29 | def __init__(self, bkprop=DEFAULT_BKPROP, bkshow=DEFAULT_BKSHOW, bkdir="newer", limit=500, continue_token=None): | ||
30 | params = { | ||
31 | "bkprop": LIST_SEPARATOR.join(bkprop), | ||
32 | "bkshow": LIST_SEPARATOR.join(bkshow), | ||
33 | "bkdir": bkdir, | ||
34 | "bklimit": limit, | ||
35 | } | ||
36 | |||
37 | if continue_token is not None: | ||
38 | params.update({"bkcontinue": continue_token}) | ||
39 | |||
40 | ListQuery.__init__(self, "blocks", params=params) | ||
41 | |||
42 | |||
diff --git a/banapedia/wapi/__init__.py b/banapedia/wapi/__init__.py deleted file mode 100644 index e69de29..0000000 --- a/banapedia/wapi/__init__.py +++ /dev/null | |||
diff --git a/bandict/__init__.py b/bandict/__init__.py deleted file mode 100644 index 389ae0e..0000000 --- a/bandict/__init__.py +++ /dev/null | |||
@@ -1,91 +0,0 @@ | |||
1 | import urllib.request | ||
2 | import json | ||
3 | import numpy as np | ||
4 | from banapedia.wapi.WikipediaQuery import BlockQuery | ||
5 | from banapedia.Ban import * | ||
6 | |||
7 | |||
8 | class BanList(): | ||
9 | |||
10 | def __init__(self, data_file, samples=30000, samples_by_query=500, | ||
11 | from_internet=False): | ||
12 | if from_internet: | ||
13 | proxy = urllib.request.ProxyHandler(urllib.request.getproxies()) | ||
14 | opener = urllib.request.build_opener(proxy) | ||
15 | urllib.request.install_opener(opener) | ||
16 | self.dict_list = self.fetch_multipart(samples, samples_by_query) | ||
17 | else: | ||
18 | with open(data_file, "r") as ban_dict_file: | ||
19 | self.dict_list = json.load(ban_dict_file) | ||
20 | self.ban_list = [] | ||
21 | for ban_dict in self.dict_list: | ||
22 | self.ban_list.append(Ban( | ||
23 | ban_dict["user"], | ||
24 | datetime.strptime(ban_dict["timestamp"], ISO_TIMESTAMP), | ||
25 | datetime.strptime(ban_dict["expiry"], ISO_TIMESTAMP), | ||
26 | )) | ||
27 | |||
28 | def fetch_multipart(self, n, query_limit): | ||
29 | ban_dict_list = [] | ||
30 | n_fetched = 0 | ||
31 | continue_token = None | ||
32 | |||
33 | print("[INFO]", "Fetching %d bans" % n) | ||
34 | while n_fetched < n: | ||
35 | to_fetch = min(query_limit, n - n_fetched) | ||
36 | query = BlockQuery( | ||
37 | bkprop=["user", "timestamp", "expiry"], | ||
38 | bkshow=["temp", "ip"], | ||
39 | limit=to_fetch, | ||
40 | continue_token=continue_token, | ||
41 | ) | ||
42 | results = query.fetch_result() | ||
43 | ban_dict_list.extend(results["query"]["blocks"]) | ||
44 | continue_token = results["query-continue"]["blocks"]["bkcontinue"] | ||
45 | n_fetched += to_fetch | ||
46 | print("[INFO]", "Fetched %d over %d bans" % (n_fetched, n)) | ||
47 | |||
48 | print("[INFO]", "Bans fetching complete") | ||
49 | return ban_dict_list | ||
50 | |||
51 | def write_to_file(self, outfile): | ||
52 | with open(outfile, "w") as ban_dict_file: | ||
53 | json.dump(self.dict_list, ban_dict_file, indent="\t") | ||
54 | |||
55 | def get_durations(self): | ||
56 | return [ban.get_duration() for ban in self.ban_list] | ||
57 | |||
58 | def get_countries(self): | ||
59 | return [ban.get_country_code() for ban in self.ban_list] | ||
60 | |||
61 | def __iter__(self): | ||
62 | return self.dict_list.__iter__() | ||
63 | |||
64 | def items(self): | ||
65 | return self.dict_list.items() | ||
66 | |||
67 | def by_country(self): | ||
68 | ban_duration_by_country = {} | ||
69 | |||
70 | for ban in self.ban_list: | ||
71 | country_code = ban.get_country_code() | ||
72 | |||
73 | if country_code not in ban_duration_by_country.keys(): | ||
74 | ban_duration_by_country[country_code] = [] | ||
75 | |||
76 | ban_duration_by_country[country_code].append(ban) | ||
77 | |||
78 | return ban_duration_by_country | ||
79 | |||
80 | def average_ban_by_country(self): | ||
81 | average_ban_duration_ctry = {} | ||
82 | |||
83 | for country, bans in self.by_country().items(): | ||
84 | average = np.mean([ban.get_duration() for ban in bans]) | ||
85 | average_ban_duration_ctry[country] = average | ||
86 | |||
87 | # In months | ||
88 | average_ban_duration_ctry = {country: duration/30 | ||
89 | for country, duration in | ||
90 | average_ban_duration_ctry.items()} | ||
91 | return average_ban_duration_ctry | ||
@@ -1,62 +1,101 @@ | |||
1 | from banapedia.Ban import * | ||
2 | import bandict | ||
3 | from collections import Counter | 1 | from collections import Counter |
2 | import webbrowser | ||
3 | |||
4 | import pygal | 4 | import pygal |
5 | import numpy as np | 5 | import numpy as np |
6 | import pygeoip | ||
7 | |||
8 | from wikibania.ban.BanDB import BanDB | ||
9 | from wikibania.ban.BanDBWrapper import BanDBWrapper | ||
10 | import sysproxy | ||
11 | |||
12 | |||
13 | # PARAMS | ||
14 | |||
15 | GEOIP_DB = "/usr/share/GeoIP/GeoIP.dat" | ||
6 | 16 | ||
7 | BAN_MAP_FILE = "output/ban-map.svg" | 17 | BAN_MAP_FILE = "output/ban-map.svg" |
8 | BAN_DURATION_MAP_FILE = "output/ban-duration-map.svg" | 18 | BAN_DURATION_MAP_FILE = "output/ban-duration-map.svg" |
9 | HIST_FILE = "output/histogram.svg" | 19 | HIST_FILE = "output/histogram.svg" |
20 | STATS_FILE = "output/stats.txt" | ||
21 | |||
22 | BAN_DB_FILE = "resources/ban_list.json" | ||
23 | |||
24 | FETCH_SAMPLES = 2000 | ||
25 | FETCH_DB = False | ||
26 | DUMP_DB = False | ||
27 | LOAD_DB = False | ||
28 | |||
29 | OPEN_FILES = False | ||
30 | |||
31 | |||