summaryrefslogtreecommitdiff
path: root/src/downloader
diff options
context:
space:
mode:
authorPacien TRAN-GIRARD2014-10-22 10:46:11 +0200
committerPacien TRAN-GIRARD2014-10-22 10:46:11 +0200
commit413f7faa5b3235ea0e00b68132b7e92917831dee (patch)
treec2269ef3ad8fc801a0234b61f68f487aadd34dda /src/downloader
parent046384051a6fd2aeab692263e9d533d363e45572 (diff)
parent1ae5575c25242c538d30bd303092f99b9e78b716 (diff)
downloadwikistats-413f7faa5b3235ea0e00b68132b7e92917831dee.tar.gz
Merge remote-tracking branch 'origin/master'
Resolved conflicts: src/downloader/__init__.py
Diffstat (limited to 'src/downloader')
-rw-r--r--src/downloader/__init__.py108
1 files changed, 39 insertions, 69 deletions
diff --git a/src/downloader/__init__.py b/src/downloader/__init__.py
index f256ddc..15fba41 100644
--- a/src/downloader/__init__.py
+++ b/src/downloader/__init__.py
@@ -11,7 +11,6 @@ import urllib.parse
11 11
12class Downloader(): 12class Downloader():
13 """Class used to download a given webpage considering system proxy""" 13 """Class used to download a given webpage considering system proxy"""
14
15 def __init__(self): 14 def __init__(self):
16 """ Downloader class constructor """ 15 """ Downloader class constructor """
17 self.proxy = urllib.request.ProxyHandler(urllib.request.getproxies()) 16 self.proxy = urllib.request.ProxyHandler(urllib.request.getproxies())
@@ -32,39 +31,6 @@ class WikimediaAPI():
32 """ 31 """
33 Class used to generate wikimedia API urls for several uses 32 Class used to generate wikimedia API urls for several uses
34 """ 33 """
35
36 RCNAMESPACES_CODES = {
37 "(Main)": "0",
38 "Talk": "1",
39 "User talk": "2",
40 "Wikipedia": "3",
41 "Wikipedia talk": "4",
42 "File": "5",
43 "File talk": "6",
44 "MediaWiki": "7",
45 "MediaWiki talk": "8",
46 "Template": "9",
47 "Template talk": "10",
48 "Help": "11",
49 "Help talk": "12",
50 "Category": "13",
51 "Category talk": "14",
52 # Custom Wikipedia namespaces
53 "Portal": "100",
54 "Portal talk": "101",
55 "Book": "108",
56 "Book talk": "109",
57 "Draft": "118",
58 "Draft talk": "119",
59 "Education Program": "446",
60 "Education Program talk": "447",
61 "TimedText": "710",
62 "TimedText talk": "711",
63 "Module": "828",
64 "Module talk": "829",
65 "Topic": "2600"
66 }
67
68 def __init__(self, endpoint="http://en.wikipedia.org/w/api.php", 34 def __init__(self, endpoint="http://en.wikipedia.org/w/api.php",
69 return_format="json"): 35 return_format="json"):
70 """ 36 """
@@ -80,18 +46,6 @@ class WikimediaAPI():
80 self.endpoint = endpoint 46 self.endpoint = endpoint
81 self.return_format = return_format 47 self.return_format = return_format
82 48
83 def gen_query_url(self, parms):
84 """
85 Generate the query URL.
86
87 :param parms: URL parameters dict
88 :return: query URL
89 """
90 parms["action"] = "query"
91 parms["format"] = self.return_format
92 parms_str = urllib.parse.urlencode(parms)
93 return urllib.parse.urljoin(self.endpoint, "?" + parms_str)
94
95 def get_recent_changes(self, namespace="(Main)"): 49 def get_recent_changes(self, namespace="(Main)"):
96 """ 50 """
97 Get the url corresponding to the latest changes made to the wiki. 51 Get the url corresponding to the latest changes made to the wiki.
@@ -102,27 +56,43 @@ class WikimediaAPI():
102 others. It will be converted to an int corresponding to the rcnamespace 56 others. It will be converted to an int corresponding to the rcnamespace
103 parameter. See https://meta.wikimedia.org/wiki/Help:Namespace 57 parameter. See https://meta.wikimedia.org/wiki/Help:Namespace
104 """ 58 """
105 return self.gen_query_url({ 59 rcnamespaces = {
60 "(Main)": "0",
61 "Talk": "1",
62 "User talk": "2",
63 "Wikipedia": "3",
64 "Wikipedia talk": "4",
65 "File": "5",
66 "File talk": "6",
67 "MediaWiki": "7",
68 "MediaWiki talk": "8",
69 "Template": "9",
70 "Template talk": "10",
71 "Help": "11",
72 "Help talk": "12",
73 "Category": "13",
74 "Category talk": "14",
75 # Custom Wikipedia namespaces
76 "Portal": "100",
77 "Portal talk": "101",
78 "Book": "108",
79 "Book talk": "109",
80 "Draft": "118",
81 "Draft talk": "119",
82 "Education Program": "446",
83 "Education Program talk": "447",
84 "TimedText": "710",
85 "TimedText talk": "711",
86 "Module": "828",
87 "Module talk": "829",
88 "Topic": "2600"
89 }
90
91 url_params = {
92 "action": "query",
106 "list": "recentchanges", 93 "list": "recentchanges",
107 "rcnamespace": WikimediaAPI.RCNAMESPACES_CODES[namespace], 94 "format": self.return_format,
108 }) 95 "rcnamespace": rcnamespaces[namespace],
109 96 }
110 def get_geo_pages(self, lat, long, radius, limit=500, namespace="(Main)"): 97 url_params_str = urllib.parse.urlencode(url_params)
111 """ 98 return urllib.parse.urljoin(self.endpoint, "?" + url_params_str)
112 Generate the query URL performing a geographic search of articles
113 located near the given coordinates.
114
115 :param lat: Latitude
116 :param long: Longitude
117 :param radius: radius (in m) to look inside
118 :param limit: maximum number of results (max 500)
119 :param namespace: namespace to restrict the search in
120 :return:
121 """
122 return self.gen_query_url({
123 "list": "geosearch",
124 "gscoord": "%f|%f" % (lat, long),
125 "gsradius": "%f" % radius,
126 "gsnamespace": WikimediaAPI.RCNAMESPACES_CODES[namespace],
127 "gslimit": limit,
128 })