summaryrefslogtreecommitdiff
path: root/src/downloader
diff options
context:
space:
mode:
Diffstat (limited to 'src/downloader')
-rw-r--r--src/downloader/__init__.py103
1 files changed, 85 insertions, 18 deletions
diff --git a/src/downloader/__init__.py b/src/downloader/__init__.py
index 15fba41..3c7706f 100644
--- a/src/downloader/__init__.py
+++ b/src/downloader/__init__.py
@@ -7,6 +7,7 @@ program, including manipulation of the wikimedia API.
7 7
8import urllib.request 8import urllib.request
9import urllib.parse 9import urllib.parse
10import json
10 11
11 12
12class Downloader(): 13class Downloader():
@@ -16,16 +17,61 @@ class Downloader():
16 self.proxy = urllib.request.ProxyHandler(urllib.request.getproxies()) 17 self.proxy = urllib.request.ProxyHandler(urllib.request.getproxies())
17 self.opener = urllib.request.build_opener(self.proxy) 18 self.opener = urllib.request.build_opener(self.proxy)
18 urllib.request.install_opener(self.opener) 19 urllib.request.install_opener(self.opener)
20 self.data_types = {
21 "revisions": "rv",
22 "recentchanges": "rc",
23 "contributors": "pc"
24 }
25
26 def download(self, endpoint, data_type, params={}, limit=0):
27 """
28 Download the given URL with POST parameters and return the source code
29 with rccontinue support. If limit equals 0, then it suppose that the
30 limit is given whithin the paramaters
31 """
32 if data_type in self.data_types.keys():
33 prefix = self.data_types[data_type]
34 limit_name = prefix + "limit"
35 continue_name = prefix + "continue"
36
37 if limit == 0:
38 limit = params[limit_name]
39 else:
40 params[limit_name] = limit
41
42 if limit <= 500:
43 url = self.compile_url(endpoint, params)
44 result = urllib.request.urlopen(url).read().decode("utf8")
45 return [json.loads(result)]
46 else:
47 results = []
48 # Each 500 limits
49 while limit > 0:
50 # Support for numbers like 1542
51 if limit > 500:
52 limit -= 500
53 temp_limit = 500
54 else:
55 temp_limit = limit
56 limit = 0
19 57
20 def download(self, url): 58 temp_result = self.download(endpoint, data_type, params, temp_limit)
21 """ Download the given URL and return the source code """ 59 if "query-continue" in temp_result[0].keys():
22 return urllib.request.urlopen(url).read().decode("utf8") 60 params[continue_name] = temp_result[0]["query-continue"][data_type][continue_name]
61 else:
62 limit = 0
63 results.append(temp_result)
64 return results
23 65
24 def download_in_file(self, url, output_file_path): 66 def download_in_file(self, url, output_file_path):
25 """ Download the given URL and write to the given file """ 67 """ Download the given URL and write to the given file """
26 with open(output_file_path, "w") as output_file: 68 with open(output_file_path, "w") as output_file:
27 output_file.write(self.download(url)) 69 output_file.write(self.download(url))
28 70
71 def compile_url(self, endpoint, params={}):
72 url_params_str = urllib.parse.urlencode(params)
73 return urllib.parse.urljoin(endpoint, "?" + url_params_str)
74
29 75
30class WikimediaAPI(): 76class WikimediaAPI():
31 """ 77 """
@@ -45,18 +91,7 @@ class WikimediaAPI():
45 """ 91 """
46 self.endpoint = endpoint 92 self.endpoint = endpoint
47 self.return_format = return_format 93 self.return_format = return_format
48 94 self.namespaces = {
49 def get_recent_changes(self, namespace="(Main)"):
50 """
51 Get the url corresponding to the latest changes made to the wiki.
52 (https://www.mediawiki.org/wiki/API:Recentchanges)
53
54 The namespace is used to restrict the results to a certain level. It
55 can be (Main) which is the default one, "Wikipedia", "File" or
56 others. It will be converted to an int corresponding to the rcnamespace
57 parameter. See https://meta.wikimedia.org/wiki/Help:Namespace
58 """
59 rcnamespaces = {
60 "(Main)": "0", 95 "(Main)": "0",
61 "Talk": "1", 96 "Talk": "1",
62 "User talk": "2", 97 "User talk": "2",
@@ -88,11 +123,43 @@ class WikimediaAPI():
88 "Topic": "2600" 123 "Topic": "2600"
89 } 124 }
90 125
126 def get_recent_changes(self, namespace="(Main)"):
127 """
128 Get the url corresponding to the latest changes made to the wiki.
129 (https://www.mediawiki.org/wiki/API:Recentchanges)
130
131 The namespace is used to restrict the results to a certain level. It
132 can be (Main) which is the default one, "Wikipedia", "File" or
133 others. It will be converted to an int corresponding to the rcnamespace
134 parameter. See https://meta.wikimedia.org/wiki/Help:Namespace
135 """
136
91 url_params = { 137 url_params = {
92 "action": "query", 138 "action": "query",
93 "list": "recentchanges", 139 "list": "recentchanges",
94 "format": self.return_format, 140 "format": self.return_format,
95 "rcnamespace": rcnamespaces[namespace], 141 "rcnamespace": self.namespaces[namespace],
142 }
143 return self.endpoint, url_params
144
145 def get_contributors(self, page="Main_Page", namespace="(Main)"):
146 """
147 Get the url corresponding to the contributors of a given page or list
148 of pages.
149 (https://www.mediawiki.org/wiki/API:Properties#contributors_.2F_pc)
150
151 Use the 'page' parameter to specify the Wikipedia page(s)
152
153 The namespace is used to restrict the results to a certain level. It
154 can be (Main) which is the default one, "Wikipedia", "File" or
155 others. It will be converted to an int corresponding to the pcnamespace
156 parameter. See https://meta.wikimedia.org/wiki/Help:Namespace
157 """
158
159 url_params = {
160 "action": "query",
161 "prop": "contributors",
162 "format": self.return_format,
163 "titles": page,
96 } 164 }
97 url_params_str = urllib.parse.urlencode(url_params) 165 return self.endpoint, url_params
98 return urllib.parse.urljoin(self.endpoint, "?" + url_params_str)