From ceb12bbde074e7de52619508a8bc3ebacf0b4f85 Mon Sep 17 00:00:00 2001 From: Minijackson Date: Thu, 23 Oct 2014 17:36:03 +0200 Subject: Adding support for request continue --- src/downloader/__init__.py | 103 +++++++++++++++++++++++++++++++++++++-------- 1 file changed, 85 insertions(+), 18 deletions(-) (limited to 'src') diff --git a/src/downloader/__init__.py b/src/downloader/__init__.py index 15fba41..3c7706f 100644 --- a/src/downloader/__init__.py +++ b/src/downloader/__init__.py @@ -7,6 +7,7 @@ program, including manipulation of the wikimedia API. import urllib.request import urllib.parse +import json class Downloader(): @@ -16,16 +17,61 @@ class Downloader(): self.proxy = urllib.request.ProxyHandler(urllib.request.getproxies()) self.opener = urllib.request.build_opener(self.proxy) urllib.request.install_opener(self.opener) + self.data_types = { + "revisions": "rv", + "recentchanges": "rc", + "contributors": "pc" + } + + def download(self, endpoint, data_type, params={}, limit=0): + """ + Download the given URL with POST parameters and return the source code + with rccontinue support. If limit equals 0, then it suppose that the + limit is given whithin the paramaters + """ + if data_type in self.data_types.keys(): + prefix = self.data_types[data_type] + limit_name = prefix + "limit" + continue_name = prefix + "continue" + + if limit == 0: + limit = params[limit_name] + else: + params[limit_name] = limit + + if limit <= 500: + url = self.compile_url(endpoint, params) + result = urllib.request.urlopen(url).read().decode("utf8") + return [json.loads(result)] + else: + results = [] + # Each 500 limits + while limit > 0: + # Support for numbers like 1542 + if limit > 500: + limit -= 500 + temp_limit = 500 + else: + temp_limit = limit + limit = 0 - def download(self, url): - """ Download the given URL and return the source code """ - return urllib.request.urlopen(url).read().decode("utf8") + temp_result = self.download(endpoint, data_type, params, temp_limit) + if "query-continue" in temp_result[0].keys(): + params[continue_name] = temp_result[0]["query-continue"][data_type][continue_name] + else: + limit = 0 + results.append(temp_result) + return results def download_in_file(self, url, output_file_path): """ Download the given URL and write to the given file """ with open(output_file_path, "w") as output_file: output_file.write(self.download(url)) + def compile_url(self, endpoint, params={}): + url_params_str = urllib.parse.urlencode(params) + return urllib.parse.urljoin(endpoint, "?" + url_params_str) + class WikimediaAPI(): """ @@ -45,18 +91,7 @@ class WikimediaAPI(): """ self.endpoint = endpoint self.return_format = return_format - - def get_recent_changes(self, namespace="(Main)"): - """ - Get the url corresponding to the latest changes made to the wiki. - (https://www.mediawiki.org/wiki/API:Recentchanges) - - The namespace is used to restrict the results to a certain level. It - can be (Main) which is the default one, "Wikipedia", "File" or - others. It will be converted to an int corresponding to the rcnamespace - parameter. See https://meta.wikimedia.org/wiki/Help:Namespace - """ - rcnamespaces = { + self.namespaces = { "(Main)": "0", "Talk": "1", "User talk": "2", @@ -88,11 +123,43 @@ class WikimediaAPI(): "Topic": "2600" } + def get_recent_changes(self, namespace="(Main)"): + """ + Get the url corresponding to the latest changes made to the wiki. + (https://www.mediawiki.org/wiki/API:Recentchanges) + + The namespace is used to restrict the results to a certain level. It + can be (Main) which is the default one, "Wikipedia", "File" or + others. It will be converted to an int corresponding to the rcnamespace + parameter. See https://meta.wikimedia.org/wiki/Help:Namespace + """ + url_params = { "action": "query", "list": "recentchanges", "format": self.return_format, - "rcnamespace": rcnamespaces[namespace], + "rcnamespace": self.namespaces[namespace], + } + return self.endpoint, url_params + + def get_contributors(self, page="Main_Page", namespace="(Main)"): + """ + Get the url corresponding to the contributors of a given page or list + of pages. + (https://www.mediawiki.org/wiki/API:Properties#contributors_.2F_pc) + + Use the 'page' parameter to specify the Wikipedia page(s) + + The namespace is used to restrict the results to a certain level. It + can be (Main) which is the default one, "Wikipedia", "File" or + others. It will be converted to an int corresponding to the pcnamespace + parameter. See https://meta.wikimedia.org/wiki/Help:Namespace + """ + + url_params = { + "action": "query", + "prop": "contributors", + "format": self.return_format, + "titles": page, } - url_params_str = urllib.parse.urlencode(url_params) - return urllib.parse.urljoin(self.endpoint, "?" + url_params_str) + return self.endpoint, url_params -- cgit v1.2.3