From ceb12bbde074e7de52619508a8bc3ebacf0b4f85 Mon Sep 17 00:00:00 2001
From: Minijackson
Date: Thu, 23 Oct 2014 17:36:03 +0200
Subject: Adding support for request continue

---
 src/downloader/__init__.py | 103 +++++++++++++++++++++++++++++++++++++--------
 1 file changed, 85 insertions(+), 18 deletions(-)

(limited to 'src')

diff --git a/src/downloader/__init__.py b/src/downloader/__init__.py
index 15fba41..3c7706f 100644
--- a/src/downloader/__init__.py
+++ b/src/downloader/__init__.py
@@ -7,6 +7,7 @@ program, including manipulation of the wikimedia API.
 
 import urllib.request
 import urllib.parse
+import json
 
 
 class Downloader():
@@ -16,16 +17,61 @@ class Downloader():
         self.proxy = urllib.request.ProxyHandler(urllib.request.getproxies())
         self.opener = urllib.request.build_opener(self.proxy)
         urllib.request.install_opener(self.opener)
+        self.data_types = {
+            "revisions": "rv",
+            "recentchanges": "rc",
+            "contributors": "pc"
+        }
+
+    def download(self, endpoint, data_type, params={}, limit=0):
+        """
+        Download the given URL with POST parameters and return the source code
+        with rccontinue support. If limit equals 0, then it suppose that the
+        limit is given whithin the paramaters
+        """
+        if data_type in self.data_types.keys():
+            prefix = self.data_types[data_type]
+            limit_name = prefix + "limit"
+            continue_name = prefix + "continue"
+
+            if limit == 0:
+                limit = params[limit_name]
+            else:
+                params[limit_name] = limit
+
+        if limit <= 500:
+            url = self.compile_url(endpoint, params)
+            result = urllib.request.urlopen(url).read().decode("utf8")
+            return [json.loads(result)]
+        else:
+            results = []
+            # Each 500 limits
+            while limit > 0:
+                # Support for numbers like 1542
+                if limit > 500:
+                    limit -= 500
+                    temp_limit = 500
+                else:
+                    temp_limit = limit
+                    limit = 0
 
-    def download(self, url):
-        """ Download the given URL and return the source code """
-        return urllib.request.urlopen(url).read().decode("utf8")
+                temp_result = self.download(endpoint, data_type, params, temp_limit)
+                if "query-continue" in temp_result[0].keys():
+                    params[continue_name] = temp_result[0]["query-continue"][data_type][continue_name]
+                else:
+                    limit = 0
+                results.append(temp_result)
+            return results
 
     def download_in_file(self, url, output_file_path):
         """ Download the given URL and write to the given file """
         with open(output_file_path, "w") as output_file:
             output_file.write(self.download(url))
 
+    def compile_url(self, endpoint, params={}):
+        url_params_str = urllib.parse.urlencode(params)
+        return urllib.parse.urljoin(endpoint, "?" + url_params_str)
+
 
 class WikimediaAPI():
     """
@@ -45,18 +91,7 @@ class WikimediaAPI():
         """
         self.endpoint = endpoint
         self.return_format = return_format
-
-    def get_recent_changes(self, namespace="(Main)"):
-        """
-        Get the url corresponding to the latest changes made to the wiki.
-        (https://www.mediawiki.org/wiki/API:Recentchanges)
-
-        The namespace is used to restrict the results to a certain level. It
-        can be (Main) which is the default one, "Wikipedia", "File" or
-        others. It will be converted to an int corresponding to the rcnamespace
-        parameter. See https://meta.wikimedia.org/wiki/Help:Namespace
-        """
-        rcnamespaces = {
+        self.namespaces = {
             "(Main)": "0",
             "Talk": "1",
             "User talk": "2",
@@ -88,11 +123,43 @@ class WikimediaAPI():
             "Topic": "2600"
         }
 
+    def get_recent_changes(self, namespace="(Main)"):
+        """
+        Get the url corresponding to the latest changes made to the wiki.
+        (https://www.mediawiki.org/wiki/API:Recentchanges)
+
+        The namespace is used to restrict the results to a certain level. It
+        can be (Main) which is the default one, "Wikipedia", "File" or
+        others. It will be converted to an int corresponding to the rcnamespace
+        parameter. See https://meta.wikimedia.org/wiki/Help:Namespace
+        """
+
         url_params = {
             "action": "query",
             "list": "recentchanges",
             "format": self.return_format,
-            "rcnamespace": rcnamespaces[namespace],
+            "rcnamespace": self.namespaces[namespace],
+        }
+        return self.endpoint, url_params
+
+    def get_contributors(self, page="Main_Page", namespace="(Main)"):
+        """
+        Get the url corresponding to the contributors of a given page or list
+        of pages.
+        (https://www.mediawiki.org/wiki/API:Properties#contributors_.2F_pc)
+
+        Use the 'page' parameter to specify the Wikipedia page(s)
+
+        The namespace is used to restrict the results to a certain level. It
+        can be (Main) which is the default one, "Wikipedia", "File" or
+        others. It will be converted to an int corresponding to the pcnamespace
+        parameter. See https://meta.wikimedia.org/wiki/Help:Namespace
+        """
+
+        url_params = {
+            "action": "query",
+            "prop": "contributors",
+            "format": self.return_format,
+            "titles": page,
         }
-        url_params_str = urllib.parse.urlencode(url_params)
-        return urllib.parse.urljoin(self.endpoint, "?" + url_params_str)
+        return self.endpoint, url_params
-- 
cgit v1.2.3