From b2b0a0603957b9581efd23debbc42d9281c63300 Mon Sep 17 00:00:00 2001 From: Minijackson Date: Wed, 15 Oct 2014 11:44:03 +0200 Subject: Using real system proxies + rcnamespace + docs + default values - Adding __init__ docs - Adding WikimediaAPI class constructor default values for endpoint and format - Real usage of rcnamespaces with namespace conversion from string to int --- src/downloader/__init__.py | 64 ++++++++++++++++++++++++++++++++++++---------- 1 file changed, 50 insertions(+), 14 deletions(-) diff --git a/src/downloader/__init__.py b/src/downloader/__init__.py index cfcf20a..4dd6169 100644 --- a/src/downloader/__init__.py +++ b/src/downloader/__init__.py @@ -6,15 +6,13 @@ program, including manipulation of the wikimedia API. """ import urllib.request -# For system proxy -import os class Downloader(): """Class used to download a given webpage considering system proxy""" def __init__(self): - self.proxy_address = os.environ.get("HTTP_Proxy") - self.proxy = urllib.request.ProxyHandler({'http': self.proxy_address}) + """ Downloader class constructor """ + self.proxy = urllib.request.ProxyHandler(urllib.request.getproxies()) self.opener = urllib.request.build_opener(self.proxy) urllib.request.install_opener(self.opener) @@ -31,13 +29,19 @@ class Downloader(): class WikimediaAPI(): """ Class used to generate wikimedia API urls for several uses - - The endpoint for this project should be "http://en.wikipedia.org/w/api.php" - but can be other wiki api endpoint made with the Wikimedia software. - The return_format can be one of json, php, wddx, xml, yaml, raw, txt, dbg, - dump or none. """ - def __init__(self, endpoint, return_format): + def __init__(self, endpoint="http://en.wikipedia.org/w/api.php", + return_format="json"): + """ + WikimediaAPI class constructor + + The endpoint for this project should be + "http://en.wikipedia.org/w/api.php" but it can be any other wiki + api endpoint made with the Wikimedia software. + + The return_format can be one of json, php, wddx, xml, yaml, raw, txt, + dbg, dump or none. + """ self.endpoint = endpoint self.return_format = return_format @@ -47,8 +51,40 @@ class WikimediaAPI(): (https://www.mediawiki.org/wiki/API:Recentchanges) The namespace is used to restrict the results to a certain level. It - can be "(Main)" which is the default one, "Wikipedia", "File" or - others. See https://meta.wikimedia.org/wiki/Help:Namespace + can be (Main) which is the default one, "Wikipedia", "File" or + others. It will be converted to an int corresponding to the rcnamespace + parameter. See https://meta.wikimedia.org/wiki/Help:Namespace """ - return self.base_url + "?action=query&list=recentchanges&format="\ - + self.return_format + "&namespace=" + namespace + rcnamespaces = { + "(Main)": "0", + "Talk": "1", + "User talk": "2", + "Wikipedia": "3", + "Wikipedia talk": "4", + "File": "5", + "File talk": "6", + "MediaWiki": "7", + "MediaWiki talk": "8", + "Template": "9", + "Template talk": "10", + "Help": "11", + "Help talk": "12", + "Category": "13", + "Category talk": "14", + # Custom Wikipedia namespaces + "Portal": "100", + "Portal talk": "101", + "Book": "108", + "Book talk": "109", + "Draft": "118", + "Draft talk": "119", + "Education Program": "446", + "Education Program talk": "447", + "TimedText": "710", + "TimedText talk": "711", + "Module": "828", + "Module talk": "829", + "Topic": "2600" + } + return self.endpoint + "?action=query&list=recentchanges&format="\ + + self.return_format + "&namespace=" + rcnamespaces[namespace] -- cgit v1.2.3