""" Init file of the downloader module. The downloader module is used to take care of the downloading part of the program, including manipulation of the wikimedia API. """ import urllib.request # For system proxy import os class Downloader(): """Class used to download a given webpage considering system proxy""" def __init__(self): self.proxy_address = os.environ.get("HTTP_Proxy") self.proxy = urllib.request.ProxyHandler({'http': self.proxy_address}) self.opener = urllib.request.build_opener(self.proxy) urllib.request.install_opener(self.opener) def download(self, url): """ Download the given URL and return the source code """ return urllib.request.urlopen(url).read().decode("utf8") def download_in_file(self, url, output_file_path): """ Download the given URL and write to the given file """ with open(output_file_path, "w") as output_file: output_file.write(self.download(url)) class WikimediaAPI(): """ Class used to generate wikimedia API urls for several uses The endpoint for this project should be "http://en.wikipedia.org/w/api.php" but can be other wiki api endpoint made with the Wikimedia software. The return_format can be one of json, php, wddx, xml, yaml, raw, txt, dbg, dump or none. """ def __init__(self, endpoint, return_format): self.endpoint = endpoint self.return_format = return_format def get_recent_changes(self, namespace="(Main)"): """ Get the url corresponding to the latest changes made to the wiki. (https://www.mediawiki.org/wiki/API:Recentchanges) The namespace is used to restrict the results to a certain level. It can be "(Main)" which is the default one, "Wikipedia", "File" or others. See https://meta.wikimedia.org/wiki/Help:Namespace """ return self.base_url + "?action=query&list=recentchanges&format="\ + self.return_format + "&namespace=" + namespace