summaryrefslogtreecommitdiff
path: root/src/downloader/__init__.py
blob: cfcf20a7a5fa4e1c7d19f96bcc37a516e59f8fcd (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
"""
Init file of the downloader module.

The downloader module is used to take care of the downloading part of the
program, including manipulation of the wikimedia API.
"""

import urllib.request
# For system proxy
import os


class Downloader():
    """Class used to download a given webpage considering system proxy"""
    def __init__(self):
        self.proxy_address = os.environ.get("HTTP_Proxy")
        self.proxy = urllib.request.ProxyHandler({'http': self.proxy_address})
        self.opener = urllib.request.build_opener(self.proxy)
        urllib.request.install_opener(self.opener)

    def download(self, url):
        """ Download the given URL and return the source code """
        return urllib.request.urlopen(url).read().decode("utf8")

    def download_in_file(self, url, output_file_path):
        """ Download the given URL and write to the given file """
        with open(output_file_path, "w") as output_file:
            output_file.write(self.download(url))


class WikimediaAPI():
    """
    Class used to generate wikimedia API urls for several uses

    The endpoint for this project should be "http://en.wikipedia.org/w/api.php"
    but can be other wiki api endpoint made with the Wikimedia software.
    The return_format can be one of json, php, wddx, xml, yaml, raw, txt, dbg,
    dump or none.
    """
    def __init__(self, endpoint, return_format):
        self.endpoint = endpoint
        self.return_format = return_format

    def get_recent_changes(self, namespace="(Main)"):
        """
        Get the url corresponding to the latest changes made to the wiki.
        (https://www.mediawiki.org/wiki/API:Recentchanges)

        The namespace is used to restrict the results to a certain level. It
        can be "(Main)" which is the default one, "Wikipedia", "File" or
        others. See https://meta.wikimedia.org/wiki/Help:Namespace
        """
        return self.base_url + "?action=query&list=recentchanges&format="\
            + self.return_format + "&namespace=" + namespace