src/downloader/__init__.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128

"""
Init file of the downloader module.

The downloader module is used to take care of the downloading part of the
program, including manipulation of the wikimedia API.
"""

import urllib.request
import urllib.parse


class Downloader():
    """Class used to download a given webpage considering system proxy"""

    def __init__(self):
        """ Downloader class constructor """
        self.proxy = urllib.request.ProxyHandler(urllib.request.getproxies())
        self.opener = urllib.request.build_opener(self.proxy)
        urllib.request.install_opener(self.opener)

    def download(self, url):
        """ Download the given URL and return the source code """
        return urllib.request.urlopen(url).read().decode("utf8")

    def download_in_file(self, url, output_file_path):
        """ Download the given URL and write to the given file """
        with open(output_file_path, "w") as output_file:
            output_file.write(self.download(url))


class WikimediaAPI():
    """
    Class used to generate wikimedia API urls for several uses
    """

    RCNAMESPACES_CODES = {
        "(Main)": "0",
        "Talk": "1",
        "User talk": "2",
        "Wikipedia": "3",
        "Wikipedia talk": "4",
        "File": "5",
        "File talk": "6",
        "MediaWiki": "7",
        "MediaWiki talk": "8",
        "Template": "9",
        "Template talk": "10",
        "Help": "11",
        "Help talk": "12",
        "Category": "13",
        "Category talk": "14",
        # Custom Wikipedia namespaces
        "Portal": "100",
        "Portal talk": "101",
        "Book": "108",
        "Book talk": "109",
        "Draft": "118",
        "Draft talk": "119",
        "Education Program": "446",
        "Education Program talk": "447",
        "TimedText": "710",
        "TimedText talk": "711",
        "Module": "828",
        "Module talk": "829",
        "Topic": "2600"
    }

    def __init__(self, endpoint="http://en.wikipedia.org/w/api.php",
                 return_format="json"):
        """
        WikimediaAPI class constructor

        The endpoint for this project should be
        "http://en.wikipedia.org/w/api.php" but it can be any other wiki
        api endpoint made with the Wikimedia software.

        The return_format can be one of json, php, wddx, xml, yaml, raw, txt,
        dbg, dump or none.
        """
        self.endpoint = endpoint
        self.return_format = return_format

    def gen_query_url(self, parms):
        """
        Generate the query URL.

        :param parms: URL parameters dict
        :return: query URL
        """
        parms["action"] = "query"
        parms["format"] = self.return_format
        parms_str = urllib.parse.urlencode(parms)
        return urllib.parse.urljoin(self.endpoint, "?" + parms_str)

    def get_recent_changes(self, namespace="(Main)"):
        """
        Get the url corresponding to the latest changes made to the wiki.
        (https://www.mediawiki.org/wiki/API:Recentchanges)

        The namespace is used to restrict the results to a certain level. It
        can be (Main) which is the default one, "Wikipedia", "File" or
        others. It will be converted to an int corresponding to the rcnamespace
        parameter. See https://meta.wikimedia.org/wiki/Help:Namespace
        """
        return self.gen_query_url({
            "list": "recentchanges",
            "rcnamespace": WikimediaAPI.RCNAMESPACES_CODES[namespace],
        })

    def get_geo_pages(self, lat, long, radius, limit=500, namespace="(Main)"):
        """
        Generate the query URL performing a geographic search of articles
        located near the given coordinates.

        :param lat: Latitude
        :param long: Longitude
        :param radius: radius (in m) to look inside
        :param limit: maximum number of results (max 500)
        :param namespace: namespace to restrict the search in
        :return:
        """
        return self.gen_query_url({
            "list": "geosearch",
            "gscoord": "%f|%f" % (lat, long),
            "gsradius": "%f" % radius,
            "gsnamespace": WikimediaAPI.RCNAMESPACES_CODES[namespace],
            "gslimit": limit,
        })