diff options
Diffstat (limited to 'src/downloader/__init__.py')
-rw-r--r-- | src/downloader/__init__.py | 54 |
1 files changed, 54 insertions, 0 deletions
diff --git a/src/downloader/__init__.py b/src/downloader/__init__.py new file mode 100644 index 0000000..cfcf20a --- /dev/null +++ b/src/downloader/__init__.py | |||
@@ -0,0 +1,54 @@ | |||
1 | """ | ||
2 | Init file of the downloader module. | ||
3 | |||
4 | The downloader module is used to take care of the downloading part of the | ||
5 | program, including manipulation of the wikimedia API. | ||
6 | """ | ||
7 | |||
8 | import urllib.request | ||
9 | # For system proxy | ||
10 | import os | ||
11 | |||
12 | |||
13 | class Downloader(): | ||
14 | """Class used to download a given webpage considering system proxy""" | ||
15 | def __init__(self): | ||
16 | self.proxy_address = os.environ.get("HTTP_Proxy") | ||
17 | self.proxy = urllib.request.ProxyHandler({'http': self.proxy_address}) | ||
18 | self.opener = urllib.request.build_opener(self.proxy) | ||
19 | urllib.request.install_opener(self.opener) | ||
20 | |||
21 | def download(self, url): | ||
22 | """ Download the given URL and return the source code """ | ||
23 | return urllib.request.urlopen(url).read().decode("utf8") | ||
24 | |||
25 | def download_in_file(self, url, output_file_path): | ||
26 | """ Download the given URL and write to the given file """ | ||
27 | with open(output_file_path, "w") as output_file: | ||
28 | output_file.write(self.download(url)) | ||
29 | |||
30 | |||
31 | class WikimediaAPI(): | ||
32 | """ | ||
33 | Class used to generate wikimedia API urls for several uses | ||
34 | |||
35 | The endpoint for this project should be "http://en.wikipedia.org/w/api.php" | ||
36 | but can be other wiki api endpoint made with the Wikimedia software. | ||
37 | The return_format can be one of json, php, wddx, xml, yaml, raw, txt, dbg, | ||
38 | dump or none. | ||
39 | """ | ||
40 | def __init__(self, endpoint, return_format): | ||
41 | self.endpoint = endpoint | ||
42 | self.return_format = return_format | ||
43 | |||
44 | def get_recent_changes(self, namespace="(Main)"): | ||
45 | """ | ||
46 | Get the url corresponding to the latest changes made to the wiki. | ||
47 | (https://www.mediawiki.org/wiki/API:Recentchanges) | ||
48 | |||
49 | The namespace is used to restrict the results to a certain level. It | ||
50 | can be "(Main)" which is the default one, "Wikipedia", "File" or | ||
51 | others. See https://meta.wikimedia.org/wiki/Help:Namespace | ||
52 | """ | ||
53 | return self.base_url + "?action=query&list=recentchanges&format="\ | ||
54 | + self.return_format + "&namespace=" + namespace | ||