summaryrefslogtreecommitdiff
path: root/src/downloader/__init__.py
diff options
context:
space:
mode:
Diffstat (limited to 'src/downloader/__init__.py')
-rw-r--r--src/downloader/__init__.py54
1 files changed, 54 insertions, 0 deletions
diff --git a/src/downloader/__init__.py b/src/downloader/__init__.py
new file mode 100644
index 0000000..cfcf20a
--- /dev/null
+++ b/src/downloader/__init__.py
@@ -0,0 +1,54 @@
1"""
2Init file of the downloader module.
3
4The downloader module is used to take care of the downloading part of the
5program, including manipulation of the wikimedia API.
6"""
7
8import urllib.request
9# For system proxy
10import os
11
12
13class Downloader():
14 """Class used to download a given webpage considering system proxy"""
15 def __init__(self):
16 self.proxy_address = os.environ.get("HTTP_Proxy")
17 self.proxy = urllib.request.ProxyHandler({'http': self.proxy_address})
18 self.opener = urllib.request.build_opener(self.proxy)
19 urllib.request.install_opener(self.opener)
20
21 def download(self, url):
22 """ Download the given URL and return the source code """
23 return urllib.request.urlopen(url).read().decode("utf8")
24
25 def download_in_file(self, url, output_file_path):
26 """ Download the given URL and write to the given file """
27 with open(output_file_path, "w") as output_file:
28 output_file.write(self.download(url))
29
30
31class WikimediaAPI():
32 """
33 Class used to generate wikimedia API urls for several uses
34
35 The endpoint for this project should be "http://en.wikipedia.org/w/api.php"
36 but can be other wiki api endpoint made with the Wikimedia software.
37 The return_format can be one of json, php, wddx, xml, yaml, raw, txt, dbg,
38 dump or none.
39 """
40 def __init__(self, endpoint, return_format):
41 self.endpoint = endpoint
42 self.return_format = return_format
43
44 def get_recent_changes(self, namespace="(Main)"):
45 """
46 Get the url corresponding to the latest changes made to the wiki.
47 (https://www.mediawiki.org/wiki/API:Recentchanges)
48
49 The namespace is used to restrict the results to a certain level. It
50 can be "(Main)" which is the default one, "Wikipedia", "File" or
51 others. See https://meta.wikimedia.org/wiki/Help:Namespace
52 """
53 return self.base_url + "?action=query&list=recentchanges&format="\
54 + self.return_format + "&namespace=" + namespace