summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--src/downloader/__init__.py64
1 files changed, 50 insertions, 14 deletions
diff --git a/src/downloader/__init__.py b/src/downloader/__init__.py
index cfcf20a..4dd6169 100644
--- a/src/downloader/__init__.py
+++ b/src/downloader/__init__.py
@@ -6,15 +6,13 @@ program, including manipulation of the wikimedia API.
6""" 6"""
7 7
8import urllib.request 8import urllib.request
9# For system proxy
10import os
11 9
12 10
13class Downloader(): 11class Downloader():
14 """Class used to download a given webpage considering system proxy""" 12 """Class used to download a given webpage considering system proxy"""
15 def __init__(self): 13 def __init__(self):
16 self.proxy_address = os.environ.get("HTTP_Proxy") 14 """ Downloader class constructor """
17 self.proxy = urllib.request.ProxyHandler({'http': self.proxy_address}) 15 self.proxy = urllib.request.ProxyHandler(urllib.request.getproxies())
18 self.opener = urllib.request.build_opener(self.proxy) 16 self.opener = urllib.request.build_opener(self.proxy)
19 urllib.request.install_opener(self.opener) 17 urllib.request.install_opener(self.opener)
20 18
@@ -31,13 +29,19 @@ class Downloader():
31class WikimediaAPI(): 29class WikimediaAPI():
32 """ 30 """
33 Class used to generate wikimedia API urls for several uses 31 Class used to generate wikimedia API urls for several uses
34
35 The endpoint for this project should be "http://en.wikipedia.org/w/api.php"
36 but can be other wiki api endpoint made with the Wikimedia software.
37 The return_format can be one of json, php, wddx, xml, yaml, raw, txt, dbg,
38 dump or none.
39 """ 32 """
40 def __init__(self, endpoint, return_format): 33 def __init__(self, endpoint="http://en.wikipedia.org/w/api.php",
34 return_format="json"):
35 """
36 WikimediaAPI class constructor
37
38 The endpoint for this project should be
39 "http://en.wikipedia.org/w/api.php" but it can be any other wiki
40 api endpoint made with the Wikimedia software.
41
42 The return_format can be one of json, php, wddx, xml, yaml, raw, txt,
43 dbg, dump or none.
44 """
41 self.endpoint = endpoint 45 self.endpoint = endpoint
42 self.return_format = return_format 46 self.return_format = return_format
43 47
@@ -47,8 +51,40 @@ class WikimediaAPI():
47 (https://www.mediawiki.org/wiki/API:Recentchanges) 51 (https://www.mediawiki.org/wiki/API:Recentchanges)
48 52
49 The namespace is used to restrict the results to a certain level. It 53 The namespace is used to restrict the results to a certain level. It
50 can be "(Main)" which is the default one, "Wikipedia", "File" or 54 can be (Main) which is the default one, "Wikipedia", "File" or
51 others. See https://meta.wikimedia.org/wiki/Help:Namespace 55 others. It will be converted to an int corresponding to the rcnamespace
56 parameter. See https://meta.wikimedia.org/wiki/Help:Namespace
52 """ 57 """
53 return self.base_url + "?action=query&list=recentchanges&format="\ 58 rcnamespaces = {
54 + self.return_format + "&namespace=" + namespace 59 "(Main)": "0",
60 "Talk": "1",
61 "User talk": "2",
62 "Wikipedia": "3",
63 "Wikipedia talk": "4",
64 "File": "5",
65 "File talk": "6",
66 "MediaWiki": "7",
67 "MediaWiki talk": "8",
68 "Template": "9",
69 "Template talk": "10",
70 "Help": "11",
71 "Help talk": "12",
72 "Category": "13",
73 "Category talk": "14",
74 # Custom Wikipedia namespaces
75 "Portal": "100",
76 "Portal talk": "101",
77 "Book": "108",
78 "Book talk": "109",
79 "Draft": "118",
80 "Draft talk": "119",
81 "Education Program": "446",
82 "Education Program talk": "447",
83 "TimedText": "710",
84 "TimedText talk": "711",
85 "Module": "828",
86 "Module talk": "829",
87 "Topic": "2600"
88 }
89 return self.endpoint + "?action=query&list=recentchanges&format="\
90 + self.return_format + "&namespace=" + rcnamespaces[namespace]