diff options
Diffstat (limited to 'src/downloader')
-rw-r--r-- | src/downloader/__init__.py | 103 |
1 files changed, 85 insertions, 18 deletions
diff --git a/src/downloader/__init__.py b/src/downloader/__init__.py index 15fba41..3c7706f 100644 --- a/src/downloader/__init__.py +++ b/src/downloader/__init__.py | |||
@@ -7,6 +7,7 @@ program, including manipulation of the wikimedia API. | |||
7 | 7 | ||
8 | import urllib.request | 8 | import urllib.request |
9 | import urllib.parse | 9 | import urllib.parse |
10 | import json | ||
10 | 11 | ||
11 | 12 | ||
12 | class Downloader(): | 13 | class Downloader(): |
@@ -16,16 +17,61 @@ class Downloader(): | |||
16 | self.proxy = urllib.request.ProxyHandler(urllib.request.getproxies()) | 17 | self.proxy = urllib.request.ProxyHandler(urllib.request.getproxies()) |
17 | self.opener = urllib.request.build_opener(self.proxy) | 18 | self.opener = urllib.request.build_opener(self.proxy) |
18 | urllib.request.install_opener(self.opener) | 19 | urllib.request.install_opener(self.opener) |
20 | self.data_types = { | ||
21 | "revisions": "rv", | ||
22 | "recentchanges": "rc", | ||
23 | "contributors": "pc" | ||
24 | } | ||
25 | |||
26 | def download(self, endpoint, data_type, params={}, limit=0): | ||
27 | """ | ||
28 | Download the given URL with POST parameters and return the source code | ||
29 | with rccontinue support. If limit equals 0, then it suppose that the | ||
30 | limit is given whithin the paramaters | ||
31 | """ | ||
32 | if data_type in self.data_types.keys(): | ||
33 | prefix = self.data_types[data_type] | ||
34 | limit_name = prefix + "limit" | ||
35 | continue_name = prefix + "continue" | ||
36 | |||
37 | if limit == 0: | ||
38 | limit = params[limit_name] | ||
39 | else: | ||
40 | params[limit_name] = limit | ||
41 | |||
42 | if limit <= 500: | ||
43 | url = self.compile_url(endpoint, params) | ||
44 | result = urllib.request.urlopen(url).read().decode("utf8") | ||
45 | return [json.loads(result)] | ||
46 | else: | ||
47 | results = [] | ||
48 | # Each 500 limits | ||
49 | while limit > 0: | ||
50 | # Support for numbers like 1542 | ||
51 | if limit > 500: | ||
52 | limit -= 500 | ||
53 | temp_limit = 500 | ||
54 | else: | ||
55 | temp_limit = limit | ||
56 | limit = 0 | ||
19 | 57 | ||
20 | def download(self, url): | 58 | temp_result = self.download(endpoint, data_type, params, temp_limit) |
21 | """ Download the given URL and return the source code """ | 59 | if "query-continue" in temp_result[0].keys(): |
22 | return urllib.request.urlopen(url).read().decode("utf8") | 60 | params[continue_name] = temp_result[0]["query-continue"][data_type][continue_name] |
61 | else: | ||
62 | limit = 0 | ||
63 | results.append(temp_result) | ||
64 | return results | ||
23 | 65 | ||
24 | def download_in_file(self, url, output_file_path): | 66 | def download_in_file(self, url, output_file_path): |
25 | """ Download the given URL and write to the given file """ | 67 | """ Download the given URL and write to the given file """ |
26 | with open(output_file_path, "w") as output_file: | 68 | with open(output_file_path, "w") as output_file: |
27 | output_file.write(self.download(url)) | 69 | output_file.write(self.download(url)) |
28 | 70 | ||
71 | def compile_url(self, endpoint, params={}): | ||
72 | url_params_str = urllib.parse.urlencode(params) | ||
73 | return urllib.parse.urljoin(endpoint, "?" + url_params_str) | ||
74 | |||
29 | 75 | ||
30 | class WikimediaAPI(): | 76 | class WikimediaAPI(): |
31 | """ | 77 | """ |
@@ -45,18 +91,7 @@ class WikimediaAPI(): | |||
45 | """ | 91 | """ |
46 | self.endpoint = endpoint | 92 | self.endpoint = endpoint |
47 | self.return_format = return_format | 93 | self.return_format = return_format |
48 | 94 | self.namespaces = { | |
49 | def get_recent_changes(self, namespace="(Main)"): | ||
50 | """ | ||
51 | Get the url corresponding to the latest changes made to the wiki. | ||
52 | (https://www.mediawiki.org/wiki/API:Recentchanges) | ||
53 | |||
54 | The namespace is used to restrict the results to a certain level. It | ||
55 | can be (Main) which is the default one, "Wikipedia", "File" or | ||
56 | others. It will be converted to an int corresponding to the rcnamespace | ||
57 | parameter. See https://meta.wikimedia.org/wiki/Help:Namespace | ||
58 | """ | ||
59 | rcnamespaces = { | ||
60 | "(Main)": "0", | 95 | "(Main)": "0", |
61 | "Talk": "1", | 96 | "Talk": "1", |
62 | "User talk": "2", | 97 | "User talk": "2", |
@@ -88,11 +123,43 @@ class WikimediaAPI(): | |||
88 | "Topic": "2600" | 123 | "Topic": "2600" |
89 | } | 124 | } |
90 | 125 | ||
126 | def get_recent_changes(self, namespace="(Main)"): | ||
127 | """ | ||
128 | Get the url corresponding to the latest changes made to the wiki. | ||
129 | (https://www.mediawiki.org/wiki/API:Recentchanges) | ||
130 | |||
131 | The namespace is used to restrict the results to a certain level. It | ||
132 | can be (Main) which is the default one, "Wikipedia", "File" or | ||
133 | others. It will be converted to an int corresponding to the rcnamespace | ||
134 | parameter. See https://meta.wikimedia.org/wiki/Help:Namespace | ||
135 | """ | ||
136 | |||
91 | url_params = { | 137 | url_params = { |
92 | "action": "query", | 138 | "action": "query", |
93 | "list": "recentchanges", | 139 | "list": "recentchanges", |
94 | "format": self.return_format, | 140 | "format": self.return_format, |
95 | "rcnamespace": rcnamespaces[namespace], | 141 | "rcnamespace": self.namespaces[namespace], |
142 | } | ||
143 | return self.endpoint, url_params | ||
144 | |||
145 | def get_contributors(self, page="Main_Page", namespace="(Main)"): | ||
146 | """ | ||
147 | Get the url corresponding to the contributors of a given page or list | ||
148 | of pages. | ||
149 | (https://www.mediawiki.org/wiki/API:Properties#contributors_.2F_pc) | ||
150 | |||
151 | Use the 'page' parameter to specify the Wikipedia page(s) | ||
152 | |||
153 | The namespace is used to restrict the results to a certain level. It | ||
154 | can be (Main) which is the default one, "Wikipedia", "File" or | ||
155 | others. It will be converted to an int corresponding to the pcnamespace | ||
156 | parameter. See https://meta.wikimedia.org/wiki/Help:Namespace | ||
157 | """ | ||
158 | |||
159 | url_params = { | ||
160 | "action": "query", | ||
161 | "prop": "contributors", | ||
162 | "format": self.return_format, | ||
163 | "titles": page, | ||
96 | } | 164 | } |
97 | url_params_str = urllib.parse.urlencode(url_params) | 165 | return self.endpoint, url_params |
98 | return urllib.parse.urljoin(self.endpoint, "?" + url_params_str) | ||