From 9539c83b76244e21a562143fc1c22c3000968e5d Mon Sep 17 00:00:00 2001 From: Minijackson Date: Thu, 23 Oct 2014 20:43:03 +0200 Subject: Adding gathering of data (watchers, revisions) + beginnig of histogram --- src/downloader/__init__.py | 1 - src/histogram/__init__.py | 19 +++++++++++++ src/main.py | 68 ++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 87 insertions(+), 1 deletion(-) create mode 100644 src/histogram/__init__.py create mode 100644 src/main.py (limited to 'src') diff --git a/src/downloader/__init__.py b/src/downloader/__init__.py index baabb36..831ea06 100644 --- a/src/downloader/__init__.py +++ b/src/downloader/__init__.py @@ -57,7 +57,6 @@ class Downloader(): limit = 0 temp_result = self.download(endpoint, data_type, params, temp_limit) - print(list(temp_result[0].keys())) if "query-continue" in temp_result[0].keys(): params[continue_name] = temp_result[0]["query-continue"][data_type][continue_name] else: diff --git a/src/histogram/__init__.py b/src/histogram/__init__.py new file mode 100644 index 0000000..a01612f --- /dev/null +++ b/src/histogram/__init__.py @@ -0,0 +1,19 @@ +""" +Module used to generate the histogram +""" +import pygal + + +class Histogram(): + """Class used to generate an Histogram""" + def __init__(self, outfile, title="", data=""): + self.outfile = outfile + self.title = title + self.data = data + self.hist = pygal.Histogram() + + def feed(self, data): + self.data = data + + def generate(self): + pass diff --git a/src/main.py b/src/main.py new file mode 100644 index 0000000..304e089 --- /dev/null +++ b/src/main.py @@ -0,0 +1,68 @@ +#!/usr/bin/env python +# −*− coding: UTF−8 −*− + +import downloader +import histogram +import json + +# Constants +RES_DIR = "../res/" + +# Gather data +api = downloader.WikimediaAPI() +down = downloader.Downloader() + +endpoint, geosearch_params = api.get_pages_around() +geosearch_results = down.download(endpoint, "geosearch", + geosearch_params, limit=50) + +pages_title = [] +for result_fragment in geosearch_results: + for page_properties in result_fragment["query"]["geosearch"]: + pages_title.append(page_properties["title"]) + +with open(RES_DIR + "Pages", "w") as f: + f.write('\n'.join(pages_title)) + +data_count = len(pages_title) +data = {} +i = 1 + +for page in pages_title: + print("\rGathering data, please wait: " + str(100*i/data_count) + "%", end="") + endpoint, watchers_params = api.get_watchers(page) + endpoint, revisions_params = api.get_revisions(page) + watchers_results = down.download(endpoint, "info", + watchers_params, limit=500) + revisions_results = down.download(endpoint, "revisions", + revisions_params, limit=100000) + for page_id, page in watchers_results[0]["query"]["pages"].items(): + page_title = page["title"] + if "watchers" in page.keys(): + page_watchers = page["watchers"] + else: + page_watchers = 0 + + page_revisions = 0 + for revisions_result in revisions_results: + for page_id, page in revisions_result["query"]["pages"].items(): + page_revisions += len(page["revisions"]) + + data[page_title] = { + "watchers": page_watchers, + "revisions": page_revisions + } + i += 1 + +with open(RES_DIR + "data-out", "w") as f: + f.write(json.dumps(data, indent=4, ensure_ascii=False)) + +# Make histograms +watcher_revisions = histogram.Histogram("../res/warev.svg", + title="Nombre de watchers par rapport \ + au nombre de révisions") + +watcher_contributers = histogram.Histogram("../res/waco.svg", + title="Nombre de watchers par \ + rapport au nombre de \ + contributeurs") -- cgit v1.2.3