Zensursula und negative Verantwortungsattribution - Netzpolitik
Zensursula und negative Verantwortungsattribution - Netzpolitik
Zensursula und negative Verantwortungsattribution - Netzpolitik
Erfolgreiche ePaper selbst erstellen
Machen Sie aus Ihren PDF Publikationen ein blätterbares Flipbook mit unserer einzigartigen Google optimierten e-Paper Software.
Anhang 1 – Der Topsy-Crawler<br />
Anhang 1 Der Topsy-Crawler<br />
# -*- coding: utf-8 -*-<br />
#!/usr/bin/python<br />
### Crawler for topsy<br />
### Keven Richly and Lutz Mache, 2012<br />
import datetime<br />
import json<br />
import sqlite3<br />
import threading<br />
import time<br />
import urllib2<br />
from multiprocessing import Queue<br />
### variables<br />
print_response_info = True<br />
print_request_info = True<br />
create_new_table = False<br />
request_window = 604800 * 10 # 604800 = 7 Tage<br />
max_request = 950<br />
db_path = '/…/main.sqlite'<br />
table_name = 'z_original_topsy_data'<br />
### request parameters<br />
tstart = 1217541600 #1. August 2008<br />
tend = 1325286000 # 31. Dezember 2011<br />
searchterm = 'zensursula'<br />
type = 'tweet'<br />
page = 1<br />
perpage = 100<br />
### classes<br />
class Crawler(threading.Thread):<br />
def __init__(self, t1, t2, results):<br />
self.t1 = t1<br />
self.t2 = t2<br />
self.results = results<br />
threading.Thread.__init__(self)<br />
# returns the request url corresponding to the defined parameters<br />
def get_request(self, start, end, term, page, type, perpage):<br />
return 'http://otter.topsy.com/search.json?q=' + searchterm + '&sort_method=date&perpage='<br />
+ str(perpage) + '&page=' + str(page) + '&mintime=' + str(start) + '&maxtime='<br />
+ str(end) + '&type=' + type<br />
class Page_Crawler(Crawler):<br />
def __init__(self, t1, t2, current_page, count, results, errors, window):<br />
self.errors = errors<br />
self.page = current_page<br />
self.count = count<br />
self.window = window<br />
Crawler.__init__(self, t1, t2, results)<br />
# executes the given query and catches exceptions<br />
def execute_query(self, query):<br />
try:<br />
return json.load(urllib2.urlopen(query))<br />
except Exception, e:<br />
print 'Error: query processing:'<br />
print e<br />
print 'actual time window was added to error list: '<br />
print 'min: ' + str(self.t1)<br />
print 'max: ' + str(self.t2)<br />
self.errors.put([self.t1, self.t2, self.page])<br />
def run(self):<br />
topsy_response = self.execute_query(get_request(self.t1, self.t2, searchterm,<br />
self.page, type, perpage))<br />
items_count = len(topsy_response['response']['list'])<br />
if (items_count > 0):<br />
self.results.put(topsy_response['response']['list'])<br />
self.count.put(items_count)<br />
class Timewindow_Crawler(Crawler):<br />
105/148