Last active
May 30, 2017 16:04
-
-
Save dasdachs/0697c9135888889d422cd528743ee0ed to your computer and use it in GitHub Desktop.
A simple scraper for sicris.si
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: UTF-8 -*- | |
#! usr/bin/python3 | |
__author__ = "Jani Šumak <[email protected]>" | |
__version__ = "1.0" | |
import datetime | |
import logging | |
import json | |
import time | |
from pymongo import MongoClient | |
import requests | |
# Start the script | |
start = time.clock() | |
# Setup loging | |
logger = logging.getLogger() | |
out = logging.StreamHandler() | |
f = logging.FileHandler("ReasearcherCrawler.log") | |
out_formatter = logging.Formatter( | |
"%(asctime)s - %(levelname)s -%(message)s" | |
) | |
f_formatter = logging.Formatter( | |
"%(asctime)s - %(levelname)s -%(message)s" | |
) | |
out.setFormatter(out_formatter) | |
f.setFormatter(f_formatter) | |
logger.addHandler(out) | |
logger.addHandler(f) | |
# Prepare the crawler | |
start_url = "http://www.sicris.si" | |
rest_url = "http://www.sicris.si/Common/rest.aspx?" | |
# Params are special because the API is not a api by | |
# by modern standars, so we'll format a url string | |
fields = "fields=" | |
session_id = "sessionID=1234CRIS12002B01B01A03IZUMBFICDOSKJHS588Nn44131" | |
entity= "entity=" | |
method_call = "methodCall=" | |
country = "country=SI_JSON" | |
# Get session ID | |
s = requests.Session() | |
s.headers.update({"User-Agent": "ReasearcherCrawler/1.0([email protected])"}) | |
r = s.get(start_url) | |
# Get reasearchers | |
r_fields = fields + "rsrid" | |
r_method_call = method_call + "auto=%20and%20lang=slv" | |
r_entity = entity + "RSR" | |
params = "&".join([r_fields, session_id, r_entity, r_method_call, country]) | |
r = s.get(rest_url + params) | |
reasearchers = r.text.splitlines()[-1] | |
reasearchers_list = json.loads(reasearchers) | |
# Setup the database | |
client = MongoClient() | |
db = client.db | |
# Get the reasearchersprint(reasearchers_list[:5]) | |
res_entity = entity + "rsr" | |
for res in reasearchers_list: | |
id = res["RSRID"] | |
res_method_call = "methodCall=id={} and lang=slv".format(id) | |
params = "&".join([fields, session_id, res_entity, res_method_call, country]) | |
# Wait a bit | |
time.sleep(5) | |
r = s.get(rest_url + params) | |
person = r.text.splitlines()[-1] | |
person_json = json.loads(person[1:-1]) | |
try: | |
db.researches.insert(person_json) | |
logger.info("Inserted %s into database" % id) | |
except: | |
logger.warning("Failed to insert %s" % id) | |
end = time.clock() | |
value = end - start | |
print("The program has ended") | |
print("Time: {}".format(timestamp.strftime('%Y-%m-%d %H:%M:%S'))) | |
print("Written {} JSON objects".format(len(reasearchers_list))) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment