Last active
July 22, 2019 20:21
-
-
Save keithrozario/a071280dc2691e9f175959996053d656 to your computer and use it in GitHub Desktop.
Script to scrape PRU website for candidates
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
import re | |
from bs4 import BeautifulSoup | |
import json | |
import pru_14_json # https://calon.spr.gov.my/pru14_json.js | |
import time | |
import csv | |
import operator | |
# Full list of Parlimen seats, :https://calon.spr.gov.my/pru14_json.js | |
parlimen_seats = [] | |
for state in pru_14_json.json_data: | |
for seat in pru_14_json.json_data[state]['parlimen']: | |
parlimen_seats.append(seat) | |
# Full list of state seats | |
state_seats = [] | |
for state in pru_14_json.json_data: | |
for seat in pru_14_json.json_data[state]['dun']: | |
state_seats.append(seat) | |
headers = { | |
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', | |
'Accept-Encoding': 'gzip, deflate, br', | |
'Accept-Language': 'en-US,en;q=0.5', | |
'Cache-Control': 'max-age=0', | |
'Connection': 'keep-alive', | |
'Cookie': '_ga=GA1.3.1001500445.1524900877; _gid=GA1.3.593399681.1524900877; PHPSESSID=56433b80694baae3aad91e66563c3484', | |
'DNT': '1', | |
'Host': 'calon.spr.gov.my', | |
'Upgrade-Insecure-Requests': '1', | |
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:59.0) Gecko/20100101 Firefox/59.0' | |
} | |
# Initial get for first token | |
r = requests.post('https://calon.spr.gov.my/', headers=headers) | |
soup = BeautifulSoup(r.content, "html.parser") | |
token = soup.find('input', {'id': 'to_spr_ken'}).get('value') | |
# Ok let's get those names For parlimen | |
records = [] | |
for seat in parlimen_seats: | |
payload = {'kod': str(seat['id']), | |
'token': token} | |
r = requests.post(url='https://calon.spr.gov.my/ajax.php', data=payload, headers=headers, allow_redirects=True) | |
response_json = json.loads(r.text) | |
for calon in response_json['calon']: | |
record = { 'seat': seat, 'calon': calon, 'id': seat['kerusi_id']} | |
records.append(record) | |
print(record) | |
token = response_json['token'] # refresh token | |
time.sleep(1) # don't overload the site hashtag:#responsibleScraper | |
records.sort(key=operator.itemgetter('id')) # sort by seat id, e.g P.001 | |
# Write out to file | |
with open("election_results/parlimen.csv", "w") as csvfile: | |
csv_writer = csv.writer(csvfile, delimiter=',', | |
quotechar='"', quoting=csv.QUOTE_MINIMAL) | |
csv_writer.writerow(["Seat ID", | |
"Seat Name", | |
"Candidate Name", | |
"Candidate Ballot Name", | |
"Candidate Party"]) | |
for record in records: | |
csv_writer.writerow([record['seat']['kerusi_id'], | |
record['seat']['name'], | |
record['calon']['nama'], | |
record['calon']['nama_undi'], | |
record['calon']['parti'] ]) | |
# and now states | |
records = [] | |
for seat in state_seats: | |
payload = {'kod': str(seat['id']), | |
'token': token} | |
r = requests.post(url='https://calon.spr.gov.my/ajax.php', data=payload, headers=headers, allow_redirects=True) | |
response_json = json.loads(r.text) | |
for calon in response_json['calon']: | |
record = { 'seat': seat, 'calon': calon, 'id': seat['state_id']} | |
records.append(record) | |
print (record) | |
token = response_json['token'] # refresh token | |
time.sleep(1) # responsible Scraper | |
records.sort(key=operator.itemgetter('id')) # sort by state_id | |
with open("election_results/state.csv", "w") as csvfile: | |
csv_writer = csv.writer(csvfile, delimiter=',', | |
quotechar='"', quoting=csv.QUOTE_MINIMAL) | |
csv_writer.writerow(["State", | |
"Seat ID", | |
"Seat Name", | |
"Candidate Name", | |
"Candidate Ballot Name", | |
"Candidate Party"]) | |
for record in records: | |
csv_writer.writerow([pru_14_json.state_mapping[record['seat']['state_id']], | |
record['seat']['kerusi_id'], | |
record['seat']['name'], | |
record['calon']['nama'], | |
record['calon']['nama_undi'], | |
record['calon']['parti'] ]) | |
# end like a boss | |
print("Keith is awesome!") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Updated much better version that used csv module instead of crafting csv 'by-hand' :)
Also included the full js from PRU website with all mappings and id.