Last active
June 8, 2019 07:11
-
-
Save dlebech/1c17f80fdbbdbbf85a4a6b8c7f740376 to your computer and use it in GitHub Desktop.
Extract photos and names of members of Danish parliament
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Public Domain CC0 license. https://creativecommons.org/publicdomain/zero/1.0/ | |
# Run this file first, e.g.: | |
# $ scrapy runspider ft_extract.py -o members.json | |
# | |
# It will probably stop working if they change their urls for the contact list of course. | |
# Worked in Spring of 2019 | |
import scrapy | |
import re | |
from urllib.parse import urlparse, urlunparse | |
title_matcher = re.compile(r'\((.*)\)') | |
def extract_image_url(image_src): | |
p = urlparse(image_src) | |
# Return the url without query string | |
return urlunparse((*p[0:4], '', '')) | |
def extract_party(title): | |
return title_matcher.findall(title)[0] | |
def extract_member(tr): | |
first_name = tr.css('td[data-title="Fornavn"] ::text').extract_first().strip() | |
if not first_name: | |
# Early stopping of parsing | |
return None | |
last_name = tr.css('td[data-title="Efternavn"] ::text').extract_first().strip() | |
title = tr.css('td[data-title="Titel"] ::text').extract_first().strip() | |
image_src = tr.css('img::attr(src)').extract_first() | |
return { | |
'name': f'{first_name} {last_name}', | |
'title': title, | |
'party': extract_party(title), | |
'photo': extract_image_url(image_src) | |
} | |
class FTSpider(scrapy.Spider): | |
name = 'ftspider' | |
start_urls = ['https://www.ft.dk/da/kontakt/telefonbog?pageSize=200&level1=MEDLEM'] | |
def parse(self, response): | |
for tr in response.css('table>tr'): | |
member = extract_member(tr) | |
if member: | |
yield member |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Public Domain CC0 license. https://creativecommons.org/publicdomain/zero/1.0/ | |
# After running ft_extract.py, | |
# this script downloads the member photos | |
# and organizes them in folders according to party affiliation | |
# and left/right association, according to the layout during Spring 2019. | |
import json | |
import os | |
import shutil | |
import requests | |
with open('members.json') as f: | |
members = json.load(f) | |
excludes = set(['T', 'IA', 'NQ', 'JF']) | |
blocks = { | |
'S': 'left', | |
'SF': 'left', | |
'EL': 'left', | |
'ALT': 'left', | |
'RV': 'left', | |
'LA': 'right', | |
'V': 'right', | |
'DF': 'right', | |
'KF': 'right' | |
} | |
for member in members: | |
if member['party'] in excludes: | |
continue | |
resp = requests.get(member['photo']) | |
name = member['name'].lower().replace(' ', '_') | |
filename1 = f'./images/parties/{member["party"]}/{name}.jpg' | |
filename2 = f'./images/blocks/{blocks[member["party"]]}/{name}.jpg' | |
os.makedirs(os.path.dirname(filename1), exist_ok=True) | |
os.makedirs(os.path.dirname(filename2), exist_ok=True) | |
with open(filename1, 'wb') as f: | |
f.write(resp.content) | |
with open(filename2, 'wb') as f: | |
f.write(resp.content) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment