Skip to content

Instantly share code, notes, and snippets.

@dlebech
Last active June 8, 2019 07:11
Show Gist options
  • Save dlebech/1c17f80fdbbdbbf85a4a6b8c7f740376 to your computer and use it in GitHub Desktop.
Save dlebech/1c17f80fdbbdbbf85a4a6b8c7f740376 to your computer and use it in GitHub Desktop.
Extract photos and names of members of Danish parliament
# Public Domain CC0 license. https://creativecommons.org/publicdomain/zero/1.0/
# Run this file first, e.g.:
# $ scrapy runspider ft_extract.py -o members.json
#
# It will probably stop working if they change their urls for the contact list of course.
# Worked in Spring of 2019
import scrapy
import re
from urllib.parse import urlparse, urlunparse
title_matcher = re.compile(r'\((.*)\)')
def extract_image_url(image_src):
p = urlparse(image_src)
# Return the url without query string
return urlunparse((*p[0:4], '', ''))
def extract_party(title):
return title_matcher.findall(title)[0]
def extract_member(tr):
first_name = tr.css('td[data-title="Fornavn"] ::text').extract_first().strip()
if not first_name:
# Early stopping of parsing
return None
last_name = tr.css('td[data-title="Efternavn"] ::text').extract_first().strip()
title = tr.css('td[data-title="Titel"] ::text').extract_first().strip()
image_src = tr.css('img::attr(src)').extract_first()
return {
'name': f'{first_name} {last_name}',
'title': title,
'party': extract_party(title),
'photo': extract_image_url(image_src)
}
class FTSpider(scrapy.Spider):
name = 'ftspider'
start_urls = ['https://www.ft.dk/da/kontakt/telefonbog?pageSize=200&level1=MEDLEM']
def parse(self, response):
for tr in response.css('table>tr'):
member = extract_member(tr)
if member:
yield member
# Public Domain CC0 license. https://creativecommons.org/publicdomain/zero/1.0/
# After running ft_extract.py,
# this script downloads the member photos
# and organizes them in folders according to party affiliation
# and left/right association, according to the layout during Spring 2019.
import json
import os
import shutil
import requests
with open('members.json') as f:
members = json.load(f)
excludes = set(['T', 'IA', 'NQ', 'JF'])
blocks = {
'S': 'left',
'SF': 'left',
'EL': 'left',
'ALT': 'left',
'RV': 'left',
'LA': 'right',
'V': 'right',
'DF': 'right',
'KF': 'right'
}
for member in members:
if member['party'] in excludes:
continue
resp = requests.get(member['photo'])
name = member['name'].lower().replace(' ', '_')
filename1 = f'./images/parties/{member["party"]}/{name}.jpg'
filename2 = f'./images/blocks/{blocks[member["party"]]}/{name}.jpg'
os.makedirs(os.path.dirname(filename1), exist_ok=True)
os.makedirs(os.path.dirname(filename2), exist_ok=True)
with open(filename1, 'wb') as f:
f.write(resp.content)
with open(filename2, 'wb') as f:
f.write(resp.content)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment