dlebech · June 8, 2019 07:11
diff --git a/ft_extract.py b/ft_extract.py
 # Public Domain CC0 license. https://creativecommons.org/publicdomain/zero/1.0/

 # Run this file first, e.g.:
 # $ scrapy runspider ft_extract.py -o members.json
 #
 # It will probably stop working if they change their urls for the contact list of course.
 # Worked in Spring of 2019
 import scrapy
 import re
 from urllib.parse import urlparse, urlunparse

 title_matcher = re.compile(r'\((.*)\)')

 def extract_image_url(image_src):
    p = urlparse(image_src)
    # Return the url without query string
    return urlunparse((*p[0:4], '', ''))


 def extract_party(title):
    return title_matcher.findall(title)[0]


 def extract_member(tr):
    first_name = tr.css('td[data-title="Fornavn"] ::text').extract_first().strip()
    if not first_name:
        # Early stopping of parsing
        return None

    last_name = tr.css('td[data-title="Efternavn"] ::text').extract_first().strip()
    title = tr.css('td[data-title="Titel"] ::text').extract_first().strip()
    image_src = tr.css('img::attr(src)').extract_first()

    return {
        'name': f'{first_name} {last_name}',
        'title': title,
        'party': extract_party(title),
        'photo': extract_image_url(image_src)
    }


 class FTSpider(scrapy.Spider):
    name = 'ftspider'
    start_urls = ['https://www.ft.dk/da/kontakt/telefonbog?pageSize=200&level1=MEDLEM']

    def parse(self, response):
        for tr in response.css('table>tr'):
            member = extract_member(tr)
            if member:
                yield member
diff --git a/ft_photos.py b/ft_photos.py
 # Public Domain CC0 license. https://creativecommons.org/publicdomain/zero/1.0/

 # After running ft_extract.py,
 # this script downloads the member photos
 # and organizes them in folders according to party affiliation
 # and left/right association, according to the layout during Spring 2019.
 import json
 import os
 import shutil

 import requests

 with open('members.json') as f:
    members = json.load(f)

 excludes = set(['T', 'IA', 'NQ', 'JF'])

 blocks = {
    'S': 'left',
    'SF': 'left',
    'EL': 'left',
    'ALT': 'left',
    'RV': 'left',
    'LA': 'right',
    'V': 'right',
    'DF': 'right',
    'KF': 'right'
 }

 for member in members:
    if member['party'] in excludes:
        continue

    resp = requests.get(member['photo'])
    name = member['name'].lower().replace(' ', '_')
    filename1 = f'./images/parties/{member["party"]}/{name}.jpg'
    filename2 = f'./images/blocks/{blocks[member["party"]]}/{name}.jpg'
    os.makedirs(os.path.dirname(filename1), exist_ok=True)
    os.makedirs(os.path.dirname(filename2), exist_ok=True)

    with open(filename1, 'wb') as f:
        f.write(resp.content)
    with open(filename2, 'wb') as f:
        f.write(resp.content)
	# Public Domain CC0 license. https://creativecommons.org/publicdomain/zero/1.0/

	# Run this file first, e.g.:
	# $ scrapy runspider ft_extract.py -o members.json
	#
	# It will probably stop working if they change their urls for the contact list of course.
	# Worked in Spring of 2019
	import scrapy
	import re
	from urllib.parse import urlparse, urlunparse

	title_matcher = re.compile(r'\((.*)\)')

	def extract_image_url(image_src):
	p = urlparse(image_src)
	# Return the url without query string
	return urlunparse((*p[0:4], '', ''))


	def extract_party(title):
	return title_matcher.findall(title)[0]


	def extract_member(tr):
	first_name = tr.css('td[data-title="Fornavn"] ::text').extract_first().strip()
	if not first_name:
	# Early stopping of parsing
	return None

	last_name = tr.css('td[data-title="Efternavn"] ::text').extract_first().strip()
	title = tr.css('td[data-title="Titel"] ::text').extract_first().strip()
	image_src = tr.css('img::attr(src)').extract_first()

	return {
	'name': f'{first_name} {last_name}',
	'title': title,
	'party': extract_party(title),
	'photo': extract_image_url(image_src)
	}


	class FTSpider(scrapy.Spider):
	name = 'ftspider'
	start_urls = ['https://www.ft.dk/da/kontakt/telefonbog?pageSize=200&level1=MEDLEM']

	def parse(self, response):
	for tr in response.css('table>tr'):
	member = extract_member(tr)
	if member:
	yield member
	# Public Domain CC0 license. https://creativecommons.org/publicdomain/zero/1.0/

	# After running ft_extract.py,
	# this script downloads the member photos
	# and organizes them in folders according to party affiliation
	# and left/right association, according to the layout during Spring 2019.
	import json
	import os
	import shutil

	import requests

	with open('members.json') as f:
	members = json.load(f)

	excludes = set(['T', 'IA', 'NQ', 'JF'])

	blocks = {
	'S': 'left',
	'SF': 'left',
	'EL': 'left',
	'ALT': 'left',
	'RV': 'left',
	'LA': 'right',
	'V': 'right',
	'DF': 'right',
	'KF': 'right'
	}

	for member in members:
	if member['party'] in excludes:
	continue

	resp = requests.get(member['photo'])
	name = member['name'].lower().replace(' ', '_')
	filename1 = f'./images/parties/{member["party"]}/{name}.jpg'
	filename2 = f'./images/blocks/{blocks[member["party"]]}/{name}.jpg'
	os.makedirs(os.path.dirname(filename1), exist_ok=True)
	os.makedirs(os.path.dirname(filename2), exist_ok=True)

	with open(filename1, 'wb') as f:
	f.write(resp.content)
	with open(filename2, 'wb') as f:
	f.write(resp.content)