Created
January 14, 2018 20:17
-
-
Save gsong/fbdce327c9644a32c30c6135cb0235ff to your computer and use it in GitHub Desktop.
Parse NYT where to go list
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import csv | |
import json | |
import re | |
from collections import OrderedDict | |
from urllib.parse import quote | |
import attr | |
import click | |
import requests | |
from bs4 import BeautifulSoup | |
SOURCE_URL = ( | |
'https://www.nytimes.com/interactive/2018/travel/places-to-visit.html' | |
) | |
MAP_URL = 'http://maps.google.com/?q=' | |
SEARCH_URL = 'https://www.google.com/search?q=' | |
FIELDS = OrderedDict([ | |
('Location', 'label'), | |
('Summary', 'summary'), | |
('Blurb', 'blurb'), | |
('Facebook Discussion', 'fb_url'), | |
('Map', 'map_url'), | |
('Search', 'search_url'), | |
]) | |
@click.command() | |
@click.argument('output-csv', type=click.Path()) | |
def cli(output_csv): | |
response = requests.get(SOURCE_URL) | |
html_doc = response.text | |
locations = parse_locations(html_doc) | |
write_locations(locations, output_csv) | |
def parse_locations(html_doc): | |
soup = BeautifulSoup(html_doc, 'html.parser') | |
location_list = get_location_list(soup) | |
return location_list | |
def get_location_list(soup): | |
loc_list = soup.find(id='g-menulist')(lambda x: x.has_attr('data-slug')) | |
locations_data = parse_locations_data(soup) | |
locations = [build_location(l, locations_data) for l in loc_list[:-1]] | |
return locations | |
def parse_locations_data(soup): | |
scripts = soup('script') | |
script = scripts[9].get_text() | |
items = script.strip().split('\n')[8] | |
re_data = re.compile(r'return (?P<data>\[.*\])') | |
data = json.loads(re_data.search(items).group('data')) | |
return data | |
def build_location(location, locations_data): | |
slug = location['data-slug'] | |
name = location.find(class_='g-menulist-name').string | |
data = next((d for d in locations_data if d['slug'] == slug)) | |
return Location(name, slug, data) | |
def write_locations(locations, output_file): | |
with open(output_file, 'w+', encoding='utf-8') as csvfile: | |
fieldnames = list(FIELDS.keys()) | |
writer = csv.DictWriter(csvfile, fieldnames=fieldnames) | |
writer.writeheader() | |
for location in locations: | |
writer.writerow(location.as_dict()) | |
@attr.s | |
class Location: | |
name = attr.ib() | |
slug = attr.ib() | |
data = attr.ib() | |
@property | |
def rank(self): | |
return self.data['rank'] | |
@property | |
def summary(self): | |
return self.data['summary'] | |
@property | |
def blurb(self): | |
return self.data['blurb'] | |
@property | |
def label(self): | |
primary_name = self.data['primary_name'] | |
secondary_name = self.data['secondary_name'] | |
combined = ', '.join(filter(None, (primary_name, secondary_name))) | |
return combined | |
@property | |
def fb_url(self): | |
return self.data['facebook_url'] | |
@property | |
def map_url(self): | |
url = MAP_URL + quote(self.label) | |
return url | |
@property | |
def search_url(self): | |
url = SEARCH_URL + quote(f"travel {self.label}") | |
return url | |
def as_dict(self): | |
result = {k: getattr(self, v) for k, v in FIELDS.items()} | |
return result | |
if __name__ == '__main__': | |
cli() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment