Skip to content

Instantly share code, notes, and snippets.

@gsong
Created January 14, 2018 20:17
Show Gist options
  • Save gsong/fbdce327c9644a32c30c6135cb0235ff to your computer and use it in GitHub Desktop.
Save gsong/fbdce327c9644a32c30c6135cb0235ff to your computer and use it in GitHub Desktop.
Parse NYT where to go list
#!/usr/bin/env python
import csv
import json
import re
from collections import OrderedDict
from urllib.parse import quote
import attr
import click
import requests
from bs4 import BeautifulSoup
SOURCE_URL = (
'https://www.nytimes.com/interactive/2018/travel/places-to-visit.html'
)
MAP_URL = 'http://maps.google.com/?q='
SEARCH_URL = 'https://www.google.com/search?q='
FIELDS = OrderedDict([
('Location', 'label'),
('Summary', 'summary'),
('Blurb', 'blurb'),
('Facebook Discussion', 'fb_url'),
('Map', 'map_url'),
('Search', 'search_url'),
])
@click.command()
@click.argument('output-csv', type=click.Path())
def cli(output_csv):
response = requests.get(SOURCE_URL)
html_doc = response.text
locations = parse_locations(html_doc)
write_locations(locations, output_csv)
def parse_locations(html_doc):
soup = BeautifulSoup(html_doc, 'html.parser')
location_list = get_location_list(soup)
return location_list
def get_location_list(soup):
loc_list = soup.find(id='g-menulist')(lambda x: x.has_attr('data-slug'))
locations_data = parse_locations_data(soup)
locations = [build_location(l, locations_data) for l in loc_list[:-1]]
return locations
def parse_locations_data(soup):
scripts = soup('script')
script = scripts[9].get_text()
items = script.strip().split('\n')[8]
re_data = re.compile(r'return (?P<data>\[.*\])')
data = json.loads(re_data.search(items).group('data'))
return data
def build_location(location, locations_data):
slug = location['data-slug']
name = location.find(class_='g-menulist-name').string
data = next((d for d in locations_data if d['slug'] == slug))
return Location(name, slug, data)
def write_locations(locations, output_file):
with open(output_file, 'w+', encoding='utf-8') as csvfile:
fieldnames = list(FIELDS.keys())
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
for location in locations:
writer.writerow(location.as_dict())
@attr.s
class Location:
name = attr.ib()
slug = attr.ib()
data = attr.ib()
@property
def rank(self):
return self.data['rank']
@property
def summary(self):
return self.data['summary']
@property
def blurb(self):
return self.data['blurb']
@property
def label(self):
primary_name = self.data['primary_name']
secondary_name = self.data['secondary_name']
combined = ', '.join(filter(None, (primary_name, secondary_name)))
return combined
@property
def fb_url(self):
return self.data['facebook_url']
@property
def map_url(self):
url = MAP_URL + quote(self.label)
return url
@property
def search_url(self):
url = SEARCH_URL + quote(f"travel {self.label}")
return url
def as_dict(self):
result = {k: getattr(self, v) for k, v in FIELDS.items()}
return result
if __name__ == '__main__':
cli()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment