Skip to content

Instantly share code, notes, and snippets.

@simeonf
Last active May 26, 2019 16:23
Show Gist options
  • Save simeonf/b89c9a1a021973f7266d0107514f1638 to your computer and use it in GitHub Desktop.
Save simeonf/b89c9a1a021973f7266d0107514f1638 to your computer and use it in GitHub Desktop.
"""
Open a .csv of article listings and print a .csv of article listings including url.
Sample .csv content:
"Slain with Ax and Pistol", San Francisco Call, May 28, 1896, 1.
Sample run:
$ python3 article.py article.csv
https://chroniclingamerica.loc.gov/lccn/sn85066387/1896-05-28/ed-1/seq-1/,Slain with Ax and Pistol,San Francisco Call,May 28,1896,1.
Code is plain python3 with no additional packages required.
"""
import csv
import ssl
import sys
import re
import urllib.request as r
from urllib import parse
domain = "https://chroniclingamerica.loc.gov"
search_url = "{domain}/search/pages/results/?state=&date1={year}&date2={year}&proxtext={title}&x=15&y=7&dateFilterType=yearRange&rows=20&searchType=basic"
def main():
with open(sys.argv[-1]) as fp:
reader = csv.reader(fp.readlines())
# Using regexes to parse HTML to avoid install lxml
anchor_pat = re.compile('<a [^>]+>')
href_pat = re.compile('href="([^"]+)"')
newcsv = []
for line in reader:
line = list(map(str.strip, line))
search = search_url.format(domain=domain, year=line[-2], title=parse.quote(line[0]))
# Don't check SSL cert in case that's messed up for you
fp = r.urlopen(search, context=ssl.SSLContext())
links = anchor_pat.findall(fp.read().decode())
links = [link for link in links if '/lccn/' in link] # Did we find a results link?
if links:
link = links[0] # first link
match = href_pat.search(link)
if match:
url = match[1]
url, query_params = url.split("#") # remove trailing query params
line.insert(0, domain + url)
newcsv.append(line)
writer = csv.writer(sys.stdout)
writer.writerows(newcsv)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment