Last active
May 26, 2019 16:23
-
-
Save simeonf/b89c9a1a021973f7266d0107514f1638 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Open a .csv of article listings and print a .csv of article listings including url. | |
Sample .csv content: | |
"Slain with Ax and Pistol", San Francisco Call, May 28, 1896, 1. | |
Sample run: | |
$ python3 article.py article.csv | |
https://chroniclingamerica.loc.gov/lccn/sn85066387/1896-05-28/ed-1/seq-1/,Slain with Ax and Pistol,San Francisco Call,May 28,1896,1. | |
Code is plain python3 with no additional packages required. | |
""" | |
import csv | |
import ssl | |
import sys | |
import re | |
import urllib.request as r | |
from urllib import parse | |
domain = "https://chroniclingamerica.loc.gov" | |
search_url = "{domain}/search/pages/results/?state=&date1={year}&date2={year}&proxtext={title}&x=15&y=7&dateFilterType=yearRange&rows=20&searchType=basic" | |
def main(): | |
with open(sys.argv[-1]) as fp: | |
reader = csv.reader(fp.readlines()) | |
# Using regexes to parse HTML to avoid install lxml | |
anchor_pat = re.compile('<a [^>]+>') | |
href_pat = re.compile('href="([^"]+)"') | |
newcsv = [] | |
for line in reader: | |
line = list(map(str.strip, line)) | |
search = search_url.format(domain=domain, year=line[-2], title=parse.quote(line[0])) | |
# Don't check SSL cert in case that's messed up for you | |
fp = r.urlopen(search, context=ssl.SSLContext()) | |
links = anchor_pat.findall(fp.read().decode()) | |
links = [link for link in links if '/lccn/' in link] # Did we find a results link? | |
if links: | |
link = links[0] # first link | |
match = href_pat.search(link) | |
if match: | |
url = match[1] | |
url, query_params = url.split("#") # remove trailing query params | |
line.insert(0, domain + url) | |
newcsv.append(line) | |
writer = csv.writer(sys.stdout) | |
writer.writerows(newcsv) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment