simeonf · May 26, 2019 16:23
diff --git a/article.py b/article.py
 """
 Open a .csv of article listings and print a .csv of article listings including url.

 Sample .csv content:

   "Slain with Ax and Pistol", San Francisco Call, May 28, 1896, 1.

 Sample run:

 $ python3 article.py article.csv
 https://chroniclingamerica.loc.gov/lccn/sn85066387/1896-05-28/ed-1/seq-1/,Slain with Ax and Pistol,San Francisco Call,May 28,1896,1.

 Code is plain python3 with no additional packages required.
 """

 import csv
 import ssl
 import sys
 import re
 import urllib.request as r
 from  urllib import parse

 domain = "https://chroniclingamerica.loc.gov"
 search_url = "{domain}/search/pages/results/?state=&date1={year}&date2={year}&proxtext={title}&x=15&y=7&dateFilterType=yearRange&rows=20&searchType=basic"

 def main():
  with open(sys.argv[-1]) as fp:
    reader = csv.reader(fp.readlines())
  # Using regexes to parse HTML to avoid install lxml
  anchor_pat = re.compile('<a [^>]+>')
  href_pat = re.compile('href="([^"]+)"')
  newcsv = []
  for line in reader:
    line = list(map(str.strip, line))
    search = search_url.format(domain=domain, year=line[-2], title=parse.quote(line[0]))
    # Don't check SSL cert in case that's messed up for you
    fp = r.urlopen(search, context=ssl.SSLContext())
    links = anchor_pat.findall(fp.read().decode())
    links = [link for link in links if '/lccn/' in link]  # Did we find a results link?
    if links:
      link = links[0]  # first link
      match = href_pat.search(link)
      if match:
        url = match[1]
        url, query_params = url.split("#")  # remove trailing query params
        line.insert(0, domain + url)
        newcsv.append(line)
    writer = csv.writer(sys.stdout)
    writer.writerows(newcsv)


 if __name__ == '__main__':
  main()
	"""
	Open a .csv of article listings and print a .csv of article listings including url.

	Sample .csv content:

	"Slain with Ax and Pistol", San Francisco Call, May 28, 1896, 1.

	Sample run:

	$ python3 article.py article.csv
	https://chroniclingamerica.loc.gov/lccn/sn85066387/1896-05-28/ed-1/seq-1/,Slain with Ax and Pistol,San Francisco Call,May 28,1896,1.

	Code is plain python3 with no additional packages required.
	"""

	import csv
	import ssl
	import sys
	import re
	import urllib.request as r
	from urllib import parse

	domain = "https://chroniclingamerica.loc.gov"
	search_url = "{domain}/search/pages/results/?state=&date1={year}&date2={year}&proxtext={title}&x=15&y=7&dateFilterType=yearRange&rows=20&searchType=basic"

	def main():
	with open(sys.argv[-1]) as fp:
	reader = csv.reader(fp.readlines())
	# Using regexes to parse HTML to avoid install lxml
	anchor_pat = re.compile('<a [^>]+>')
	href_pat = re.compile('href="([^"]+)"')
	newcsv = []
	for line in reader:
	line = list(map(str.strip, line))
	search = search_url.format(domain=domain, year=line[-2], title=parse.quote(line[0]))
	# Don't check SSL cert in case that's messed up for you
	fp = r.urlopen(search, context=ssl.SSLContext())
	links = anchor_pat.findall(fp.read().decode())
	links = [link for link in links if '/lccn/' in link] # Did we find a results link?
	if links:
	link = links[0] # first link
	match = href_pat.search(link)
	if match:
	url = match[1]
	url, query_params = url.split("#") # remove trailing query params
	line.insert(0, domain + url)
	newcsv.append(line)
	writer = csv.writer(sys.stdout)
	writer.writerows(newcsv)


	if __name__ == '__main__':
	main()