pmgreen · September 3, 2015 19:03
diff --git a/cache_dump.py b/cache_dump.py
 #!/usr/bin/env python
 #-*- coding: utf-8 -*-
 """
 Just get a csv report from the db that's populated by checky_urls.py
 from 20150903
 pmg
 """
 import csv
 import sqlite3 as lite

 cached = ''
 con = lite.connect('./db/sql_cache.db')

 header = ['bib', 'link', 'response']

 counter = 0
 with open('./out/cache_dump.csv','wb+') as outfile:
 		writer = csv.writer(outfile)
 		writer.writerow(header)

 with con:
 	con.row_factory = lite.Row
 	cur = con.cursor()
 	cur.execute("SELECT * FROM items")
 	rows = cur.fetchall()
 	for row in rows:
 		bib = row['bib_id']
 		link = row['url']
 		obj = row['obj_id']
 				
 		newitem = bib, link, obj
 		#print(newitem)
 		with open('./out/cache_dump.csv','ab+') as outfile:
 			writer = csv.writer(outfile)
 			writer.writerow(newitem)
 			counter += 1

 if con:
 	con.close()
 	
 print(str(counter) + " records output.")
diff --git a/get_hathi_url.py b/get_hathi_url.py
 #!/usr/bin/env python
 #-*- coding: utf-8 -*-
 """
 hathi/check_urls. Get a list of urls in the form of e.g.
 http://hdl.handle.net/2027/njp.32101075715571
 return Hathi record id, from the source code of the page, which are in the form of e.g.
 http://catalog.hathitrust.org/Record/008881996 
 NOTE: http://babel.hathitrust.org/robots.txt has: 
 	User-agent: *
 	Crawl-delay: 1
 	Disallow: /cgi/
 from 20150108
 pmg
 """
 import csv
 import requests
 import sqlite3 as lite
 import sys
 from lxml import html
 from time import sleep


 def main():

 	cached = ''
 	con = lite.connect('./db/sql_cache.db')

 	with open('./in/__HT_links_indiv.csv',"rb") as infile, open('./out/__HT_links_indiv_out.csv','ab+') as outfile:
 		reader = csv.reader(infile)
 		writer = csv.writer(outfile)
 		next(reader, None)  # skip the column names
 			
 		counter = 0
 		
 		for line in reader:
 			bibid = line[0]
 			hdlurl = line[1]
 			status = ''
 			msg = ''
 			
 			with con:
 				con.row_factory = lite.Row
 				cur = con.cursor()
 				cur.execute("SELECT bib_id FROM items WHERE bib_id=?",(bibid,))
 				rows = cur.fetchall()
 				if len(rows) == 0:
 					cached = None
 					counter += 1
 				else:
 					cached = True
 					
 			if cached is None: #and counter <= 1000:
 				
 				try:
 					page = requests.get(hdlurl)
 				except:
 					etype,evalue,etraceback = sys.exc_info()
 					print(str(evalue) + ' will try again shortly...')
 					sleep(300)
 					page = requests.get(hdlurl)
 				
 				if page.status_code == 200:
 					tree = html.fromstring(page.text)
 					itemid = tree.xpath('//span[@itemprop="url"]/text()') # there's only one of these per page
 					objectid = tree.xpath('//a/@data-tracking-label') # will just grab the first one
 					
 					if objectid is not None:
 						iid = itemid[0]
 						oid = objectid[0]
 						bid = bibid
 						msg = 'ok'
 					else:
 						iid = itemid[0]
 						oid = 'Hathi id not found'
 						bid = bibid
 						msg = 'id not found'
 						
 				else:
 					iid = hdlurl
 					oid = page.status_code
 					bid = bibid
 					msg = oid
 					
 				## write to csv
 				newline = iid,oid,bid
 				writer.writerow(newline)
 				
 				## write to cache
 				newitem = (bid, oid, iid)
 				try:
 					cur.executemany("INSERT INTO items VALUES(?, ?, ?);", (newitem,))
 					print("%s %s %s %s" % (counter,bid, hdlurl,oid))
 				except:
 					print('db error')
 			
 				counter += 1
 				
 				sleep(1.5) # to be safe
 	
 	if con:
 		con.close()
 	
 if __name__ == "__main__":
 	main()
	#!/usr/bin/env python
	#-- coding: utf-8 --
	"""
	Just get a csv report from the db that's populated by checky_urls.py
	from 20150903
	pmg
	"""
	import csv
	import sqlite3 as lite

	cached = ''
	con = lite.connect('./db/sql_cache.db')

	header = ['bib', 'link', 'response']

	counter = 0
	with open('./out/cache_dump.csv','wb+') as outfile:
	writer = csv.writer(outfile)
	writer.writerow(header)

	with con:
	con.row_factory = lite.Row
	cur = con.cursor()
	cur.execute("SELECT * FROM items")
	rows = cur.fetchall()
	for row in rows:
	bib = row['bib_id']
	link = row['url']
	obj = row['obj_id']

	newitem = bib, link, obj
	#print(newitem)
	with open('./out/cache_dump.csv','ab+') as outfile:
	writer = csv.writer(outfile)
	writer.writerow(newitem)
	counter += 1

	if con:
	con.close()

	print(str(counter) + " records output.")
	#!/usr/bin/env python
	#-- coding: utf-8 --
	"""
	hathi/check_urls. Get a list of urls in the form of e.g.
	http://hdl.handle.net/2027/njp.32101075715571
	return Hathi record id, from the source code of the page, which are in the form of e.g.
	http://catalog.hathitrust.org/Record/008881996
	NOTE: http://babel.hathitrust.org/robots.txt has:
	User-agent: *
	Crawl-delay: 1
	Disallow: /cgi/
	from 20150108
	pmg
	"""
	import csv
	import requests
	import sqlite3 as lite
	import sys
	from lxml import html
	from time import sleep


	def main():

	cached = ''
	con = lite.connect('./db/sql_cache.db')

	with open('./in/__HT_links_indiv.csv',"rb") as infile, open('./out/__HT_links_indiv_out.csv','ab+') as outfile:
	reader = csv.reader(infile)
	writer = csv.writer(outfile)
	next(reader, None) # skip the column names

	counter = 0

	for line in reader:
	bibid = line[0]
	hdlurl = line[1]
	status = ''
	msg = ''

	with con:
	con.row_factory = lite.Row
	cur = con.cursor()
	cur.execute("SELECT bib_id FROM items WHERE bib_id=?",(bibid,))
	rows = cur.fetchall()
	if len(rows) == 0:
	cached = None
	counter += 1
	else:
	cached = True

	if cached is None: #and counter <= 1000:

	try:
	page = requests.get(hdlurl)
	except:
	etype,evalue,etraceback = sys.exc_info()
	print(str(evalue) + ' will try again shortly...')
	sleep(300)
	page = requests.get(hdlurl)

	if page.status_code == 200:
	tree = html.fromstring(page.text)
	itemid = tree.xpath('//span[@itemprop="url"]/text()') # there's only one of these per page
	objectid = tree.xpath('//a/@data-tracking-label') # will just grab the first one

	if objectid is not None:
	iid = itemid[0]
	oid = objectid[0]
	bid = bibid
	msg = 'ok'
	else:
	iid = itemid[0]
	oid = 'Hathi id not found'
	bid = bibid
	msg = 'id not found'

	else:
	iid = hdlurl
	oid = page.status_code
	bid = bibid
	msg = oid

	## write to csv
	newline = iid,oid,bid
	writer.writerow(newline)

	## write to cache
	newitem = (bid, oid, iid)
	try:
	cur.executemany("INSERT INTO items VALUES(?, ?, ?);", (newitem,))
	print("%s %s %s %s" % (counter,bid, hdlurl,oid))
	except:
	print('db error')

	counter += 1

	sleep(1.5) # to be safe

	if con:
	con.close()

	if __name__ == "__main__":
	main()