Skip to content

Instantly share code, notes, and snippets.

@pmgreen
Last active September 3, 2015 19:03
Show Gist options
  • Select an option

  • Save pmgreen/97170a14f814ad116f58 to your computer and use it in GitHub Desktop.

Select an option

Save pmgreen/97170a14f814ad116f58 to your computer and use it in GitHub Desktop.
Get Hathi URLs
#!/usr/bin/env python
#-*- coding: utf-8 -*-
"""
Just get a csv report from the db that's populated by checky_urls.py
from 20150903
pmg
"""
import csv
import sqlite3 as lite
cached = ''
con = lite.connect('./db/sql_cache.db')
header = ['bib', 'link', 'response']
counter = 0
with open('./out/cache_dump.csv','wb+') as outfile:
writer = csv.writer(outfile)
writer.writerow(header)
with con:
con.row_factory = lite.Row
cur = con.cursor()
cur.execute("SELECT * FROM items")
rows = cur.fetchall()
for row in rows:
bib = row['bib_id']
link = row['url']
obj = row['obj_id']
newitem = bib, link, obj
#print(newitem)
with open('./out/cache_dump.csv','ab+') as outfile:
writer = csv.writer(outfile)
writer.writerow(newitem)
counter += 1
if con:
con.close()
print(str(counter) + " records output.")
#!/usr/bin/env python
#-*- coding: utf-8 -*-
"""
hathi/check_urls. Get a list of urls in the form of e.g.
http://hdl.handle.net/2027/njp.32101075715571
return Hathi record id, from the source code of the page, which are in the form of e.g.
http://catalog.hathitrust.org/Record/008881996
NOTE: http://babel.hathitrust.org/robots.txt has:
User-agent: *
Crawl-delay: 1
Disallow: /cgi/
from 20150108
pmg
"""
import csv
import requests
import sqlite3 as lite
import sys
from lxml import html
from time import sleep
def main():
cached = ''
con = lite.connect('./db/sql_cache.db')
with open('./in/__HT_links_indiv.csv',"rb") as infile, open('./out/__HT_links_indiv_out.csv','ab+') as outfile:
reader = csv.reader(infile)
writer = csv.writer(outfile)
next(reader, None) # skip the column names
counter = 0
for line in reader:
bibid = line[0]
hdlurl = line[1]
status = ''
msg = ''
with con:
con.row_factory = lite.Row
cur = con.cursor()
cur.execute("SELECT bib_id FROM items WHERE bib_id=?",(bibid,))
rows = cur.fetchall()
if len(rows) == 0:
cached = None
counter += 1
else:
cached = True
if cached is None: #and counter <= 1000:
try:
page = requests.get(hdlurl)
except:
etype,evalue,etraceback = sys.exc_info()
print(str(evalue) + ' will try again shortly...')
sleep(300)
page = requests.get(hdlurl)
if page.status_code == 200:
tree = html.fromstring(page.text)
itemid = tree.xpath('//span[@itemprop="url"]/text()') # there's only one of these per page
objectid = tree.xpath('//a/@data-tracking-label') # will just grab the first one
if objectid is not None:
iid = itemid[0]
oid = objectid[0]
bid = bibid
msg = 'ok'
else:
iid = itemid[0]
oid = 'Hathi id not found'
bid = bibid
msg = 'id not found'
else:
iid = hdlurl
oid = page.status_code
bid = bibid
msg = oid
## write to csv
newline = iid,oid,bid
writer.writerow(newline)
## write to cache
newitem = (bid, oid, iid)
try:
cur.executemany("INSERT INTO items VALUES(?, ?, ?);", (newitem,))
print("%s %s %s %s" % (counter,bid, hdlurl,oid))
except:
print('db error')
counter += 1
sleep(1.5) # to be safe
if con:
con.close()
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment