Last active
September 3, 2015 19:03
-
-
Save pmgreen/97170a14f814ad116f58 to your computer and use it in GitHub Desktop.
Get Hathi URLs
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python | |
| #-*- coding: utf-8 -*- | |
| """ | |
| Just get a csv report from the db that's populated by checky_urls.py | |
| from 20150903 | |
| pmg | |
| """ | |
| import csv | |
| import sqlite3 as lite | |
| cached = '' | |
| con = lite.connect('./db/sql_cache.db') | |
| header = ['bib', 'link', 'response'] | |
| counter = 0 | |
| with open('./out/cache_dump.csv','wb+') as outfile: | |
| writer = csv.writer(outfile) | |
| writer.writerow(header) | |
| with con: | |
| con.row_factory = lite.Row | |
| cur = con.cursor() | |
| cur.execute("SELECT * FROM items") | |
| rows = cur.fetchall() | |
| for row in rows: | |
| bib = row['bib_id'] | |
| link = row['url'] | |
| obj = row['obj_id'] | |
| newitem = bib, link, obj | |
| #print(newitem) | |
| with open('./out/cache_dump.csv','ab+') as outfile: | |
| writer = csv.writer(outfile) | |
| writer.writerow(newitem) | |
| counter += 1 | |
| if con: | |
| con.close() | |
| print(str(counter) + " records output.") |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python | |
| #-*- coding: utf-8 -*- | |
| """ | |
| hathi/check_urls. Get a list of urls in the form of e.g. | |
| http://hdl.handle.net/2027/njp.32101075715571 | |
| return Hathi record id, from the source code of the page, which are in the form of e.g. | |
| http://catalog.hathitrust.org/Record/008881996 | |
| NOTE: http://babel.hathitrust.org/robots.txt has: | |
| User-agent: * | |
| Crawl-delay: 1 | |
| Disallow: /cgi/ | |
| from 20150108 | |
| pmg | |
| """ | |
| import csv | |
| import requests | |
| import sqlite3 as lite | |
| import sys | |
| from lxml import html | |
| from time import sleep | |
| def main(): | |
| cached = '' | |
| con = lite.connect('./db/sql_cache.db') | |
| with open('./in/__HT_links_indiv.csv',"rb") as infile, open('./out/__HT_links_indiv_out.csv','ab+') as outfile: | |
| reader = csv.reader(infile) | |
| writer = csv.writer(outfile) | |
| next(reader, None) # skip the column names | |
| counter = 0 | |
| for line in reader: | |
| bibid = line[0] | |
| hdlurl = line[1] | |
| status = '' | |
| msg = '' | |
| with con: | |
| con.row_factory = lite.Row | |
| cur = con.cursor() | |
| cur.execute("SELECT bib_id FROM items WHERE bib_id=?",(bibid,)) | |
| rows = cur.fetchall() | |
| if len(rows) == 0: | |
| cached = None | |
| counter += 1 | |
| else: | |
| cached = True | |
| if cached is None: #and counter <= 1000: | |
| try: | |
| page = requests.get(hdlurl) | |
| except: | |
| etype,evalue,etraceback = sys.exc_info() | |
| print(str(evalue) + ' will try again shortly...') | |
| sleep(300) | |
| page = requests.get(hdlurl) | |
| if page.status_code == 200: | |
| tree = html.fromstring(page.text) | |
| itemid = tree.xpath('//span[@itemprop="url"]/text()') # there's only one of these per page | |
| objectid = tree.xpath('//a/@data-tracking-label') # will just grab the first one | |
| if objectid is not None: | |
| iid = itemid[0] | |
| oid = objectid[0] | |
| bid = bibid | |
| msg = 'ok' | |
| else: | |
| iid = itemid[0] | |
| oid = 'Hathi id not found' | |
| bid = bibid | |
| msg = 'id not found' | |
| else: | |
| iid = hdlurl | |
| oid = page.status_code | |
| bid = bibid | |
| msg = oid | |
| ## write to csv | |
| newline = iid,oid,bid | |
| writer.writerow(newline) | |
| ## write to cache | |
| newitem = (bid, oid, iid) | |
| try: | |
| cur.executemany("INSERT INTO items VALUES(?, ?, ?);", (newitem,)) | |
| print("%s %s %s %s" % (counter,bid, hdlurl,oid)) | |
| except: | |
| print('db error') | |
| counter += 1 | |
| sleep(1.5) # to be safe | |
| if con: | |
| con.close() | |
| if __name__ == "__main__": | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment