Skip to content

Instantly share code, notes, and snippets.

@DominicBM
Last active August 29, 2015 14:21
Show Gist options
  • Save DominicBM/4ec12aade94aef91c41f to your computer and use it in GitHub Desktop.
Save DominicBM/4ec12aade94aef91c41f to your computer and use it in GitHub Desktop.
import requests, json, csv, re, string, argparse
logfile = 'fields.csv'
offset = 0
while offset < 6000 :
r = requests.get('https://catalog.archives.gov/api/v1?type=topical-subject&rows=200&offset=' + str(offset)).text
r = re.sub('( *)(\"@user.*?)\"\$\"',r'\1\2\n\1"$"', r)
m = string.split(r, '\n')
newrows = set()
with open(logfile, 'r') as log :
readlog = csv.reader(log, delimiter= '\t', quoting=csv.QUOTE_ALL)
for row in readlog :
newrows.add(row[0])
log.close()
with open(logfile, 'ab+') as log :
writelog = csv.writer(log, delimiter= '\t', quoting=csv.QUOTE_ALL)
for row in m:
n = re.search('( *)\"(.*?)\":', row)
l = 0
try:
l = (len(n.group(1))-6)/2
f = '.' + n.group(2)
except AttributeError as e:
try:
n = re.search('( *){', row)
l = (len(n.group(1))-6)/2
f = ''
except AttributeError:
continue
if l is 1:
field0 = n.group(2)
if n.group(2) not in newrows :
newrows.add(n.group(2))
writelog.writerow( (n.group(2), ) )
if l is 2:
field1 = field0 + f
if field1 not in newrows :
newrows.add(field1)
writelog.writerow( (field1, ) )
if l is 3:
field2 = field1 + f
if field2 not in newrows :
newrows.add(field2)
writelog.writerow( (field2, ) )
if l is 4:
field3 = field2 + f
if field3 not in newrows :
newrows.add(field3)
writelog.writerow( (field3, ) )
if l is 5:
field4 = field3 + f
if field4 not in newrows :
newrows.add(field4)
writelog.writerow( (field4, ) )
if l is 6:
field5 = field4 + f
if field5 not in newrows :
newrows.add(field5)
writelog.writerow( (field5, ) )
if l is 7:
field6 = field5 + f
if field6 not in newrows :
newrows.add(field6)
writelog.writerow( (field6, ) )
if l is 8:
field7 = field6 + f
if field7 not in newrows :
newrows.add(field7)
writelog.writerow( (field7, ) )
if l is 9:
field8 = field7 + f
if field8 not in newrows :
newrows.add(field8)
writelog.writerow( (field8, ) )
if l is 10:
field9 = field8 + f
if field9 not in newrows :
newrows.add(field9)
writelog.writerow( (field9, ) )
if l is 11:
field10 = field9 + f
if field10 not in newrows :
newrows.add(field10)
writelog.writerow( (field10, ) )
if l is 12:
field11 = field10 + f
if field11 not in newrows :
newrows.add(field11)
writelog.writerow( (field11, ) )
offset = offset + 200
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment