nad2000 · July 7, 2017 06:07
diff --git a/NZ items in ORCID's public data files.py b/NZ items in ORCID's public data files.py
 # -*- coding: utf-8 -*-
 """
 Created on Wed Jul 05 11:53:20 2017

 Can we get json to do the same tricks as xml, and then pull into mongodb for reporting
 ORCID public data files are tar.gz with all json serially before xml, so can speed the process a wee bit by using the json
 The real question is whether the json library is quicker with json, than ETree is with xml 
 NB: version 2.0 of ORCID schema

 @author: Jason
 """
 import os, json
 import tarfile

 tarpath = 'D:/DATA/json_play.tar.gz'
 tar = tarfile.open(tarpath, 'r:gz')
 i = 0
 errorlog = open('D:/DATA/json_play.error.log', 'w')

 for tar_info in tar:

    i += 1
    tar.members = []  
    to_write = 0
    filename = tar_info.name
    
    # can stop when we hit the xml
    if filename.endswith('.xml'): 
        break
    if not filename.endswith('.json'): 
        continue
        
    # counter is just to monitor progress  
    if i % 1000 == 0 : print i, filename
    
    filename = filename.split("/")[-1]
    tarjsonfile = tar.extractfile(tar_info).read()

    data = json.loads(tarjsonfile)
    
    try:
        
        # check the person country for "NZ"
        
        try:
            if data['person']['addresses']['address'][0]['country']['value'] == "NZ":
                to_write = 1                   
        except:
            # person address doesn't exist            
            pass
        
        if to_write == 0:
            
            # loop through any education affiliations and check for country "NZ"
            
            try:
                for edu_organizations in data['activities-summary']['educations']['education-summary']:
                    if edu_organizations['organization']['address']['country'] == "NZ":
                        to_write = 1
                        break
            except:
                #no education affiliations
                pass               
    
        if to_write == 0:   
            
            # loop through any employment affiliations and check for country "NZ"
            
            try:
                for emp_organizations in data['activities-summary']['employments']['employment-summary']:
                    if emp_organizations['organization']['address']['country'] == "NZ":
                        to_write = 1
                        break
            except:
                #no employment affiliations
                pass 
        
        print filename + ', ' + str(to_write)
        
        if to_write == 1:  
            
            # Somewhere an NZ affiliation has been found
        
            with open(os.path.join('D:/DATA/json_write', filename), 'w') as outfile:            
                outfile.write(tarjsonfile)

    except:
        errorlog.write('Problem with: ' + filename + '\n')
        with open(os.path.join('D:/DATA/json_error', filename), 'w') as outfile:            
            outfile.write(tarjsonfile)

 tar.close()
 errorlog.close()
	# -- coding: utf-8 --
	"""
	Created on Wed Jul 05 11:53:20 2017

	Can we get json to do the same tricks as xml, and then pull into mongodb for reporting
	ORCID public data files are tar.gz with all json serially before xml, so can speed the process a wee bit by using the json
	The real question is whether the json library is quicker with json, than ETree is with xml
	NB: version 2.0 of ORCID schema

	@author: Jason
	"""
	import os, json
	import tarfile

	tarpath = 'D:/DATA/json_play.tar.gz'
	tar = tarfile.open(tarpath, 'r:gz')
	i = 0
	errorlog = open('D:/DATA/json_play.error.log', 'w')

	for tar_info in tar:

	i += 1
	tar.members = []
	to_write = 0
	filename = tar_info.name

	# can stop when we hit the xml
	if filename.endswith('.xml'):
	break
	if not filename.endswith('.json'):
	continue

	# counter is just to monitor progress
	if i % 1000 == 0 : print i, filename

	filename = filename.split("/")[-1]
	tarjsonfile = tar.extractfile(tar_info).read()

	data = json.loads(tarjsonfile)

	try:

	# check the person country for "NZ"

	try:
	if data['person']['addresses']['address'][0]['country']['value'] == "NZ":
	to_write = 1
	except:
	# person address doesn't exist
	pass

	if to_write == 0:

	# loop through any education affiliations and check for country "NZ"

	try:
	for edu_organizations in data['activities-summary']['educations']['education-summary']:
	if edu_organizations['organization']['address']['country'] == "NZ":
	to_write = 1
	break
	except:
	#no education affiliations
	pass

	if to_write == 0:

	# loop through any employment affiliations and check for country "NZ"

	try:
	for emp_organizations in data['activities-summary']['employments']['employment-summary']:
	if emp_organizations['organization']['address']['country'] == "NZ":
	to_write = 1
	break
	except:
	#no employment affiliations
	pass

	print filename + ', ' + str(to_write)

	if to_write == 1:

	# Somewhere an NZ affiliation has been found

	with open(os.path.join('D:/DATA/json_write', filename), 'w') as outfile:
	outfile.write(tarjsonfile)

	except:
	errorlog.write('Problem with: ' + filename + '\n')
	with open(os.path.join('D:/DATA/json_error', filename), 'w') as outfile:
	outfile.write(tarjsonfile)

	tar.close()
	errorlog.close()