Created
August 21, 2012 23:05
-
-
Save jackiekazil/3420186 to your computer and use it in GitHub Desktop.
Simple demo of map_xml feature of pymarc library & comparing country value in folder of marc xml to records in django db
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
''' | |
Demo of an implementation of the pymarc library found here: | |
http://pypi.python.org/pypi/pymarc/ | |
What does this script do? | |
Returns a list of country counts in a folder of Marc xml records & | |
coorsponding records that exist in the current database. This is used to help | |
understand how close the Marc xml records are to the existing ChronAm dataset. | |
To run this file: python extract_cntry_from_worldcatrecs.py $path_to_bib_folder | |
Example: python extract_cntry_from_worldcatrecs.py 'bib_recs/bib' | |
#TODO: Make this more generic to use with other fields. | |
''' | |
import glob | |
import operator | |
import os | |
import sys | |
from pymarc import map_xml | |
from django.db.models import Count | |
from chronam.core.models import Country | |
# FOLDER should be the location of the marcxml files. | |
FOLDER = sys.argv[1] | |
marc_countries = {} | |
def get_record_cntry(record): | |
try: | |
country_code = record['008'].data[15:18].strip() | |
except TypeError: | |
# This is a nonetype we are choosing to ignore | |
return | |
try: | |
marc_countries[country_code]+=1 | |
except KeyError: | |
marc_countries[country_code]=1 | |
if __name__ == '__main__': | |
os.chdir(FOLDER) | |
for file in glob.glob("*.xml"): | |
map_xml(get_record_cntry, open(file, 'r')) | |
# This is where we output and compare to existing records. | |
print ', '.join(['Code','Name','# of recs from OCLC','# of titles in db','difference']) | |
db_countries = Country.objects.annotate(title_count=Count('title')) | |
for cntry in db_countries: | |
db_count = cntry.title_count | |
if not db_count: | |
continue | |
try: | |
marcxml_count = marc_countries[cntry.code] | |
except KeyError: | |
marcxml_count = 0 | |
name = cntry.name | |
diff = marcxml_count - db_count | |
output = [cntry.code, name, str(marcxml_count), str(db_count), str(diff)] | |
print ', '.join(output) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment