Last active
November 9, 2015 19:46
-
-
Save lokal-profil/f52b92fac9b9badd3d77 to your computer and use it in GitHub Desktop.
LSH: Script for a one-time replacement of upper case connections in Materials, Keywords and ObjKeywords
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/python | |
| # -*- coding: utf-8 -*- | |
| # | |
| # Cleanup function for a one-time replacement of upper case connections | |
| # in Materials, Keywords and ObjKeywords | |
| # | |
| import helpers | |
| import codecs | |
| import os | |
| def run(inPath=u'old_connections', outPath=u'old_connections'): | |
| files = {'keywords': {'filename': 'commons-Keywords.csv', 'cols': 4}, | |
| 'material': {'filename': 'commons-Materials.csv', 'cols': 3}, | |
| 'objKeywords': {'filename': 'commons-ObjKeywords.csv', 'cols': 3}} | |
| if not os.path.isdir(outPath): | |
| os.mkdir(outPath) | |
| for k, v in files.iteritems(): | |
| print 'Processing %s...' % k | |
| f = codecs.open(os.path.join(inPath, v['filename']), 'r', 'utf-8') | |
| lines = f.read().split('\n') | |
| lines.pop() | |
| f = codecs.open(os.path.join(outPath, v['filename']), 'w', 'utf-8') | |
| entries = cleanup(lines, v['cols']) | |
| for key, val in helpers.sortedBy(entries): | |
| if v['cols'] == 3: | |
| f.write(u'*%s|%s|%s\n' % (key, val['freq'], val['connection'])) | |
| else: | |
| f.write(u'*%s|%s|%s|%s\n' % (key, val['freq'], | |
| ' '.join(val['extra']), | |
| val['connection'])) | |
| f.close() | |
| def cleanup(lines, cols): | |
| """ | |
| Given the lines of a connection file | |
| * convert to lower case | |
| * merge any duplicates | |
| * resolve any discrepancies | |
| and return a dict | |
| :param lines: the lines to process | |
| :param cols: the number of columns to expect | |
| :return: dict | |
| """ | |
| d = {} | |
| for l in lines: | |
| parts = l[1:].split('|') | |
| key = parts[0].lower() | |
| freq = int(parts[1]) | |
| extra = '' | |
| connection = '' | |
| if cols == 4: | |
| extra = parts[2].split(' ') | |
| connection = parts[3] | |
| else: | |
| connection = parts[2] | |
| if key not in d.keys(): | |
| d[key] = {'freq': freq, 'connection': connection, 'extra': extra} | |
| else: | |
| d[key]['freq'] += freq | |
| if connection != d[key]['connection']: | |
| d[key]['connection'] = resolve(d[key]['connection'], | |
| connection, key) | |
| for e in extra: | |
| if e not in d[key]['extra']: | |
| d[key]['extra'].append(e) | |
| return d | |
| def resolve(oldVal, newVal, key=None): | |
| """ | |
| If the same key has different values then resolve and return the | |
| decided value | |
| :param oldVal: the existing value | |
| :param newVal: the newer value | |
| :return: str | |
| """ | |
| if len(oldVal) == 0: | |
| return newVal | |
| elif len(newVal) == 0: | |
| return oldVal | |
| while True: | |
| if key: | |
| print '(%s)' % key | |
| print u"1. %s\n2. %s\n3. add new\n" % (oldVal, newVal) | |
| choice = raw_input("Which?: ") | |
| choice = helpers.convertFromCommandline(choice) | |
| if choice in ('1', '2', '3'): | |
| if choice == '1': | |
| return oldVal | |
| elif choice == '2': | |
| return newVal | |
| else: | |
| val = raw_input(u"enter new value (be careful): ") | |
| return helpers.convertFromCommandline(val) | |
| def sortedBy(dDict, sortkey=u'freq'): | |
| """ | |
| Given a dictionary this returns a list of the keys orderd | |
| by decreasing value of the sortkey | |
| :para | |
| """ | |
| return sorted(dDict.iteritems(), | |
| key=lambda (k, v): (v[sortkey], k), | |
| reverse=True) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment