Skip to content

Instantly share code, notes, and snippets.

@lokal-profil
Last active November 9, 2015 19:46
Show Gist options
  • Select an option

  • Save lokal-profil/f52b92fac9b9badd3d77 to your computer and use it in GitHub Desktop.

Select an option

Save lokal-profil/f52b92fac9b9badd3d77 to your computer and use it in GitHub Desktop.
LSH: Script for a one-time replacement of upper case connections in Materials, Keywords and ObjKeywords
#!/usr/bin/python
# -*- coding: utf-8 -*-
#
# Cleanup function for a one-time replacement of upper case connections
# in Materials, Keywords and ObjKeywords
#
import helpers
import codecs
import os
def run(inPath=u'old_connections', outPath=u'old_connections'):
files = {'keywords': {'filename': 'commons-Keywords.csv', 'cols': 4},
'material': {'filename': 'commons-Materials.csv', 'cols': 3},
'objKeywords': {'filename': 'commons-ObjKeywords.csv', 'cols': 3}}
if not os.path.isdir(outPath):
os.mkdir(outPath)
for k, v in files.iteritems():
print 'Processing %s...' % k
f = codecs.open(os.path.join(inPath, v['filename']), 'r', 'utf-8')
lines = f.read().split('\n')
lines.pop()
f = codecs.open(os.path.join(outPath, v['filename']), 'w', 'utf-8')
entries = cleanup(lines, v['cols'])
for key, val in helpers.sortedBy(entries):
if v['cols'] == 3:
f.write(u'*%s|%s|%s\n' % (key, val['freq'], val['connection']))
else:
f.write(u'*%s|%s|%s|%s\n' % (key, val['freq'],
' '.join(val['extra']),
val['connection']))
f.close()
def cleanup(lines, cols):
"""
Given the lines of a connection file
* convert to lower case
* merge any duplicates
* resolve any discrepancies
and return a dict
:param lines: the lines to process
:param cols: the number of columns to expect
:return: dict
"""
d = {}
for l in lines:
parts = l[1:].split('|')
key = parts[0].lower()
freq = int(parts[1])
extra = ''
connection = ''
if cols == 4:
extra = parts[2].split(' ')
connection = parts[3]
else:
connection = parts[2]
if key not in d.keys():
d[key] = {'freq': freq, 'connection': connection, 'extra': extra}
else:
d[key]['freq'] += freq
if connection != d[key]['connection']:
d[key]['connection'] = resolve(d[key]['connection'],
connection, key)
for e in extra:
if e not in d[key]['extra']:
d[key]['extra'].append(e)
return d
def resolve(oldVal, newVal, key=None):
"""
If the same key has different values then resolve and return the
decided value
:param oldVal: the existing value
:param newVal: the newer value
:return: str
"""
if len(oldVal) == 0:
return newVal
elif len(newVal) == 0:
return oldVal
while True:
if key:
print '(%s)' % key
print u"1. %s\n2. %s\n3. add new\n" % (oldVal, newVal)
choice = raw_input("Which?: ")
choice = helpers.convertFromCommandline(choice)
if choice in ('1', '2', '3'):
if choice == '1':
return oldVal
elif choice == '2':
return newVal
else:
val = raw_input(u"enter new value (be careful): ")
return helpers.convertFromCommandline(val)
def sortedBy(dDict, sortkey=u'freq'):
"""
Given a dictionary this returns a list of the keys orderd
by decreasing value of the sortkey
:para
"""
return sorted(dDict.iteritems(),
key=lambda (k, v): (v[sortkey], k),
reverse=True)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment