Skip to content

Instantly share code, notes, and snippets.

@jl2
Created October 19, 2010 22:51
Show Gist options
  • Save jl2/635319 to your computer and use it in GitHub Desktop.
Save jl2/635319 to your computer and use it in GitHub Desktop.
Merge two or more wiki tables and merge duplicate entries based on 'key' columns in each table
#!/usr/bin/python3
# Script to merge two or more wiki tables and merge duplicate entries based on 'key' columns in each table
# How to use:
# wikimerge.py keys columns files...
# 'keys' is a comma seperated list of key columns for each table
# This is the column from each table that will be joined on
# Do not put spaces in the list. Use '1,2,3', not '1, 2, 3'
# 'columns' is a semi-colon sperated list of table,column pairs specifying the output file columns
# Don't put spaces in this list either. Use '1,2;0,1;2,4' not '1,2; 0,2; 2,4'
# files are two or more input files in "wiki table format". Just copy/paste all the lines from a wikie between {| and \}
import sys
def main(args):
keyString = args[0]
keys = keyString.split(',')
keys = list(map(int, keys))
outColsString = args[1]
outColsTmp = outColsString.split(';')
def toTuple(istr):
p1,p2 = istr.split(',')
return (int(p1), int(p2))
outCols = list(map(toTuple, outColsTmp))
inNames = args[2:]
infs = []
for fn in inNames:
infs.append(open(fn, "rt"))
tableColumns = []
tables = []
for i in range(len(infs)):
tableColumns.append([])
tables.append(dict())
curEntry = dict()
curCol = 0
curRow = dict()
for i in range(len(infs)):
for line in infs[i]:
if len(line)==0: continue
if line[0:2]=='{|': continue
if line[0:2]=='|}': continue
if line[0] == '!':
tableColumns[i].append(line[2:-1])
curCol += 1
continue
if line[0:2] == '|-':
if len(curRow)>0:
cr = tables[i].get(curRow[keys[i]], [])
cr.append(curRow)
tables[i][curRow[keys[i]]] = cr
curRow = dict()
curCol = 0
continue
if line[0] == '|':
val = line[2:-1]
curRow[curCol] = val
curCol += 1
continue
for inf in infs:
inf.close()
newTable = dict()
print('{| class="wikitable sortable"')
for oc in outCols:
tab, col = oc
print('! {}'.format(tableColumns[tab][col]))
print('|-')
for k in tables[0]:
for oc in outCols:
tab, col = oc
vals = []
if tables[tab].get(k):
for v in tables[tab][k]:
vals.append(v[col])
print('| {}'.format(','.join(vals)))
print('|-')
print('|}')
if __name__=='__main__':
main(sys.argv[1:])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment