-
-
Save gh4dp/95cfd4d34a10df4e603c8ec6bb32340f to your computer and use it in GitHub Desktop.
xlsx2tsv: python command-line script to convert xlsx (Excel "OOXML") into tab-separated values
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
""" | |
xlsx2tsv filename.xlsx [sheet number or name] | |
Parse a .xlsx (Excel OOXML, which is not OpenOffice) into tab-separated values. | |
If it has multiple sheets, need to give a sheet number or name. | |
Outputs honest-to-goodness tsv, no quoting or embedded \\n\\r\\t. | |
One reason I wrote this is because Mac Excel 2008 export to csv or tsv messes | |
up encodings, converting everything to something that's not utf8 (macroman | |
perhaps). This script seems to do better. | |
The spec for this format is 5220 pages. I did not use it. This was helpful: | |
http://blogs.msdn.com/excel/archive/2008/08/14/reading-excel-files-from-linux.aspx | |
But mostly I guessed how the format works. So bugs are guaranteed. | |
brendan o'connor - anyall.org - gist.github.com/22764 | |
""" | |
#from __future__ import print_function | |
import xml.etree.ElementTree as ET | |
import os,sys,zipfile,re,itertools | |
def myjoin(seq, sep=" "): | |
" because str.join() is annoying " | |
return sep.join(str(x) for x in seq) | |
args = sys.argv[:] | |
args.pop(0) | |
if args: | |
z = zipfile.ZipFile(args.pop(0)) | |
elif not sys.stdin.isatty(): | |
z = zipfile.ZipFile(sys.stdin) | |
else: | |
print __doc__.strip() | |
sys.exit(1) | |
n=lambda x: "{http://schemas.openxmlformats.org/spreadsheetml/2006/main}%s" % x | |
sheet_filenames = [f for f in z.namelist() if re.search("^xl/worksheets/sheet.*xml$", f)] | |
workbook_x = ET.XML(z.read("xl/workbook.xml")) | |
sheet_xs = workbook_x.find(n("sheets")).findall(n("sheet")) | |
def sheet_report(): | |
global sheet_xs | |
print>>sys.stderr, "Sheets in this file:" | |
for i,x in enumerate(sheet_xs): | |
print>>sys.stderr, "%3d: %s" % (i+1, x.get('name')) | |
sys.exit(1) | |
def sheet_error(msg): | |
print>>sys.stderr, msg | |
sheet_report() | |
if not args and len(sheet_filenames) > 1: | |
sheet_error("There are multiple sheets -- need to specify a sheet number or name.") | |
elif not args and len(sheet_filenames) == 1: | |
sheet_num = 1 | |
elif args: | |
sheet_num = args.pop(0) | |
if isinstance(sheet_num,str) and (not re.search('^[0-9]+$',sheet_num) or int(sheet_num) > len(sheet_filenames)): | |
name = sheet_num | |
inds = [i for i,x in enumerate(sheet_xs) if x.get('name')==name] | |
if not inds: sheet_error("Can't find sheet with name '%s'" % name) | |
if len(inds)>1: sheet_error("Multiple sheets with name '%s'" % name) | |
sheet_num = inds[0] + 1 | |
def letter2col_index(letter): | |
""" A -> 0, B -> 1, Z -> 25, AA -> 26, BA -> 52 """ | |
base26digits = [1+ord(x)-ord("A") for x in letter] | |
return sum([x*26**(len(base26digits) - k - 1) for k,x in enumerate(base26digits)]) - 1 | |
def flatten(iter): | |
return list(itertools.chain(*iter)) | |
def cell2text(cell): | |
if cell is None: | |
return "" | |
elif 't' in cell.attrib and cell.attrib['t'] == 's': | |
# shared string | |
idx = int(cell.find(n("v")).text) | |
si = ss_list[idx] | |
t_elt = si.find(n("t")) | |
if t_elt is not None: | |
return t_elt.text | |
t_elts = si.findall(n("r") + "/" + n("t")) | |
if t_elts: | |
text = "".join( (t.text) for t in t_elts ) | |
return text | |
raise Exception("COULDNT DECODE CELL: %s" % ET.tostring(si)) | |
#return si.find(n("t")).text | |
#return ET.tostring(si) | |
else: | |
v_elt = cell.find(n("v")) | |
if v_elt is None: return "" | |
return v_elt.text | |
ss_xml = z.read("xl/sharedStrings.xml") | |
ss_list = ET.XML(ss_xml).findall(n("si")) | |
xml = z.read("xl/worksheets/sheet%s.xml" % sheet_num) | |
s = ET.fromstring(xml) | |
rows = s.findall(n("sheetData")+"/"+n("row")) | |
all_cells = flatten( [[c for c in row.findall(n("c"))] for row in rows] ) | |
max_col = max(letter2col_index(re.search("^[A-Z]+",c.attrib['r']).group()) for c in all_cells) | |
def make_cells(): | |
return [None] * (max_col+1) | |
warning_count=0 | |
warning_max = 50 | |
def warning(s): | |
global warning_count | |
warning_count += 1 | |
if warning_count > warning_max: return | |
print>>sys.stderr, "WARNING: %s" % s | |
def cell_text_clean(text): | |
s = text.encode("utf-8") | |
if "\t" in s: warning("Clobbering embedded tab") | |
if "\n" in s: warning("Clobbering embedded newline") | |
if "\r" in s: warning("Clobbering embedded carriage return") | |
s = s.replace("\t"," ").replace("\n"," ").replace("\r"," ") | |
return s | |
for row in rows: | |
cells_elts = row.findall(n("c")) | |
inds = [] # parallel | |
for c in cells_elts: | |
letter = re.search("^[A-Z]+", c.attrib['r']).group() | |
inds.append(letter2col_index(letter) ) | |
cells = make_cells() | |
for c,j in zip(cells_elts,inds): | |
cells[j] = c | |
#print( *(cell2text( c ).encode("utf-8").replace("\t"," ") for c in cells), sep="\t") | |
print myjoin((cell_text_clean(cell2text( c )) for c in cells), sep="\t") | |
if warning_count > warning_max: | |
print>>sys.stderr, "%d total warnings, %d hidden" % (warning_count, warning_count-warning_max) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment