Created
October 1, 2017 21:58
-
-
Save alpiepho/64bdfaea76a5b872c23aa68e4df37fc9 to your computer and use it in GitHub Desktop.
Python - tool to parse THE ONE FILE and pull out data as a CSV file
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
import getopt | |
import os | |
import sys | |
def Usage(): | |
print("Usage: %s -i <file> -o <file>" % sys.argv[0]) | |
print(" -i <file> Input HTM file") | |
print(" -o <file> Output CSV file") | |
def getTDStr(contents, index, startTag): | |
index = contents.find(startTag, index) | |
index = contents.find(TAG_CLOSE, index) | |
a = index+1 | |
index = contents.find(TD_CLOSE, index) | |
b = index | |
str = contents[a:b] | |
return index, str | |
def outputString(outFp, str): | |
outFp.write("\"" + str + "\"" + "\n") | |
def outputList(outFp, list): | |
for i in xrange(len(list)): | |
list[i] = "\"" + list[i] + "\"" | |
outFp.write(','.join(list) + "\n") | |
TABLE_OPENTAG = "<table id=\"isi-report\">" | |
TABLE_CLOSETAG = "</table>" | |
TR_DATA_HEADER = "<tr class=\"dataHeader" | |
BLANK_TRTDTDTR = "<tr><td></td></tr>" | |
def getTableIndexes(contents): | |
tableOpenIndex = contents.find(TABLE_OPENTAG) | |
tableCloseIndex = contents.find(TABLE_CLOSETAG) | |
dataOpenIndex = contents.find(TR_DATA_HEADER) | |
dataCloseIndex = contents.find(BLANK_TRTDTDTR, dataOpenIndex) | |
return tableOpenIndex, tableCloseIndex, dataOpenIndex, dataCloseIndex | |
TD_ISI_GROUP = "<td isi-group=" | |
TD_ISI_GROUP_MEMBER = "<td isi-group-member=" | |
TD_CLOSE = "</td>" | |
TAG_CLOSE = ">" | |
SPAN_LABEL = "<span isi-label=\"\">" | |
SPAN_VALUE = "<span isi-value=\"\">" | |
SPAN_CLOSE = "</span>" | |
def getGroupInfo(outFp, contents, startIndex, endIndex): | |
index = startIndex | |
while index < endIndex: | |
if contents.startswith(BLANK_TRTDTDTR, index): | |
outputString(outFp, "") | |
if contents.startswith(TD_ISI_GROUP, index): | |
index, str = getTDStr(contents, index, TD_ISI_GROUP) | |
outputString(outFp, str) | |
if contents.startswith(TD_ISI_GROUP_MEMBER, index): | |
index, str = getTDStr(contents, index, TD_ISI_GROUP_MEMBER) | |
outputString(outFp, str) | |
index += 1 | |
return index | |
TD_DATA_HEADER = "<td isi-data-column-header=" | |
TR_DATA = "<tr class=\"data\"" | |
TR_CLOSETAG = "</tr>" | |
TD_DATA1 = "<td class=" | |
TD_DATAN = "<td>" | |
def getDataAndValues(outFp, contents, startIndex, endIndex): | |
index = startIndex | |
line = [] | |
while index < endIndex: | |
if contents.startswith(TR_CLOSETAG, index): | |
outputList(outFp, line) | |
line = [] | |
if contents.startswith(TD_DATA_HEADER, index): | |
index, str = getTDStr(contents, index, TD_DATA_HEADER) | |
line.append(str) | |
if contents.startswith(TD_DATA1, index): | |
index, str = getTDStr(contents, index, TD_DATA1) | |
line.append(str) | |
if contents.startswith(TD_DATAN, index): | |
index, str = getTDStr(contents, index, TD_DATAN) | |
line.append(str) | |
index += 1 | |
inFilename = '' | |
outFilename = '' | |
try: | |
# process command arguments | |
ouropts, args = getopt.getopt(sys.argv[1:],"i:o:h") | |
for o, a in ouropts: | |
if o == '-i': | |
inFilename = a | |
elif o == '-o': | |
outFilename = a | |
elif o == '-h': | |
Usage() | |
sys.exit(0) | |
except getopt.GetoptError as e: | |
print(str(e)) | |
Usage() | |
sys.exit(2) | |
if type(inFilename) != str or len(inFilename) <= 0: | |
print("please use -i for input HTM log file") | |
Usage() | |
sys.exit(0) | |
if type(outFilename) != str or len(outFilename) <= 0: | |
print("please use -o for output CSV log file") | |
Usage() | |
sys.exit(0) | |
with open(outFilename, 'wb') as outFp: | |
with open(inFilename, 'rb') as inFp: | |
contents = inFp.readlines() | |
contents = ''.join(contents) | |
tableOpenIndex, tableCloseIndex, dataOpenIndex, dataCloseIndex = getTableIndexes(contents) | |
index = getGroupInfo( outFp, contents, tableOpenIndex, dataOpenIndex) | |
index = getDataAndValues(outFp, contents, dataOpenIndex, dataCloseIndex) | |
index = getGroupInfo( outFp, contents, dataCloseIndex, tableCloseIndex) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment