Last active
September 29, 2017 20:14
-
-
Save alpiepho/f472d2ad9928a8cee9f70466486f5294 to your computer and use it in GitHub Desktop.
Python - tool to expand THE ONE FILE columns and rows (with adjusted dates)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
from datetime import datetime | |
import getopt | |
import os | |
import sys | |
TD_HEADER = "<td isi-data-column-header" | |
TR_DATA = "<tr class=\"data\" isi-data-row" | |
TR_DATAMARKED = "<tr class=\"data-marked\" isi-data-row" | |
TD_DATE = "<td class=\"dateTime\">" | |
TD_OPENTAG = "<td>" | |
TD_CLOSETAG = "</td>" | |
TR_CLOSETAG = "</tr>" | |
def Usage(): | |
print("Usage: %s -i <file> -o <file> -c <num> -r <num> -m <num>" % sys.argv[0]) | |
print(" -i <file> Input HTM file") | |
print(" -o <file> Output HTM file") | |
print(" -c <num> Duplicate columns N times") | |
print(" -r <num> Duplicate rows N times") | |
print(" -m <num> max rows (total will be min of this number and original row count)") | |
def DataRowToDate(line): | |
dateStartIndex = line.find(TD_DATE) + len(TD_DATE) | |
dateEndIndex = line.find(TD_CLOSETAG, dateStartIndex) | |
dateStr = line[dateStartIndex:dateEndIndex] | |
try: | |
dto = datetime.strptime(dateStr, '%Y-%m-%d %H:%M:%S.%f') | |
except: | |
dto = datetime.strptime(dateStr, '%Y-%m-%d %H:%M:%S') | |
return dto | |
def DataRowReplaceDate(line, dto): | |
dateStartIndex = line.find(TD_DATE) + len(TD_DATE) | |
dateEndIndex = line.find(TD_CLOSETAG, dateStartIndex) | |
try: | |
temp = datetime.strptime(dateStr, '%Y-%m-%d %H:%M:%S.%f') | |
useMs = True | |
except: | |
useMs = False | |
newLine = line[:dateStartIndex] | |
if useMs: | |
newLine += dto.strftime('%Y-%m-%d %H:%M:%S.%f')[:-3] | |
else: | |
newLine += dto.strftime('%Y-%m-%d %H:%M:%S') | |
newLine += line[dateEndIndex:-1] | |
newLine += "\n" | |
return newLine | |
inFilename = '' | |
outFilename = '' | |
dupColumns = 1 | |
dupRows = 1 | |
dupRowMax = 100000000000 | |
try: | |
# process command arguments | |
ouropts, args = getopt.getopt(sys.argv[1:],"i:o:c:r:m:h") | |
for o, a in ouropts: | |
if o == '-i': | |
inFilename = a | |
elif o == '-o': | |
outFilename = a | |
elif o == '-c': | |
dupColumns = int(a) | |
elif o == '-r': | |
dupRows = int(a) | |
elif o == '-m': | |
dupRowMax = int(a) | |
elif o == '-h': | |
Usage() | |
sys.exit(0) | |
except getopt.GetoptError as e: | |
print(str(e)) | |
Usage() | |
sys.exit(2) | |
if type(inFilename) != str or len(inFilename) <= 0: | |
print("please use -i for input HTM log file") | |
Usage() | |
sys.exit(0) | |
if type(outFilename) != str or len(outFilename) <= 0: | |
print("please use -o for output HTM log file") | |
Usage() | |
sys.exit(0) | |
actualColumns = 0 | |
actualRows = 0 | |
with open(outFilename, 'wb') as outFp: | |
with open(inFilename, 'rb') as inFp: | |
contents = inFp.readlines() | |
total = len(contents) | |
newContents = [] | |
# adjust all data columns first | |
row = 0 | |
while row < total: | |
# for duplicate data columns (log has 1 tr per full data row, with multiple tds) | |
if contents[row].find(TR_DATA) >= 0 or contents[row].find(TR_DATAMARKED) >= 0: | |
line = contents[row] | |
rangeDataStart = line.find(TD_OPENTAG) | |
rangeDataEnd = line.rfind(TR_CLOSETAG) | |
newLine = line[:rangeDataEnd] | |
for i in range(1, dupColumns): | |
newLine += line[rangeDataStart:rangeDataEnd] | |
newLine += TR_CLOSETAG + "\n" | |
contents[row] = newLine | |
row += 1 | |
# adjust header columns and data rows in the same loop | |
rangeHeaderStart = -1 | |
rangeHeaderEnd = -1 | |
rangeDataStart = -1 | |
rangeDataEnd = -1 | |
row = 0 | |
while row < total: | |
# for duplicate header columns (log has 1 td/column per row) | |
# start header td range | |
if contents[row].find(TD_HEADER) >= 0 and rangeHeaderStart == -1: | |
rangeHeaderStart = row + 1 # skip Date/Time | |
# track header td range end | |
if contents[row].find(TD_HEADER) >= 0 and rangeHeaderStart != -1: | |
rangeHeaderEnd = row | |
# header range done, duplicate header columns | |
if rangeHeaderStart != -1 and contents[row].find(TD_HEADER) == -1: | |
for i in range(1, dupColumns): | |
for j in range(rangeHeaderStart, rangeHeaderEnd+1): | |
newContents.append(contents[j]) | |
actualColumns = dupColumns * (rangeHeaderEnd - rangeHeaderStart) + 1 | |
rangeHeaderStart = -1 | |
rangeHeaderEnd = -1 | |
if (contents[row].find(TR_DATA) >= 0 or contents[row].find(TR_DATAMARKED) >= 0) and rangeDataStart == -1: | |
rangeDataStart = row | |
# track data td range end | |
if (contents[row].find(TR_DATA) >= 0 or contents[row].find(TR_DATAMARKED) >= 0) and rangeDataStart != -1: | |
rangeDataEnd = row | |
# data data done, duplicate data rows | |
# start data tr range | |
if rangeDataStart != -1 and contents[row].find(TR_DATA) == -1 and contents[row].find(TR_DATAMARKED) == -1: | |
# get date range delta | |
dto1 = DataRowToDate(contents[rangeDataStart]) | |
dto2 = DataRowToDate(contents[rangeDataStart+1]) | |
dto3 = DataRowToDate(contents[rangeDataEnd]) | |
rangeDelta = (dto3 - dto1) + (dto2 - dto1) | |
totalRows = rangeDataEnd - rangeDataStart + 1 | |
for i in range(1, dupRows): | |
for j in range(rangeDataStart, rangeDataEnd+1): | |
line = contents[j] | |
# adjust date/time for repeated row sets | |
dto = DataRowToDate(line) | |
dto += i * rangeDelta | |
line = DataRowReplaceDate(line, dto) | |
# add the duplicated row | |
newContents.append(line) | |
totalRows += 1 | |
if totalRows > dupRowMax: | |
break | |
if totalRows > dupRowMax: | |
break | |
actualRows = totalRows | |
rangeDataStart = -1 | |
rangeDataEnd = -1 | |
newContents.append(contents[row]) | |
row += 1 | |
outFp.write(''.join(newContents)) | |
print "actual columns: {}".format(actualColumns) | |
print "actual rows: {}".format(actualRows) | |
print "\n(hint: current max rows: {} -> row dup num {}\n)".format(dupRowMax, dupRowMax/actualRows) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment