Skip to content

Instantly share code, notes, and snippets.

@alpiepho
Last active September 29, 2017 20:14
Show Gist options
  • Save alpiepho/f472d2ad9928a8cee9f70466486f5294 to your computer and use it in GitHub Desktop.
Save alpiepho/f472d2ad9928a8cee9f70466486f5294 to your computer and use it in GitHub Desktop.
Python - tool to expand THE ONE FILE columns and rows (with adjusted dates)
#!/usr/bin/python
from datetime import datetime
import getopt
import os
import sys
TD_HEADER = "<td isi-data-column-header"
TR_DATA = "<tr class=\"data\" isi-data-row"
TR_DATAMARKED = "<tr class=\"data-marked\" isi-data-row"
TD_DATE = "<td class=\"dateTime\">"
TD_OPENTAG = "<td>"
TD_CLOSETAG = "</td>"
TR_CLOSETAG = "</tr>"
def Usage():
print("Usage: %s -i <file> -o <file> -c <num> -r <num> -m <num>" % sys.argv[0])
print(" -i <file> Input HTM file")
print(" -o <file> Output HTM file")
print(" -c <num> Duplicate columns N times")
print(" -r <num> Duplicate rows N times")
print(" -m <num> max rows (total will be min of this number and original row count)")
def DataRowToDate(line):
dateStartIndex = line.find(TD_DATE) + len(TD_DATE)
dateEndIndex = line.find(TD_CLOSETAG, dateStartIndex)
dateStr = line[dateStartIndex:dateEndIndex]
try:
dto = datetime.strptime(dateStr, '%Y-%m-%d %H:%M:%S.%f')
except:
dto = datetime.strptime(dateStr, '%Y-%m-%d %H:%M:%S')
return dto
def DataRowReplaceDate(line, dto):
dateStartIndex = line.find(TD_DATE) + len(TD_DATE)
dateEndIndex = line.find(TD_CLOSETAG, dateStartIndex)
try:
temp = datetime.strptime(dateStr, '%Y-%m-%d %H:%M:%S.%f')
useMs = True
except:
useMs = False
newLine = line[:dateStartIndex]
if useMs:
newLine += dto.strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]
else:
newLine += dto.strftime('%Y-%m-%d %H:%M:%S')
newLine += line[dateEndIndex:-1]
newLine += "\n"
return newLine
inFilename = ''
outFilename = ''
dupColumns = 1
dupRows = 1
dupRowMax = 100000000000
try:
# process command arguments
ouropts, args = getopt.getopt(sys.argv[1:],"i:o:c:r:m:h")
for o, a in ouropts:
if o == '-i':
inFilename = a
elif o == '-o':
outFilename = a
elif o == '-c':
dupColumns = int(a)
elif o == '-r':
dupRows = int(a)
elif o == '-m':
dupRowMax = int(a)
elif o == '-h':
Usage()
sys.exit(0)
except getopt.GetoptError as e:
print(str(e))
Usage()
sys.exit(2)
if type(inFilename) != str or len(inFilename) <= 0:
print("please use -i for input HTM log file")
Usage()
sys.exit(0)
if type(outFilename) != str or len(outFilename) <= 0:
print("please use -o for output HTM log file")
Usage()
sys.exit(0)
actualColumns = 0
actualRows = 0
with open(outFilename, 'wb') as outFp:
with open(inFilename, 'rb') as inFp:
contents = inFp.readlines()
total = len(contents)
newContents = []
# adjust all data columns first
row = 0
while row < total:
# for duplicate data columns (log has 1 tr per full data row, with multiple tds)
if contents[row].find(TR_DATA) >= 0 or contents[row].find(TR_DATAMARKED) >= 0:
line = contents[row]
rangeDataStart = line.find(TD_OPENTAG)
rangeDataEnd = line.rfind(TR_CLOSETAG)
newLine = line[:rangeDataEnd]
for i in range(1, dupColumns):
newLine += line[rangeDataStart:rangeDataEnd]
newLine += TR_CLOSETAG + "\n"
contents[row] = newLine
row += 1
# adjust header columns and data rows in the same loop
rangeHeaderStart = -1
rangeHeaderEnd = -1
rangeDataStart = -1
rangeDataEnd = -1
row = 0
while row < total:
# for duplicate header columns (log has 1 td/column per row)
# start header td range
if contents[row].find(TD_HEADER) >= 0 and rangeHeaderStart == -1:
rangeHeaderStart = row + 1 # skip Date/Time
# track header td range end
if contents[row].find(TD_HEADER) >= 0 and rangeHeaderStart != -1:
rangeHeaderEnd = row
# header range done, duplicate header columns
if rangeHeaderStart != -1 and contents[row].find(TD_HEADER) == -1:
for i in range(1, dupColumns):
for j in range(rangeHeaderStart, rangeHeaderEnd+1):
newContents.append(contents[j])
actualColumns = dupColumns * (rangeHeaderEnd - rangeHeaderStart) + 1
rangeHeaderStart = -1
rangeHeaderEnd = -1
if (contents[row].find(TR_DATA) >= 0 or contents[row].find(TR_DATAMARKED) >= 0) and rangeDataStart == -1:
rangeDataStart = row
# track data td range end
if (contents[row].find(TR_DATA) >= 0 or contents[row].find(TR_DATAMARKED) >= 0) and rangeDataStart != -1:
rangeDataEnd = row
# data data done, duplicate data rows
# start data tr range
if rangeDataStart != -1 and contents[row].find(TR_DATA) == -1 and contents[row].find(TR_DATAMARKED) == -1:
# get date range delta
dto1 = DataRowToDate(contents[rangeDataStart])
dto2 = DataRowToDate(contents[rangeDataStart+1])
dto3 = DataRowToDate(contents[rangeDataEnd])
rangeDelta = (dto3 - dto1) + (dto2 - dto1)
totalRows = rangeDataEnd - rangeDataStart + 1
for i in range(1, dupRows):
for j in range(rangeDataStart, rangeDataEnd+1):
line = contents[j]
# adjust date/time for repeated row sets
dto = DataRowToDate(line)
dto += i * rangeDelta
line = DataRowReplaceDate(line, dto)
# add the duplicated row
newContents.append(line)
totalRows += 1
if totalRows > dupRowMax:
break
if totalRows > dupRowMax:
break
actualRows = totalRows
rangeDataStart = -1
rangeDataEnd = -1
newContents.append(contents[row])
row += 1
outFp.write(''.join(newContents))
print "actual columns: {}".format(actualColumns)
print "actual rows: {}".format(actualRows)
print "\n(hint: current max rows: {} -> row dup num {}\n)".format(dupRowMax, dupRowMax/actualRows)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment