dixsonhuie · November 7, 2024 20:06
diff --git a/app-log.py b/app-log.py
 import os
 import sys
 import csv
 import re
 import logging
 import argparse
 import datetime

 col_li = ['filename', 'line_number', 'host', 'pid', 'comp', 'id', 'time', 'ms', 'category', 'level', 'logger', 'message']

 dirlist = [r'E:\log']

 start_date = None

 end_date = None

 # date format used to convert command line arguments into a datetime object
 # example: 2021-09-14
 filter_date_fmt = '%Y-%m-%d'
 # adding hours, minutes and seconds
 filter_datetime_fmt = filter_date_fmt + ' %H:%M:%S'

 home_dir = os.path.expanduser('~')

 filename_prefix = 'app_log_summary'

 output_filename = ''

 show_fullpath = False

 # list of extensions to visit
 extlist = ['\.\d+', '.log', '.out', '.stdouterr', '.err']

 # regex representing entire date time portion from a line in a log file
 # example: 2021-09-14 16:22,124
 datefmt = r'(\d\d\d\d-\d\d-\d\d \d\d:\d\d:\d\d),(\d\d\d)'

 # search for the following strings that may indicate an error
 error_li = [ 'warning', 'severe', 'exception', 'error', 'failure', 'Long GC collection']

 # for setting log level
 level_li = ['SEVERE', 'WARNING', 'INFO', 'CONFIG', 'FINE', 'FINER', 'FINEST']

 host_li = []

 def gettimestamp():
    today = datetime.date.today()    
    return today.strftime("%Y%b%d")


 # check if string matches any of the hostnames                 
 def get_hostname(s):
    for host in host_li:
        pattern = '.*({0}).*'.format(host)
        m = re.match(pattern, s)
        if m:
            return m.group(1)
    return ''


 # check if filename contains pid and component information
 # only works if filename format has not changed
 def get_pid(s):

    comp = ''
    id   = ''
    host = ''
    pid  = ''

    patternstr = r'.*(gsc|manager|gsm|lus)_(\d+)-([\w\.]+)-(\d+).*'
    m = re.match(patternstr, s)
    if m:
        comp = m.group(1)
        id   = m.group(2)
        host = m.group(3)
        pid  = m.group(4)
    else:
        # other processes: gsa, GSWebUI, ui, service
        patternstr = r'.*(gsa|GSWebUI|ui|service)-([\w\.]+)-(\d+).*'
        m = re.match(patternstr, s)
        if m:
            comp = m.group(1)
            host = m.group(2)
            pid  = m.group(3)

    return (comp, id, host, pid)


 def process_file(fullpath):
    line_number = 0

    with open(fullpath, encoding="latin-1") as f:

        sDate = ''
        dtDate = None
        millis = ''

        for line in f:
            found = False
            line_number += 1

            # skip lines beginning with white space
            if re.match(r'\s', line):
                continue


            # save the timestamp for lines with no timestamp
            patternstr = r'.*{}.*'.format(datefmt)
            m = re.match(patternstr, line)
            if m:
                sDate = m.group(1)
                dtDate = datetime.datetime.strptime(sDate, filter_datetime_fmt)
                millis = m.group(2)

            # filter out log lines by date
            if start_date is not None and dtDate is not None and dtDate < start_date:
                continue

            if end_date is not None   and dtDate is not None and dtDate > end_date:
                continue

            for error_pattern in error_li:
                if re.search(error_pattern, line, re.IGNORECASE):
                    found = True
                    break

            logging.debug("log date as string: %s, log date: %s", sDate, '' if dtDate is None else dtDate.strftime(filter_date_fmt))

            if found == True:
                # truncate the line
                line = line[:300]
                line = line.rstrip()
                logging.debug("Line: %s", line)
                process_line(line, fullpath, line_number, sDate, millis)


 def process_line(s, fullpath, line_number, date, millis):
    # example: 2017-01-05 14:11:21,821 LUS INFO [com.sun.jini.reggie] - Exception
    # example: 2016-12-31 17:38:57,334 pmds.deployment-1.8.9-pu.18 [2] WARNING [com.gigaspaces.core.common] - Primary space is unavailable
    patternstr = r'{}{}'.format(datefmt, r' ([\w \-\.]*)(\[\d\] )?([\w]*)? \[([\w\-\.]*)\] - (.*)$')
    m = re.match(patternstr, s)
    #m = re.match(r'(\d\d\d\d-\d\d-\d\d \d\d:\d\d:\d\d),(\d\d\d) ([\w \-\.]*)(\[\d\] )?([\w]*)? \[([\w\-\.]*)\] - (.*)$', s)
    '''
                   ^ date                              ^ millis ^ category  ^ optional^ level    ^ logger      ^ message
                                                                              match 0 or 1 times
    '''

    if m:
        # 1 date
        # 2 millis
        # 3 category
        # 4 optional, '[2]' in comment above
        # 5 level
        # 6 logger
        # 7 message
        category = ''
        level = ''
        if m.group(4) == None:
            category = m.group(3)
            # extract level information
            # eg., LUS INFO
            for i in level_li:
                index = category.find(i)
                if index >= 0:
                    level = category[index:]
                    category = category[0:index]
                    break
        else:
            category = m.group(3) + m.group(4)
            level = m.group(5)

        # this group also grabs the space that may come after this optional string; need to strip it out
        category = category.strip()
        level = level.strip()        
        mywriter.writerow([fileinfo['path'], line_number, fileinfo['host'], fileinfo['pid'], fileinfo['comp'], fileinfo['id'], m.group(1), m.group(2), category, level, m.group(6), m.group(7)])
    else:
        # sometimes clients just provide output of the gs-agent process
        # [gsc][1/10120]	2017-10-11 10:52:37,557 CommonClassLoader WARNING [net.jini.discovery.LookupLocatorDiscovery] - java.net.SocketTimeoutException: connect timed out - using unicast locator 10.10.10.117:4174 - delay next lookup by 1,000 ms
        patternstr = r'{}{}{}'.format(r'\[(\w*)\]\[(\d*)/(\d*)\]\s*', datefmt, r' ([\w \-\.]*)(\[\d\] )?([\w]*)? \[([\w\-\.]*)\] - (.*)$')
        m = re.match(patternstr, s)
        #m = re.match(r'\[(\w*)\]\[(\d*)/(\d*)\]\s*(\d\d\d\d-\d\d-\d\d \d\d:\d\d:\d\d),(\d\d\d) ([\w \-\.]*)(\[\d\] )?([\w]*)? \[([\w\-\.]*)\] - (.*)$', s)
        '''
                         ^ proc   ^ id  ^ pid - the rest is a repeat of the regex used above
        '''
        if m:
            # 1 component
            # 2 id
            # 3 pid
            # 4 date
            # 5 millis
            # 6 category
            # 7 optional
            # 8 level
            # 9 logger
            # 10 message
            category = ''
            level = ''
            if m.group(7) == None:
                category = m.group(6)
                # extract level information
                for i in level_li:
                    index = category.find(i)
                    if index >= 0:
                        level = category[index:]
                        category = category[0:index]
                        break

                category = category.strip()
                if category.upper() == m.group(1).upper():
                    category = ''
            else:
                category = m.group(6) + m.group(7)
                level = m.group(8)

            mywriter.writerow([fileinfo['path'], line_number, fileinfo['host'], m.group(3), m.group(1), m.group(2), m.group(4), m.group(5), category, level, m.group(9), m.group(10)])
        else:
            #[manager][1/13986]	Caused by: com.gigaspaces.security.AuthenticationException: Authentication request is invalid - you are not logged in.
            # log message pattern missing timestamp
            patternstr = r'{}{}'.format(r'\[(\w*)\]\[(\d*)/(\d*)\]\s*', r'(.*)$')
            #                               ^comp    ^id   ^pid           ^message
            m = re.match(patternstr, s)
            if m:
                # 1 component
                # 2 id
                # 3 pid
                # 4 message

                mywriter.writerow([fileinfo['path'], line_number, fileinfo['host'], m.group(3), m.group(1), m.group(2), date, millis, '', '', '', m.group(4)])
            else:
                mywriter.writerow([fileinfo['path'], line_number, fileinfo['host'], fileinfo['pid'], fileinfo['comp'], fileinfo['id'], date, millis, '', '', '', s])

 def process_args():
    global dirlist, start_date, end_date, filename_prefix, output_filename, host_li, show_fullpath

    parser = argparse.ArgumentParser(add_help=False)
    parser.add_argument("--start_dir",  help="the root directory to begin processing.")
    parser.add_argument("--output_dir", help="where the output file should be written to.")
    parser.add_argument("--start_date", help="the date to begin processing errors. Log lines with dates before the start date will be filtered out. Example format: 2021-09-21")
    parser.add_argument("--end_date",   help="the date to end processing errors. Log lines with dates after the end date will be filtered out. Example format: 2021-09-21")
    parser.add_argument("--log_level",  choices=['CRITICAL', 'ERROR', 'WARNING', 'INFO', 'DEBUG'], help="logging level. Default is WARNING.")
    parser.add_argument("--hosts",      help="list of hosts, separated by commas.")
    parser.add_argument("--filename_prefix", help="Output filename prefix.")
    parser.add_argument("--show_fullpath", help="Output the full path. Default is false.")
    parser.add_argument('-h', '--help', action='help', default=argparse.SUPPRESS, help="This program parses a set of XAP log files formatted with standard XAP out-of-the-box settings.")

    args = parser.parse_args()

    if args.filename_prefix:
        filename_prefix = args.filename_prefix

    if args.start_dir:
        dirlist = [args.start_dir]

    if args.start_date:
        start_date = datetime.datetime.strptime(args.start_date, filter_date_fmt)

    if args.end_date:
        end_date = datetime.datetime.strptime(args.end_date, filter_date_fmt)

    if args.output_dir:
        output_filename = args.output_dir + os.path.sep + filename_prefix + "-" + gettimestamp() + ".csv"
    else:
        output_filename = home_dir + os.path.sep + filename_prefix + '-' + gettimestamp() + ".csv"

    if args.show_fullpath:
        show_fullpath = args.show_fullpath

    if args.log_level:
        if args.log_level == 'CRITICAL':
            logging.basicConfig(level=logging.CRITICAL)
        elif args.log_level == 'ERROR':
            logging.basicConfig(level=logging.ERROR)
        elif args.log_level == 'INFO':
            logging.basicConfig(level=logging.INFO)
        elif args.log_level == 'DEBUG':
            logging.basicConfig(level=logging.DEBUG)
        else:
            logging.basicConfig(level=logging.WARNING)
    else:
        # set logging level. WARNING is default level
        logging.basicConfig(level=logging.WARNING)

    if args.hosts:
        host_li = args.hosts.split(',')


 def myvisitor(extlist, dirname, names):
    global fileinfo
    logging.debug("Current directory: %s", dirname)
    for f in names:
        (b, ext) = os.path.splitext(f)
        logging.debug("Filename base: %s Ext: %s", b, ext)
        for x in extlist:
            m = re.match(x, ext)
            if m:
                fullpath = os.path.join(dirname, f)
                logging.debug("Fullpath: %s", fullpath)
                try:
                    hostname = get_hostname(f)
                    fileinfo = {'host': hostname}
                    process_file(fullpath)
                except OSError as err:
                    print("OS error: {0}".format(err))
                #except OSError, detail:
                #    print detail
                break

 def myvisitor_2(fullpath, start_dir, filename):
    global fileinfo
    try:
        relative_path = "{}{}".format('.', fullpath.replace(start_dir, '', 1))

        hostname = get_hostname(relative_path)
        if not show_fullpath:
            path = relative_path
        else:
            path = fullpath

        (comp, id, host, pid) = get_pid(filename)
        if hostname == '':
            hostname = host
        fileinfo = {'host': hostname, 'path': path, 'comp': comp, 'id': id, 'pid': pid}

        process_file(fullpath)
    except OSError as err:
        print("OS error: {0}".format(err))

 def main():
    if sys.version_info < (3,0,0):
        print("Please use a version of Python > 3")
        sys.exit(-1)

    global mywriter

    process_args()    

    # write output to csv file
    with open(output_filename, 'w', newline='') as csvfile:
        mywriter = csv.writer(csvfile)
        mywriter.writerow(col_li) 

        for i in dirlist:
            logging.debug("Processing: %s", i)
            for root, dirs, files in os.walk(i):
                for name in files:
                    logging.debug(os.path.join(root, name))
                    (b, ext) = os.path.splitext(name)
                    for x in extlist:
                        m = re.match(x, ext)
                        if m:
                            fullpath = os.path.join(root, name)
                            myvisitor_2(fullpath, i, name)
                for name in dirs:
                    logging.debug(os.path.join(root, name))
            #os.path.walk(i, myvisitor, extlist)


 main()
diff --git a/clean.py b/clean.py
 import argparse
 import logging
 import os
 import sys


 start_dir = os.path.expanduser('~')
 show_relpath = False
 filter_li = []


 class file_suffix_filter:
    # heap dump files
    # extlist = ['.hprof']

    def __init__(self, li):
        self.extlist = li
    
    def hasFileMatch(self):
        return True

    def hasDirectoryMatch(self):
        return False

    def isFileMatch(self, path, filename):
        (base, ext) = os.path.splitext(filename)
        if ext in self.extlist:
            return True
        else:
    	    return False


 class named_dir_filter:
    def __init__(self, named_dir_li):
       self.dirname_li = named_dir_li

    def hasFileMatch(self):
        return False

    def hasDirectoryMatch(self):
        return True

    def isDirectoryMatch(self, dirname):
        logging.debug("dirname is:" + dirname)
        if dirname in self.dirname_li:
            return True
        else:
            return False


 class large_file_filter:
    def __init__(self, f_size):
      self.file_size = f_size

    def hasFileMatch(self):
        return True

    def hasDirectoryMatch(self):
        return False

    def isFileMatch(self, path, filename):
      fname = os.path.join(path, filename)
      if not os.path.islink(fname):
        f_size = os.path.getsize(fname)
        if f_size > self.file_size:
          return True
        else:
          return False
      else:
        return False


 # recursively visit directory and its children
 def process():

    for root, dirs, files in os.walk(start_dir):
       rel_dir = os.path.relpath(root, start_dir) 

       for name in files:

           for filter in filter_li:
               if filter.hasFileMatch() and filter.isFileMatch(root, name):
                   if show_relpath == True:
                       filename = os.path.join('.', rel_dir, name)
                       print(filename)
                   else:
                       filename = os.path.join(root, name)
                       print(filename)

       for dir in dirs:
           for filter in filter_li:
               if filter.hasDirectoryMatch() and filter.isDirectoryMatch(dir):
                   if show_relpath == True:
                       filename = os.path.join('.', rel_dir, dir)
                       print(filename)
                   else:
                       filename = os.path.join(root, dir)
                       print(filename)



 def process_args():
    global start_dir, show_relpath, filter_li;

    is_file_suffix_filter = True
    file_suffix = ['.hprof']
    is_named_dir_filter = True
    is_large_file_filter = True
    large_file_filter_size = 1_000_000_000
    named_dir = ['logs']

    parser = argparse.ArgumentParser(add_help=False)
    parser.add_argument("--start_dir",  help="The root directory to begin processing. Default is the user's home directory.")
    parser.add_argument("--show_relpath", help="Output the relative path, otherwise show full path. Default is False.")
    parser.add_argument("--log_level",  choices=['CRITICAL', 'ERROR', 'WARNING', 'INFO', 'DEBUG'], help="logging level. Default is WARNING.")
    parser.add_argument("--file_suffix_filter", choices=['true', 'false'], help="Filter in files that match a suffix. Default is true.")
    parser.add_argument("--file_suffix", help="A list of file suffixes to be used with --file_suffix_filter, separated by commas. Default suffixes: '.hprof'.")
    parser.add_argument("--named_dir_filter", choices=['true', 'false'], help="Filter in directories based on a name. Default is true.")
    parser.add_argument("--named_dir", help="A list of directories used with --named_dir_filter, separated by commas. Default directories: 'logs'. Other suggestions: target,work,deploy")
    parser.add_argument("--large_file_filter", choices=['true', 'false'], help="Filter in files larger than a default size of {}. Default is true.".format(large_file_filter_size))
    parser.add_argument("--large_file_filter_size", help="Large file filter size.")
    parser.add_argument('-h', '--help', action='help', default=argparse.SUPPRESS, help="This program will recurse a directory and look for files to be cleaned up.")


    # process arguments
    args = parser.parse_args()

    if args.start_dir:
        start_dir = args.start_dir

    if args.show_relpath:
        if args.show_relpath.lower() == 'true' or args.show_relpath.lower() == 't':
            show_relpath = True
   
    if args.file_suffix_filter:
        if args.file_suffix_filter.lower() == 'true' or args.file_suffix_filter.lower() == 't':
            is_file_suffix_filter = True
        else:
            is_file_suffix_filter = False

    if args.file_suffix:
        file_suffix = args.file_suffix.split(',')

    if args.named_dir_filter:
        if args.named_dir_filter.lower() == 'true' or args.named_dir_filter.lower() == 't':
            is_named_dir_filter = True
        else:
            is_named_dir_filter = False

    if args.named_dir:
        named_dir = args.named_dir.split(',')

    if args.large_file_filter:
        if args.large_file_filter.lower() == 'true' or args.large_file_filter.lower() == 't':
            is_large_file_filter = True
        else:
            is_large_file_filter = False

    if args.large_file_filter_size:
        large_file_filter_size = int(args.large_file_filter_size)


    # set values based on arguments
    if is_file_suffix_filter == True:
        filter_li.append(file_suffix_filter(file_suffix))

    if is_named_dir_filter == True:
        filter_li.append(named_dir_filter(named_dir))

    if is_large_file_filter == True:
        filter_li.append(large_file_filter(large_file_filter_size))


    if args.log_level:
        if args.log_level == 'CRITICAL':
            logging.basicConfig(level=logging.CRITICAL)
        elif args.log_level == 'ERROR':
            logging.basicConfig(level=logging.ERROR)
        elif args.log_level == 'INFO':
            logging.basicConfig(level=logging.INFO)
        elif args.log_level == 'DEBUG':
            logging.basicConfig(level=logging.DEBUG)
        else:
            logging.basicConfig(level=logging.WARNING)
    else:
        # set logging level. WARNING is default level
        logging.basicConfig(level=logging.WARNING)




 def main():
    if sys.version_info < (3,0,0):
        print("Please use a version of Python > 3")
        sys.exit(-1)

    process_args()
    process()


 main()
diff --git a/date-filter.py b/date-filter.py
 import argparse
 import csv
 from datetime import datetime
 import logging
 import sys

 file = r'C:\Users\Dixson\tmp.csv'
 before_dt = None
 after_dt = None
 col_no = 1

 def process(fin):
    with open('tmp.csv', 'w', newline='') as csvfile:
        mywriter = csv.writer(csvfile)
        
        with open(fin, newline='') as csvfile:
            reader = csv.reader(csvfile)
            for row in reader:
                value = row[col_no]
                dt = convert_dt(value)
                logging.debug("Value: {}, date: {} on column {}".format(value, dt, col_no))
                if dt == None:
                    mywriter.writerow(row)
                    #print(', '.join(row))
                    continue
                if (before_dt == None or dt < before_dt):
                    if( after_dt == None or dt > after_dt):
                        mywriter.writerow(row)
                        #print(', '.join(row))
                    
                    
            
 # example date: 2017-01-05 14:11:21            
 def convert_dt(s):
    try:
        return datetime.strptime(s, '%Y-%m-%d %H:%M:%S')    
    except Exception as error:
        return None
    
 def process_args():
    global file, before_dt, after_dt, col_no

    parser = argparse.ArgumentParser(add_help=False)
    parser.add_argument("-f", "--file", help="the input file. If not provided, /dev/stdin is used.")
    parser.add_argument("--before", help='include dates before provided date. E.g., --before "2017-01-05 14:11:21"')
    parser.add_argument("--after", help="include dates after provided date.")
    parser.add_argument("--columnNumber", help="the column number that has the date field, beginning at 0.")
    parser.add_argument("--log_level",  choices=['CRITICAL', 'ERROR', 'WARNING', 'INFO', 'DEBUG'], help="logging level")
    parser.add_argument('-h', '--help', action='help', default=argparse.SUPPRESS, help="This program parses a csv file using the date filter criteria.")
    
    args = parser.parse_args()
    if args.file:
        file = args.file
    else:
        # won't work on Windows
        file = '/dev/stdin'
        
    if args.before:
        before_dt = convert_dt(args.before)
        
    if args.after:
        after_dt = convert_dt(args.after)

    if args.columnNumber:
        col_no = int(args.columnNumber)
                
    if args.log_level:
        if args.log_level == 'CRITICAL':
            logging.basicConfig(level=logging.CRITICAL)
        elif args.log_level == 'ERROR':
            logging.basicConfig(level=logging.ERROR)
        elif args.log_level == 'INFO':
            logging.basicConfig(level=logging.INFO)
        elif args.log_level == 'DEBUG':
            logging.basicConfig(level=logging.DEBUG)
        else:
            logging.basicConfig(level=logging.WARNING)
    else:
        # set logging level. WARNING is default level
        logging.basicConfig(level=logging.WARNING)


 def main():
    if sys.version_info < (3,0,0):
        print("Please use a version of Python > 3")
        sys.exit(-1)

    process_args()
    process(file)
    
    
 main()
    
diff --git a/gc-adaptive-datetime.py b/gc-adaptive-datetime.py
 import argparse
 import csv
 import datetime
 import logging
 import os
 import re
 import sys


 '''
    This program parses a gc log file for stop the world phases and keywords to csv file.
        PrintGCDateStamps has been enabled
        PrintAdaptiveSizePolicy has been enabled 
        young gc types - G1 Evacuation Pause, G1 Humongous Allocation, Metadata GC Threshold 
        mixed types - G1 Evacuation Pause
        full gc types - Allocation Failure, System.gc()
        
 '''


 # list of columns
 col_li = ['file name', 'line no.', 'host', 'pid','date time', 'process time', 'gc type/keyword', 'time', 'comment']


 # list of special extensions to visit
 # versioned logs will be in the format .1, .2, etc. This is checked elsewhere.
 extlist = ['.current']
 #extlist = ['.log', '.current', '.1', '.2']

 # special patterns to search for
 search_li = ['to-space','humongous'] 
 #search_li = ['to-space','humongous', r'System.gc\(\)']


 datefmt = r'(\d\d\d\d-\d\d-\d\dT\d\d:\d\d:\d\d\.\d\d\d[\+\-]\d\d\d\d): (\d+\.\d+): '
 #            ^ date time                                            ^ timestamp


 def gethomedir():
    return os.path.expanduser('~')

 def gettimestamp():
    today = datetime.date.today()    
    return today.strftime("%Y%b%d")

 # globals
 # list of directories to visit
 dirlist = [r'E:\log']

 output_filename = '{}{}gc-summary-{}.csv'.format(gethomedir(), os.path.sep, gettimestamp())
 show_relative_path = False
 enable_humongous = False
 # a list of possible host names
 host_li = []

 # log files were collected and put in directories by hostname, separated by '.'                
 def get_hostname(dirpath):
    for h in host_li:
        if dirpath.find(h) > -1:
            return h

    return ''

 # get the pid from the log file name
 # use Xloggc:/path/to/file/gc.%p.log, where %p tells the JVM to substitute the pid                
 def get_pid(filename):
    li = filename.split('pid')
    if( len(li) == 1 ):
        return li[0]
    else:
        (pid, rest) = li[1].split('.', 1)
        logging.debug("pid: %s", pid)
        return pid

 # not used, this is a deprecated version of the visitor
 #def myvisitor(extlist, dirname, names):
 #    global fileinfo
 #    logging.debug("Current directory: %s", dirname)
 #    for f in names:
 #        (p, ext) = os.path.splitext(f)
 #        logging.debug("%s %s", f, ext)
 #        if ext in extlist:
 #            fullpath = os.path.join(dirname, f)
 #            logging.debug(fullpath)
 #            try:
 #                hostname = get_hostname(dirname)
 #                pid = get_pid(f)
 #                fileinfo = {'filename': f, 'host' : hostname, 'pid' : pid}
 #
 #                process_file(fullpath)
 #            except OSError as err:
 #                print("OS error: {0}".format(err))
 #
 #            #except OSError, detail:
 #            #    print detail


 def myvisitor_2(fullpath, f):

    global fileinfo
    try:
        hostname = get_hostname(f)
        pid = get_pid(f)
        fileinfo = {'filename': f, 'host': hostname, 'pid': pid}
        process_file(fullpath)
    except OSError as err:
        print("OS error: {0}".format(err))                



 def process_file(fullpath):
    # this section processes log messages that occupy a single linen
    linenum = 0
    f = open(fullpath, 'r')

    date_time = ''
    process_time = ''

    # process line by line to get basic information
    for line in f:
        linenum += 1

        m = re.match(r'^' + datefmt, line)
        if m:
            # save current timestamp
            date_time = m.group(1)
            process_time = m.group(2)

        # check for keywords of interest
        process_search_pattern(line, linenum, date_time, process_time)

        if line.startswith('Java HotSpot(TM)') or line.startswith('Memory:') or line.startswith('CommandLine flags:'):
            process_jvminfo(line, linenum)
        elif line.startswith(' ') == False:
            # check for stw pauses that appear on one line
            process_remark_cleanup(line, linenum)

    # this section processes log messages that span multiple lines
    # read file object to string. When -XX:+PrintAdaptiveSizePolicy is used,
    # gc phases need a multi-line regex to handle
    # check for stw pause that spans multiple lines
    f.seek(0)
    text = f.read()

    f.close()

    # we are interested in activity that causes a stop-the-world pause and the duration of the gc
    # https://blogs.oracle.com/poonam/entry/understanding_g1_gc_logs
    # https://www.oracle.com/technetwork/articles/java/g1gc-1984535.html

    # process multi-line gc phases
    process_young_mixed(text)
    process_full(text)


 ###############################################################################
 # methods that process a multi-line messages
 ###############################################################################
 def process_young_mixed(s):
    '''
    young generation and mixed collection share similar formats
    These gc log statements show up on multiple lines.
    Example:

 2017-09-01T16:12:51.133+0000: 134.345: [GC pause (Metadata GC Threshold) (young) (initial-mark)
 Desired survivor size 48234496 bytes, new threshold 15 (max 15)
 134.346: [G1Ergonomics (CSet Construction) start choosing CSet, _pending_cards: 0, predicted base time: 10.00 ms, remaining time: 990.00 ms, target pause time: 1000.00 ms]
 134.346: [G1Ergonomics (CSet Construction) add young regions to CSet, eden: 63 regions, survivors: 0 regions, predicted young region time: 4209.46 ms]
 134.346: [G1Ergonomics (CSet Construction) finish choosing CSet, eden: 63 regions, survivors: 0 regions, old: 0 regions, predicted pause time: 4219.46 ms, target pause time: 1000.00 ms]
 , 0.0325663 secs]

    '''                

    date_time = ''
    process_time = 0.0
    young_mixed_type = '' # young or mixed
    secondary_type = '' # Eg, G1 Evacuation Pause, G1 Humongous Allocation, Metadata GC Threshold
    initial_mark = '' # tertiary type, associated with G1 Humongous Allocation and Metadata GC Threshold
    gc_time = 0.0

    patternstr = datefmt + r'\[GC pause \(([ \w]*)\) \((young|mixed)\)( \([\w-]+\))?.+?, (\d+\.\d+) secs\]$'
    '''
                                            ^secondary  ^young/mixed     ^initial_mark   ^ elapsed time
    '''
    pattern = re.compile(patternstr, re.MULTILINE | re.DOTALL)
    #pattern = re.compile(r'^(\d\d\d\d-\d\d-\d\dT\d\d:\d\d:\d\d\.\d\d\d\+\d\d\d\d): (\d*\.\d*): \[GC pause \(([ \w\.\(\)]*)\) \(young\).+?, (\d*\.\d*) secs\]$', re.MULTILINE | re.DOTALL)

    for m in pattern.finditer(s):
        date_time = m.group(1)
        process_time = m.group(2)
        young_mixed_type = m.group(4)
        secondary_type = m.group(3)

        if m.group(5) == None:
            initial_mark = ''
        else:
            tmp = m.group(5)
            tmp = tmp.strip('() ')
            initial_mark = ' ' + tmp

        gc_time = m.group(6)
        mywriter.writerow([fileinfo['filename'], '', fileinfo['host'], fileinfo['pid'], date_time, process_time, 'GC pause - ' + young_mixed_type + ' ' + secondary_type + initial_mark, gc_time, ''])

 def process_mixed(s):
    '''
 2017-09-01T17:53:24.732+0000: 6167.945: [GC pause (G1 Evacuation Pause) (mixed)
 Desired survivor size 48234496 bytes, new threshold 1 (max 15)
 - age   1:  303167832 bytes,  303167832 total
 6167.945: [G1Ergonomics (CSet Construction) start choosing CSet, _pending_cards: 8728, predicted base time: 24.66 ms, remaining time: 975.34 ms, target pause time: 1000.00 ms]
 6167.945: [G1Ergonomics (CSet Construction) add young regions to CSet, eden: 105 regions, survivors: 74 regions, predicted young region time: 305.85 ms]
 6167.945: [G1Ergonomics (CSet Construction) finish adding old regions to CSet, reason: reclaimable percentage not over threshold, old: 19 regions, max: 359 regions, reclaimable: 751186712 bytes (5.00 %), threshold: 5.00 %]
 6167.945: [G1Ergonomics (CSet Construction) finish choosing CSet, eden: 105 regions, survivors: 74 regions, old: 19 regions, predicted pause time: 362.13 ms, target pause time: 1000.00 ms]
 6168.115: [G1Ergonomics (Mixed GCs) do not continue mixed GCs, reason: reclaimable percentage not over threshold, candidate old regions: 335 regions, reclaimable: 751186712 bytes (5.00 %), threshold: 5.00 %]
 , 0.1695338 secs]
    '''

    process_time = 0.0
    date_time = ''
    mixed_type = ''
    gc_time = 0.0

    # output similar to GC pause (young)
    patternstr = datefmt + r'\[GC pause \(([ \w]*)\) \(mixed\)( \([\w-]+\))?.+?, (\d+\.\d+) secs\]$'
    #                                     ^mixed_type         
    pattern = re.compile(patternstr, re.MULTILINE | re.DOTALL)

    for m in pattern.finditer(s):
        date_time = m.group(1)
        process_time = m.group(2)
        mixed_type = m.group(3)
        gc_time = m.group(5)        
        mywriter.writerow([fileinfo['filename'], '', fileinfo['host'], fileinfo['pid'], date_time, process_time, 'Mixed generation collection - ' + mixed_type, gc_time, '']) 


 def process_full(s):
    '''
    Full GC statements are also output to multiple lines.
 2018-07-30T11:39:47.643-0400: 174.007: [Full GC (Heap Inspection Initiated GC) 2018-07-30T11:39:47.643-0400: 174.007: [Class Histogram (before full gc):
 2018-07-25T11:59:08.922+0000: 1098967.077: [Full GC (System.gc()) 2018-07-25T11:59:08.927+0000: 1098967.081: [Class Histogram (before full gc):
 2018-07-21T12:11:41.060+0000: 387110.898: [Full GC (Allocation Failure) 2018-07-21T12:11:41.060+0000: 387110.898: [Class Histogram (before full gc):
    ...
    ..., real=6.79 secs]
    '''

    date_time = ''
    process_time = 0.0
    young_type = ''
    gc_time = 0.0

    gcfmt = r'\[Full GC \(([ \w\.\(\)]*)\) .+?, real=(\d+\.\d+) secs\]\s*$'
    #                     ^ full gc type
    
    patternstr = datefmt + gcfmt
 
    pattern = re.compile(patternstr, re.MULTILINE | re.DOTALL)

    for m in pattern.finditer(s):
        date_time = m.group(1)
        process_time = m.group(2)
        full_gc_type = m.group(3)
        gc_time = m.group(4)
        mywriter.writerow([fileinfo['filename'], '', fileinfo['host'], fileinfo['pid'], date_time, process_time, 'Full GC - ' + full_gc_type , gc_time, ''])

 ###############################################################################
 # end methods that process a multi-line messages
 ###############################################################################

 ###############################################################################
 # methods that process a single line
 ###############################################################################

 def process_jvminfo(s, linenum):
    s = s.strip()
    mywriter.writerow([fileinfo['filename'], linenum, fileinfo['host'], fileinfo['pid'], '', '', 'jvm info', '', s])

 def process_remark_cleanup(s, linenum):
    '''
    These gc log statements show up on a single line.
    Example:
 2017-09-01T16:12:51.175+0000: 134.388: [GC remark 2017-09-01T16:12:51.175+0000: 134.388: [Finalize Marking, 0.0058528 secs] 2017-09-01T16:12:51.181+0000: 134.394: [GC ref-proc, 0.0001349 secs] 2017-09-01T16:12:51.181+0000: 134.394: [Unloading, 0.0032643 secs], 0.0100601 secs]


 44973.856: [GC cleanup 22G->22G(30G), 0.0100070 secs]
 [Times: user=0.08 sys=0.00, real=0.01 secs] 
    '''
    gc_type = ''
    date_time = ''
    process_time = 0.0
    gc_time = 0.0

    m = re.match(r'^' + datefmt + r'\[GC remark .*, (\d+\.\d+) secs\]$', s)
    if m:
        gc_type = 'GC remark'
        date_time = m.group(1)
        process_time = m.group(2)
        gc_time = m.group(3)     
    else:
        m = re.match(r'^' + datefmt + r'\[GC cleanup .+, (\d+\.\d+) secs\]$', s)
        if m:
            gc_type = 'GC cleanup'
            date_time = m.group(1)
            process_time = m.group(2)
            gc_time = m.group(3)

    if gc_type != '':
        mywriter.writerow([fileinfo['filename'], linenum, fileinfo['host'], fileinfo['pid'], date_time, process_time, gc_type, gc_time])


 def process_search_pattern(s, linenum, date_time, process_time):
    '''
    Look for search strings of interest. If found write to csv. 
    '''            
    patternstr = r'({})'.format('|'.join(search_li))
    m = re.search(patternstr, s, re.IGNORECASE)
    if m:
        search_pattern = m.group(1).lower()
        if search_pattern == 'humongous' and enable_humongous == False:
            return
        else:
            s = s.strip()
            mywriter.writerow([fileinfo['filename'], linenum, fileinfo['host'], fileinfo['pid'], date_time, process_time, search_pattern, '', s])

    '''
    for search_pattern in search_li:
        if re.search(search_pattern, s, re.IGNORECASE):
            s = s.strip()
            mywriter.writerow([fileinfo['filename'], linenum, fileinfo['host'], fileinfo['pid'], date_time, process_time, search_pattern, '', s]) 
            break
    '''
 ###############################################################################
 # end methods that process a single line
 ###############################################################################


 def process_args():            
    global dirlist, output_filename, enable_humongous, show_relative_path, host_li

    parser = argparse.ArgumentParser(add_help=False)
    parser.add_argument("--start_dir",  help="the root directory to begin processing.")
    parser.add_argument("--output_dir", help="where the output file should be written to. By default the output file will be located in a user's home directory.")
    parser.add_argument("--log_level",  choices=['CRITICAL', 'ERROR', 'WARNING', 'INFO', 'DEBUG'], help="logging level. Default is WARNING.")
    parser.add_argument("--enable_humongous", help='True enables inclusion of any log messages that have to do with humongous allocation. Default is False.')
    parser.add_argument("--show_relative_path", help="show relative path in filename column. true or false. Default is false.")
    parser.add_argument("--hosts",      help="list of hosts, separated by commas.")
    parser.add_argument('-h', '--help', action='help', default=argparse.SUPPRESS, help=r'This program parses a gc log file and provides a summary in csv format. The following JVM options should be used to generate the log file: -Xloggc:/path/to/file/gc_%%p.log -XX:+PrintCommandLineFlags -XX:+PrintGC -XX:+PrintGCCause -XX:+PrintGCDetails -XX:+PrintGCTimeStamps -XX:+PrintGCDateStamps -XX:+PrintAdaptiveSizePolicy -XX:+PrintTenuringDistribution -XX:+PrintReferenceGC')

    args = parser.parse_args()
    if args.start_dir:
        dirlist = [args.start_dir]

    if args.output_dir:
        output_filename = args.output_dir + os.path.sep + "gc_log_summary-" + gettimestamp() + ".csv"

    if args.enable_humongous:
        if args.enable_humongous.lower() == 'true' or args.enable_humongous.lower() == 't':
            enable_humongous = True


    if args.log_level:
        if args.log_level == 'CRITICAL':
            logging.basicConfig(level=logging.CRITICAL)
        elif args.log_level == 'ERROR':
            logging.basicConfig(level=logging.ERROR)
        elif args.log_level == 'INFO':
            logging.basicConfig(level=logging.INFO)
        elif args.log_level == 'DEBUG':
            logging.basicConfig(level=logging.DEBUG)
        else:
            logging.basicConfig(level=logging.WARNING)
    else:
        # set logging level. WARNING is default level
        logging.basicConfig(level=logging.WARNING)

    if args.show_relative_path and args.show_relative_path.lower() == 'true':
        show_relative_path = True
       
    if args.hosts:
        host_li = args.hosts.split(',')


 def main():
    global mywriter

    if sys.version_info < (3,0,0):
        print("Please use a version of Python > 3")
        sys.exit(-1)
    process_args()

    # write output to csv file
    with open(output_filename, 'w', newline='') as csvfile:
        mywriter = csv.writer(csvfile)
        # write column headings
        mywriter.writerow(col_li)

        for dir in dirlist:
            logging.debug(dir)

            for root, dirs, files in os.walk(dir):
                for name in files:
                    logging.debug(os.path.join(root, name))
                    (b, extension) = os.path.splitext(name)
                    ext  = extension.lstrip('.')
                    if extension in extlist or ext.isdigit():
                        fullpath = os.path.join(root, name)
                        if show_relative_path == True:
                            # add one for path separator
                            index = len(dir) + 1  
                            fname = fullpath[index:]
                            myvisitor_2(fullpath, fname)
                        else:
                            myvisitor_2(fullpath, name)

                for name in dirs:
                    logging.debug(os.path.join(root, name))

            #os.path.walk(dir, myvisitor, extlist)


 main()
diff --git a/gc-adaptive-detail.py b/gc-adaptive-detail.py
 import argparse
 import csv
 import datetime
 import logging
 import os
 import re
 import sys


 '''
    This program parses a gc log file for stop the world phases and keywords to csv file.
        young gc types - G1 Evacuation Pause, G1 Humongous Allocation, Metadata GC Threshold
        mixed types - G1 Evacuation Pause
        full gc types - Allocation Failure, System.gc()
 '''


 # list of columns
 col_li = ['file name', 'line no.', 'host', 'pid','date time', 'process time', 'gc type/keyword', 'time', 'comment']


 # list of extensions to visit
 extlist = ['.current', '.0', '.1', '.2', '.3', '.4', '.5']
 #extlist = ['.log', '.current', '.1', '.2']

 # special patterns to search for
 search_li = ['to-space','humongous']
 #search_li = ['to-space','humongous', r'System.gc\(\)']


 datefmt = r'^(\d+\.\d+): '
 #              ^ timestamp
 #datefmt = r'^(\d\d\d\d-\d\d-\d\dT\d\d:\d\d:\d\d\.\d\d\d[\+\-]\d\d\d\d): (\d+\.\d+): '
 #            ^ date time                                            ^ timestamp


 def gethomedir():
    return os.path.expanduser('~')

 def gettimestamp():
    today = datetime.date.today()
    return today.strftime("%Y%b%d")

 # globals
 # list of directories to visit
 dirlist = [r'/tmp']

 output_filename = '{}{}gc-summary-{}.csv'.format(gethomedir(), os.path.sep, gettimestamp())
 show_relative_path = False
 host_li = []

 # log files were collected and put in directories by hostname, separated by '.'
 def get_hostname(dirpath):
    for h in host_li:
        if dirpath.find(h) > -1:
            return h

    return ''

 # use Xloggc:/path/to/file/gc.%p.log, where %p tells the JVM to substitute the pid
 def get_pid(filename):
    li = filename.split('pid')
    if( len(li) == 1 ):
        return li[0]
    else:
        (pid, rest) = li[1].split('.', 1)
        logging.debug("pid: %s", pid)
        return pid


 def myvisitor(extlist, dirname, names):
    global fileinfo
    logging.debug("Current directory: %s", dirname)
    for f in names:
        (p, ext) = os.path.splitext(f)
        logging.debug("%s %s", f, ext)
        if ext in extlist:
            fullpath = os.path.join(dirname, f)
            logging.debug(fullpath)
            try:
                hostname = get_hostname(dirname)
                pid = get_pid(f)
                fileinfo = {'filename': f, 'host' : hostname, 'pid' : pid}

                process_file(fullpath)
            except OSError as err:
                print("OS error: {0}".format(err))

            #except OSError, detail:
            #    print detail

 def myvisitor_2(fullpath, f):
    global fileinfo
    try:
        hostname = get_hostname(f)
        pid = get_pid(f)
        fileinfo = {'filename': f, 'host': hostname, 'pid': pid}
        process_file(fullpath)
    except OSError as err:
        print("OS error: {0}".format(err))


 def process_file(fullpath):

    linenum = 0
    f = open(fullpath, 'r')

    date_time = ''
    process_time = ''

    # process line by line to get basic information
    for line in f:
        linenum += 1

        m = re.match(datefmt, line)
        if m:
            # save current timestamp
            date_time = m.group(0)
            process_time = m.group(1)

        # check for keywords of interest
        #process_search_pattern(line, linenum, date_time, process_time)

        if line.startswith('Java HotSpot(TM)') or line.startswith('Memory:') or line.startswith('CommandLine flags:'):
            process_jvminfo(line, linenum)
        elif line.startswith(' ') == False:
            # check for stw pauses that appear on one line
            process_remark_cleanup(line, linenum)


    # read file object to string. When -XX:+PrintAdaptiveSizePolicy is used,
    # gc phases need a multi-line regex to handle
    # check for stw pause that spans multiple lines
    f.seek(0)
    text = f.read()

    f.close()

    # we are interested in activity that causes a stop-the-world pause and the duration of the gc
    # https://blogs.oracle.com/poonam/entry/understanding_g1_gc_logs
    # https://www.oracle.com/technetwork/articles/java/g1gc-1984535.html

    # process multi-line gc phases
    process_young(text)
    process_mixed(text)
    process_full(text)


 def process_jvminfo(s, linenum):
    s = s.strip()
    mywriter.writerow([fileinfo['filename'], linenum, fileinfo['host'], fileinfo['pid'], '', '', 'jvm info', '', s])


 def process_young(s):
    '''
 232610.071: [GC pause (G1 Evacuation Pause) (young)
 Desired survivor size 1090519040 bytes, new threshold 15 (max 15)
 - age   1:    2294896 bytes,    2294896 total
 - age   2:    1768760 bytes,    4063656 total
 - age   3:    2228888 bytes,    6292544 total
 - age   4:    4939064 bytes,   11231608 total
 - age   5:    4320224 bytes,   15551832 total
 - age   6:    2211832 bytes,   17763664 total
 - age   7:     594464 bytes,   18358128 total
 - age   8:    1539128 bytes,   19897256 total
 - age   9:    3044240 bytes,   22941496 total
 - age  10:    2794640 bytes,   25736136 total
 - age  11:    3209632 bytes,   28945768 total
 - age  12:    2267952 bytes,   31213720 total
 - age  13:    2402216 bytes,   33615936 total
 - age  14:    2345184 bytes,   35961120 total
 - age  15:    2231848 bytes,   38192968 total
 232610.071: [G1Ergonomics (CSet Construction) start choosing CSet, _pending_cards: 13138, predicted base time: 78.16 ms, remaining time: 121.84 ms, target pause time: 200.00 ms]
 232610.071: [G1Ergonomics (CSet Construction) add young regions to CSet, eden: 1035 regions, survivors: 4 regions, predicted young region time: 11.03 ms]
 232610.071: [G1Ergonomics (CSet Construction) finish choosing CSet, eden: 1035 regions, survivors: 4 regions, old: 0 regions, predicted pause time: 89.19 ms, target pause time: 200.00 ms]
 , 0.1156739 secs]
    '''

    logging.debug("In process_young")
    date_time = ''
    process_time = 0.0
    young_type = ''
    initial_mark = ''
    gc_time = 0.0

    patternstr = datefmt + r'\[GC pause \(([ \w]*)\) \(young\).+?, (\d+\.\d+) secs\]$'
    '''
                                          ^type                      ^ elapsed time
    '''
    #patternstr = datefmt + r'\[GC pause \(([ \w]*)\) \(young\)( \([\w-]+\))?.+?, (\d+\.\d+) secs\]$'
    '''
                                          ^type               ^initial_mark      ^ elapsed time
    '''
    pattern = re.compile(patternstr, re.MULTILINE | re.DOTALL)
    #pattern = re.compile(r'^(\d\d\d\d-\d\d-\d\dT\d\d:\d\d:\d\d\.\d\d\d\+\d\d\d\d): (\d*\.\d*): \[GC pause \(([ \w\.\(\)]*)\) \(young\).+?, (\d*\.\d*) secs\]$', re.MULTILINE | re.DOTALL)

    for m in pattern.finditer(s):
        process_time = m.group(1)
        young_type = m.group(2)

        gc_time = m.group(3)
        mywriter.writerow([fileinfo['filename'], '', fileinfo['host'], fileinfo['pid'], date_time, process_time, 'Young generation collection - ' + young_type + initial_mark, gc_time, ''])

 def process_mixed(s):
    '''
 257167.069: [GC pause (G1 Evacuation Pause) (mixed)
 Desired survivor size 117440512 bytes, new threshold 15 (max 15)
 - age   1:     169008 bytes,     169008 total
 - age   2:       5032 bytes,     174040 total
 - age   3:    2712288 bytes,    2886328 total
 - age   4:     820208 bytes,    3706536 total
 - age   5:     916704 bytes,    4623240 total
 - age   6:    3246680 bytes,    7869920 total
 - age   7:     852856 bytes,    8722776 total
 - age   8:     605648 bytes,    9328424 total
 - age   9:     983264 bytes,   10311688 total
 - age  10:    1685120 bytes,   11996808 total
 - age  11:     692152 bytes,   12688960 total
 - age  12:    2147224 bytes,   14836184 total
 - age  13:    1511072 bytes,   16347256 total
 - age  14:    1832744 bytes,   18180000 total
 - age  15:    1066168 bytes,   19246168 total
 257167.069: [G1Ergonomics (CSet Construction) start choosing CSet, _pending_cards: 70042, predicted base time: 71.62 ms, remaining time: 128.38 ms, target pause time: 200.00 ms]
 257167.069: [G1Ergonomics (CSet Construction) add young regions to CSet, eden: 109 regions, survivors: 3 regions, predicted young region time: 6.64 ms]
 257167.069: [G1Ergonomics (CSet Construction) finish adding old regions to CSet, reason: predicted time is too high, predicted time: 3.29 ms, remaining time: 0.00 ms, old: 79 regions, min: 79 regions]
 257167.069: [G1Ergonomics (CSet Construction) added expensive regions to CSet, reason: old CSet region num not reached min, old: 79 regions, expensive: 29 regions, min: 79 regions, remaining time: 0.00 ms]
 257167.069: [G1Ergonomics (CSet Construction) finish choosing CSet, eden: 109 regions, survivors: 3 regions, old: 79 regions, predicted pause time: 285.70 ms, target pause time: 200.00 ms]
 257167.236: [G1Ergonomics (Mixed GCs) continue mixed GCs, reason: candidate old regions available, candidate old regions: 344 regions, reclaimable: 2334497912 bytes (6.21 %), threshold: 5.00 %]
 , 0.1677699 secs]
    '''

    process_time = 0.0
    date_time = ''
    mixed_type = ''
    gc_time = 0.0

    # output similar to GC pause (young)
    patternstr = datefmt + r'\[GC pause \(([ \w]*)\) \(mixed\).+?, (\d+\.\d+) secs\]$'
    #                                     ^mixed_type
    #patternstr = datefmt + r'\[GC pause \(([ \w]*)\) \(mixed\)( \([\w-]+\))?.+?, (\d+\.\d+) secs\]$'
    pattern = re.compile(patternstr, re.MULTILINE | re.DOTALL)

    for m in pattern.finditer(s):
        process_time = m.group(1)
        mixed_type = m.group(2)
        gc_time = m.group(3)
        mywriter.writerow([fileinfo['filename'], '', fileinfo['host'], fileinfo['pid'], date_time, process_time, 'Mixed generation collection - ' + mixed_type, gc_time, ''])


 def process_full(s):
    '''
 422052.838: [Full GC (System.gc())  16G->10G(35G), 34.1545090 secs]
    '''

    date_time = ''
    process_time = 0.0
    young_type = ''
    gc_time = 0.0

    gcfmt = r'\[Full GC \(([ \w\.\(\)]*)\) .+?, (\d+\.\d+) secs\]$'
    #                     ^ full gc type
    #gcfmt = r'\[Full GC \(([ \w\.\(\)]*)\) .+?, real=(\d+\.\d+) secs\]\s*$'

    patternstr = datefmt + gcfmt

    pattern = re.compile(patternstr, re.MULTILINE | re.DOTALL)

    for m in pattern.finditer(s):
        process_time = m.group(1)
        full_gc_type = m.group(2)
        gc_time = m.group(3)
        mywriter.writerow([fileinfo['filename'], '', fileinfo['host'], fileinfo['pid'], date_time, process_time, 'Full GC - ' + full_gc_type , gc_time, ''])


 def process_remark_cleanup(s, linenum):
    '''
    These gc log statements show up on a single line.
    Example:
 706.065: [GC cleanup 220M->218M(512M), 0.0021548 secs]
 706.035: [GC remark, 0.0278976 secs]
 108684.812: [GC remark 108684.812: [Finalize Marking, 0.0018014 secs] 108684.814: [GC ref-proc, 0.0089392 secs] 108684.823: [Unloading, 0.0317085 secs], 0.0672140 secs]
    '''
    gc_type = ''
    date_time = ''
    process_time = 0.0
    gc_time = 0.0

    m = re.match(datefmt + r'\[GC remark.+(\d+\.\d+) secs\]$', s)
    if m:
        gc_type = 'GC remark'
        process_time = m.group(1)
        gc_time = m.group(2)
    else:
        m = re.match(datefmt + r'\[GC cleanup .+, (\d+\.\d+) secs\]$', s)
        if m:
            gc_type = 'GC cleanup'
            date_time = '0'
            process_time = m.group(1)
            gc_time = m.group(2)

    if gc_type != '':
        mywriter.writerow([fileinfo['filename'], linenum, fileinfo['host'], fileinfo['pid'], date_time, process_time, gc_type, gc_time])


 def process_search_pattern(s, linenum, date_time, process_time):
    '''
    Look for search strings of interest. If found write to csv.
    '''
    patternstr = r'({})'.format('|'.join(search_li))
    m = re.search(patternstr, s, re.IGNORECASE)
    if m:
        search_pattern = m.group(1).lower()
        s = s.strip()
        mywriter.writerow([fileinfo['filename'], linenum, fileinfo['host'], fileinfo['pid'], date_time, process_time, search_pattern, '', s])

    '''
    for search_pattern in search_li:
        if re.search(search_pattern, s, re.IGNORECASE):
            s = s.strip()
            mywriter.writerow([fileinfo['filename'], linenum, fileinfo['host'], fileinfo['pid'], date_time, process_time, search_pattern, '', s])
            break
    '''

 def process_args():
    global dirlist, output_filename, show_relative_path, host_li

    parser = argparse.ArgumentParser(add_help=False)
    parser.add_argument("--start_dir",  help="the root directory to begin processing.")
    parser.add_argument("--output_dir", help="where the output file should be written to. By default the output file will be located in a user's home directory.")
    parser.add_argument("--log_level",  choices=['CRITICAL', 'ERROR', 'WARNING', 'INFO', 'DEBUG'], help="logging level. Default is WARNING.")
    parser.add_argument("--show_relative_path", help="show relative path in filename column. true or false. Default is false.")
    parser.add_argument("--hosts",      help="list of hosts, separated by commas.")
    parser.add_argument('-h', '--help', action='help', default=argparse.SUPPRESS, help=r'This program parses a gc log file and provides a summary in csv format. The following JVM options should be used to generate the log file: -Xloggc:/path/to/file/gc_%%p.log -XX:+PrintCommandLineFlags -XX:+PrintGC -XX:+PrintGCDetails -XX:+PrintGCTimeStamps -XX:+PrintAdaptiveSizePolicy -XX:+PrintTenuringDistribution -XX:-PrintReferenceGC')

    args = parser.parse_args()
    if args.start_dir:
        dirlist = [args.start_dir]
        output_filename = args.start_dir  + os.path.sep + "gc_log_summary-" + gettimestamp() + ".csv"


    if args.output_dir:
        output_filename = args.output_dir + os.path.sep + "gc_log_summary-" + gettimestamp() + ".csv"

    if args.log_level:
        if args.log_level == 'CRITICAL':
            logging.basicConfig(level=logging.CRITICAL)
        elif args.log_level == 'ERROR':
            logging.basicConfig(level=logging.ERROR)
        elif args.log_level == 'INFO':
            logging.basicConfig(level=logging.INFO)
        elif args.log_level == 'DEBUG':
            logging.basicConfig(level=logging.DEBUG)
        else:
            logging.basicConfig(level=logging.WARNING)
    else:
        # set logging level. WARNING is default level
        logging.basicConfig(level=logging.WARNING)

    if args.show_relative_path and args.show_relative_path.lower() == 'true':
        show_relative_path = True

    if args.hosts:
        host_li = args.hosts.split(',')


 def main():
    global mywriter

    if sys.version_info < (3,0,0):
        print("Please use a version of Python > 3")
        sys.exit(-1)
    process_args()

    # write output to csv file
    with open(output_filename, 'w', newline='') as csvfile:
        mywriter = csv.writer(csvfile)
        # write column headings
        mywriter.writerow(col_li)

        for dir in dirlist:
            logging.debug(dir)

            for root, dirs, files in os.walk(dir):
                for name in files:
                    logging.debug(os.path.join(root, name))
                    (b, ext) = os.path.splitext(name)
                    for x in extlist:
                        m = re.match(x, ext)
                        if m:
                            fullpath = os.path.join(root, name)
                            if show_relative_path == True:
                                # add one for path separator
                                index = len(dir) + 1
                                fname = fullpath[index:]
                                myvisitor_2(fullpath, fname)
                            else:
                                myvisitor_2(fullpath, name)
                for name in dirs:
                    logging.debug(os.path.join(root, name))

            #os.path.walk(dir, myvisitor, extlist)


 main()
diff --git a/gc-adaptive-log.py b/gc-adaptive-log.py
 import os
 import csv
 import re
 import logging
 import argparse
 import datetime


 # list of columns
 col_li = ['file name', 'line no.', 'host', 'pid', 'process time', 'gc type/keyword', 'time', 'size before gc', 'size after gc', 'total heap size']

 # list of directories to visit
 dirlist = [r'E:\log']
 show_relative_path = False


 def gethomedir():
    return os.path.expanduser('~')
    
 def gettimestamp():
    today = datetime.date.today()    
    return today.strftime("%Y%b%d")

 output_filename = '{}{}gc-summary-{}.csv'.format(gethomedir(), os.path.sep, gettimestamp())

 # list of extensions to visit
 extlist = ['.log']

 # special patterns to search for
 search_li = ['to-space','humongous', r'System.gc\(\)']

 # log files were collected and put in directories by hostname, separated by '.'                
 def get_hostname(dirpath):
    (head, tail) = os.path.split(dirpath)
    if tail.find('.') > -1:
        (hostname, rest) = tail.split('.', 1)
        logging.debug("hostname: %s", hostname)        
        return hostname
    else:
        return ''


 # use Xloggc:/path/to/file/gc.%p.log, where %p tells the JVM to substitute the pid                
 def get_pid(filename):
    li = filename.split('pid')
    if( len(li) == 1 ):
        return li[0]
    else:
        (pid, rest) = li[1].split('.', 1)
        logging.debug("pid: %s", pid)
        return pid

        
 def myvisitor(extlist, dirname, names):
    global fileinfo
    logging.debug("Current directory: %s", dirname)
    for f in names:
        (p, ext) = os.path.splitext(f)
        logging.debug("%s %s", f, ext)
        if ext in extlist:
            fullpath = os.path.join(dirname, f)
            logging.debug(fullpath)
            try:
                hostname = get_hostname(dirname)
                pid = get_pid(f)
                fileinfo = {'filename': f, 'host' : hostname, 'pid' : pid}
                
                process_file(fullpath)
            except OSError as err:
                print("OS error: {0}".format(err))

            #except OSError, detail:
            #    print detail
                
 def process_jvminfo(s, linenum):
    s = s.strip()
    mywriter.writerow([fileinfo['filename'], linenum, fileinfo['host'], fileinfo['pid'], '', 'jvm info', '', '', '', '', s]) 
        
        
 def process_file(fullpath):

    linenum = 0
    f = open(fullpath, 'r')

    # process line by line to get basic information
    for line in f:
        linenum += 1
        # check for keywords of interest
        process_search_pattern(line, linenum)
        if line.startswith('Java HotSpot(TM)') or line.startswith('Memory:') or line.startswith('CommandLine flags:'):
            process_jvminfo(line, linenum)
        elif line.startswith(' ') == False:
            process_remark_cleanup_fullgc(line, linenum)
          

    # read file object to string. When -XX:+PrintAdaptiveSizePolicy is used,
    # gc phases need a multi-line regex to handle
    # check for stw pause that spans multiple lines
    f.seek(0)
    text = f.read()

    f.close()

    # we are interested in activity that causes a stop-the-world pause and the duration of the gc
    # https://blogs.oracle.com/poonam/entry/understanding_g1_gc_logs

    # process multi-line gc phases
    process_young(text)            
    process_mixed(text)

    
 def process_young(s):
    '''
    These gc log statements show up on multiple lines.
    Example:
 54614.619: [GC pause (young)
 Desired survivor size 109051904 bytes, new threshold 16 (max 25)
 - age   1:    9991736 bytes,    9991736 total
 54614.620: [G1Ergonomics (CSet Construction) start choosing CSet, _pending_cards: 4184, predicted base time: 28.58 ms, remaining time: 971.42 ms, target pause time: 1000.00 ms]
 54614.620: [G1Ergonomics (CSet Construction) add young regions to CSet, eden: 199 regions, survivors: 4 regions, predicted young region time: 939.32 ms]
 54614.620: [G1Ergonomics (CSet Construction) finish choosing CSet, eden: 199 regions, survivors: 4 regions, old: 0 regions, predicted pause time: 967.90 ms, target pause time: 1000.00 ms]
 54614.644: [SoftReference, 878 refs, 0.0006080 secs]54614.645: [WeakReference, 1371 refs, 0.0003980 secs]54614.645: [FinalReference, 6591 refs, 0.0029020 secs]54614.648: [PhantomReference, 5 refs, 106 refs, 0.0019450 secs]54614.650: [JNI Weak Reference, 0.0090930 secs], 0.0433140 secs]
    '''                

    process_time = 0.0
    gc_time = 0.0
    
    pattern = re.compile(r'^(\d*\.\d*): \[GC pause [ \w\(\)]* \(young\)(.+?), (\d*\.\d*) secs\]$', re.MULTILINE | re.DOTALL)

    # multi-line search    
    for m in pattern.finditer(s):
        process_time = m.group(1)
        gc_time = m.group(3)        
        mywriter.writerow([fileinfo['filename'], '', fileinfo['host'], fileinfo['pid'], process_time, 'Young generation collection', gc_time, '', '', '']) 
                
 def process_mixed(s):
    process_time = 0.0
    gc_time = 0.0

    # output similar to GC pause (young)
    pattern = re.compile(r'^(\d*\.\d*): \[GC pause \(mixed\)(.+?), (\d*\.\d*) secs\]$', re.MULTILINE | re.DOTALL)
    
    for m in pattern.finditer(s):
        process_time = m.group(1)
        gc_time = m.group(3)        
        mywriter.writerow([fileinfo['filename'], '', fileinfo['host'], fileinfo['pid'], process_time, 'Mixed generation collection', gc_time, '', '', '']) 

 def process_remark_cleanup_fullgc(s, linenum):
    '''
    These gc log statements show up on a single line.
    Example:
 44973.752: [GC remark 44973.753: [GC ref-proc44973.753: [SoftReference, 3741 refs, 0.0031090 secs]44973.756: [WeakReference, 6937 refs, 0.0069930 secs]44973.763: [FinalReference, 2459 refs, 0.0038880 secs]44973.767: [PhantomReference, 28 refs, 1275 refs, 0.0029950 secs]44973.770: [JNI Weak Reference, 0.0621620 secs], 0.0803160 secs], 0.1021600 secs]
 [Times: user=0.30 sys=0.00, real=0.11 secs] 
 44973.856: [GC cleanup 22G->22G(30G), 0.0100070 secs]
 [Times: user=0.08 sys=0.00, real=0.01 secs] 
 151413.747: [Full GC151419.349: [SoftReference, 490 refs, 0.0000980 secs]151419.349: [WeakReference, 5036 refs, 0.0004770 secs]151419.349: [FinalReference, 10 refs, 0.0000230 secs]151419.349: [PhantomReference, 129 refs, 346 refs, 0.0000520 secs]151419.349: [JNI Weak Reference, 0.0025470 secs] 19G->19G(30G), 14.2256960 secs] 
    '''
    gc_type = ''
    process_time = 0.0
    gc_time = 0.0
    gc_size_before = ''
    gc_size_after = ''
    total_heap_size = ''
                    
    m = re.match(r'^(\d*\.\d*): \[GC remark \d*\.\d*: (.+), (\d*\.\d*) secs\]$', s)
    if m:
        gc_type = 'GC remark'
        process_time = m.group(1)
        gc_time = m.group(3)     
    else:
        m = re.match(r'^(\d*\.\d*): \[GC cleanup (.+), (\d*\.\d*) secs\]$', s)
        if m:
            gc_type = 'GC cleanup'
            process_time = m.group(1)
            gc_time = m.group(3)
        else:
            m = re.match(r'^(\d*\.\d*): \[Full GC(.+) (\d+[MG])->(\d*[MG])\((\d*[MG])\), (\d*\.\d*) secs\]$', s)
            if m:
                gc_type = 'Full GC'
                process_time    = m.group(1)
                gc_size_before  = m.group(3)
                gc_size_after   = m.group(4)
                total_heap_size = m.group(5)
                gc_time         = m.group(6)

    if gc_type != '':
        mywriter.writerow([fileinfo['filename'], linenum, fileinfo['host'], fileinfo['pid'], process_time, gc_type, gc_time, gc_size_before, gc_size_after, total_heap_size])

        
 def process_search_pattern(s, linenum):
    '''
    Look for search strings of interest. If found write to csv. 
    '''            
    for search_pattern in search_li:
        if re.search(search_pattern, s, re.IGNORECASE):
            s = s.strip()
            mywriter.writerow([fileinfo['filename'], linenum, fileinfo['host'], fileinfo['pid'], '', search_pattern, '', '', '', '', s]) 
            break

 def process_args():            
    global dirlist, output_filename, host_li

    parser = argparse.ArgumentParser(add_help=False)
    parser.add_argument("--start_dir",  help="the root directory to begin processing")
    parser.add_argument("--output_dir", help="where the output file should be written to. By default the output file will be located in a user's home directory.")
    parser.add_argument("--log_level",  choices=['CRITICAL', 'ERROR', 'WARNING', 'INFO', 'DEBUG'], help="logging level")
    parser.add_argument("--hosts",      help="list of hosts, separated by commas")
    parser.add_argument('-h', '--help', action='help', default=argparse.SUPPRESS, help="This program parses a gc log file and provides a summary in csv format. The following JVM options should be used to generate the log file: -Xloggc:/path/to/file/gc_%%p.log -XX:+PrintCommandLineFlags -XX:+PrintGC -XX:+PrintGCDetails -XX:+PrintGCTimeStamps -XX:+PrintAdaptiveSizePolicy -XX:+PrintTenuringDistribution -XX:+PrintReferenceGC")
    
    args = parser.parse_args()
    if args.start_dir:
        dirlist = [args.start_dir]
    
    if args.output_dir:
        output_filename = args.output_dir + os.path.sep + "gc_log_summary-" + gettimestamp() + ".csv"
        
    if args.log_level:
        if args.log_level == 'CRITICAL':
            logging.basicConfig(level=logging.CRITICAL)
        elif args.log_level == 'ERROR':
            logging.basicConfig(level=logging.ERROR)
        elif args.log_level == 'INFO':
            logging.basicConfig(level=logging.INFO)
        elif args.log_level == 'DEBUG':
            logging.basicConfig(level=logging.DEBUG)
        else:
            logging.basicConfig(level=logging.WARNING)
    else:
        # set logging level. WARNING is default level
        logging.basicConfig(level=logging.WARNING)

    if args.hosts:
        host_li = args.hosts.split(',')

        
 def myvisitor_2(fullpath, f):
    global fileinfo
    try:
        hostname = get_hostname(f)
        pid = get_pid(f)
        fileinfo = {'filename': f, 'host': hostname, 'pid': pid}
        process_file(fullpath)
    except OSError as err:
        print("OS error: {0}".format(err))


 def main():
    global mywriter

    process_args()
    
    # write output to csv file
    with open(output_filename, 'w', newline='') as csvfile:
    #with open(output_filename, 'wb') as csvfile:
        mywriter = csv.writer(csvfile)
        # write column headings
        mywriter.writerow(col_li)

        for dir in dirlist:
            logging.debug(dir)

            for root, dirs, files in os.walk(dir):
                for name in files:
                    logging.debug(os.path.join(root, name))
                    (b, ext) = os.path.splitext(name)
                    for x in extlist:
                        m = re.match(x, ext)
                        if m:
                            fullpath = os.path.join(root, name)
                            if show_relative_path == True:
                                # add one for path separator
                                index = len(dir) + 1
                                fname = fullpath[index:]
                                myvisitor_2(fullpath, fname)
                            else:
                                myvisitor_2(fullpath, name)
                for name in dirs:
                    logging.debug(os.path.join(root, name))


            #os.path.walk(dir, myvisitor, extlist)
        
        
 main()
diff --git a/heap-summary.py b/heap-summary.py
 import re
 import sys

 datefmt = r'^(\d\d\d\d-\d\d-\d\dT\d\d:\d\d:\d\d\.\d\d\d[\+\-]\d\d\d\d): (\d+\.\d+): '
 #             ^ date                                                
 #                                ^ time
 #                                                ^ millis
 #                                                            ^ time zone
 #                                                                        ^ timestamp

 filename = ''

 # check python version
 def check_version():

  if sys.version_info < (3,0,0):
      print("Please use a version of Python > 3")
      sys.exit(-1)

  if len(sys.argv) < 2:
      print("No filename specified.")
      print("Usage: {} <filename>".format(sys.argv[0]))
      sys.exit(-1)

 def process_args():
  global filename
  filename = sys.argv[1]

 def process():
    with open(filename, encoding="latin-1") as f:

        # the number of lines that have heap size information
        count = 0
        # the total number of lines processed
        linecount = 1

        # date and time stamp
        date_time = ''
        # number of seconds elapsed since the process started
        process_time = ''

        print(', filename, line_number, date_time, process_time, begin_eden, begin_max_eden, end_eden, end_max_eden, begin_survivor, end_survivor, begin_heap, begin_max_heap, end_heap, end_max_heap') 
        for line in f:
            line = line.strip()

            #print(line)

            m = re.match(datefmt, line)
            if m:
                # save current timestamp
                date_time = m.group(1)
                process_time = m.group(2)

            #  match heap information in following formats
            #   [Eden: 9632.0M(9632.0M)->0.0B(9624.0M) Survivors: 192.0M->200.0M Heap: 11.4G(16.0G)->2074.8M(16.0G)]
            #   [Eden: 4704.0M(9624.0M)->0.0B(9824.0M) Survivors: 200.0M->0.0B Heap: 6786.9M(16.0G)->931.6M(16.0G)], [Metaspace: 61553K->61499K(1105920K)]
        
            edenstr     = r'\s*\[Eden: (\d+\.\d[B|K|M|G])\((\d+\.\d[B|K|M|G])\)->(\d+\.\d[B|K|M|G])\((\d+\.\d[B|K|M|G])\) '
            survivorstr = r'Survivors: (\d+\.\d[B|K|M|G])->(\d+\.\d[B|K|M|G]) '
            heapstr     = r'Heap: (\d+\.\d[B|K|M|G])\((\d+\.\d[B|K|M|G])\)->(\d+\.\d[B|K|M|G])\((\d+\.\d[B|K|M|G])\)\].*'

            patternstr = edenstr + survivorstr + heapstr

            m = re.match(patternstr, line)
            if m:
                begin_eden     = m.group(1)
                begin_max_eden = m.group(2)
                end_eden       = m.group(3)
                end_max_eden   = m.group(4)
                begin_survivor = m.group(5)
                end_survivor   = m.group(6)
                begin_heap     = m.group(7)
                begin_max_heap = m.group(8)
                end_heap       = m.group(9)
                end_max_heap   = m.group(10)
                count += 1

                print('{},{},{},{},{},{},{},{},{},{},{},{},{},{},{}'.format(
                count, filename, linecount, date_time, process_time, begin_eden,
                begin_max_eden, end_eden, end_max_eden, begin_survivor, end_survivor,
                begin_heap, begin_max_heap, end_heap, end_max_heap))

            # match heap information in following formats

            # 2022-02-22T14:22:29.770-0600: 3.287: [GC cleanup 18M->18M(3072M), 0.0059295 secs]
            # 2022-02-22T14:22:34.301-0600: 7.817: [GC pause (Metadata GC Threshold) (young) (initial-mark) 123M->23M(3072M), 0.1070516 secs]
            # 2022-02-22T15:27:01.100-0600: 3829.383: [GC pause (G1 Evacuation Pause) (young) 9827M->6775M(11G), 0.1417604 secs]
            # 2022-02-22T16:42:59.750-0600: 8433.267: [GC pause (G1 Humongous Allocation) (young) (initial-mark) 1683M->1433M(3072M), 0.0867971 secs]

            #young = r'\[GC pause \([\w ]+\) \(\w+\) (\d+[B|K|M|G])->(\d+[B|K|M|G])\((\d+[B|K|M|G])\), \d+\.\d+ secs\]'
            young = r'\[GC [\w \(\)-]+ (\d+[B|K|M|G])->(\d+[B|K|M|G])\((\d+[B|K|M|G])\), \d+\.\d+ secs\]'
            patternstr = datefmt + young 
            m = re.match(patternstr, line)
            if m:
                # first 2 group matches are used by datefmt
                begin_heap = m.group(3)
                end_heap = m.group(4)
                end_max_heap = m.group(5)

                print('{},{},{},{},{},{},{},{},{},{},{},{},{},{},{}'.format(
                count, filename, linecount, date_time, process_time, '',
                '', '', '', '', '',
                begin_heap, '', end_heap, end_max_heap))

                count += 1


            linecount += 1



 def main():
  check_version()
  process_args()
  process()

 main()
diff --git a/iterate over dictionary b/iterate over dictionary
 teams = {'NY': 'Giants', 'Dallas' : 'Cowboys', 'Green Bay': 'Packers'}
 for k,v in teams.iteritems():
   print "%s => %s" % (k,v

 # sort, then print
 keys = teams.keys()
 keys.sort()
 for k in keys:
    print '%s => %s' % (k, teams[k]) 
    
 # alternatively
 for key in sorted(teams):
    print '%s => %s' % (key, teams[key])
diff --git a/jarchecker.py b/jarchecker.py
 import argparse
 import csv
 import datetime
 import logging
 import os
 import subprocess
 import sys
 import traceback


 # globals
 dirlist = []
 output_filename = ''

 # only run jar tvf on extensions of .jar
 ext_li = [ ".jar"]

 # only process the following file types in jar tvf output
 filetype_ext_li = [ '.class', '.jar']

 filename_prefix = 'jar_checker_summary'

 col_heading_li = ['artifact', 'size', 'content']

 home_dir = os.path.expanduser('~')

 def gettimestamp():
    today = datetime.date.today()
    return today.strftime("%Y%b%d")

 def capture_process_output(filename):
    cmd = 'jar tvf {0}'.format(filename) 

    result = subprocess.run(cmd, shell=True, check=True, capture_output=True, text=True)
    output = result.stdout
    logging.debug("result output: %s", output)
    return output

 def myvisitor(fullpath):
    logging.debug("fullpath: %s", fullpath)
    try:

        filename, file_extension = os.path.splitext(fullpath)

        if file_extension in ext_li:
            logging.debug("fullpath: %s", fullpath)
            jar_output = capture_process_output(fullpath)
            for line in jar_output.split('\n'):
                logging.debug(">>>>line: %s", line)

                # only process lines with output
                if line:
                    line_li = line.split()
                    logging.debug(r'........line_li: <%s>', ','.join(line_li))

                    size = line_li[0]
                    content = line_li[7]
                    content_filename, content_file_extension = os.path.splitext(content)
                    if content_file_extension in filetype_ext_li:
                        mywriter.writerow([fullpath, size, content])

    except Exception as err:
        logging.warning("Error caught while visiting {}".format(fullpath))
        logging.warning("Error: {0}".format(err))
        traceback.print_exc()


 def process_args():
    global dirlist, output_filename

    parser = argparse.ArgumentParser(add_help=False)
    parser.add_argument("--start_dir",  help="the root directory to begin processing. Multiple paths should be separated with a comma ','")
    parser.add_argument("--output_dir", help="where the output file should be written to.")
    parser.add_argument("--log_level",  choices=['CRITICAL', 'ERROR', 'WARNING', 'INFO', 'DEBUG'], help="logging level. Default is WARNING.")
    parser.add_argument('-h', '--help', action='help', default=argparse.SUPPRESS, help="This program searches jar files and outputs information about the content in csv format.")

    args = parser.parse_args()

    if args.start_dir:
        dirlist = args.start_dir.split(',')

    if args.output_dir:
        output_filename = args.output_dir + os.path.sep + filename_prefix + "-" + gettimestamp() + ".csv"
    else:
        output_filename = home_dir + os.path.sep + filename_prefix + '-' + gettimestamp() + ".csv"

    if args.log_level:
        if args.log_level == 'CRITICAL':
            logging.basicConfig(level=logging.CRITICAL)
        elif args.log_level == 'ERROR':
            logging.basicConfig(level=logging.ERROR)
        elif args.log_level == 'INFO':
            logging.basicConfig(level=logging.INFO)
        elif args.log_level == 'DEBUG':
            logging.basicConfig(level=logging.DEBUG)
        else:
            logging.basicConfig(level=logging.WARNING)
    else:
        # set logging level. WARNING is default level
        logging.basicConfig(level=logging.WARNING)

    logging.debug("dirlist: %s", dirlist)

 def main():

    if sys.version_info < (3,7,0):
        print("Please use a version of Python > 3.7")
        sys.exit(-1)

    process_args()

    global mywriter

    with open(output_filename, 'w', newline='') as csvfile:
        mywriter = csv.writer(csvfile)
        mywriter.writerow(col_heading_li)

        for root in dirlist:
            logging.debug("Processing: %s", root)
            for currentpath, dirs, files in os.walk(root):
                for name in files:
                    fullpath = os.path.join(currentpath, name)

                    logging.debug("root_dir: %s, currentpath: %s, fullpath: %s", root, currentpath, fullpath)
                    myvisitor(fullpath)

 main()
diff --git a/linkchecker.py b/linkchecker.py
 import argparse
 import csv
 import logging
 import os
 import re
 import sys
 import urllib.request


 ########## This program is used to search for urls in pdf files.
 ########## The pdf files should be downloaded to a local directory.
 ########## This program will test the urls for broken links.

 ########## global variables

 start_dir = ''
 output_dir = ''
 ext_list = ['pdf']
 # key: url, value: urlInfo
 links = {}

 ########## end global variables

 class UrlInfo:
    def __init__(self, url, hostname, files, count, responseCode, valid):
        self.url = url
        self.hostname = hostname
        self.files = files
        self.count = count
        self.responseCode = responseCode
        self.valid = valid


 def openFileHelper(filename):
    s = ''
    with open(filename, 'rb') as fopen:
        bytes = fopen.read()
        # workaround to handle pdf files as they are binary format
        s = bytes.decode('latin-1')

    return s


 # takes the filename of the file to search
 def searchInFile(filename):

    pattern = r'(http|https)://([a-zA-Z0-9\.#/%=_?-]*)'
    # special characters
    # # anchor
    # % escape
    # ? query string
    # other special characters (not used by us):
    # &, ~ (home directory location), + (plus sign)
 
    text = openFileHelper(filename)

    li = re.findall(pattern, text)
    for item in li:
        logging.debug('item is: {0}'.format(item));
        url = item[0] + '://' + item[1]
        # get hostname name
        hostname = ''
        m = re.match(r'^([a-zA-Z0-9\.-]*)', item[1])
        if m:
            hostname = m.group(1)

        logging.debug('url is: {0}'.format(url));
        if not url in links.keys():
            #links[url] = 1
            urlInfo = UrlInfo(url, hostname, [filename], 1, 0, False)
            links[url] = urlInfo
        else:
            urlInfo = links[url]
            urlInfo.count += 1
            if filename not in urlInfo.files:
                urlInfo.files.append(filename)


 def testLinks():
    print("testing links...")

    key = ''
    for key, value in links.items():
        try:
            responseCode = urllib.request.urlopen(key).getcode()
            value.valid = True
            value.responseCode = responseCode
        except Exception as err:
            logging.warning("Url: {0}, Error: {1}".format(key, err))
            #traceback.print_exc()
            if isinstance(err, urllib.error.HTTPError):
                #print('type is: ')
                #print(type(err))
                value.responseCode = err.code

 def outputLinks():
    keys = list(links.keys())
    keys.sort()

    #numlinks = len(keys)
    #print('The number of links: {0}'.format(numlinks))

    output_filename = os.path.sep.join([output_dir, 'linkchecker.csv'])

    with open(output_filename, 'w', newline='') as csvfile:
        mywriter = csv.writer(csvfile)

        # header
        mywriter.writerow(['url', 'hostname', 'in files', 'response code', 'valid', 'occurrences'])

        for key in keys:
            value = links[key]
            mywriter.writerow([key, value.hostname, ','.join(value.files), value.responseCode, value.valid, value.count])
            #if not value.valid == False:
            #    print('url: {}, occurrences: {}'.format(key, valu:e.count))
            #else:
            #    print('url: {}, in files: {}, occurrences: {}'.format(key, ','.join(value.files), value.count))


 def process_args():
    global start_dir, output_dir, ext_list

    parser = argparse.ArgumentParser(add_help=False)
    parser.add_argument("--start_dir",  help="the root directory to begin processing.", required=True)
    parser.add_argument("--output_dir", help="where the output file should be written to. If not specified it will be the same as start_dir.")

    parser.add_argument("--ext_list", help="the list of file extensions to search in separated with commas. Default is pdf.")
    parser.add_argument("--log_level",  choices=['CRITICAL', 'ERROR', 'WARNING', 'INFO', 'DEBUG'], help="logging level. Default is WARNING.")
    parser.add_argument('-h', '--help', action='help', default=argparse.SUPPRESS, help="This program is used to check files on disk for valid urls.")

    args = parser.parse_args()

    if args.start_dir:
        start_dir = args.start_dir

    if args.output_dir:
        output_dir = args.output_dir
    else:
        output_dir = start_dir

    if args.ext_list:
        ext_list = args.ext_list.split(',')

    if args.log_level:
        if args.log_level == 'CRITICAL':
            logging.basicConfig(level=logging.CRITICAL)
        elif args.log_level == 'ERROR':
            logging.basicConfig(level=logging.ERROR)
        elif args.log_level == 'INFO':
            logging.basicConfig(level=logging.INFO)
        elif args.log_level == 'DEBUG':
            logging.basicConfig(level=logging.DEBUG)
        else:
            logging.basicConfig(level=logging.WARNING)
    else:
        # set logging level. WARNING is default level
        logging.basicConfig(level=logging.WARNING)

    logging.debug("start_dir is: " + start_dir)
    logging.debug("ext_list is: " + ",".join(ext_list))



 def process():
    logging.debug("in process(), start_dir is: " + start_dir)

    for root, dirs, files in os.walk(start_dir):
        for name in files:
            (base, extension) = os.path.splitext(name)

            logging.debug("file name is: " + name)
            logging.debug("base file name is: " + base)

            if extension.startswith('.'):
                ext = extension.lstrip('.')

                ext_match = False
                if ext_list:
                    if ext in ext_list:
                        ext_match = True
                else:
                    ext_match = True

                if ext_match:
                    input_filename = os.path.join(root, name)
                    searchInFile(input_filename)

    testLinks()
    outputLinks()


 def main():

    if sys.version_info < (3,0,0):
        print("Please use a version of Python > 3")
        sys.exit(-1)

    process_args()

    process()

 main()
diff --git a/loop-heap-summary.py b/loop-heap-summary.py
 import os
 import subprocess 
 import sys
 import argparse

 start_dir = os.path.expanduser('~')
 output_dir = start_dir
 heap_summary_cmd = '/home/dixson/work/tools/py/heap-summary.py'

 def process_args():
    global start_dir, output_dir

    parser = argparse.ArgumentParser(add_help=False)
    parser.add_argument("--start_dir",  help="the root directory to begin processing.")
    parser.add_argument("--output_dir", help="where the output file should be written to. If this is not set, this defaults to the start_dir")
    parser.add_argument('-h', '--help', action='help', default=argparse.SUPPRESS, help="This program will parse a set of gc log files in a configured directory.")

    args = parser.parse_args()

    if args.start_dir:
        start_dir = args.start_dir

    if args.output_dir:
        output_dir = args.output_dir
    else:
        output_dir = start_dir



 def process():
    for root, dirs, files in os.walk(start_dir):
       for name in files:
           (base, extension) = os.path.splitext(name)
           if extension.startswith('.'):
               ext = extension.lstrip('.')

               if ext.isdigit() or ext == 'current' or ext == 'log':
                   input_filename = os.path.join(root, name)
                   output_filename = input_filename + '.csv'
                   print(input_filename)
                   print(output_filename)
                   with open(output_filename, "w") as outfile:
                       subprocess.run(['python3', heap_summary_cmd, input_filename], stdout=outfile)


 def main():
    if sys.version_info < (3,0,0):
        print("Please use a version of Python > 3")
        sys.exit(-1)

    process_args()
    process()
    
    
 main()
diff --git a/replaceblock.py b/replaceblock.py
 import argparse
 import logging
 import re
 import shutil
 import sys

 multiplespaceregex = r'[\s]+'

 filename = ''
 search_text_file = ''
 replacement_text_file = ''

 # replace any white space characters with a regular expression for white space
 def replaceWhiteSpace(s):
    whitespacefound = False

    searchstr = ''
    for ch in s:
        logging.debug(ch)
        m = re.match(r'[\s]', ch)
        if m:
            logging.debug('I found whitespace')
            if whitespacefound == False:
                whitespacefound = True

        else:
            if whitespacefound == True:
                searchstr += multiplespaceregex
            searchstr += ch

            whitespacefound = False

    if whitespacefound == True:
        searchstr += multiplespaceregex
    return searchstr

 def searchInFile(filename, searchstr):
 
    text = openFileHelper(filename)

    pattern = re.compile(searchstr)

    m = pattern.search(text)
    if m:
        return True
    else:
        return False


 def process_args():
    global filename, search_text_file, replacement_text_file

    parser = argparse.ArgumentParser(add_help=False)
    parser.add_argument("--filename",  help="the file to search.")
    parser.add_argument("--search_text_file",  help="the text block to search and replace for.")
    parser.add_argument("--replacement_text_file", help="the replacement text block.")
    parser.add_argument("--log_level",  choices=['CRITICAL', 'ERROR', 'WARNING', 'INFO', 'DEBUG'], help="logging level. Default is WARNING.")
    parser.add_argument('-h', '--help', action='help', default=argparse.SUPPRESS, help="This program helps to replace search replace block text.")

    args = parser.parse_args()

    if args.filename:
        filename = args.filename

    if args.search_text_file:
        search_text_file = args.search_text_file

    if args.replacement_text_file:
        replacement_text_file = args.replacement_text_file

    if args.log_level:
        if args.log_level == 'CRITICAL':
            logging.basicConfig(level=logging.CRITICAL)
        elif args.log_level == 'ERROR':
            logging.basicConfig(level=logging.ERROR)
        elif args.log_level == 'INFO':
            logging.basicConfig(level=logging.INFO)
        elif args.log_level == 'DEBUG':
            logging.basicConfig(level=logging.DEBUG)
        else:
            logging.basicConfig(level=logging.WARNING)
    else:
        # set logging level. WARNING is default level
        logging.basicConfig(level=logging.WARNING)


 def openFileHelper(filename):
    f = open(filename, 'r')

    s = f.read()
    f.close()

    return s 

 def main():
    global filename, search_text_file, replacement_text_file

    if sys.version_info < (3,0,0):
        print("Please use a version of Python > 3")
        sys.exit(-1)

    process_args()


    original_text = openFileHelper(filename)
    search_text = openFileHelper(search_text_file)
    replacement_text = openFileHelper(replacement_text_file)


    #searchstr = replaceWhiteSpace(search_text)
    searchstr = search_text
    logging.debug("searchstr..........")
    logging.debug(searchstr)

    found = searchInFile(filename, searchstr) 
    if found:
        # copy file
        dst = filename + '~'
        shutil.copy(filename, dst) 
        
        pattern = re.compile(searchstr)


        logging.debug("replacement text..........")
        logging.debug(replacement_text)


        replaced_text = pattern.sub(replacement_text, original_text)
        logging.debug("replaced text..........")
        logging.debug(replaced_text)

        f = open(filename, "w")
        n = f.write(replaced_text)
        f.close()


 main()
diff --git a/replacestring.py b/replacestring.py
 import argparse
 import logging
 import os
 import re
 import shutil
 import sys

 ########## This program replaces find . -exec sed 's/a/b/g' {} \; because certain characters like backslash were too difficult to handle using bash
 ########## global variables

 start_dir = ''
 search_regex_file = ''
 replacement_text_file = ''
 ext_list = []

 ########## end global variables

 def openFileHelper(filename):
    f = open(filename, 'r')
    s = f.read()
    f.close()
    return s

 # takes the filename of the file to search
 # pattern is the regex pattern to search for
 def searchInFile(filename, pattern):
 
    text = openFileHelper(filename)

    #pattern = re.compile(searchregex)

    m = pattern.search(text)
    if m:
        return True
    else:
        return False


 def process_args():
    global start_dir, search_regex_file, replacement_text_file, ext_list

    parser = argparse.ArgumentParser(add_help=False)
    parser.add_argument("--start_dir",  help="the root directory to begin processing.", required=True)

    parser.add_argument("--search_regex_file",  help="the file containing the regex to search for. The file should contain a single line and trailing whitespace will be stripped.", required=True)
    parser.add_argument("--replacement_text_file", help="the file containing the replacement string. The file should contain a single line and trailing whitespace will be stripped.", required=True)
    parser.add_argument("--ext_list", help="the list of file extensions to search in separated with commas.")
    parser.add_argument("--log_level",  choices=['CRITICAL', 'ERROR', 'WARNING', 'INFO', 'DEBUG'], help="logging level. Default is WARNING.")
    parser.add_argument('-h', '--help', action='help', default=argparse.SUPPRESS, help="This program helps to search and replace text.")

    args = parser.parse_args()

    if args.start_dir:
        start_dir = args.start_dir

    if args.search_regex_file:
        search_regex_file = args.search_regex_file

    if args.replacement_text_file:
        replacement_text_file = args.replacement_text_file

    if args.ext_list:
        ext_list = args.ext_list.split(',')

    if args.log_level:
        if args.log_level == 'CRITICAL':
            logging.basicConfig(level=logging.CRITICAL)
        elif args.log_level == 'ERROR':
            logging.basicConfig(level=logging.ERROR)
        elif args.log_level == 'INFO':
            logging.basicConfig(level=logging.INFO)
        elif args.log_level == 'DEBUG':
            logging.basicConfig(level=logging.DEBUG)
        else:
            logging.basicConfig(level=logging.WARNING)
    else:
        # set logging level. WARNING is default level
        logging.basicConfig(level=logging.WARNING)

    logging.debug("start_dir is: " + start_dir)
    logging.debug("search_regex_file is: " + search_regex_file)
    logging.debug("replacement_text_file is: " + replacement_text_file)
    logging.debug("ext_list is: " + ",".join(ext_list))



 def process(pattern):

    for root, dirs, files in os.walk(start_dir):
        for name in files:
            (base, extension) = os.path.splitext(name)
            if extension.startswith('.'):
                ext = extension.lstrip('.')

                ext_match = False
                if ext_list:
                    if ext in ext_list:
                        ext_match = True
                else:
                    ext_match = True

                if ext_match:
                    input_filename = os.path.join(root, name)

                    found = searchInFile(input_filename, pattern)
                    if found:
                        # create backup copy
                        backup_filename = input_filename + '~'
                        shutil.copy(input_filename, backup_filename)
                        original_text = openFileHelper(input_filename)
                        # replace text
                        replaced_text = pattern.sub(replacement_text, original_text)

                        # save to original file
                        f = open(input_filename, "w")
                        n = f.write(replaced_text)
                        f.close()


 def main():
    global search_regex, replacement_text

    if sys.version_info < (3,0,0):
        print("Please use a version of Python > 3")
        sys.exit(-1)

    process_args()

    search_regex = openFileHelper(search_regex_file).rstrip()
    replacement_text = openFileHelper(replacement_text_file).rstrip()


    logging.debug("search_regex is: " + search_regex)

    logging.debug("replacement_text is: " + replacement_text)

    pattern = re.compile(search_regex)

    process(pattern)

 main()
diff --git a/start a python web server b/start a python web server
 # very simple, no need to download web framework and deploy app, simply navigate to directory. directory serves as document root
 python3 -m http.server 8888
diff --git a/thread-parse.py b/thread-parse.py
 import argparse
 import csv
 import logging
 import os
 import sys
 import re
 import traceback

 '''
    This code parses a Java thread dump txt file and outputs it to a csv file,
    for easier analysis.
    
    It creates 2 csv files. It creates a .csv and -summary.csv file.
    It takes the original thread dump file name and removes the .txt suffix and appends the above suffixes.
    
    Sometimes the thread dump was generated with a long listing and will contain additional fields.
    This program will attempt to parse using a long listing and a simple listing strategy.
    It is normal to see some errors, as one strategy will fail.
 '''


 # selected comma for main separator, choose another separator
 SUB_SEPARATOR = '|' 
 title = ''
 jni_global_references = ''
 heap = ''


 start_dir = r'C:\Users\Dixson\Downloads\support\logs\test'
 home_dir = os.path.expanduser('~')
 output_dir = ''
 print_runnable = True
 print_thread_count = False
 print_other_thread_summary = False


 class Substate:
    def __init__(self, msg, objectid, otherClassName):
        self.msg = msg
        self.objectid = objectid
        self.otherClassName = otherClassName

 # strategy
 # Enhancement JDK-8200720 allows for additional fields
 class EnhancedLongListingStrategy(object):
    def __init__(self):
        self.name = 'EnhancedLongListingStrategy'
        self.col_li = ['name', 'number', 'type', 'priority', 'os_priority', 'cpu', 'elapsed', 'tid', 'nid', 'status', 'state', 'substate', 'address', 'stack']

    def process_threadprop(self, s):
        # replace with underscores for easier parsing
        s = s.replace('waiting on condition', 'waiting_on_condition')
        s = s.replace('in Object.wait()', 'in_Object.wait()')
        s = s.replace('waiting for monitor entry', 'waiting_for_monitor_entry')

        #logging.debug("s: {}".format(s))
        thread_name = ''
        threadprop = {}

        # extract thread name
        m = re.match(r'"(.*)"(.*)$', s)
        if m:
            thread_name = m.group(1)
            substring = m.group(2)

            if (s.find('daemon') > -1):
                # general case, most threads 'labelled' daemon
                li = substring.split()

                thread_no = li[0].lstrip('#')
                thread_type = li[1]
                thread_priority = li[2]
                thread_ospriority = li[3]
                thread_cpu = li[4]
                thread_elapsed = li[5]
                thread_tid = li[6]
                thread_nid = li[7]
                thread_status = li[8]

                # some thread dumps show status with address, eg. sleeping[0x00007f297b44f000]
                if len(li) >= 8:
                    thread_address = li[9]
                else:
                    thread_address = ''

                threadprop = {'name': thread_name, 'number': thread_no, 'type': thread_type, 'priority': thread_priority, 'os_priority': thread_ospriority,'cpu': thread_cpu, 'elapsed': thread_elapsed, 'tid': thread_tid, 'nid': thread_nid, 'status': thread_status, 'address': thread_address}
            else:
                # threads not labelled 'daemon'
                logging.debug('substring {}'.format(substring))
                m = re.match(r' #(\d+) (.*)$', substring)
                if m:
                    thread_no = m.group(1)
                    substring = m.group(2)

                    li = substring.split()
                    thread_priority = li[0]
                    thread_ospriority = li[1]
                    thread_cpu = li[2]
                    thread_elapsed = li[3]
                    thread_tid = li[4]
                    thread_nid = li[5]
                    thread_status = li[6]

                    # some thread dumps show status with address, eg. sleeping[0x00007f297b44f000]
                    if len(li) > 6:
                        thread_address = li[7]
                    else:
                        thread_address = ''

                    threadprop = {'name': thread_name, 'number': thread_no, 'priority': thread_priority, 'os_priority': thread_ospriority, 'cpu': thread_cpu, 'elapsed':thread_elapsed, 'tid': thread_tid, 'nid': thread_nid, 'status': thread_status, 'address': thread_address}

                else:
                    # jvm threads - only display basic information
                    # "G1 Conc#0" os_prio=0 cpu=1453.41ms elapsed=52307.25s tid=0x00007f912406ded0 nid=0x1cafd5 runnable
                    li = substring.split()
                    thread_ospriority = li[0]
                    thread_cpu = li[1]
                    thread_elapsed = li[2]
                    thread_tid = li[3]
                    thread_nid = li[4]
                    thread_status = li[5]
                    threadprop = {'name' : thread_name, 'os_priority' : thread_ospriority, 'cpu': thread_cpu, 'elapsed' : thread_elapsed, 'tid' : thread_tid, 'nid' : thread_nid, 'status' : thread_status}

            return threadprop


 # generated with jstack -l
 class LongListingStrategy(object):
    def __init__(self):
        self.name = 'LongListingStrategy'
        self.col_li = ['name', 'number', 'type', 'priority', 'os_priority', 'tid', 'nid', 'status', 'state', 'substate', 'address', 'stack']

    def process_threadprop(self, s):        
        # replace with underscores for easier parsing
        s = s.replace('waiting on condition', 'waiting_on_condition')
        s = s.replace('in Object.wait()', 'in_Object.wait()')
        s = s.replace('waiting for monitor entry', 'waiting_for_monitor_entry')

        #logging.debug("s: {}".format(s))
        thread_name = ''
        threadprop = {}
        
        # extract thread name
        m = re.match(r'"(.*)"(.*)$', s)
        if m:
            thread_name = m.group(1)
            substring = m.group(2)

            # general case, most threads 'labelled' daemon
            if (s.find('daemon') > -1):                
                li = substring.split()
            
                thread_no = li[0].lstrip('#')
                thread_type = li[1]
                thread_priority = li[2]
                thread_ospriority = li[3]
                thread_tid = li[4]
                thread_nid = li[5]
                thread_status = li[6]

                # some thread dumps show status with address, eg. sleeping[0x00007f297b44f000]
                if len(li) >= 8:
                    thread_address = li[7]
                else:
                    thread_address = ''

                threadprop = {'name': thread_name, 'number': thread_no, 'type': thread_type, 'priority': thread_priority, 'os_priority': thread_ospriority, 'tid': thread_tid, 'nid': thread_nid, 'status': thread_status, 'address': thread_address}
            else:
                #"RMI Reaper" #14 prio=5 os_prio=0 tid=0x00007f2bd1d3f800 nid=0x2161 in Object.wait() [0x00007f2106550000]
                #"main" #1 prio=5 os_prio=0 tid=0x00007f2bd000b800 nid=0x20ab waiting on condition [0x00007f2bd5f79000]
                #"main" #1 prio=5 os_prio=0 tid=0x00007f79c000d800 nid=0x9091 sleeping[0x00007f79c8305000]
                #"GS-swiftJmsSenderContainer-1" #205 prio=5 os_prio=0 tid=0x00007f684645a000 nid=0x6156 sleeping[0x00007f6735dea000]

                m = re.match(r' #(\d+) (.*)$', substring)
                if m:
                    thread_no = m.group(1)
                    substring = m.group(2)
                    
                    li = substring.split()
                    thread_priority = li[0]
                    thread_ospriority = li[1]
                    thread_tid = li[2]
                    thread_nid = li[3]
                    thread_status = li[4]
                    if len(li) > 5:
                        thread_address = li[5]

                    # some thread dumps show status with address, eg. sleeping[0x00007f297b44f000]
                    if len(li) >= 6:
                        thread_address = li[5]
                    else:
                        thread_address = ''

                    threadprop = {'name': thread_name, 'number': thread_no, 'priority': thread_priority, 'os_priority': thread_ospriority, 'tid': thread_tid, 'nid': thread_nid, 'status': thread_status, 'address': thread_address}

                else:
                    # jvm threads only display basic information
                    li = substring.split()
                    thread_ospriority = li[0]
                    thread_tid = li[1]
                    thread_nid = li[2]
                    thread_status = li[3]
                    threadprop = {'name' : thread_name, 'os_priority' : thread_ospriority, 'tid' : thread_tid, 'nid' : thread_nid, 'status' : thread_status}

            return threadprop

 # generated with jstack; missing thread number and os_priority
 class SimpleListingStrategy(object):
    def __init__(self):
        self.name = 'SimpleListingStrategy'
        self.col_li = ['name', 'type', 'priority', 'tid', 'nid', 'status', 'state', 'substate', 'address', 'stack', 'locked_ownable_synchronizers']

    def process_threadprop(self, s):        
        # replace with underscores for easier parsing
        s = s.replace('waiting on condition', 'waiting_on_condition')
        s = s.replace('in Object.wait()', 'in_Object.wait()')
        s = s.replace('waiting for monitor entry', 'waiting_for_monitor_entry')

        thread_name = ''
        threadprop = {}
        
        # extract thread name
        m = re.match(r'"(.*)"(.*)$', s)
        if m:
            thread_name = m.group(1)
            substring = m.group(2)

            # general case, most threads 'labelled' daemon
            if (s.find('daemon') > -1):
                li = substring.split()
            
                thread_type = li[0]
                thread_priority = li[1]
                thread_tid = li[2]
                thread_nid = li[3]
                thread_status = li[4]
                # some thread dumps show status with address, eg. sleeping[0x00007f297b44f000]
                if len(li) >= 6:                
                    thread_address = li[5]
                else:
                    thread_address = ''

                threadprop = {'name': thread_name, 'type': thread_type, 'priority': thread_priority, 'tid': thread_tid, 'nid': thread_nid, 'status': thread_status, 'address': thread_address}

            else:
                li = substring.split()
                if len(li) > 2:
                    thread_priority = li[0]
                    thread_tid = li[1]
                    thread_nid = li[2]
                    thread_status = li[3]
                    # some thread dumps show status with address, eg. sleeping[0x00007f297b44f000]
                    if len(li) >= 5:
                        thread_address = li[4]
                    else:
                        thread_address = ''

                    threadprop = {'name': thread_name, 'priority': thread_priority, 'tid': thread_tid, 'nid': thread_nid, 'status': thread_status, 'address': thread_address}
                else:
                    #"GS-GSPingManager:com.gigaspaces.internal.lrmi.stubs.LRMISpaceImpl:1632991357520" Id=721 TIMED_WAITING
                    thread_id = li[0]
                    thread_status = li[1]
                    threadprop = {'name': thread_name, 'tid' : thread_id, 'status' : thread_status}
                
            return threadprop
 # end strategy


 # an indented line containing java.lang.Thread.State is usually the first line of the block
 def process_state(li):
    if( len(li) > 0):
        #logging.debug(li[0])
        m = re.match(r'^\s+java\.lang\.Thread\.State: (.*)$', li[0])
        if m:
            return m.group(1)
    #if( block_li[0].find('java.lang.Thread.State:') > -1):
        else:
            return ''
    else:
        return ''

 # a stack trace may have additional information I call substate        
 def process_substate(li):
    #logging.debug("In process substate")
    #logging.debug("li is: " + ''.join(li))
    substateObj = None
    substate_li = []
    substateObj_li = []
    for s in li:
        s = s.strip()
        logging.debug("s is: '" + s + "'")
        if( s.startswith('-')):
            substate_li.append(s)
        m = re.match(r'-(.*)<(.*)> \(a (.*)\)', s)

        if m:
            msg = m.group(1).strip()
            objectid = m.group(2)
            classname = m.group(3)
            logging.debug("match found")
            subStateObj = Substate(msg, objectid, classname)
            substateObj_li.append(subStateObj)

 
    return (SUB_SEPARATOR.join(substate_li), substateObj_li)

 def process_stack(li):
    stack_li = []
    for s in li:
        s = s.strip()
        stack_li.append(s)
        
    #logging.debug("begin>>>>> %s" % SUB_SEPARATOR.join(stack_li))
    #logging.debug("end>>>>>>>")
    
    return SUB_SEPARATOR.join(stack_li)

 def process_heap(li):
    heap_li = []
    for s in li:
        s = s.strip()
        heap_li.append(s)
        
    s = SUB_SEPARATOR.join(heap_li)    
    return s.replace(',', '\'')

 # the information in this block occurs below the stack trace    
 def process_locked_ownable_sync(block_li):

    #logging.debug("block_li in locked_ownable_synchronizers: {}".format(block_li))
    if not block_li:
        return ''
    
    length = len(block_li)
    for n in range(0, length):
        s = block_li[n]
        if s.find('Locked ownable synchronizers:') > -1 :
            # return value in next line
            if n + 1 < length:
                return block_li[n + 1].strip().lstrip('-')
    return ''
    
    
 def process_block(strategy, block_li, nextblock_li, threadprop_by_name):
    global title, jni_global_references, heap
    logging.debug("BEGIN BLOCK")
    logging.debug(block_li)
    logging.debug("END BLOCK")
    s = block_li[0]
    if (s.startswith('"')):
        # thread name found
        threadprop = strategy.process_threadprop(s)
        threadprop['state'] = process_state(block_li[1:])
        threadprop['block'] = block_li[1:]
        #  there can be more than 1 thread referenced 
        (substate, substateObj) = process_substate(block_li[1:])
        threadprop['substate'] = substate
        threadprop['substateObj'] = substateObj

        threadprop['stack'] = process_stack(block_li[1:])
        threadprop['locked_ownable_synchronizers'] = process_locked_ownable_sync(nextblock_li)
        threadprop_by_name[threadprop['name']] = threadprop
    elif (s.startswith('Full thread dump')):
        title = s
    elif (s.startswith('JNI global references') or s.startswith('JNI global refs')):
        jni_global_references = s
    elif (s == 'Heap'):
        heap = process_heap(block_li[1:])
    else:
        logging.debug('Skipping block that starts with line: {}'.format(s))
        
    return threadprop_by_name

 # print substate in another format for easy viewing
 # print thread name, id, status, object id, classname
 def print_substate(threadprop_by_name, mywriter):
    mywriter.writerow(['substate (redux)', 'thread', 'tid', 'msg', 'other_oid', 'other_classname (e.g, locked/waiting on)'])
    for k in threadprop_by_name.keys():
        thread_name = k
        #logging.debug(threadprop_by_name[k])
        threadprop = threadprop_by_name[k]
        tid = threadprop['tid'].split('=')[1]
        if 'substateObj' in threadprop:
            substatusObj_li = threadprop_by_name[k]['substateObj']
            for substatusObj in substatusObj_li:
                mywriter.writerow(['',thread_name, tid, substatusObj.msg, substatusObj.objectid, substatusObj.otherClassName ])

 def print_runnable_stack(threadprop_by_name, mywriter):
    mywriter.writerow(['runnable', 'thread (in state RUNNABLE)', 'stack'])
    for k in threadprop_by_name.keys():
        thread_name = k
        threadprop = threadprop_by_name[k]
        state = threadprop['state']
        if state == 'RUNNABLE':
            # re-format original stack trace
            block = [line.strip() for line in threadprop['block'][1:]]
            block_s = "\n".join(block)
            mywriter.writerow(['',thread_name,block_s])
    
 def count_occurrences(threadprop_by_name, field, mywriter, column_name):
    logging.debug("field is: " + field);
    values = []
    count_dict = {}
    for k in threadprop_by_name.keys():
        value = threadprop_by_name[k][field]
        values.append(value)
    
    for item in values:
        if item in count_dict:
            count = count_dict[item]
            count += 1
            count_dict[item] = count
        else:
            count_dict[item] = 1

    mywriter.writerow([column_name, 'value', 'count'])

    '''    
    for key, value in sorted(count_dict.iteritems(), key=lambda (k,v): (v,k), reverse=True):
        #print ", %s, %s" % (key[:160], value)
        mywriter.writerow(['', key[:160], value])
    '''
    sorted_keys = sorted(count_dict.keys())
    
    for key in sorted_keys:
        value = count_dict[key]
        s = key[:160]
        if not s:
           s = "EMPTY"
        mywriter.writerow(['', s, value])
        
 def print_threads(strategy, threadprop_by_name, mywriter):
    mywriter.writerow(strategy.col_li)
    
    mywriter.writerow(['Title', title])
    mywriter.writerow(['JNI global references', jni_global_references])
    if heap:
        mywriter.writerow(['Heap', heap])
    mywriter.writerow([])
    mywriter.writerow(['** Begin threads **'])

    keys = sorted(threadprop_by_name.keys())
    #keys.sort()

    for k in keys:
        #logging.debug('%s => %s' % (k, threadprop_by_name[k]))
        li = []
        threadprop_dict = threadprop_by_name[k]
        for col in strategy.col_li:
            if col in threadprop_dict:
                s = threadprop_dict[col]
            else:
                s = ''
            s = s if not None else ''
            li.append(s)
            
        mywriter.writerow(li)

        
 def write_csv(strategy, threadprop_by_name, filename):
    # write output to csv file
    # output compilation of thread properties
    filename_woext, file_extension = os.path.splitext(filename)
    output_filename = output_dir + os.path.sep + filename_woext + '.csv'
    
    with open(output_filename, 'w', newline='') as csvfile:
        mywriter = csv.writer(csvfile)
        print_threads(strategy, threadprop_by_name, mywriter)

    # output summary    
    output_filename = output_dir + os.path.sep + filename_woext + '-summary.csv'    
    with open(output_filename, 'w', newline='') as csvfile:
        mywriter = csv.writer(csvfile)
        
        if print_thread_count == True:
            count_occurrences(threadprop_by_name, 'status', mywriter, 'status')
    
            count_occurrences(threadprop_by_name, 'state', mywriter, 'state')
    
            count_occurrences(threadprop_by_name, 'substate', mywriter, 'linked to')
            # an application with many threads in a certain section of code may indicate a problem
   
            count_occurrences(threadprop_by_name, 'stack', mywriter, 'stack (first few lines of)')

        if print_other_thread_summary == True:
            print_substate(threadprop_by_name, mywriter)

        if print_runnable == True:
            print_runnable_stack(threadprop_by_name, mywriter)

    
 def process_file(fullpathname, filename):   
    line_number = 0

    
    f = open(fullpathname)

    # allblock_li is all the thread text sections saved to a list
    allblock_li = []
    # current_block_li is a text section containing information for a single thread
    current_block_li = []
    # k thread name -> v dictionary with key (column heading or property name), value pairs for that thread
    threadprop_by_name = {}

    firsttime = True
    for line in f:
        line_number += 1
        s = line.rstrip()

        #logging.debug(">> %d: %s" % (line_number, s))
        
        # lines beginning with white space
        m = re.match(r'^(\s)+(.*)$', s)
        
        # separate lines in file into sections, ie, block
        # save for future processing
        # need to be able to look ahead into block and next block
        if( not m):
            # new block found
            if( firsttime == False ):
                #threadprop_by_name = process_block(strategy, current_block_li, threadprop_by_name)
                allblock_li.append(current_block_li)
            else:
                firsttime = False
            # reset current_block_li
            current_block_li = [s]
        else:
            current_block_li.append(s)
        
    allblock_li.append(current_block_li)
    #threadprop_by_name = process_block(strategy, current_block_li, threadprop_by_name)

    # initialize strategies
    strategy_li = [EnhancedLongListingStrategy(), LongListingStrategy(), SimpleListingStrategy()]
    # try each strategy until one processes cleanly
    for strategy in strategy_li:
        try:
            threadprop_by_name = {}
            length = len(allblock_li)
            for n in range(0, length):
            #for block in allblock_li:
                block = allblock_li[n]
                if (n+1 >= length):
                    nextblock = None
                else:
                    nextblock = allblock_li[n+1]
                threadprop_by_name = process_block(strategy, block, nextblock, threadprop_by_name)
    
            write_csv(strategy, threadprop_by_name, filename)
        
            # if this succeeds, no need to try next strategy
            break
        
        except Exception as err:
            logging.warning("Error caught while parsing {} using strategy {}".format(filename, strategy.name))
            logging.warning("Error: {0}".format(err))
            traceback.print_exc()
        
            
 def process_args():
    global start_dir, output_dir, print_runnable, print_thread_count, print_other_thread_summary

    parser = argparse.ArgumentParser(add_help=False)
    parser.add_argument("--start_dir",  help="the root directory to begin processing")
    parser.add_argument("--output_dir", help="where the output file should be written to")
    parser.add_argument("--print_runnable", help="print the stack traces of the runnable threads. Default is true")
    parser.add_argument("--print_thread_count", help="print a summary of thread counts by class. Default is false")
    parser.add_argument("--print_other_thread_summary", help="print a summary of the referenced threads. Default is false")
    parser.add_argument("--log_level",  choices=['CRITICAL', 'ERROR', 'WARNING', 'INFO', 'DEBUG'], help="logging level")
    parser.add_argument('-h', '--help', action='help', default=argparse.SUPPRESS, help="This program parses a set of thread dump files generated with jstack or kill -3.")
    
    args = parser.parse_args()
    if args.start_dir:
        start_dir = args.start_dir
    
    if args.output_dir:
        output_dir = args.output_dir + os.path.sep
    else:
        output_dir = home_dir + os.path.sep

    if args.print_runnable:
        if args.print_runnable.lower() == 'false' or args.print_runnable.lower() == 'f':
            print_runnable = False 

    if args.print_thread_count:
        if args.print_thread_count.lower() == 'true' or args.print_thread_count.lower() == 't':
            print_thread_count = True

    if args.print_other_thread_summary:
        if args.print_other_thread_summary.lower() == 'true' or args.print_other_thread_summary.lower() == 't':
            print_other_thread_summary = True

    if args.log_level:
        if args.log_level == 'CRITICAL':
            logging.basicConfig(level=logging.CRITICAL)
        elif args.log_level == 'ERROR':
            logging.basicConfig(level=logging.ERROR)
        elif args.log_level == 'INFO':
            logging.basicConfig(level=logging.INFO)
        elif args.log_level == 'DEBUG':
            logging.basicConfig(level=logging.DEBUG)
        else:
            logging.basicConfig(level=logging.WARNING)
    else:
        # set logging level. WARNING is default level
        logging.basicConfig(level=logging.WARNING)

        
 def main():

    if sys.version_info < (3,0,0):
        print("Please use a version of Python > 3")
        sys.exit(-1)

    process_args()

    for start, dirs, files in os.walk(start_dir):
        for name in files:
            if name.endswith('txt') or name.endswith('tdump'):
                process_file(os.path.join(start, name), name)
     
    
 main()
	import os
	import sys
	import csv
	import re
	import logging
	import argparse
	import datetime

	col_li = ['filename', 'line_number', 'host', 'pid', 'comp', 'id', 'time', 'ms', 'category', 'level', 'logger', 'message']

	dirlist = [r'E:\log']

	start_date = None

	end_date = None

	# date format used to convert command line arguments into a datetime object
	# example: 2021-09-14
	filter_date_fmt = '%Y-%m-%d'
	# adding hours, minutes and seconds
	filter_datetime_fmt = filter_date_fmt + ' %H:%M:%S'

	home_dir = os.path.expanduser('~')

	filename_prefix = 'app_log_summary'

	output_filename = ''

	show_fullpath = False

	# list of extensions to visit
	extlist = ['\.\d+', '.log', '.out', '.stdouterr', '.err']

	# regex representing entire date time portion from a line in a log file
	# example: 2021-09-14 16:22,124
	datefmt = r'(\d\d\d\d-\d\d-\d\d \d\d:\d\d:\d\d),(\d\d\d)'

	# search for the following strings that may indicate an error
	error_li = [ 'warning', 'severe', 'exception', 'error', 'failure', 'Long GC collection']

	# for setting log level
	level_li = ['SEVERE', 'WARNING', 'INFO', 'CONFIG', 'FINE', 'FINER', 'FINEST']

	host_li = []

	def gettimestamp():
	today = datetime.date.today()
	return today.strftime("%Y%b%d")


	# check if string matches any of the hostnames
	def get_hostname(s):
	for host in host_li:
	pattern = '.({0}).'.format(host)
	m = re.match(pattern, s)
	if m:
	return m.group(1)
	return ''


	# check if filename contains pid and component information
	# only works if filename format has not changed
	def get_pid(s):

	comp = ''
	id = ''
	host = ''
	pid = ''

	patternstr = r'.(gsc\|manager\|gsm\|lus)_(\d+)-([\w\.]+)-(\d+).'
	m = re.match(patternstr, s)
	if m:
	comp = m.group(1)
	id = m.group(2)
	host = m.group(3)
	pid = m.group(4)
	else:
	# other processes: gsa, GSWebUI, ui, service
	patternstr = r'.(gsa\|GSWebUI\|ui\|service)-([\w\.]+)-(\d+).'
	m = re.match(patternstr, s)
	if m:
	comp = m.group(1)
	host = m.group(2)
	pid = m.group(3)

	return (comp, id, host, pid)


	def process_file(fullpath):
	line_number = 0

	with open(fullpath, encoding="latin-1") as f:

	sDate = ''
	dtDate = None
	millis = ''

	for line in f:
	found = False
	line_number += 1

	# skip lines beginning with white space
	if re.match(r'\s', line):
	continue


	# save the timestamp for lines with no timestamp
	patternstr = r'.{}.'.format(datefmt)
	m = re.match(patternstr, line)
	if m:
	sDate = m.group(1)
	dtDate = datetime.datetime.strptime(sDate, filter_datetime_fmt)
	millis = m.group(2)

	# filter out log lines by date
	if start_date is not None and dtDate is not None and dtDate < start_date:
	continue

	if end_date is not None and dtDate is not None and dtDate > end_date:
	continue

	for error_pattern in error_li:
	if re.search(error_pattern, line, re.IGNORECASE):
	found = True
	break

	logging.debug("log date as string: %s, log date: %s", sDate, '' if dtDate is None else dtDate.strftime(filter_date_fmt))

	if found == True:
	# truncate the line
	line = line[:300]
	line = line.rstrip()
	logging.debug("Line: %s", line)
	process_line(line, fullpath, line_number, sDate, millis)


	def process_line(s, fullpath, line_number, date, millis):
	# example: 2017-01-05 14:11:21,821 LUS INFO [com.sun.jini.reggie] - Exception
	# example: 2016-12-31 17:38:57,334 pmds.deployment-1.8.9-pu.18 [2] WARNING [com.gigaspaces.core.common] - Primary space is unavailable
	patternstr = r'{}{}'.format(datefmt, r' ([\w \-\.])(\[\d\] )?([\w])? \[([\w\-\.])\] - (.)$')
	m = re.match(patternstr, s)
	#m = re.match(r'(\d\d\d\d-\d\d-\d\d \d\d:\d\d:\d\d),(\d\d\d) ([\w \-\.])(\[\d\] )?([\w])? \[([\w\-\.])\] - (.)$', s)
	'''
	^ date ^ millis ^ category ^ optional^ level ^ logger ^ message
	match 0 or 1 times
	'''

	if m:
	# 1 date
	# 2 millis
	# 3 category
	# 4 optional, '[2]' in comment above
	# 5 level
	# 6 logger
	# 7 message
	category = ''
	level = ''
	if m.group(4) == None:
	category = m.group(3)
	# extract level information
	# eg., LUS INFO
	for i in level_li:
	index = category.find(i)
	if index >= 0:
	level = category[index:]
	category = category[0:index]
	break
	else:
	category = m.group(3) + m.group(4)
	level = m.group(5)

	# this group also grabs the space that may come after this optional string; need to strip it out
	category = category.strip()
	level = level.strip()
	mywriter.writerow([fileinfo['path'], line_number, fileinfo['host'], fileinfo['pid'], fileinfo['comp'], fileinfo['id'], m.group(1), m.group(2), category, level, m.group(6), m.group(7)])
	else:
	# sometimes clients just provide output of the gs-agent process
	# [gsc][1/10120] 2017-10-11 10:52:37,557 CommonClassLoader WARNING [net.jini.discovery.LookupLocatorDiscovery] - java.net.SocketTimeoutException: connect timed out - using unicast locator 10.10.10.117:4174 - delay next lookup by 1,000 ms
	patternstr = r'{}{}{}'.format(r'\[(\w)\]\[(\d)/(\d)\]\s', datefmt, r' ([\w \-\.])(\[\d\] )?([\w])? \[([\w\-\.])\] - (.)$')
	m = re.match(patternstr, s)
	#m = re.match(r'\[(\w)\]\[(\d)/(\d)\]\s(\d\d\d\d-\d\d-\d\d \d\d:\d\d:\d\d),(\d\d\d) ([\w \-\.])(\[\d\] )?([\w])? \[([\w\-\.])\] - (.)$', s)
	'''
	^ proc ^ id ^ pid - the rest is a repeat of the regex used above
	'''
	if m:
	# 1 component
	# 2 id
	# 3 pid
	# 4 date
	# 5 millis
	# 6 category
	# 7 optional
	# 8 level
	# 9 logger
	# 10 message
	category = ''
	level = ''
	if m.group(7) == None:
	category = m.group(6)
	# extract level information
	for i in level_li:
	index = category.find(i)
	if index >= 0:
	level = category[index:]
	category = category[0:index]
	break

	category = category.strip()
	if category.upper() == m.group(1).upper():
	category = ''
	else:
	category = m.group(6) + m.group(7)
	level = m.group(8)

	mywriter.writerow([fileinfo['path'], line_number, fileinfo['host'], m.group(3), m.group(1), m.group(2), m.group(4), m.group(5), category, level, m.group(9), m.group(10)])
	else:
	#[manager][1/13986] Caused by: com.gigaspaces.security.AuthenticationException: Authentication request is invalid - you are not logged in.
	# log message pattern missing timestamp
	patternstr = r'{}{}'.format(r'\[(\w)\]\[(\d)/(\d)\]\s', r'(.*)$')
	# ^comp ^id ^pid ^message
	m = re.match(patternstr, s)
	if m:
	# 1 component
	# 2 id
	# 3 pid
	# 4 message

	mywriter.writerow([fileinfo['path'], line_number, fileinfo['host'], m.group(3), m.group(1), m.group(2), date, millis, '', '', '', m.group(4)])
	else:
	mywriter.writerow([fileinfo['path'], line_number, fileinfo['host'], fileinfo['pid'], fileinfo['comp'], fileinfo['id'], date, millis, '', '', '', s])

	def process_args():
	global dirlist, start_date, end_date, filename_prefix, output_filename, host_li, show_fullpath

	parser = argparse.ArgumentParser(add_help=False)
	parser.add_argument("--start_dir", help="the root directory to begin processing.")
	parser.add_argument("--output_dir", help="where the output file should be written to.")
	parser.add_argument("--start_date", help="the date to begin processing errors. Log lines with dates before the start date will be filtered out. Example format: 2021-09-21")
	parser.add_argument("--end_date", help="the date to end processing errors. Log lines with dates after the end date will be filtered out. Example format: 2021-09-21")
	parser.add_argument("--log_level", choices=['CRITICAL', 'ERROR', 'WARNING', 'INFO', 'DEBUG'], help="logging level. Default is WARNING.")
	parser.add_argument("--hosts", help="list of hosts, separated by commas.")
	parser.add_argument("--filename_prefix", help="Output filename prefix.")
	parser.add_argument("--show_fullpath", help="Output the full path. Default is false.")
	parser.add_argument('-h', '--help', action='help', default=argparse.SUPPRESS, help="This program parses a set of XAP log files formatted with standard XAP out-of-the-box settings.")

	args = parser.parse_args()

	if args.filename_prefix:
	filename_prefix = args.filename_prefix

	if args.start_dir:
	dirlist = [args.start_dir]

	if args.start_date:
	start_date = datetime.datetime.strptime(args.start_date, filter_date_fmt)

	if args.end_date:
	end_date = datetime.datetime.strptime(args.end_date, filter_date_fmt)

	if args.output_dir:
	output_filename = args.output_dir + os.path.sep + filename_prefix + "-" + gettimestamp() + ".csv"
	else:
	output_filename = home_dir + os.path.sep + filename_prefix + '-' + gettimestamp() + ".csv"

	if args.show_fullpath:
	show_fullpath = args.show_fullpath

	if args.log_level:
	if args.log_level == 'CRITICAL':
	logging.basicConfig(level=logging.CRITICAL)
	elif args.log_level == 'ERROR':
	logging.basicConfig(level=logging.ERROR)
	elif args.log_level == 'INFO':
	logging.basicConfig(level=logging.INFO)
	elif args.log_level == 'DEBUG':
	logging.basicConfig(level=logging.DEBUG)
	else:
	logging.basicConfig(level=logging.WARNING)
	else:
	# set logging level. WARNING is default level
	logging.basicConfig(level=logging.WARNING)

	if args.hosts:
	host_li = args.hosts.split(',')


	def myvisitor(extlist, dirname, names):
	global fileinfo
	logging.debug("Current directory: %s", dirname)
	for f in names:
	(b, ext) = os.path.splitext(f)
	logging.debug("Filename base: %s Ext: %s", b, ext)
	for x in extlist:
	m = re.match(x, ext)
	if m:
	fullpath = os.path.join(dirname, f)
	logging.debug("Fullpath: %s", fullpath)
	try:
	hostname = get_hostname(f)
	fileinfo = {'host': hostname}
	process_file(fullpath)
	except OSError as err:
	print("OS error: {0}".format(err))
	#except OSError, detail:
	# print detail
	break

	def myvisitor_2(fullpath, start_dir, filename):
	global fileinfo
	try:
	relative_path = "{}{}".format('.', fullpath.replace(start_dir, '', 1))

	hostname = get_hostname(relative_path)
	if not show_fullpath:
	path = relative_path
	else:
	path = fullpath

	(comp, id, host, pid) = get_pid(filename)
	if hostname == '':
	hostname = host
	fileinfo = {'host': hostname, 'path': path, 'comp': comp, 'id': id, 'pid': pid}

	process_file(fullpath)
	except OSError as err:
	print("OS error: {0}".format(err))

	def main():
	if sys.version_info < (3,0,0):
	print("Please use a version of Python > 3")
	sys.exit(-1)

	global mywriter

	process_args()

	# write output to csv file
	with open(output_filename, 'w', newline='') as csvfile:
	mywriter = csv.writer(csvfile)
	mywriter.writerow(col_li)

	for i in dirlist:
	logging.debug("Processing: %s", i)
	for root, dirs, files in os.walk(i):
	for name in files:
	logging.debug(os.path.join(root, name))
	(b, ext) = os.path.splitext(name)
	for x in extlist:
	m = re.match(x, ext)
	if m:
	fullpath = os.path.join(root, name)
	myvisitor_2(fullpath, i, name)
	for name in dirs:
	logging.debug(os.path.join(root, name))
	#os.path.walk(i, myvisitor, extlist)


	main()
	import argparse
	import csv
	from datetime import datetime
	import logging
	import sys

	file = r'C:\Users\Dixson\tmp.csv'
	before_dt = None
	after_dt = None
	col_no = 1

	def process(fin):
	with open('tmp.csv', 'w', newline='') as csvfile:
	mywriter = csv.writer(csvfile)

	with open(fin, newline='') as csvfile:
	reader = csv.reader(csvfile)
	for row in reader:
	value = row[col_no]
	dt = convert_dt(value)
	logging.debug("Value: {}, date: {} on column {}".format(value, dt, col_no))
	if dt == None:
	mywriter.writerow(row)
	#print(', '.join(row))
	continue
	if (before_dt == None or dt < before_dt):
	if( after_dt == None or dt > after_dt):
	mywriter.writerow(row)
	#print(', '.join(row))



	# example date: 2017-01-05 14:11:21
	def convert_dt(s):
	try:
	return datetime.strptime(s, '%Y-%m-%d %H:%M:%S')
	except Exception as error:
	return None

	def process_args():
	global file, before_dt, after_dt, col_no

	parser = argparse.ArgumentParser(add_help=False)
	parser.add_argument("-f", "--file", help="the input file. If not provided, /dev/stdin is used.")
	parser.add_argument("--before", help='include dates before provided date. E.g., --before "2017-01-05 14:11:21"')
	parser.add_argument("--after", help="include dates after provided date.")
	parser.add_argument("--columnNumber", help="the column number that has the date field, beginning at 0.")
	parser.add_argument("--log_level", choices=['CRITICAL', 'ERROR', 'WARNING', 'INFO', 'DEBUG'], help="logging level")
	parser.add_argument('-h', '--help', action='help', default=argparse.SUPPRESS, help="This program parses a csv file using the date filter criteria.")

	args = parser.parse_args()
	if args.file:
	file = args.file
	else:
	# won't work on Windows
	file = '/dev/stdin'

	if args.before:
	before_dt = convert_dt(args.before)

	if args.after:
	after_dt = convert_dt(args.after)

	if args.columnNumber:
	col_no = int(args.columnNumber)

	if args.log_level:
	if args.log_level == 'CRITICAL':
	logging.basicConfig(level=logging.CRITICAL)
	elif args.log_level == 'ERROR':
	logging.basicConfig(level=logging.ERROR)
	elif args.log_level == 'INFO':
	logging.basicConfig(level=logging.INFO)
	elif args.log_level == 'DEBUG':
	logging.basicConfig(level=logging.DEBUG)
	else:
	logging.basicConfig(level=logging.WARNING)
	else:
	# set logging level. WARNING is default level
	logging.basicConfig(level=logging.WARNING)


	def main():
	if sys.version_info < (3,0,0):
	print("Please use a version of Python > 3")
	sys.exit(-1)

	process_args()
	process(file)


	main()
	import re
	import sys

	datefmt = r'^(\d\d\d\d-\d\d-\d\dT\d\d:\d\d:\d\d\.\d\d\d[\+\-]\d\d\d\d): (\d+\.\d+): '
	# ^ date
	# ^ time
	# ^ millis
	# ^ time zone
	# ^ timestamp

	filename = ''

	# check python version
	def check_version():

	if sys.version_info < (3,0,0):
	print("Please use a version of Python > 3")
	sys.exit(-1)

	if len(sys.argv) < 2:
	print("No filename specified.")
	print("Usage: {} <filename>".format(sys.argv[0]))
	sys.exit(-1)

	def process_args():
	global filename
	filename = sys.argv[1]

	def process():
	with open(filename, encoding="latin-1") as f:

	# the number of lines that have heap size information
	count = 0
	# the total number of lines processed
	linecount = 1

	# date and time stamp
	date_time = ''
	# number of seconds elapsed since the process started
	process_time = ''

	print(', filename, line_number, date_time, process_time, begin_eden, begin_max_eden, end_eden, end_max_eden, begin_survivor, end_survivor, begin_heap, begin_max_heap, end_heap, end_max_heap')
	for line in f:
	line = line.strip()

	#print(line)

	m = re.match(datefmt, line)
	if m:
	# save current timestamp
	date_time = m.group(1)
	process_time = m.group(2)

	# match heap information in following formats
	# [Eden: 9632.0M(9632.0M)->0.0B(9624.0M) Survivors: 192.0M->200.0M Heap: 11.4G(16.0G)->2074.8M(16.0G)]
	# [Eden: 4704.0M(9624.0M)->0.0B(9824.0M) Survivors: 200.0M->0.0B Heap: 6786.9M(16.0G)->931.6M(16.0G)], [Metaspace: 61553K->61499K(1105920K)]

	edenstr = r'\s*\[Eden: (\d+\.\d[B\|K\|M\|G])\((\d+\.\d[B\|K\|M\|G])\)->(\d+\.\d[B\|K\|M\|G])\((\d+\.\d[B\|K\|M\|G])\) '
	survivorstr = r'Survivors: (\d+\.\d[B\|K\|M\|G])->(\d+\.\d[B\|K\|M\|G]) '
	heapstr = r'Heap: (\d+\.\d[B\|K\|M\|G])\((\d+\.\d[B\|K\|M\|G])\)->(\d+\.\d[B\|K\|M\|G])\((\d+\.\d[B\|K\|M\|G])\)\].*'

	patternstr = edenstr + survivorstr + heapstr

	m = re.match(patternstr, line)
	if m:
	begin_eden = m.group(1)
	begin_max_eden = m.group(2)
	end_eden = m.group(3)
	end_max_eden = m.group(4)
	begin_survivor = m.group(5)
	end_survivor = m.group(6)
	begin_heap = m.group(7)
	begin_max_heap = m.group(8)
	end_heap = m.group(9)
	end_max_heap = m.group(10)
	count += 1

	print('{},{},{},{},{},{},{},{},{},{},{},{},{},{},{}'.format(
	count, filename, linecount, date_time, process_time, begin_eden,
	begin_max_eden, end_eden, end_max_eden, begin_survivor, end_survivor,
	begin_heap, begin_max_heap, end_heap, end_max_heap))

	# match heap information in following formats

	# 2022-02-22T14:22:29.770-0600: 3.287: [GC cleanup 18M->18M(3072M), 0.0059295 secs]
	# 2022-02-22T14:22:34.301-0600: 7.817: [GC pause (Metadata GC Threshold) (young) (initial-mark) 123M->23M(3072M), 0.1070516 secs]
	# 2022-02-22T15:27:01.100-0600: 3829.383: [GC pause (G1 Evacuation Pause) (young) 9827M->6775M(11G), 0.1417604 secs]
	# 2022-02-22T16:42:59.750-0600: 8433.267: [GC pause (G1 Humongous Allocation) (young) (initial-mark) 1683M->1433M(3072M), 0.0867971 secs]

	#young = r'\[GC pause \([\w ]+\) \(\w+\) (\d+[B\|K\|M\|G])->(\d+[B\|K\|M\|G])\((\d+[B\|K\|M\|G])\), \d+\.\d+ secs\]'
	young = r'\[GC [\w \(\)-]+ (\d+[B\|K\|M\|G])->(\d+[B\|K\|M\|G])\((\d+[B\|K\|M\|G])\), \d+\.\d+ secs\]'
	patternstr = datefmt + young
	m = re.match(patternstr, line)
	if m:
	# first 2 group matches are used by datefmt
	begin_heap = m.group(3)
	end_heap = m.group(4)
	end_max_heap = m.group(5)

	print('{},{},{},{},{},{},{},{},{},{},{},{},{},{},{}'.format(
	count, filename, linecount, date_time, process_time, '',
	'', '', '', '', '',
	begin_heap, '', end_heap, end_max_heap))

	count += 1


	linecount += 1



	def main():
	check_version()
	process_args()
	process()

	main()
	teams = {'NY': 'Giants', 'Dallas' : 'Cowboys', 'Green Bay': 'Packers'}
	for k,v in teams.iteritems():
	print "%s => %s" % (k,v

	# sort, then print
	keys = teams.keys()
	keys.sort()
	for k in keys:
	print '%s => %s' % (k, teams[k])

	# alternatively
	for key in sorted(teams):
	print '%s => %s' % (key, teams[key])
	import os
	import subprocess
	import sys
	import argparse

	start_dir = os.path.expanduser('~')
	output_dir = start_dir
	heap_summary_cmd = '/home/dixson/work/tools/py/heap-summary.py'

	def process_args():
	global start_dir, output_dir

	parser = argparse.ArgumentParser(add_help=False)
	parser.add_argument("--start_dir", help="the root directory to begin processing.")
	parser.add_argument("--output_dir", help="where the output file should be written to. If this is not set, this defaults to the start_dir")
	parser.add_argument('-h', '--help', action='help', default=argparse.SUPPRESS, help="This program will parse a set of gc log files in a configured directory.")

	args = parser.parse_args()

	if args.start_dir:
	start_dir = args.start_dir

	if args.output_dir:
	output_dir = args.output_dir
	else:
	output_dir = start_dir



	def process():
	for root, dirs, files in os.walk(start_dir):
	for name in files:
	(base, extension) = os.path.splitext(name)
	if extension.startswith('.'):
	ext = extension.lstrip('.')

	if ext.isdigit() or ext == 'current' or ext == 'log':
	input_filename = os.path.join(root, name)
	output_filename = input_filename + '.csv'
	print(input_filename)
	print(output_filename)
	with open(output_filename, "w") as outfile:
	subprocess.run(['python3', heap_summary_cmd, input_filename], stdout=outfile)


	def main():
	if sys.version_info < (3,0,0):
	print("Please use a version of Python > 3")
	sys.exit(-1)

	process_args()
	process()


	main()
	# very simple, no need to download web framework and deploy app, simply navigate to directory. directory serves as document root
	python3 -m http.server 8888