Skip to content

Instantly share code, notes, and snippets.

@hvgab
Created January 6, 2020 01:23
Show Gist options
  • Save hvgab/119713b096da2f3a0a5f4b8a228339e6 to your computer and use it in GitHub Desktop.
Save hvgab/119713b096da2f3a0a5f4b8a228339e6 to your computer and use it in GitHub Desktop.
"""
Comment from today, script from years ago.
Not sure if I ever fixed this.
This script was suppose to loop through a shit-ton of excel like files;
Get all the headers, make a list of all the headers it encountered, and then report back with how many times a header was encountered.
Everything gets written to files at the end.
- Heads with counter
- Errors
- Files it could not read (not excel/csv
I think it would silently fail on the different types of csv-dialects it could encounter.
"""
""" Get headers from all files """
import fnmatch
import os
import shutil
from collections import Counter
import pandas as pd
import xlrd
import json
from unidecode import unidecode
from time import sleep
from time import perf_counter
from datetime import timedelta
from datetime import datetime
rootPath = 'X:\\'
destDir = './headers/'
heads = []
heads_and_path =[]
not_csv = []
errorfile = []
counter = 0
for_file_times = []
#c = {}
status = {
'root':'',
'filename':'',
'filenumber':0,
'filenumber_total':7500,
'filetime':0.0,
'filestatus':'',
'success':0,
'error':0,
'not_csv':0,
'time_so_far':0.0,
'fileheads':0
}
def read_file(file):
if file_extension == ".xlsx" or file_extension == '.xls':
df = pd.read_excel(file)
status['success'] = status['success'] + 1
return df
elif file_extension == ".csv":
df = pd.read_csv(file, encoding='latin1', sep=';')
status['success'] = status['success'] + 1
return df
elif file_extension == ".txt":
df = pd.read_table(file, encoding='latin1', sep='\t')
status['success'] = status['success'] + 1
return df
else:
print ("Error?! File should be excel or csv: \n\t {}".format(file).encode('utf-8', 'replace'))
sleep(10)
def get_heads(df):
getheads = []
for head in df.columns:
getheads.append( head )
status['fileheads']=len(getheads)
return getheads
def write_to_file(contents):
f = open('heads.txt', 'w')
f.write( contents )
f.close()
def count_files(dir):
filecount_start = perf_counter()
filecount = 0
for root, dirnames, filenames in os.walk(rootPath):
for file in filenames:
filecount += 1
print ('count files in {}'.format(perf_counter()-filecount_start))
print (filecount)
sleep(10)
return filecount
"""
START OF SCRIPT
"""
count_files = count_files(rootPath)
main_start = perf_counter()
for root, dirnames, filenames in os.walk(rootPath):
status['root']=root
for file in filenames:
file_start = perf_counter()
status['filename']=file
status['filenumber']= status['filenumber']+1
filename, file_extension = os.path.splitext(file)
pathname = os.path.join(root, file)
if file_extension == ".xlsx" \
or file_extension == '.xls' \
or file_extension == '.csv':
try:
file_df = read_file(pathname)
file_df_heads = get_heads(file_df)
heads.extend(file_df_heads)
heads_and_path.append(['file', 'file_df_heads'])
except xlrd.biffh.XLRDError as e:
errorfile.append([file, str(e)])
status['error']=status['error']+1
continue
except UnicodeEncodeError as e:
errorfile.append([file, str(e)])
status['error']=status['error']+1
continue
except Exception as e:
errorfile.append([file, str(e)])
status['error']=status['error']+1
continue
except BaseException as e:
errorfile.append([file, str(e)])
status['error']=status['error']+1
continue
else:
not_csv.append(file)
status['not_csv'] = status['not_csv']+1
counter += 1
file_end = perf_counter()
for_file_times.append(file_end-file_start)
status['filetime']=file_end-file_start
status['time_so_far']=file_end-main_start
for head in heads:
if isinstance(head, datetime):
print ('datetime: {}'.format(head))
head = head.isoformat()
print ('isoformat: {}'.format(head))
#sleep(10)
json.dumps(head, indent=2)
"""
STATUS PRINT
"""
print (
'--------------------------------------\n\
root : "{root}"\n\
filename : "{filename}"\n\
succes: {success}\terror: {error}\tnot_csv: {not_csv}\n\
fileheads#: {fileheads} totalheads: {totalheads}\n\
file#: {filenumber} totalfile#: {filecount} {percent:.2f}%\n\
filetime: {filetime} totaltime: {time_so_far} est left: {esttime}\
\ntest: time/filenumber: {time_div_filenum}\
'.format(
root=unidecode(str(status['root'])),
filename=unidecode(str(status['filename'])),
filenumber=status['filenumber'],
success=status['success'],
error=status['error'],
not_csv=status['not_csv'],
fileheads=status['fileheads'],
totalheads=len(heads),
filecount=count_files,
percent=float(status['filenumber'])/float(count_files)*100,
filetime=unidecode(str(timedelta(seconds=status['filetime']))),
time_so_far=unidecode(str(timedelta(seconds=status['time_so_far']))),
esttime=unidecode( str(timedelta(seconds = (status['time_so_far']/status['filenumber']*count_files)))),
time_div_filenum=unidecode(str(timedelta(seconds = (status['time_so_far']/status['filenumber']))))
) )
print ("status['root']\t{}".format(type(status['root'])))
print ("status['filename']\t{}".format(type(status['filename'])))
print ("status['filenumber']\t{}".format(type(status['filenumber'])))
print ("status['success']\t{}".format(type(status['success'])))
print ("status['error']\t{}".format(type(status['error'])))
print ("status['not_csv']\t{}".format(type(status['not_csv'])))
print ("status['fileheads']\t{}".format(type(status['fileheads'])))
print ("len(heads)\t{}".format(type(len(heads))))
print ("count_files\t{}".format(type(count_files)))
print ("percent\t{}".format(type(float(status['filenumber'])/float(count_files)*100)))
#print ("filetime\t{}".format(type(str(timedelta(seconds=status['filetime'])))))
#print ("time_so_far\t{}".format(type(str(timedelta(seconds=status['time_so_far'])))))
#print ("esttime\t{}".format(type(unidecode( str(timedelta(seconds = (status['time_so_far']/status['filenumber']*count_files)))))))
#print ("time_div_filenum\t{}".format(type(unidecode(str(timedelta(seconds = (status['time_so_far']/status['filenumber'])))))))
sleep(10)
"""
FOR LOOP ER FERDIG, FINISHING SCRIPT
"""
"""
print heads, Counter heads, print json
"""
print( 'total time = {} seconds'.format(perf_counter()-main_start) )
print( 'total time = {} mins'.format( (perf_counter()-main_start)/60 ) )
print('finishing:')
# HEADS TO FILE
for file_head in heads_and_path:
if isinstance(file_head[1], datetime):
serial = file_head[1].isoformat()
print ('serial: {}'.format(serial))
sleep(5)
print('datehead: \n\t{}\n\t{}'.format(head[0], head[1]))
for head in heads:
if isinstance(head, datetime):
print ('datetime: {}'.format(head))
head = head.isoformat()
print ('isoformat: {}'.format(head))
print('starting writing heads to file')
with open('heads.json', 'w') as outfile:
json.dump(heads, outfile, indent=4)
#print ('heads: {}'.format(unidecode(heads)))
print('finished writing heads to file')
# NOT CSV TO FILE
with open('not_csvxls.json', 'w') as outfile:
json.dump(not_csv, outfile, indent=4)
# ERRORS TO FILE
with open('errorfile.json', 'w') as outfile:
json.dump(errorfile, outfile, indent=4)
# HEADS_COUNTER TO FILE
print('starting writing heads_counter to file')
c = Counter(unidecode(heads)).most_common()
with open('heads_counter.json', 'w') as outfile:
json.dump(c, outfile, indent=4)
print ('json: \n\t{}'.format(unidecode(json.dumps(c))))
print('finished writing heads_counter to file')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment