Created
January 6, 2020 01:23
-
-
Save hvgab/119713b096da2f3a0a5f4b8a228339e6 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Comment from today, script from years ago. | |
Not sure if I ever fixed this. | |
This script was suppose to loop through a shit-ton of excel like files; | |
Get all the headers, make a list of all the headers it encountered, and then report back with how many times a header was encountered. | |
Everything gets written to files at the end. | |
- Heads with counter | |
- Errors | |
- Files it could not read (not excel/csv | |
I think it would silently fail on the different types of csv-dialects it could encounter. | |
""" | |
""" Get headers from all files """ | |
import fnmatch | |
import os | |
import shutil | |
from collections import Counter | |
import pandas as pd | |
import xlrd | |
import json | |
from unidecode import unidecode | |
from time import sleep | |
from time import perf_counter | |
from datetime import timedelta | |
from datetime import datetime | |
rootPath = 'X:\\' | |
destDir = './headers/' | |
heads = [] | |
heads_and_path =[] | |
not_csv = [] | |
errorfile = [] | |
counter = 0 | |
for_file_times = [] | |
#c = {} | |
status = { | |
'root':'', | |
'filename':'', | |
'filenumber':0, | |
'filenumber_total':7500, | |
'filetime':0.0, | |
'filestatus':'', | |
'success':0, | |
'error':0, | |
'not_csv':0, | |
'time_so_far':0.0, | |
'fileheads':0 | |
} | |
def read_file(file): | |
if file_extension == ".xlsx" or file_extension == '.xls': | |
df = pd.read_excel(file) | |
status['success'] = status['success'] + 1 | |
return df | |
elif file_extension == ".csv": | |
df = pd.read_csv(file, encoding='latin1', sep=';') | |
status['success'] = status['success'] + 1 | |
return df | |
elif file_extension == ".txt": | |
df = pd.read_table(file, encoding='latin1', sep='\t') | |
status['success'] = status['success'] + 1 | |
return df | |
else: | |
print ("Error?! File should be excel or csv: \n\t {}".format(file).encode('utf-8', 'replace')) | |
sleep(10) | |
def get_heads(df): | |
getheads = [] | |
for head in df.columns: | |
getheads.append( head ) | |
status['fileheads']=len(getheads) | |
return getheads | |
def write_to_file(contents): | |
f = open('heads.txt', 'w') | |
f.write( contents ) | |
f.close() | |
def count_files(dir): | |
filecount_start = perf_counter() | |
filecount = 0 | |
for root, dirnames, filenames in os.walk(rootPath): | |
for file in filenames: | |
filecount += 1 | |
print ('count files in {}'.format(perf_counter()-filecount_start)) | |
print (filecount) | |
sleep(10) | |
return filecount | |
""" | |
START OF SCRIPT | |
""" | |
count_files = count_files(rootPath) | |
main_start = perf_counter() | |
for root, dirnames, filenames in os.walk(rootPath): | |
status['root']=root | |
for file in filenames: | |
file_start = perf_counter() | |
status['filename']=file | |
status['filenumber']= status['filenumber']+1 | |
filename, file_extension = os.path.splitext(file) | |
pathname = os.path.join(root, file) | |
if file_extension == ".xlsx" \ | |
or file_extension == '.xls' \ | |
or file_extension == '.csv': | |
try: | |
file_df = read_file(pathname) | |
file_df_heads = get_heads(file_df) | |
heads.extend(file_df_heads) | |
heads_and_path.append(['file', 'file_df_heads']) | |
except xlrd.biffh.XLRDError as e: | |
errorfile.append([file, str(e)]) | |
status['error']=status['error']+1 | |
continue | |
except UnicodeEncodeError as e: | |
errorfile.append([file, str(e)]) | |
status['error']=status['error']+1 | |
continue | |
except Exception as e: | |
errorfile.append([file, str(e)]) | |
status['error']=status['error']+1 | |
continue | |
except BaseException as e: | |
errorfile.append([file, str(e)]) | |
status['error']=status['error']+1 | |
continue | |
else: | |
not_csv.append(file) | |
status['not_csv'] = status['not_csv']+1 | |
counter += 1 | |
file_end = perf_counter() | |
for_file_times.append(file_end-file_start) | |
status['filetime']=file_end-file_start | |
status['time_so_far']=file_end-main_start | |
for head in heads: | |
if isinstance(head, datetime): | |
print ('datetime: {}'.format(head)) | |
head = head.isoformat() | |
print ('isoformat: {}'.format(head)) | |
#sleep(10) | |
json.dumps(head, indent=2) | |
""" | |
STATUS PRINT | |
""" | |
print ( | |
'--------------------------------------\n\ | |
root : "{root}"\n\ | |
filename : "{filename}"\n\ | |
succes: {success}\terror: {error}\tnot_csv: {not_csv}\n\ | |
fileheads#: {fileheads} totalheads: {totalheads}\n\ | |
file#: {filenumber} totalfile#: {filecount} {percent:.2f}%\n\ | |
filetime: {filetime} totaltime: {time_so_far} est left: {esttime}\ | |
\ntest: time/filenumber: {time_div_filenum}\ | |
'.format( | |
root=unidecode(str(status['root'])), | |
filename=unidecode(str(status['filename'])), | |
filenumber=status['filenumber'], | |
success=status['success'], | |
error=status['error'], | |
not_csv=status['not_csv'], | |
fileheads=status['fileheads'], | |
totalheads=len(heads), | |
filecount=count_files, | |
percent=float(status['filenumber'])/float(count_files)*100, | |
filetime=unidecode(str(timedelta(seconds=status['filetime']))), | |
time_so_far=unidecode(str(timedelta(seconds=status['time_so_far']))), | |
esttime=unidecode( str(timedelta(seconds = (status['time_so_far']/status['filenumber']*count_files)))), | |
time_div_filenum=unidecode(str(timedelta(seconds = (status['time_so_far']/status['filenumber'])))) | |
) ) | |
print ("status['root']\t{}".format(type(status['root']))) | |
print ("status['filename']\t{}".format(type(status['filename']))) | |
print ("status['filenumber']\t{}".format(type(status['filenumber']))) | |
print ("status['success']\t{}".format(type(status['success']))) | |
print ("status['error']\t{}".format(type(status['error']))) | |
print ("status['not_csv']\t{}".format(type(status['not_csv']))) | |
print ("status['fileheads']\t{}".format(type(status['fileheads']))) | |
print ("len(heads)\t{}".format(type(len(heads)))) | |
print ("count_files\t{}".format(type(count_files))) | |
print ("percent\t{}".format(type(float(status['filenumber'])/float(count_files)*100))) | |
#print ("filetime\t{}".format(type(str(timedelta(seconds=status['filetime']))))) | |
#print ("time_so_far\t{}".format(type(str(timedelta(seconds=status['time_so_far']))))) | |
#print ("esttime\t{}".format(type(unidecode( str(timedelta(seconds = (status['time_so_far']/status['filenumber']*count_files))))))) | |
#print ("time_div_filenum\t{}".format(type(unidecode(str(timedelta(seconds = (status['time_so_far']/status['filenumber']))))))) | |
sleep(10) | |
""" | |
FOR LOOP ER FERDIG, FINISHING SCRIPT | |
""" | |
""" | |
print heads, Counter heads, print json | |
""" | |
print( 'total time = {} seconds'.format(perf_counter()-main_start) ) | |
print( 'total time = {} mins'.format( (perf_counter()-main_start)/60 ) ) | |
print('finishing:') | |
# HEADS TO FILE | |
for file_head in heads_and_path: | |
if isinstance(file_head[1], datetime): | |
serial = file_head[1].isoformat() | |
print ('serial: {}'.format(serial)) | |
sleep(5) | |
print('datehead: \n\t{}\n\t{}'.format(head[0], head[1])) | |
for head in heads: | |
if isinstance(head, datetime): | |
print ('datetime: {}'.format(head)) | |
head = head.isoformat() | |
print ('isoformat: {}'.format(head)) | |
print('starting writing heads to file') | |
with open('heads.json', 'w') as outfile: | |
json.dump(heads, outfile, indent=4) | |
#print ('heads: {}'.format(unidecode(heads))) | |
print('finished writing heads to file') | |
# NOT CSV TO FILE | |
with open('not_csvxls.json', 'w') as outfile: | |
json.dump(not_csv, outfile, indent=4) | |
# ERRORS TO FILE | |
with open('errorfile.json', 'w') as outfile: | |
json.dump(errorfile, outfile, indent=4) | |
# HEADS_COUNTER TO FILE | |
print('starting writing heads_counter to file') | |
c = Counter(unidecode(heads)).most_common() | |
with open('heads_counter.json', 'w') as outfile: | |
json.dump(c, outfile, indent=4) | |
print ('json: \n\t{}'.format(unidecode(json.dumps(c)))) | |
print('finished writing heads_counter to file') | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment