Last active
December 17, 2016 15:32
-
-
Save belsander/df134e4464d75c78c7b1be65e3556595 to your computer and use it in GitHub Desktop.
Convert XLS files from http://football-data.co.uk to a more workable CSV format
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
"""football-data.co.uk_convert.py: Convert xslx files to one or more workable csv file(s) | |
Copyright (C) 2016 Sander Bel | |
This program is free software: you can redistribute it and/or modify it under the terms of the | |
GNU General Public License as published by the Free Software Foundation, either version 3 of the | |
License, or (at your option) any later version. | |
This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without | |
even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
General Public License for more details. | |
You should have received a copy of the GNU General Public License along with this program. If not, | |
see <http://www.gnu.org/licenses/>. | |
""" | |
__author__ = "Sander Bel" | |
__copyright__ = "Copyright (C) 2016 Sander Bel" | |
__credits__ = ["Sander Bel"] | |
__licence__ = "GPLv3" | |
__version__ = "1.0.0" | |
__maintainer__ = "Sander Bel" | |
__email__ = "[email protected]" | |
__status__ = "Production" | |
from argparse import ArgumentParser, RawTextHelpFormatter | |
from glob import glob | |
from os.path import splitext | |
from xlrd import open_workbook | |
from csv import writer, QUOTE_NONE | |
from zipfile import ZipFile | |
# GLOBALS | |
ALL_COMBINED = False | |
SORT_HEADERS = False | |
EXTRACT_ZIP = False | |
TRANSPOSE = False | |
EXT_CSV = ".csv" | |
EXT_XSL = ".xls" | |
EXT_ZIP = ".zip" | |
CONVERT_CSV_EXT = "_converted%s" % (EXT_CSV) | |
COMMA_SIGN = "," | |
DOT_SIGN = "." | |
CSV_DELIMITER = COMMA_SIGN | |
OUTPUT_DECIMAL_SIGN = DOT_SIGN | |
NULL_VALUE = "0" | |
ENCODING = "UTF-8" | |
HEADERS = ["Div", "Date", "HomeTeam", "AwayTeam", "FTR", "B365A", "B365D", "B365H", "BSA", "BSD", "BSH", \ | |
"BWA", "BWD", "BWH", "GBA", "GBD", "GBH", "IWA", "IWD", "IWH", "LBA", "LBD", "LBH", "PSA", "PSD", \ | |
"PSH", "SBA", "SBD", "SBH", "SJA", "SJD", "SJH", "SOA", "SOD", "SOH", "SYA", "SYD", "SYH", "VCA", \ | |
"VCD", "VCH", "WHA", "WHD", "WHH"] | |
arg_parser = ArgumentParser(prog=__file__, description="Convert xls files to csv.", epilog="%s Copyright (C) 2016 Sander Bel\nThis program comes with ABSOLUTELY NO WARRANTY; for details type '%s --license'.\nThis is free software, and you are welcome to redistribute it under certain conditions; type '%s --license' for details." % ((__file__,) * 3), formatter_class=RawTextHelpFormatter) | |
arg_parser.add_argument("-l", "--license", action="store_true", help="show license information and exit") | |
arg_parser.add_argument("-v", "--version", action="version", help="show version information and exit", version="%(prog)s v" + __version__) | |
arg_parser.add_argument("-a", "--all_combined", default=ALL_COMBINED, help="combine all the xls files into one csv file, instead of a seperate csv output file for each xls input file") | |
arg_parser.add_argument("-s", "--sort_headers", default=SORT_HEADERS, help="sort headers no longer based upon bookmaker, but based upon results (what is the name of A,D,H?)") | |
arg_parser.add_argument("-t", "--transpose", default=TRANSPOSE, help="transpose columns to rows and visa versa") | |
arg_parser.add_argument("-e", "--extract_zip", default=EXTRACT_ZIP, help="before importing the xls files, extract all zip files within the current directory (not including subdirectories)") | |
arg_parser.add_argument("-c", "--csv_delimiter", default=CSV_DELIMITER, help="delimiter used for the output csv file(s) (default: '%s')" % (CSV_DELIMITER)) | |
arg_parser.add_argument("-o", "--output_decimal_sign", default=OUTPUT_DECIMAL_SIGN, help="decimal ('%s' or '%s') sign for the output csv file(s) (default: '%s')" % (COMMA_SIGN, DOT_SIGN, OUTPUT_DECIMAL_SIGN)) | |
arguments = arg_parser.parse_args() | |
if arguments.license: | |
print("For more information regarding the %s license, see <https://blog.b-e-l.be/gnu-gplv3-license>" % (__licence__)) | |
exit(0) | |
# Parse arguments and set global variables accordingly | |
global_vars = list(i for i in globals().iterkeys() if all(w.isupper() for w in i if w.isalpha())) | |
for global_var in global_vars: | |
try: | |
exec("if arguments.%s: %s = arguments.%s" % (global_var.lower(), global_var, global_var.lower())) | |
except AttributeError: | |
pass | |
# ACTUAL SCRIPT -- LEAVE ALONE | |
if OUTPUT_DECIMAL_SIGN == ".": | |
INPUT_DECIMAL_SIGN = "," | |
elif OUTPUT_DECIMAL_SIGN == ",": | |
INPUT_DECIMAL_SIGN = "." | |
else: | |
arg_parser.print_help() | |
exit(1) | |
if SORT_HEADERS: | |
EXTRA, A, D, H = list(), list(), list(), list(); | |
for header in HEADERS: | |
if header.endswith("A"): | |
A.append(header) | |
elif header.endswith("D"): | |
D.append(header) | |
elif header.endswith("H"): | |
H.append(header) | |
else: | |
EXTRA.append(header) | |
A.sort() | |
D.sort() | |
H.sort() | |
HEADERS = EXTRA + A + D + H | |
def generate_dict_out(): | |
return_dict = dict() | |
for header in HEADERS: | |
return_dict[header] = list() | |
return return_dict | |
def set_decimal_seperator(row): | |
return list(str(row_value).replace(INPUT_DECIMAL_SIGN, OUTPUT_DECIMAL_SIGN) if isinstance(row_value, float) else str(row_value.encode(ENCODING)) for row_value in row) | |
def dump_to_csv(csv_file_name, dict_data): | |
with open(csv_file_name, "w") as csv_out: | |
for header in HEADERS: | |
csv_writer = writer(csv_out, delimiter=CSV_DELIMITER, quoting=QUOTE_NONE) | |
csv_writer.writerow([header] + dict_data[header]) | |
dict_data.clear() | |
print("ALL EXISING FILES WILL BE AUTOMATICALLY OVERWRITTEN! Are you sure you want to continue?") | |
print("START") | |
if EXTRACT_ZIP: | |
for zip_file in glob("*%s" % (EXT_ZIP)): | |
with ZipFile(zip_file, "r") as zip_file_fd: | |
print("Extracting files out of archive (%s): %s" % (zip_file, zip_file_fd.namelist())) | |
zip_file_fd.extractall() | |
dict_out = generate_dict_out() | |
xls_files = glob("*%s" % (EXT_XSL)) | |
xls_files.sort() | |
for xls_file in xls_files: | |
print("Opening xls file: %s" % (xls_file)) | |
if not(ALL_COMBINED) and not(dict_out): | |
dict_out = generate_dict_out() | |
xls_workbook = open_workbook(xls_file) | |
for sheet_name in xls_workbook.sheet_names(): | |
xls_workbook_sheet = xls_workbook.sheet_by_name(sheet_name) | |
dict_translate = dict() | |
list_not_present = list() | |
first_row = True | |
for row_num in xrange(xls_workbook_sheet.nrows): | |
row_values = set_decimal_seperator(xls_workbook_sheet.row_values(row_num)) | |
if first_row: | |
first_row = False | |
for header in HEADERS: | |
if header in row_values: | |
dict_translate[row_values.index(header)] = header | |
else: | |
list_not_present.append(header) | |
else: | |
for value_column in dict_translate.keys(): | |
row_value = row_values[value_column] | |
if row_value.strip() == "": | |
row_value = NULL_VALUE | |
dict_out[dict_translate[value_column]].append(row_value) | |
for header_not_present in list_not_present: | |
dict_out[header_not_present].append(NULL_VALUE) | |
dict_translate.clear() | |
del list_not_present[:] | |
if not(ALL_COMBINED): | |
xls_file_name, xls_file_ext = splitext(xls_file) | |
print("Writing csv data to: %s%s" % (xls_file_name, CONVERT_CSV_EXT)) | |
dump_to_csv("%s%s" % (xls_file_name, CONVERT_CSV_EXT), dict_out) | |
if ALL_COMBINED: | |
print("Writing all csv data to combined file: %s%s" % ("all-combined", CONVERT_CSV_EXT)) | |
dump_to_csv("%s%s" % ("all-combined", CONVERT_CSV_EXT), dict_out) | |
print("DONE") | |
exit(0) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment