Skip to content

Instantly share code, notes, and snippets.

@belsander
Last active December 17, 2016 15:32
Show Gist options
  • Save belsander/df134e4464d75c78c7b1be65e3556595 to your computer and use it in GitHub Desktop.
Save belsander/df134e4464d75c78c7b1be65e3556595 to your computer and use it in GitHub Desktop.
Convert XLS files from http://football-data.co.uk to a more workable CSV format
#!/usr/bin/python
"""football-data.co.uk_convert.py: Convert xslx files to one or more workable csv file(s)
Copyright (C) 2016 Sander Bel
This program is free software: you can redistribute it and/or modify it under the terms of the
GNU General Public License as published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without
even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
General Public License for more details.
You should have received a copy of the GNU General Public License along with this program. If not,
see <http://www.gnu.org/licenses/>.
"""
__author__ = "Sander Bel"
__copyright__ = "Copyright (C) 2016 Sander Bel"
__credits__ = ["Sander Bel"]
__licence__ = "GPLv3"
__version__ = "1.0.0"
__maintainer__ = "Sander Bel"
__email__ = "[email protected]"
__status__ = "Production"
from argparse import ArgumentParser, RawTextHelpFormatter
from glob import glob
from os.path import splitext
from xlrd import open_workbook
from csv import writer, QUOTE_NONE
from zipfile import ZipFile
# GLOBALS
ALL_COMBINED = False
SORT_HEADERS = False
EXTRACT_ZIP = False
TRANSPOSE = False
EXT_CSV = ".csv"
EXT_XSL = ".xls"
EXT_ZIP = ".zip"
CONVERT_CSV_EXT = "_converted%s" % (EXT_CSV)
COMMA_SIGN = ","
DOT_SIGN = "."
CSV_DELIMITER = COMMA_SIGN
OUTPUT_DECIMAL_SIGN = DOT_SIGN
NULL_VALUE = "0"
ENCODING = "UTF-8"
HEADERS = ["Div", "Date", "HomeTeam", "AwayTeam", "FTR", "B365A", "B365D", "B365H", "BSA", "BSD", "BSH", \
"BWA", "BWD", "BWH", "GBA", "GBD", "GBH", "IWA", "IWD", "IWH", "LBA", "LBD", "LBH", "PSA", "PSD", \
"PSH", "SBA", "SBD", "SBH", "SJA", "SJD", "SJH", "SOA", "SOD", "SOH", "SYA", "SYD", "SYH", "VCA", \
"VCD", "VCH", "WHA", "WHD", "WHH"]
arg_parser = ArgumentParser(prog=__file__, description="Convert xls files to csv.", epilog="%s Copyright (C) 2016 Sander Bel\nThis program comes with ABSOLUTELY NO WARRANTY; for details type '%s --license'.\nThis is free software, and you are welcome to redistribute it under certain conditions; type '%s --license' for details." % ((__file__,) * 3), formatter_class=RawTextHelpFormatter)
arg_parser.add_argument("-l", "--license", action="store_true", help="show license information and exit")
arg_parser.add_argument("-v", "--version", action="version", help="show version information and exit", version="%(prog)s v" + __version__)
arg_parser.add_argument("-a", "--all_combined", default=ALL_COMBINED, help="combine all the xls files into one csv file, instead of a seperate csv output file for each xls input file")
arg_parser.add_argument("-s", "--sort_headers", default=SORT_HEADERS, help="sort headers no longer based upon bookmaker, but based upon results (what is the name of A,D,H?)")
arg_parser.add_argument("-t", "--transpose", default=TRANSPOSE, help="transpose columns to rows and visa versa")
arg_parser.add_argument("-e", "--extract_zip", default=EXTRACT_ZIP, help="before importing the xls files, extract all zip files within the current directory (not including subdirectories)")
arg_parser.add_argument("-c", "--csv_delimiter", default=CSV_DELIMITER, help="delimiter used for the output csv file(s) (default: '%s')" % (CSV_DELIMITER))
arg_parser.add_argument("-o", "--output_decimal_sign", default=OUTPUT_DECIMAL_SIGN, help="decimal ('%s' or '%s') sign for the output csv file(s) (default: '%s')" % (COMMA_SIGN, DOT_SIGN, OUTPUT_DECIMAL_SIGN))
arguments = arg_parser.parse_args()
if arguments.license:
print("For more information regarding the %s license, see <https://blog.b-e-l.be/gnu-gplv3-license>" % (__licence__))
exit(0)
# Parse arguments and set global variables accordingly
global_vars = list(i for i in globals().iterkeys() if all(w.isupper() for w in i if w.isalpha()))
for global_var in global_vars:
try:
exec("if arguments.%s: %s = arguments.%s" % (global_var.lower(), global_var, global_var.lower()))
except AttributeError:
pass
# ACTUAL SCRIPT -- LEAVE ALONE
if OUTPUT_DECIMAL_SIGN == ".":
INPUT_DECIMAL_SIGN = ","
elif OUTPUT_DECIMAL_SIGN == ",":
INPUT_DECIMAL_SIGN = "."
else:
arg_parser.print_help()
exit(1)
if SORT_HEADERS:
EXTRA, A, D, H = list(), list(), list(), list();
for header in HEADERS:
if header.endswith("A"):
A.append(header)
elif header.endswith("D"):
D.append(header)
elif header.endswith("H"):
H.append(header)
else:
EXTRA.append(header)
A.sort()
D.sort()
H.sort()
HEADERS = EXTRA + A + D + H
def generate_dict_out():
return_dict = dict()
for header in HEADERS:
return_dict[header] = list()
return return_dict
def set_decimal_seperator(row):
return list(str(row_value).replace(INPUT_DECIMAL_SIGN, OUTPUT_DECIMAL_SIGN) if isinstance(row_value, float) else str(row_value.encode(ENCODING)) for row_value in row)
def dump_to_csv(csv_file_name, dict_data):
with open(csv_file_name, "w") as csv_out:
for header in HEADERS:
csv_writer = writer(csv_out, delimiter=CSV_DELIMITER, quoting=QUOTE_NONE)
csv_writer.writerow([header] + dict_data[header])
dict_data.clear()
print("ALL EXISING FILES WILL BE AUTOMATICALLY OVERWRITTEN! Are you sure you want to continue?")
print("START")
if EXTRACT_ZIP:
for zip_file in glob("*%s" % (EXT_ZIP)):
with ZipFile(zip_file, "r") as zip_file_fd:
print("Extracting files out of archive (%s): %s" % (zip_file, zip_file_fd.namelist()))
zip_file_fd.extractall()
dict_out = generate_dict_out()
xls_files = glob("*%s" % (EXT_XSL))
xls_files.sort()
for xls_file in xls_files:
print("Opening xls file: %s" % (xls_file))
if not(ALL_COMBINED) and not(dict_out):
dict_out = generate_dict_out()
xls_workbook = open_workbook(xls_file)
for sheet_name in xls_workbook.sheet_names():
xls_workbook_sheet = xls_workbook.sheet_by_name(sheet_name)
dict_translate = dict()
list_not_present = list()
first_row = True
for row_num in xrange(xls_workbook_sheet.nrows):
row_values = set_decimal_seperator(xls_workbook_sheet.row_values(row_num))
if first_row:
first_row = False
for header in HEADERS:
if header in row_values:
dict_translate[row_values.index(header)] = header
else:
list_not_present.append(header)
else:
for value_column in dict_translate.keys():
row_value = row_values[value_column]
if row_value.strip() == "":
row_value = NULL_VALUE
dict_out[dict_translate[value_column]].append(row_value)
for header_not_present in list_not_present:
dict_out[header_not_present].append(NULL_VALUE)
dict_translate.clear()
del list_not_present[:]
if not(ALL_COMBINED):
xls_file_name, xls_file_ext = splitext(xls_file)
print("Writing csv data to: %s%s" % (xls_file_name, CONVERT_CSV_EXT))
dump_to_csv("%s%s" % (xls_file_name, CONVERT_CSV_EXT), dict_out)
if ALL_COMBINED:
print("Writing all csv data to combined file: %s%s" % ("all-combined", CONVERT_CSV_EXT))
dump_to_csv("%s%s" % ("all-combined", CONVERT_CSV_EXT), dict_out)
print("DONE")
exit(0)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment