Last active
November 9, 2018 22:26
-
-
Save robinhowlett/688a6265d6d4dc62b1dfe65441affa8a to your computer and use it in GitHub Desktop.
Java code I used to parse the BCBC's PDF of all bets placed during the tournament and convert the data into a tsv/spreadsheet. Takes about 12 seconds to run. Source: https://twitter.com/robinhowlett/status/1060427312603119618
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// didn't both with the import statements | |
public class U_BCBCAnalysis extends PDFTextStripper { | |
// regex to match bets e.g. EX, TRI, DD, WIN | |
static final Pattern BET_TYPE = Pattern.compile("^([A-Z-])+$"); | |
static FileOutputStream fileOutputStream; | |
// keeps track of where lines of text related to each player start and end | |
static List<BCBCEntry> players = new ArrayList<>(); | |
// keeps track of the lines of text for some context | |
static List<String> playerText = new ArrayList<>(); | |
static int index = 0; | |
static int start = 0; | |
static int end = 0; | |
// handle the first player edge case | |
static boolean firstPlayerFound = false; | |
public U_BCBCAnalysis() throws IOException { | |
} | |
public static void main(String[] args) throws IOException { | |
// apache pdfbox | |
PDDocument document = null; | |
try { | |
// load, split, and read the text of the PDF, aggregating the pages' text | |
document = PDDocument.load( | |
Paths.get("/Users/rhowlett/Downloads/2018 BCBC Final.pdf").toFile()); | |
// i'm trying to remember why i bother to split, but since it all works, moving on... | |
Splitter splitter = new Splitter(); | |
List<PDDocument> raceCharts = splitter.split(document); | |
// for each pdf page, parse its text | |
for (PDDocument raceChart : raceCharts) { | |
PDFTextStripper stripper = new U_BCBCAnalysis(); | |
stripper.setSortByPosition(true); | |
try { | |
Writer dummy = new OutputStreamWriter(new ByteArrayOutputStream(), | |
StandardCharsets.UTF_8); | |
// this will end up calling #writeString() below with the text of each line | |
// each line of text can then be examined in the context of the text already | |
// parsed, so can figure out if the text is related to the player or the bet etc | |
// the line of text itself will be pulled apart to extract the actual bets | |
stripper.writeText(raceChart, dummy); | |
// at this point, all the players' start-end line indexes have been saved | |
} finally { | |
raceChart.close(); | |
} | |
} | |
} catch (Exception e) { | |
e.printStackTrace(); // meh | |
} finally { | |
if (document != null) { | |
document.close(); | |
} | |
} | |
try { | |
// start writing a csv file and provide some headers | |
// use tab-seprated as commas as commonly used in the data e.g. bet selections | |
fileOutputStream = new FileOutputStream("/Users/rhowlett/tmp/bcbc.tsv", false); | |
fileOutputStream.write(("last\tfirst\tuuid\tadw\tdate\trace\ttype\tbets\trefunds" + | |
"\twinnings\trunners").getBytes(UTF_8)); | |
// for each BCBC entry, parse and write their bets as rows into the csv file | |
for (int i = 0; i < players.size(); i++) { | |
BCBCEntry player = players.get(i); | |
writeCsvRows(i, player, playerText.subList(player.start, player.end)); | |
} | |
} finally { | |
IOUtils.closeQuietly(fileOutputStream); | |
} | |
} | |
// triggered during stripper.writeText() execution above | |
// and called for each line of text processed when parsing the PDF | |
@Override | |
protected void writeString(String string, List<TextPosition> textPositions) throws IOException { | |
playerText.add(string); | |
// if Friday's date was found, then we are on to a new player (unless this is the first | |
// player found) | |
boolean lastPlayerFound = (index == 85802); // lazy solution for last page | |
if (string.equals("Date: 20181102") || lastPlayerFound) { | |
if (firstPlayerFound) { | |
end = (lastPlayerFound ? index : index - 5); | |
players.add(new BCBCEntry(playerText.get(start + 1).trim(), // first | |
playerText.get(start).trim(), // last | |
playerText.get(start + 4).trim(), // full name | |
playerText.get(start + 2).trim(), // uuid | |
playerText.get(start + 3).trim(), // adw | |
start, end)); // mark where this players lines of text start and end | |
start = end; | |
} | |
firstPlayerFound = true; | |
} | |
index++; | |
} | |
private static void writeCsvRows(int playerIndex, BCBCEntry player, List<String> strings) | |
throws IOException { | |
List<Bet> bets = new ArrayList<>(); | |
boolean day2Found = false; | |
boolean typeFound = false; | |
boolean betFound = false; | |
int currentRace = 0; | |
Bet bet = new Bet(playerIndex + 1); // make it 1-based to also rep. fin position | |
// plan: | |
// separate days | |
// separate races | |
// separate bets | |
// bother with penalty? | |
for (String string : strings) { | |
// if it's Saturday's date, then Friday bets must be done | |
if (string.equals("Date: 20181103")) { | |
day2Found = true; | |
continue; | |
} else if (string.startsWith("Race: ")) { | |
// track the race these bets were for | |
currentRace = Integer.parseInt(string.substring(string.lastIndexOf(' ') + 1)); | |
continue; | |
} | |
// only bet types are all in caps around the bet data - identify them | |
Matcher matcher = BET_TYPE.matcher(string); | |
if (matcher.find()) { | |
// check we haven't messed up - markers should already have been found | |
if (typeFound && betFound) { | |
bet.first = player.first; | |
bet.last = player.last; | |
bet.uuid = player.uuid; | |
bet.adw = player.adw; | |
bet.date = (day2Found ? "20181103" : "20181102"); | |
bet.race = currentRace; | |
bets.add(bet); | |
// start clean for the next bet | |
bet = new Bet(playerIndex + 1); | |
betFound = false; | |
} | |
// learned from analysis via Excel | |
// the BCBC used inconsistent labels for the same bet types (hand entered?) | |
// this standardizes them | |
if (string.trim().equals("DB") || string.trim().equals("DBL") || | |
string.trim().equals("DOUBLE")) { | |
string = "DD"; | |
} else if (string.trim().equals("EXA") || string.trim().equals("EXACTA")) { | |
string = "EX"; | |
} else if (string.trim().equals("PLACE") || string.trim().equals("PL")) { | |
string = "PLC"; | |
} else if (string.trim().equals("TRIFECTA") || string.trim().equals("TR") || | |
string.trim().equals("SF")) { // really odd? a superfecta? not allowed | |
string = "TRI"; | |
} else if (string.trim().equals("SH") || string.trim().equals("SHOW")) { | |
string = "SHW"; | |
} else if (string.trim().equals("WN")) { | |
string = "WIN"; | |
} | |
bet.type = string; | |
typeFound = true; | |
continue; | |
} | |
// this is messy | |
if (typeFound) { | |
if (string.split(" ").length == 3) { | |
bet.first = player.first; | |
bet.last = player.last; | |
bet.uuid = player.uuid; | |
bet.adw = player.adw; | |
bet.date = (day2Found ? "20181103" : "20181102"); | |
bet.race = currentRace; | |
bets.add(bet); | |
bet = new Bet(playerIndex + 1); | |
typeFound = false; | |
betFound = false; | |
} else { | |
// day 2 seemed to use the pipe character | instead of the traditional | |
// comma character , to separate different selections within the same race. | |
// standardizing on comma | |
// | |
// this part also handles when the bets details were so long they bled over | |
// to the next line | |
if (bet.all != null) { | |
bet.setAll(bet.all.concat(string.trim()).replaceAll("\\|", ",")); | |
betFound = true; | |
} else { | |
bet.setAll(string.trim().replaceAll("\\|", ",")); | |
betFound = true; | |
} | |
} | |
} | |
} | |
// for each bet, write a row to the csv file | |
for (Bet b : bets) { | |
String row = "\n" + b.last + "\t" + | |
b.first + "\t" + | |
b.uuid + "\t" + | |
b.adw + "\t" + | |
b.date + "\t" + | |
b.race + "\t" + | |
b.type + "\t" + | |
b.bets + "\t" + | |
b.refunds + "\t" + | |
b.winnings + "\t" + | |
b.runners; | |
fileOutputStream.write(row.getBytes(UTF_8)); | |
} | |
} | |
// pojo to store a single bet's details for this player - will be used as the row data | |
static class Bet { | |
// matches bets, refunds, winnings, and runners data that the pdfs spits out combined | |
// e.g. $2,000.00 $0.00 $50,420.00 2/5/11 | |
// can they be separated out using a regex | |
public static final Pattern ALL = | |
Pattern.compile("\\$([\\d,\\.]+)\\s\\$([\\d,\\.]+)\\s\\$([\\d,\\.]+)\\s(.+)"); | |
int finPos; | |
String first; | |
String last; | |
String uuid; | |
String adw; | |
String date; | |
int race; | |
String type; | |
String all; | |
Double bets; | |
Double refunds; | |
Double winnings; | |
String runners; | |
public Bet(int finPos) { | |
this.finPos = finPos; | |
} | |
// convert "all" temp placeholder into its respective components | |
public void setAll(String all) { | |
this.all = all; | |
Matcher matcher = ALL.matcher(all); | |
if (matcher.find()) { | |
bets = Double.valueOf(matcher.group(1).replaceAll(",", "")); | |
refunds = Double.valueOf(matcher.group(2).replaceAll(",", "")); | |
winnings = Double.valueOf(matcher.group(3).replaceAll(",", "")); | |
runners = matcher.group(4); | |
} | |
} | |
} | |
// temp object to boundary details and locations of lines of text relating to a player | |
class BCBCEntry { | |
String first; | |
String last; | |
String name; | |
String uuid; | |
String adw; | |
int start; | |
int end; | |
public BCBCEntry(String first, String last, String name, String uuid, String adw, | |
int start, int end) { | |
this.first = first; | |
this.last = last; | |
this.name = name; | |
this.uuid = uuid; | |
this.adw = adw; | |
this.start = start; | |
this.end = end; | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment