Skip to content

Instantly share code, notes, and snippets.

@robinhowlett
Last active November 9, 2018 22:26
Show Gist options
  • Save robinhowlett/688a6265d6d4dc62b1dfe65441affa8a to your computer and use it in GitHub Desktop.
Save robinhowlett/688a6265d6d4dc62b1dfe65441affa8a to your computer and use it in GitHub Desktop.
Java code I used to parse the BCBC's PDF of all bets placed during the tournament and convert the data into a tsv/spreadsheet. Takes about 12 seconds to run. Source: https://twitter.com/robinhowlett/status/1060427312603119618
// didn't both with the import statements
public class U_BCBCAnalysis extends PDFTextStripper {
// regex to match bets e.g. EX, TRI, DD, WIN
static final Pattern BET_TYPE = Pattern.compile("^([A-Z-])+$");
static FileOutputStream fileOutputStream;
// keeps track of where lines of text related to each player start and end
static List<BCBCEntry> players = new ArrayList<>();
// keeps track of the lines of text for some context
static List<String> playerText = new ArrayList<>();
static int index = 0;
static int start = 0;
static int end = 0;
// handle the first player edge case
static boolean firstPlayerFound = false;
public U_BCBCAnalysis() throws IOException {
}
public static void main(String[] args) throws IOException {
// apache pdfbox
PDDocument document = null;
try {
// load, split, and read the text of the PDF, aggregating the pages' text
document = PDDocument.load(
Paths.get("/Users/rhowlett/Downloads/2018 BCBC Final.pdf").toFile());
// i'm trying to remember why i bother to split, but since it all works, moving on...
Splitter splitter = new Splitter();
List<PDDocument> raceCharts = splitter.split(document);
// for each pdf page, parse its text
for (PDDocument raceChart : raceCharts) {
PDFTextStripper stripper = new U_BCBCAnalysis();
stripper.setSortByPosition(true);
try {
Writer dummy = new OutputStreamWriter(new ByteArrayOutputStream(),
StandardCharsets.UTF_8);
// this will end up calling #writeString() below with the text of each line
// each line of text can then be examined in the context of the text already
// parsed, so can figure out if the text is related to the player or the bet etc
// the line of text itself will be pulled apart to extract the actual bets
stripper.writeText(raceChart, dummy);
// at this point, all the players' start-end line indexes have been saved
} finally {
raceChart.close();
}
}
} catch (Exception e) {
e.printStackTrace(); // meh
} finally {
if (document != null) {
document.close();
}
}
try {
// start writing a csv file and provide some headers
// use tab-seprated as commas as commonly used in the data e.g. bet selections
fileOutputStream = new FileOutputStream("/Users/rhowlett/tmp/bcbc.tsv", false);
fileOutputStream.write(("last\tfirst\tuuid\tadw\tdate\trace\ttype\tbets\trefunds" +
"\twinnings\trunners").getBytes(UTF_8));
// for each BCBC entry, parse and write their bets as rows into the csv file
for (int i = 0; i < players.size(); i++) {
BCBCEntry player = players.get(i);
writeCsvRows(i, player, playerText.subList(player.start, player.end));
}
} finally {
IOUtils.closeQuietly(fileOutputStream);
}
}
// triggered during stripper.writeText() execution above
// and called for each line of text processed when parsing the PDF
@Override
protected void writeString(String string, List<TextPosition> textPositions) throws IOException {
playerText.add(string);
// if Friday's date was found, then we are on to a new player (unless this is the first
// player found)
boolean lastPlayerFound = (index == 85802); // lazy solution for last page
if (string.equals("Date: 20181102") || lastPlayerFound) {
if (firstPlayerFound) {
end = (lastPlayerFound ? index : index - 5);
players.add(new BCBCEntry(playerText.get(start + 1).trim(), // first
playerText.get(start).trim(), // last
playerText.get(start + 4).trim(), // full name
playerText.get(start + 2).trim(), // uuid
playerText.get(start + 3).trim(), // adw
start, end)); // mark where this players lines of text start and end
start = end;
}
firstPlayerFound = true;
}
index++;
}
private static void writeCsvRows(int playerIndex, BCBCEntry player, List<String> strings)
throws IOException {
List<Bet> bets = new ArrayList<>();
boolean day2Found = false;
boolean typeFound = false;
boolean betFound = false;
int currentRace = 0;
Bet bet = new Bet(playerIndex + 1); // make it 1-based to also rep. fin position
// plan:
// separate days
// separate races
// separate bets
// bother with penalty?
for (String string : strings) {
// if it's Saturday's date, then Friday bets must be done
if (string.equals("Date: 20181103")) {
day2Found = true;
continue;
} else if (string.startsWith("Race: ")) {
// track the race these bets were for
currentRace = Integer.parseInt(string.substring(string.lastIndexOf(' ') + 1));
continue;
}
// only bet types are all in caps around the bet data - identify them
Matcher matcher = BET_TYPE.matcher(string);
if (matcher.find()) {
// check we haven't messed up - markers should already have been found
if (typeFound && betFound) {
bet.first = player.first;
bet.last = player.last;
bet.uuid = player.uuid;
bet.adw = player.adw;
bet.date = (day2Found ? "20181103" : "20181102");
bet.race = currentRace;
bets.add(bet);
// start clean for the next bet
bet = new Bet(playerIndex + 1);
betFound = false;
}
// learned from analysis via Excel
// the BCBC used inconsistent labels for the same bet types (hand entered?)
// this standardizes them
if (string.trim().equals("DB") || string.trim().equals("DBL") ||
string.trim().equals("DOUBLE")) {
string = "DD";
} else if (string.trim().equals("EXA") || string.trim().equals("EXACTA")) {
string = "EX";
} else if (string.trim().equals("PLACE") || string.trim().equals("PL")) {
string = "PLC";
} else if (string.trim().equals("TRIFECTA") || string.trim().equals("TR") ||
string.trim().equals("SF")) { // really odd? a superfecta? not allowed
string = "TRI";
} else if (string.trim().equals("SH") || string.trim().equals("SHOW")) {
string = "SHW";
} else if (string.trim().equals("WN")) {
string = "WIN";
}
bet.type = string;
typeFound = true;
continue;
}
// this is messy
if (typeFound) {
if (string.split(" ").length == 3) {
bet.first = player.first;
bet.last = player.last;
bet.uuid = player.uuid;
bet.adw = player.adw;
bet.date = (day2Found ? "20181103" : "20181102");
bet.race = currentRace;
bets.add(bet);
bet = new Bet(playerIndex + 1);
typeFound = false;
betFound = false;
} else {
// day 2 seemed to use the pipe character | instead of the traditional
// comma character , to separate different selections within the same race.
// standardizing on comma
//
// this part also handles when the bets details were so long they bled over
// to the next line
if (bet.all != null) {
bet.setAll(bet.all.concat(string.trim()).replaceAll("\\|", ","));
betFound = true;
} else {
bet.setAll(string.trim().replaceAll("\\|", ","));
betFound = true;
}
}
}
}
// for each bet, write a row to the csv file
for (Bet b : bets) {
String row = "\n" + b.last + "\t" +
b.first + "\t" +
b.uuid + "\t" +
b.adw + "\t" +
b.date + "\t" +
b.race + "\t" +
b.type + "\t" +
b.bets + "\t" +
b.refunds + "\t" +
b.winnings + "\t" +
b.runners;
fileOutputStream.write(row.getBytes(UTF_8));
}
}
// pojo to store a single bet's details for this player - will be used as the row data
static class Bet {
// matches bets, refunds, winnings, and runners data that the pdfs spits out combined
// e.g. $2,000.00 $0.00 $50,420.00 2/5/11
// can they be separated out using a regex
public static final Pattern ALL =
Pattern.compile("\\$([\\d,\\.]+)\\s\\$([\\d,\\.]+)\\s\\$([\\d,\\.]+)\\s(.+)");
int finPos;
String first;
String last;
String uuid;
String adw;
String date;
int race;
String type;
String all;
Double bets;
Double refunds;
Double winnings;
String runners;
public Bet(int finPos) {
this.finPos = finPos;
}
// convert "all" temp placeholder into its respective components
public void setAll(String all) {
this.all = all;
Matcher matcher = ALL.matcher(all);
if (matcher.find()) {
bets = Double.valueOf(matcher.group(1).replaceAll(",", ""));
refunds = Double.valueOf(matcher.group(2).replaceAll(",", ""));
winnings = Double.valueOf(matcher.group(3).replaceAll(",", ""));
runners = matcher.group(4);
}
}
}
// temp object to boundary details and locations of lines of text relating to a player
class BCBCEntry {
String first;
String last;
String name;
String uuid;
String adw;
int start;
int end;
public BCBCEntry(String first, String last, String name, String uuid, String adw,
int start, int end) {
this.first = first;
this.last = last;
this.name = name;
this.uuid = uuid;
this.adw = adw;
this.start = start;
this.end = end;
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment