robinhowlett · November 9, 2018 22:26
diff --git a/U_BCBCAnalysis.java b/U_BCBCAnalysis.java
 // didn't both with the import statements
 public class U_BCBCAnalysis extends PDFTextStripper {
    // regex to match bets e.g. EX, TRI, DD, WIN
    static final Pattern BET_TYPE = Pattern.compile("^([A-Z-])+$");
    static FileOutputStream fileOutputStream;
    // keeps track of where lines of text related to each player start and end
    static List<BCBCEntry> players = new ArrayList<>();
    // keeps track of the lines of text for some context
    static List<String> playerText = new ArrayList<>();
    static int index = 0;
    static int start = 0;
    static int end = 0;
    // handle the first player edge case
    static boolean firstPlayerFound = false;

    public U_BCBCAnalysis() throws IOException {
    }

    public static void main(String[] args) throws IOException {
        // apache pdfbox
        PDDocument document = null;
        try {
            // load, split, and read the text of the PDF, aggregating the pages' text
            document = PDDocument.load(
                    Paths.get("/Users/rhowlett/Downloads/2018 BCBC Final.pdf").toFile());

            // i'm trying to remember why i bother to split, but since it all works, moving on...
            Splitter splitter = new Splitter();
            List<PDDocument> raceCharts = splitter.split(document);

            // for each pdf page, parse its text
            for (PDDocument raceChart : raceCharts) {
                PDFTextStripper stripper = new U_BCBCAnalysis();
                stripper.setSortByPosition(true);
                try {
                    Writer dummy = new OutputStreamWriter(new ByteArrayOutputStream(),
                            StandardCharsets.UTF_8);
                    // this will end up calling #writeString() below with the text of each line
                    // each line of text can then be examined in the context of the text already
                    // parsed, so can figure out if the text is related to the player or the bet etc
                    // the line of text itself will be pulled apart to extract the actual bets
                    stripper.writeText(raceChart, dummy);
                    // at this point, all the players' start-end line indexes have been saved
                } finally {
                    raceChart.close();
                }
            }
        } catch (Exception e) {
            e.printStackTrace(); // meh
        } finally {
            if (document != null) {
                document.close();
            }
        }

        try {
            // start writing a csv file and provide some headers
            // use tab-seprated as commas as commonly used in the data e.g. bet selections
            fileOutputStream = new FileOutputStream("/Users/rhowlett/tmp/bcbc.tsv", false);
            fileOutputStream.write(("last\tfirst\tuuid\tadw\tdate\trace\ttype\tbets\trefunds" +
                    "\twinnings\trunners").getBytes(UTF_8));
            // for each BCBC entry, parse and write their bets as rows into the csv file
            for (int i = 0; i < players.size(); i++) {
                BCBCEntry player = players.get(i);
                writeCsvRows(i, player, playerText.subList(player.start, player.end));
            }
        } finally {
            IOUtils.closeQuietly(fileOutputStream);
        }
    }

    // triggered during stripper.writeText() execution above
    // and called for each line of text processed when parsing the PDF
    @Override
    protected void writeString(String string, List<TextPosition> textPositions) throws IOException {
        playerText.add(string);
        // if Friday's date was found, then we are on to a new player (unless this is the first
        // player found)
        boolean lastPlayerFound = (index == 85802); // lazy solution for last page
        if (string.equals("Date: 20181102") || lastPlayerFound) {
            if (firstPlayerFound) {
                end = (lastPlayerFound ? index : index - 5);
                players.add(new BCBCEntry(playerText.get(start + 1).trim(), // first
                        playerText.get(start).trim(), // last
                        playerText.get(start + 4).trim(), // full name
                        playerText.get(start + 2).trim(), // uuid
                        playerText.get(start + 3).trim(), // adw
                        start, end)); // mark where this players lines of text start and end
                start = end;
            }
            firstPlayerFound = true;
        }
        index++;
    }

    private static void writeCsvRows(int playerIndex, BCBCEntry player, List<String> strings)
            throws IOException {
        List<Bet> bets = new ArrayList<>();
        boolean day2Found = false;
        boolean typeFound = false;
        boolean betFound = false;
        int currentRace = 0;

        Bet bet = new Bet(playerIndex + 1); // make it 1-based to also rep. fin position

        // plan:
        // separate days
        // separate races
        // separate bets
        // bother with penalty?
        for (String string : strings) {
            // if it's Saturday's date, then Friday bets must be done
            if (string.equals("Date: 20181103")) {
                day2Found = true;
                continue;
            } else if (string.startsWith("Race: ")) {
                // track the race these bets were for
                currentRace = Integer.parseInt(string.substring(string.lastIndexOf(' ') + 1));
                continue;
            }

            // only bet types are all in caps around the bet data - identify them
            Matcher matcher = BET_TYPE.matcher(string);
            if (matcher.find()) {
                // check we haven't messed up - markers should already have been found
                if (typeFound && betFound) {
                    bet.first = player.first;
                    bet.last = player.last;
                    bet.uuid = player.uuid;
                    bet.adw = player.adw;
                    bet.date = (day2Found ? "20181103" : "20181102");
                    bet.race = currentRace;
                    bets.add(bet);
                    // start clean for the next bet
                    bet = new Bet(playerIndex + 1);
                    betFound = false;
                }

                // learned from analysis via Excel
                // the BCBC used inconsistent labels for the same bet types (hand entered?)
                // this standardizes them
                if (string.trim().equals("DB") || string.trim().equals("DBL") ||
                        string.trim().equals("DOUBLE")) {
                    string = "DD";
                } else if (string.trim().equals("EXA") || string.trim().equals("EXACTA")) {
                    string = "EX";
                } else if (string.trim().equals("PLACE") || string.trim().equals("PL")) {
                    string = "PLC";
                } else if (string.trim().equals("TRIFECTA") || string.trim().equals("TR") ||
                        string.trim().equals("SF")) { // really odd? a superfecta? not allowed
                    string = "TRI";
                } else if (string.trim().equals("SH") || string.trim().equals("SHOW")) {
                    string = "SHW";
                } else if (string.trim().equals("WN")) {
                    string = "WIN";
                }

                bet.type = string;
                typeFound = true;
                continue;
            }

            // this is messy
            if (typeFound) {
                if (string.split(" ").length == 3) {
                    bet.first = player.first;
                    bet.last = player.last;
                    bet.uuid = player.uuid;
                    bet.adw = player.adw;
                    bet.date = (day2Found ? "20181103" : "20181102");
                    bet.race = currentRace;
                    bets.add(bet);
                    bet = new Bet(playerIndex + 1);
                    typeFound = false;
                    betFound = false;
                } else {
                    // day 2 seemed to use the pipe character | instead of the traditional
                    // comma character , to separate different selections within the same race.
                    // standardizing on comma
                    //
                    // this part also handles when the bets details were so long they bled over
                    // to the next line
                    if (bet.all != null) {
                        bet.setAll(bet.all.concat(string.trim()).replaceAll("\\|", ","));
                        betFound = true;
                    } else {
                        bet.setAll(string.trim().replaceAll("\\|", ","));
                        betFound = true;
                    }
                }
            }
        }

        // for each bet, write a row to the csv file
        for (Bet b : bets) {
            String row = "\n" + b.last + "\t" +
                    b.first + "\t" +
                    b.uuid + "\t" +
                    b.adw + "\t" +
                    b.date + "\t" +
                    b.race + "\t" +
                    b.type + "\t" +
                    b.bets + "\t" +
                    b.refunds + "\t" +
                    b.winnings + "\t" +
                    b.runners;

            fileOutputStream.write(row.getBytes(UTF_8));
        }

    }

    // pojo to store a single bet's details for this player - will be used as the row data
    static class Bet {
        // matches bets, refunds, winnings, and runners data that the pdfs spits out combined
        // e.g. $2,000.00 $0.00 $50,420.00 2/5/11
        // can they be separated out using a regex
        public static final Pattern ALL =
                Pattern.compile("\\$([\\d,\\.]+)\\s\\$([\\d,\\.]+)\\s\\$([\\d,\\.]+)\\s(.+)");

        int finPos;
        String first;
        String last;
        String uuid;
        String adw;
        String date;
        int race;
        String type;
        String all;
        Double bets;
        Double refunds;
        Double winnings;
        String runners;

        public Bet(int finPos) {
            this.finPos = finPos;
        }

        // convert "all" temp placeholder into its respective components
        public void setAll(String all) {
            this.all = all;

            Matcher matcher = ALL.matcher(all);
            if (matcher.find()) {
                bets = Double.valueOf(matcher.group(1).replaceAll(",", ""));
                refunds = Double.valueOf(matcher.group(2).replaceAll(",", ""));
                winnings = Double.valueOf(matcher.group(3).replaceAll(",", ""));
                runners = matcher.group(4);
            }
        }
    }

    // temp object to boundary details and locations of lines of text relating to a player
    class BCBCEntry {
        String first;
        String last;
        String name;
        String uuid;
        String adw;
        int start;
        int end;

        public BCBCEntry(String first, String last, String name, String uuid, String adw,
                int start, int end) {
            this.first = first;
            this.last = last;
            this.name = name;
            this.uuid = uuid;
            this.adw = adw;
            this.start = start;
            this.end = end;
        }
    }
 }
	// didn't both with the import statements
	public class U_BCBCAnalysis extends PDFTextStripper {
	// regex to match bets e.g. EX, TRI, DD, WIN
	static final Pattern BET_TYPE = Pattern.compile("^([A-Z-])+$");
	static FileOutputStream fileOutputStream;
	// keeps track of where lines of text related to each player start and end
	static List<BCBCEntry> players = new ArrayList<>();
	// keeps track of the lines of text for some context
	static List<String> playerText = new ArrayList<>();
	static int index = 0;
	static int start = 0;
	static int end = 0;
	// handle the first player edge case
	static boolean firstPlayerFound = false;

	public U_BCBCAnalysis() throws IOException {
	}

	public static void main(String[] args) throws IOException {
	// apache pdfbox
	PDDocument document = null;
	try {
	// load, split, and read the text of the PDF, aggregating the pages' text
	document = PDDocument.load(
	Paths.get("/Users/rhowlett/Downloads/2018 BCBC Final.pdf").toFile());

	// i'm trying to remember why i bother to split, but since it all works, moving on...
	Splitter splitter = new Splitter();
	List<PDDocument> raceCharts = splitter.split(document);

	// for each pdf page, parse its text
	for (PDDocument raceChart : raceCharts) {
	PDFTextStripper stripper = new U_BCBCAnalysis();
	stripper.setSortByPosition(true);
	try {
	Writer dummy = new OutputStreamWriter(new ByteArrayOutputStream(),
	StandardCharsets.UTF_8);
	// this will end up calling #writeString() below with the text of each line
	// each line of text can then be examined in the context of the text already
	// parsed, so can figure out if the text is related to the player or the bet etc
	// the line of text itself will be pulled apart to extract the actual bets
	stripper.writeText(raceChart, dummy);
	// at this point, all the players' start-end line indexes have been saved
	} finally {
	raceChart.close();
	}
	}
	} catch (Exception e) {
	e.printStackTrace(); // meh
	} finally {
	if (document != null) {
	document.close();
	}
	}

	try {
	// start writing a csv file and provide some headers
	// use tab-seprated as commas as commonly used in the data e.g. bet selections
	fileOutputStream = new FileOutputStream("/Users/rhowlett/tmp/bcbc.tsv", false);
	fileOutputStream.write(("last\tfirst\tuuid\tadw\tdate\trace\ttype\tbets\trefunds" +
	"\twinnings\trunners").getBytes(UTF_8));
	// for each BCBC entry, parse and write their bets as rows into the csv file
	for (int i = 0; i < players.size(); i++) {
	BCBCEntry player = players.get(i);
	writeCsvRows(i, player, playerText.subList(player.start, player.end));
	}
	} finally {
	IOUtils.closeQuietly(fileOutputStream);
	}
	}

	// triggered during stripper.writeText() execution above
	// and called for each line of text processed when parsing the PDF
	@Override
	protected void writeString(String string, List<TextPosition> textPositions) throws IOException {
	playerText.add(string);
	// if Friday's date was found, then we are on to a new player (unless this is the first
	// player found)
	boolean lastPlayerFound = (index == 85802); // lazy solution for last page
	if (string.equals("Date: 20181102") \|\| lastPlayerFound) {
	if (firstPlayerFound) {
	end = (lastPlayerFound ? index : index - 5);
	players.add(new BCBCEntry(playerText.get(start + 1).trim(), // first
	playerText.get(start).trim(), // last
	playerText.get(start + 4).trim(), // full name
	playerText.get(start + 2).trim(), // uuid
	playerText.get(start + 3).trim(), // adw
	start, end)); // mark where this players lines of text start and end
	start = end;
	}
	firstPlayerFound = true;
	}
	index++;
	}

	private static void writeCsvRows(int playerIndex, BCBCEntry player, List<String> strings)
	throws IOException {
	List<Bet> bets = new ArrayList<>();
	boolean day2Found = false;
	boolean typeFound = false;
	boolean betFound = false;
	int currentRace = 0;

	Bet bet = new Bet(playerIndex + 1); // make it 1-based to also rep. fin position

	// plan:
	// separate days
	// separate races
	// separate bets
	// bother with penalty?
	for (String string : strings) {
	// if it's Saturday's date, then Friday bets must be done
	if (string.equals("Date: 20181103")) {
	day2Found = true;
	continue;
	} else if (string.startsWith("Race: ")) {
	// track the race these bets were for
	currentRace = Integer.parseInt(string.substring(string.lastIndexOf(' ') + 1));
	continue;
	}

	// only bet types are all in caps around the bet data - identify them
	Matcher matcher = BET_TYPE.matcher(string);
	if (matcher.find()) {
	// check we haven't messed up - markers should already have been found
	if (typeFound && betFound) {
	bet.first = player.first;
	bet.last = player.last;
	bet.uuid = player.uuid;
	bet.adw = player.adw;
	bet.date = (day2Found ? "20181103" : "20181102");
	bet.race = currentRace;
	bets.add(bet);
	// start clean for the next bet
	bet = new Bet(playerIndex + 1);
	betFound = false;
	}

	// learned from analysis via Excel
	// the BCBC used inconsistent labels for the same bet types (hand entered?)
	// this standardizes them
	if (string.trim().equals("DB") \|\| string.trim().equals("DBL") \|\|
	string.trim().equals("DOUBLE")) {
	string = "DD";
	} else if (string.trim().equals("EXA") \|\| string.trim().equals("EXACTA")) {
	string = "EX";
	} else if (string.trim().equals("PLACE") \|\| string.trim().equals("PL")) {
	string = "PLC";
	} else if (string.trim().equals("TRIFECTA") \|\| string.trim().equals("TR") \|\|
	string.trim().equals("SF")) { // really odd? a superfecta? not allowed
	string = "TRI";
	} else if (string.trim().equals("SH") \|\| string.trim().equals("SHOW")) {
	string = "SHW";
	} else if (string.trim().equals("WN")) {
	string = "WIN";
	}

	bet.type = string;
	typeFound = true;
	continue;
	}

	// this is messy
	if (typeFound) {
	if (string.split(" ").length == 3) {
	bet.first = player.first;
	bet.last = player.last;
	bet.uuid = player.uuid;
	bet.adw = player.adw;
	bet.date = (day2Found ? "20181103" : "20181102");
	bet.race = currentRace;
	bets.add(bet);
	bet = new Bet(playerIndex + 1);
	typeFound = false;
	betFound = false;
	} else {
	// day 2 seemed to use the pipe character \| instead of the traditional
	// comma character , to separate different selections within the same race.
	// standardizing on comma
	//
	// this part also handles when the bets details were so long they bled over
	// to the next line
	if (bet.all != null) {
	bet.setAll(bet.all.concat(string.trim()).replaceAll("\\\|", ","));
	betFound = true;
	} else {
	bet.setAll(string.trim().replaceAll("\\\|", ","));
	betFound = true;
	}
	}
	}
	}

	// for each bet, write a row to the csv file
	for (Bet b : bets) {
	String row = "\n" + b.last + "\t" +
	b.first + "\t" +
	b.uuid + "\t" +
	b.adw + "\t" +
	b.date + "\t" +
	b.race + "\t" +
	b.type + "\t" +
	b.bets + "\t" +
	b.refunds + "\t" +
	b.winnings + "\t" +
	b.runners;

	fileOutputStream.write(row.getBytes(UTF_8));
	}

	}

	// pojo to store a single bet's details for this player - will be used as the row data
	static class Bet {
	// matches bets, refunds, winnings, and runners data that the pdfs spits out combined
	// e.g. $2,000.00 $0.00 $50,420.00 2/5/11
	// can they be separated out using a regex
	public static final Pattern ALL =
	Pattern.compile("\\$([\\d,\\.]+)\\s\\$([\\d,\\.]+)\\s\\$([\\d,\\.]+)\\s(.+)");

	int finPos;
	String first;
	String last;
	String uuid;
	String adw;
	String date;
	int race;
	String type;
	String all;
	Double bets;
	Double refunds;
	Double winnings;
	String runners;

	public Bet(int finPos) {
	this.finPos = finPos;
	}

	// convert "all" temp placeholder into its respective components
	public void setAll(String all) {
	this.all = all;

	Matcher matcher = ALL.matcher(all);
	if (matcher.find()) {
	bets = Double.valueOf(matcher.group(1).replaceAll(",", ""));
	refunds = Double.valueOf(matcher.group(2).replaceAll(",", ""));
	winnings = Double.valueOf(matcher.group(3).replaceAll(",", ""));
	runners = matcher.group(4);
	}
	}
	}

	// temp object to boundary details and locations of lines of text relating to a player
	class BCBCEntry {
	String first;
	String last;
	String name;
	String uuid;
	String adw;
	int start;
	int end;

	public BCBCEntry(String first, String last, String name, String uuid, String adw,
	int start, int end) {
	this.first = first;
	this.last = last;
	this.name = name;
	this.uuid = uuid;
	this.adw = adw;
	this.start = start;
	this.end = end;
	}
	}
	}