Skip to content

Instantly share code, notes, and snippets.

@anselm
Created January 19, 2022 02:04
Show Gist options
  • Save anselm/72906b2796c6d16621659fab00e5b25c to your computer and use it in GitHub Desktop.
Save anselm/72906b2796c6d16621659fab00e5b25c to your computer and use it in GitHub Desktop.
parser for brandy
const fs = require("fs")
var pdf_table_extractor = require("pdf-table-extractor");
var fetch = require( 'node-fetch' );
var excel = require( 'write-excel-file/node' );
///////////////////////////////////////////////////////////////////////////////////////////////
// fetch things from net
let urlbase = "https://www.bppe.ca.gov"
let url = "https://www.bppe.ca.gov/webapps/summary_2018.php"
// helper
function getParams(uri) {
var params = {}, tokens, re = /[?&]?([^=]+)=([^&]*)/g;
while (tokens = re.exec(uri)) {
params[decodeURIComponent(tokens[1])] = decodeURIComponent(tokens[2]);
}
return params;
}
// fetch a doc from the site and save
async function savetodoc(url) {
let time = Date.now()
let path = getParams(url)["docName"]
docid = path.match(/\d+/)
path = "files/" + docid + ".pdf"
if(fs.existsSync(path)) {
console.log("already found as " + path)
return [path,0]
}
console.log("saving as " + path)
let response = await fetch(url)
let blob = await response.arrayBuffer()
fs.writeFileSync(path,Buffer.from(blob))
console.log("time was " + (Date.now() - time) )
return [path,blob]
}
// pull all useful urls from master manifest
async function geturls() {
//fetch the doc - actually i just fetched it once and just saved it here
//let response = await fetch(url)
//let text = await response.text()
//console.log(text)
// just load the file
let text = fs.readFileSync("summary_2018.html") + ""
// find things that are like what we want
let urls = text.match(/\bwebapps\/file_download.*?"/gi);
let novel = {}
let values = []
let count = 0
console.log("total is " + urls.length)
for(let url of urls) {
let musthave = "School Performance Fact Sheet"
if(!url.includes(musthave)) continue
//url = url.substring(0,url.length-musthave.length-2)
url = url.substring(0,url.length-1)
url = urlbase + "/" + url.trim().replace(/\s/g,"%20")
if(novel[url]) continue
novel[url]=url
values.push(url)
}
return values
}
///////////////////////////////////////////////////////////////////////////////////////////////
// sometimes rows are messed up
let badrow = 0
Object.defineProperty(Array.prototype, 'last', {
enumerable: false,
configurable: true,
get: function() {
return this[this.length - 1];
},
set: undefined
});
let notstartwords = [ "Year","Employed","Field","Concurrent","Aggregated","Positions" ]
function workharder(row) {
let out = []
for(let i = 0; i < row.length; i++) {
let term = row[i].trim().replace(/\s+/g,' ').trim()
// Calendar Year ... ugh
if(term.startsWith("Calendar Year ")) {
out.push("Calendar Year")
term = term.substring(13)
}
// things that are lower case are automatically not a start word
if(term.charAt(0) != term.charAt(0).toUpperCase()) {
out[out.length-1] = out[out.length-1] + " " + term
continue
}
// also stop words are not start words
let stopped = 0
for(let bad of notstartwords) {
if(term.startsWith(bad)) {
out[out.length-1] = out[out.length-1] + " " + term
stopped = 1
break
}
}
if(stopped) continue
out.push(term)
}
return out
}
function fixuprow(row,numeric=0) {
// sometimes everything is in the first field - so copy all over and examine
let rows2 = []
for(let field of row) {
field = field.trim()
if(!field.length) continue
rows2.push(field)
}
// if there is more than one valid entry then this is ok
// if there is only one thing then this is probably a bad row
if(rows2.length > 1) { badrow = 0; return rows2 }
if(rows2.length == 0) throw new Error("no data")
badrow = 1
if(numeric==0)
{ rows2 = rows2[0].split(/\r?\n/); rows2 = workharder(rows2) }
else
rows2 = rows2[0].split(' ')
// console.log("*** had to fix row")
// console.log(row)
// console.log(rows2)
return rows2
}
function speciallycheckfordupeto(row) {
if(row.length < 2) return
for(let i = 0; i < row.length ; i++) {
let field = row[i]
let members = field.match(/to /g) || []
if(members.length > 1) {
let parts = field.split("\n")
let first = parts[0] + " " + parts[1] + " " + parts[2]
let second = parts[3] + " " + parts[4] + " " + parts[5]
row[i] = first
row.splice(i,0,second)
//console.log("**** fixed bad two")
//console.log(row)
//console.log(parts)
}
}
}
function speciallycheckfordupeto2(row) {
if(row.length < 2) return
for(let i = 0; i < row.length ; i++) {
let field = row[i].trim()
let parts = field.split(' ')
if(parts.length > 1) {
//console.log("**** bad num")
//console.log(row)
//console.log(parts)
//console.log(field)
row[i] = parts[0]
row.splice(i,0,parts[1])
//console.log(row)
}
}
}
let expectedsize = 3
let header = 0
function fixuprowheader(row) {
row = fixuprow(row,0)
header = row
expectedsize = row.length
speciallycheckfordupeto(row)
return row
}
function fixuprow2(row) {
row = fixuprow(row,1)
if(expectedsize != row.length) {
//console.error("**** bad row ***")
//console.error(header)
//console.error(row)
throw Error("Row length error")
}
speciallycheckfordupeto2(row)
return row
}
///////////////////////////////////////////////////////////////////////////////////////////////
//write stuff
let MINCOLS = 16
let docid = 0
//https://www.npmjs.com/package/write-excel-file
let excel_blob = []
function excel_new_doc() {
excel_blob = []
}
function excel_write_related(row) {
let str = "SchooldID: " + docid + " Info: " + row.join(" ").replace(/[\n\r]+/g, '');
let data = []
data.push({value:str,fontWeight:"bold"})
while(data.length< MINCOLS) data.push({value:""})
excel_blob.push(data)
}
function excel_write_header(row) {
row = fixuprowheader(row)
let data = []
data.push({value:"SchoolID",fontWeight:"bold"})
for(let field of row) {
//field = field.replace(/[^a-zA-Z ]/g, "")
field = field.replace(/[\n\r]+/g, '');
if(!field.length) continue
data.push({ value:field.trim(), fontWeight:'bold' })
}
while(data.length< MINCOLS) data.push({value:""})
excel_blob.push(data)
}
function excel_write_row(row) {
row = fixuprow2(row)
let data = []
data.push({value:docid})
for(let field of row) {
if(!field.length) continue
data.push({ value:field.trim() })
}
while(data.length< MINCOLS) data.push({value:""})
excel_blob.push(data)
}
async function excel_save_doc(filename) {
let stream = await excel(excel_blob)
const output = fs.createWriteStream(filename)
stream.pipe(output)
}
///////////////////////////////////////////////////////////////////////////////////////////////
// this is a list of tables that have unique header rows...
let uniques = {}
// chew through the pdf and think about it
async function chew(data) {
let counter = 0
let bucket = 0
// visit every page
for(let page of data.pageTables) {
// visit every "row" aka "table" (they mistakenly refer to rows as 'tables')
let rows = page.tables
for(let r = 0; r < rows.length; r++) {
// lets look at this row if it has anything fun in it
let row = rows[r]
if(!row.length) continue
let isNumber = row[0].match(/^\d/)
// first time we've seen a number? back up and write the header
if(isNumber && bucket == 0) {
bucket = rows[r-2].join(" ") + rows[r-1].join(" ")
if(!uniques[bucket]) uniques[bucket] = []
excel_write_related(rows[r-2])
excel_write_header(rows[r-1])
}
// add numbers to a bucket
if(isNumber) {
uniques[bucket].push(row)
excel_write_row(row)
}
// else flush the bucket
else bucket = 0
}
}
}
///////////////////////////////////////////////////////////////////////////////////////////
function pdf_promise(path,callback) {
return new Promise((a,b)=>{ pdf_table_extractor(path,a) })
}
///////////////////////////////////////////////////////////////////////////////////////////
// run
async function boot() {
let urls = await geturls();
for(let url of urls) {
// get raw data as pdf
console.log("loading " + url)
let [path,text] = await savetodoc(url)
// start a doc
excel_new_doc()
// chew on it
let data = await pdf_promise(path)
chew(data)
console.log("**** done")
// save it
excel_save_doc("out"+path+".xlsx")
// done
break
}
}
boot()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment