Created
January 19, 2022 02:04
-
-
Save anselm/72906b2796c6d16621659fab00e5b25c to your computer and use it in GitHub Desktop.
parser for brandy
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const fs = require("fs") | |
var pdf_table_extractor = require("pdf-table-extractor"); | |
var fetch = require( 'node-fetch' ); | |
var excel = require( 'write-excel-file/node' ); | |
/////////////////////////////////////////////////////////////////////////////////////////////// | |
// fetch things from net | |
let urlbase = "https://www.bppe.ca.gov" | |
let url = "https://www.bppe.ca.gov/webapps/summary_2018.php" | |
// helper | |
function getParams(uri) { | |
var params = {}, tokens, re = /[?&]?([^=]+)=([^&]*)/g; | |
while (tokens = re.exec(uri)) { | |
params[decodeURIComponent(tokens[1])] = decodeURIComponent(tokens[2]); | |
} | |
return params; | |
} | |
// fetch a doc from the site and save | |
async function savetodoc(url) { | |
let time = Date.now() | |
let path = getParams(url)["docName"] | |
docid = path.match(/\d+/) | |
path = "files/" + docid + ".pdf" | |
if(fs.existsSync(path)) { | |
console.log("already found as " + path) | |
return [path,0] | |
} | |
console.log("saving as " + path) | |
let response = await fetch(url) | |
let blob = await response.arrayBuffer() | |
fs.writeFileSync(path,Buffer.from(blob)) | |
console.log("time was " + (Date.now() - time) ) | |
return [path,blob] | |
} | |
// pull all useful urls from master manifest | |
async function geturls() { | |
//fetch the doc - actually i just fetched it once and just saved it here | |
//let response = await fetch(url) | |
//let text = await response.text() | |
//console.log(text) | |
// just load the file | |
let text = fs.readFileSync("summary_2018.html") + "" | |
// find things that are like what we want | |
let urls = text.match(/\bwebapps\/file_download.*?"/gi); | |
let novel = {} | |
let values = [] | |
let count = 0 | |
console.log("total is " + urls.length) | |
for(let url of urls) { | |
let musthave = "School Performance Fact Sheet" | |
if(!url.includes(musthave)) continue | |
//url = url.substring(0,url.length-musthave.length-2) | |
url = url.substring(0,url.length-1) | |
url = urlbase + "/" + url.trim().replace(/\s/g,"%20") | |
if(novel[url]) continue | |
novel[url]=url | |
values.push(url) | |
} | |
return values | |
} | |
/////////////////////////////////////////////////////////////////////////////////////////////// | |
// sometimes rows are messed up | |
let badrow = 0 | |
Object.defineProperty(Array.prototype, 'last', { | |
enumerable: false, | |
configurable: true, | |
get: function() { | |
return this[this.length - 1]; | |
}, | |
set: undefined | |
}); | |
let notstartwords = [ "Year","Employed","Field","Concurrent","Aggregated","Positions" ] | |
function workharder(row) { | |
let out = [] | |
for(let i = 0; i < row.length; i++) { | |
let term = row[i].trim().replace(/\s+/g,' ').trim() | |
// Calendar Year ... ugh | |
if(term.startsWith("Calendar Year ")) { | |
out.push("Calendar Year") | |
term = term.substring(13) | |
} | |
// things that are lower case are automatically not a start word | |
if(term.charAt(0) != term.charAt(0).toUpperCase()) { | |
out[out.length-1] = out[out.length-1] + " " + term | |
continue | |
} | |
// also stop words are not start words | |
let stopped = 0 | |
for(let bad of notstartwords) { | |
if(term.startsWith(bad)) { | |
out[out.length-1] = out[out.length-1] + " " + term | |
stopped = 1 | |
break | |
} | |
} | |
if(stopped) continue | |
out.push(term) | |
} | |
return out | |
} | |
function fixuprow(row,numeric=0) { | |
// sometimes everything is in the first field - so copy all over and examine | |
let rows2 = [] | |
for(let field of row) { | |
field = field.trim() | |
if(!field.length) continue | |
rows2.push(field) | |
} | |
// if there is more than one valid entry then this is ok | |
// if there is only one thing then this is probably a bad row | |
if(rows2.length > 1) { badrow = 0; return rows2 } | |
if(rows2.length == 0) throw new Error("no data") | |
badrow = 1 | |
if(numeric==0) | |
{ rows2 = rows2[0].split(/\r?\n/); rows2 = workharder(rows2) } | |
else | |
rows2 = rows2[0].split(' ') | |
// console.log("*** had to fix row") | |
// console.log(row) | |
// console.log(rows2) | |
return rows2 | |
} | |
function speciallycheckfordupeto(row) { | |
if(row.length < 2) return | |
for(let i = 0; i < row.length ; i++) { | |
let field = row[i] | |
let members = field.match(/to /g) || [] | |
if(members.length > 1) { | |
let parts = field.split("\n") | |
let first = parts[0] + " " + parts[1] + " " + parts[2] | |
let second = parts[3] + " " + parts[4] + " " + parts[5] | |
row[i] = first | |
row.splice(i,0,second) | |
//console.log("**** fixed bad two") | |
//console.log(row) | |
//console.log(parts) | |
} | |
} | |
} | |
function speciallycheckfordupeto2(row) { | |
if(row.length < 2) return | |
for(let i = 0; i < row.length ; i++) { | |
let field = row[i].trim() | |
let parts = field.split(' ') | |
if(parts.length > 1) { | |
//console.log("**** bad num") | |
//console.log(row) | |
//console.log(parts) | |
//console.log(field) | |
row[i] = parts[0] | |
row.splice(i,0,parts[1]) | |
//console.log(row) | |
} | |
} | |
} | |
let expectedsize = 3 | |
let header = 0 | |
function fixuprowheader(row) { | |
row = fixuprow(row,0) | |
header = row | |
expectedsize = row.length | |
speciallycheckfordupeto(row) | |
return row | |
} | |
function fixuprow2(row) { | |
row = fixuprow(row,1) | |
if(expectedsize != row.length) { | |
//console.error("**** bad row ***") | |
//console.error(header) | |
//console.error(row) | |
throw Error("Row length error") | |
} | |
speciallycheckfordupeto2(row) | |
return row | |
} | |
/////////////////////////////////////////////////////////////////////////////////////////////// | |
//write stuff | |
let MINCOLS = 16 | |
let docid = 0 | |
//https://www.npmjs.com/package/write-excel-file | |
let excel_blob = [] | |
function excel_new_doc() { | |
excel_blob = [] | |
} | |
function excel_write_related(row) { | |
let str = "SchooldID: " + docid + " Info: " + row.join(" ").replace(/[\n\r]+/g, ''); | |
let data = [] | |
data.push({value:str,fontWeight:"bold"}) | |
while(data.length< MINCOLS) data.push({value:""}) | |
excel_blob.push(data) | |
} | |
function excel_write_header(row) { | |
row = fixuprowheader(row) | |
let data = [] | |
data.push({value:"SchoolID",fontWeight:"bold"}) | |
for(let field of row) { | |
//field = field.replace(/[^a-zA-Z ]/g, "") | |
field = field.replace(/[\n\r]+/g, ''); | |
if(!field.length) continue | |
data.push({ value:field.trim(), fontWeight:'bold' }) | |
} | |
while(data.length< MINCOLS) data.push({value:""}) | |
excel_blob.push(data) | |
} | |
function excel_write_row(row) { | |
row = fixuprow2(row) | |
let data = [] | |
data.push({value:docid}) | |
for(let field of row) { | |
if(!field.length) continue | |
data.push({ value:field.trim() }) | |
} | |
while(data.length< MINCOLS) data.push({value:""}) | |
excel_blob.push(data) | |
} | |
async function excel_save_doc(filename) { | |
let stream = await excel(excel_blob) | |
const output = fs.createWriteStream(filename) | |
stream.pipe(output) | |
} | |
/////////////////////////////////////////////////////////////////////////////////////////////// | |
// this is a list of tables that have unique header rows... | |
let uniques = {} | |
// chew through the pdf and think about it | |
async function chew(data) { | |
let counter = 0 | |
let bucket = 0 | |
// visit every page | |
for(let page of data.pageTables) { | |
// visit every "row" aka "table" (they mistakenly refer to rows as 'tables') | |
let rows = page.tables | |
for(let r = 0; r < rows.length; r++) { | |
// lets look at this row if it has anything fun in it | |
let row = rows[r] | |
if(!row.length) continue | |
let isNumber = row[0].match(/^\d/) | |
// first time we've seen a number? back up and write the header | |
if(isNumber && bucket == 0) { | |
bucket = rows[r-2].join(" ") + rows[r-1].join(" ") | |
if(!uniques[bucket]) uniques[bucket] = [] | |
excel_write_related(rows[r-2]) | |
excel_write_header(rows[r-1]) | |
} | |
// add numbers to a bucket | |
if(isNumber) { | |
uniques[bucket].push(row) | |
excel_write_row(row) | |
} | |
// else flush the bucket | |
else bucket = 0 | |
} | |
} | |
} | |
/////////////////////////////////////////////////////////////////////////////////////////// | |
function pdf_promise(path,callback) { | |
return new Promise((a,b)=>{ pdf_table_extractor(path,a) }) | |
} | |
/////////////////////////////////////////////////////////////////////////////////////////// | |
// run | |
async function boot() { | |
let urls = await geturls(); | |
for(let url of urls) { | |
// get raw data as pdf | |
console.log("loading " + url) | |
let [path,text] = await savetodoc(url) | |
// start a doc | |
excel_new_doc() | |
// chew on it | |
let data = await pdf_promise(path) | |
chew(data) | |
console.log("**** done") | |
// save it | |
excel_save_doc("out"+path+".xlsx") | |
// done | |
break | |
} | |
} | |
boot() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment