anselm · January 19, 2022 02:04
diff --git a/gistfile1.txt b/gistfile1.txt

 const fs = require("fs")
 var pdf_table_extractor = require("pdf-table-extractor");
 var fetch = require( 'node-fetch' );
 var excel = require( 'write-excel-file/node' );

 ///////////////////////////////////////////////////////////////////////////////////////////////
 // fetch things from net

 let urlbase = "https://www.bppe.ca.gov"
 let url = "https://www.bppe.ca.gov/webapps/summary_2018.php"

 // helper 

 function getParams(uri) {
 	var params = {}, tokens, re = /[?&]?([^=]+)=([^&]*)/g;        
    while (tokens = re.exec(uri)) {
        params[decodeURIComponent(tokens[1])] = decodeURIComponent(tokens[2]);
    }
    return params;
 }

 // fetch a doc from the site and save

 async function savetodoc(url) {
 	let time = Date.now()
 	let path = getParams(url)["docName"]
 	docid = path.match(/\d+/)
 	path = "files/" + docid + ".pdf"
 	if(fs.existsSync(path)) {
 		console.log("already found as " + path)
 		return [path,0]
 	}
 	console.log("saving as " + path)
 	let response = await fetch(url)
 	let blob = await response.arrayBuffer()
 	fs.writeFileSync(path,Buffer.from(blob))
 	console.log("time was " + (Date.now() - time) )
 	return [path,blob]
 }

 // pull all useful urls from master manifest

 async function geturls() {

 	//fetch the doc - actually i just fetched it once and just saved it here
 	//let response = await fetch(url)
 	//let text = await response.text()
 	//console.log(text)

 	// just load the file
 	let text = fs.readFileSync("summary_2018.html") + ""

 	// find things that are like what we want
 	let urls = text.match(/\bwebapps\/file_download.*?"/gi);
 	let novel = {}
 	let values = []
 	let count = 0
 	console.log("total is " + urls.length)
 	for(let url of urls) {
 		let musthave = "School Performance Fact Sheet"
 		if(!url.includes(musthave)) continue
 		//url = url.substring(0,url.length-musthave.length-2)
 		url = url.substring(0,url.length-1)
 		url = urlbase + "/" + url.trim().replace(/\s/g,"%20")
 		if(novel[url]) continue
 		novel[url]=url
 		values.push(url)
 	}
 	return values
 }

 ///////////////////////////////////////////////////////////////////////////////////////////////
 // sometimes rows are messed up

 let badrow = 0

 Object.defineProperty(Array.prototype, 'last', {
    enumerable: false,
    configurable: true,
    get: function() {
        return this[this.length - 1];
    },
    set: undefined
 });

 let notstartwords = [ "Year","Employed","Field","Concurrent","Aggregated","Positions" ]

 function workharder(row) {

 	let out = []

 	for(let i = 0; i < row.length; i++) {
 		let term = row[i].trim().replace(/\s+/g,' ').trim()

 		// Calendar Year ... ugh
 		if(term.startsWith("Calendar Year ")) {
 			out.push("Calendar Year")
 			term = term.substring(13)
 		}

 		// things that are lower case are automatically not a start word
 		if(term.charAt(0) != term.charAt(0).toUpperCase()) {
 			out[out.length-1] = out[out.length-1] + " " + term
 			continue
 		}
 		// also stop words are not start words
 		let stopped = 0
 		for(let bad of notstartwords) {
 			if(term.startsWith(bad)) {
 				out[out.length-1] = out[out.length-1] + " " + term
 				stopped = 1
 				break
 			}
 		}
 		if(stopped) continue
 		out.push(term)
 	}

 	return out
 }

 function fixuprow(row,numeric=0) {

 	// sometimes everything is in the first field - so copy all over and examine

 	let rows2 = []
 	for(let field of row) {
 		field = field.trim()
 		if(!field.length) continue
 		rows2.push(field)
 	}

 	// if there is more than one valid entry then this is ok
 	// if there is only one thing then this is probably a bad row

 	if(rows2.length > 1) { badrow = 0; return rows2 }

 	if(rows2.length == 0) throw new Error("no data")

 	badrow = 1

 	if(numeric==0)
 		{ rows2 = rows2[0].split(/\r?\n/); rows2 = workharder(rows2) }
 	else
 		rows2 = rows2[0].split(' ')

 //	console.log("*** had to fix row")
 //	console.log(row)
 //	console.log(rows2)

 	return rows2
 }

 function speciallycheckfordupeto(row) {
 	if(row.length < 2) return
 	for(let i = 0; i < row.length ; i++) {
 		let field = row[i]
 		let members = field.match(/to /g) || []
 		if(members.length > 1) {
 			let parts = field.split("\n")
 			let first = parts[0] + " " + parts[1] + " " + parts[2]
 			let second = parts[3] + " " + parts[4] + " " + parts[5]
 			row[i] = first
 			row.splice(i,0,second)
 			//console.log("**** fixed bad two")
 			//console.log(row)
 			//console.log(parts)
 		}
 	}
 }

 function speciallycheckfordupeto2(row) {
 	if(row.length < 2) return
 	for(let i = 0; i < row.length ; i++) {
 		let field = row[i].trim()
 		let parts = field.split(' ')
 		if(parts.length > 1) {
 			//console.log("**** bad num")
 			//console.log(row)
 			//console.log(parts)
 			//console.log(field)
 			row[i] = parts[0]
 			row.splice(i,0,parts[1])
 			//console.log(row)
 		}
 	}
 }


 let expectedsize = 3
 let header = 0

 function fixuprowheader(row) {
 	row = fixuprow(row,0)	
 	header = row
 	expectedsize = row.length

 	speciallycheckfordupeto(row)

 	return row
 }

 function fixuprow2(row) {
 	row = fixuprow(row,1)
 	if(expectedsize != row.length) {
 		//console.error("**** bad row ***")
 		//console.error(header)
 		//console.error(row)
 		throw Error("Row length error")
 	}

 	speciallycheckfordupeto2(row)

 	return row
 }

 ///////////////////////////////////////////////////////////////////////////////////////////////
 //write stuff

 let MINCOLS = 16
 let docid = 0

 //https://www.npmjs.com/package/write-excel-file
 let excel_blob = []

 function excel_new_doc() {
 	excel_blob = []
 }

 function excel_write_related(row) {

 	let str = "SchooldID: " + docid + " Info: " + row.join(" ").replace(/[\n\r]+/g, '');

 	let data = []
 	data.push({value:str,fontWeight:"bold"})
 	while(data.length< MINCOLS) data.push({value:""})

 	excel_blob.push(data)
 }


 function excel_write_header(row) {

 	row = fixuprowheader(row)

 	let data = []
 	data.push({value:"SchoolID",fontWeight:"bold"})
 	for(let field of row) {
 		//field = field.replace(/[^a-zA-Z ]/g, "")
 		field = field.replace(/[\n\r]+/g, '');
 		if(!field.length) continue
 		data.push({ value:field.trim(), fontWeight:'bold' })
 	}
 	while(data.length< MINCOLS) data.push({value:""})

 	excel_blob.push(data)
 }

 function excel_write_row(row) {

 	row = fixuprow2(row)

 	let data = []
 	data.push({value:docid})
 	for(let field of row) {
 		if(!field.length) continue
 		data.push({ value:field.trim() })
 	}
 	while(data.length< MINCOLS) data.push({value:""})
 	excel_blob.push(data)
 }

 async function excel_save_doc(filename) {
 	let stream = await excel(excel_blob)
 	const output = fs.createWriteStream(filename)
 	stream.pipe(output)
 }

 ///////////////////////////////////////////////////////////////////////////////////////////////

 // this is a list of tables that have unique header rows...
 let uniques = {}

 // chew through the pdf and think about it

 async function chew(data) {
 	let counter = 0
 	let bucket = 0
 	// visit every page
 	for(let page of data.pageTables) {
 		// visit every "row" aka "table" (they mistakenly refer to rows as 'tables')
 		let rows = page.tables
 		for(let r = 0; r < rows.length; r++) {
 			// lets look at this row if it has anything fun in it
 			let row = rows[r]
 			if(!row.length) continue
 			let isNumber = row[0].match(/^\d/)
 			// first time we've seen a number? back up and write the header
 			if(isNumber && bucket == 0) {
 				bucket = rows[r-2].join(" ") + rows[r-1].join(" ")
 				if(!uniques[bucket]) uniques[bucket] = []

 				excel_write_related(rows[r-2])

 				excel_write_header(rows[r-1])
 			}
 			// add numbers to a bucket
 			if(isNumber) {
 				uniques[bucket].push(row)
 				excel_write_row(row)
 			}
 			// else flush the bucket
 			else bucket = 0
 		}
 	}
 }

 ///////////////////////////////////////////////////////////////////////////////////////////

 function pdf_promise(path,callback) {
 	return new Promise((a,b)=>{ pdf_table_extractor(path,a) })
 }

 ///////////////////////////////////////////////////////////////////////////////////////////
 // run

 async function boot() {
 	let urls = await geturls();
 	for(let url of urls) {
 		// get raw data as pdf
 		console.log("loading " + url)
 		let [path,text] = await savetodoc(url)
 		// start a doc
 		excel_new_doc()
 		// chew on it
 		let data = await pdf_promise(path)
 		chew(data)
 		console.log("**** done")
 		// save it
 		excel_save_doc("out"+path+".xlsx")
 		// done
 		break
 	}
 }

 boot()

	const fs = require("fs")
	var pdf_table_extractor = require("pdf-table-extractor");
	var fetch = require( 'node-fetch' );
	var excel = require( 'write-excel-file/node' );

	///////////////////////////////////////////////////////////////////////////////////////////////
	// fetch things from net

	let urlbase = "https://www.bppe.ca.gov"
	let url = "https://www.bppe.ca.gov/webapps/summary_2018.php"

	// helper

	function getParams(uri) {
	var params = {}, tokens, re = /[?&]?([^=]+)=([^&]*)/g;
	while (tokens = re.exec(uri)) {
	params[decodeURIComponent(tokens[1])] = decodeURIComponent(tokens[2]);
	}
	return params;
	}

	// fetch a doc from the site and save

	async function savetodoc(url) {
	let time = Date.now()
	let path = getParams(url)["docName"]
	docid = path.match(/\d+/)
	path = "files/" + docid + ".pdf"
	if(fs.existsSync(path)) {
	console.log("already found as " + path)
	return [path,0]
	}
	console.log("saving as " + path)
	let response = await fetch(url)
	let blob = await response.arrayBuffer()
	fs.writeFileSync(path,Buffer.from(blob))
	console.log("time was " + (Date.now() - time) )
	return [path,blob]
	}

	// pull all useful urls from master manifest

	async function geturls() {

	//fetch the doc - actually i just fetched it once and just saved it here
	//let response = await fetch(url)
	//let text = await response.text()
	//console.log(text)

	// just load the file
	let text = fs.readFileSync("summary_2018.html") + ""

	// find things that are like what we want
	let urls = text.match(/\bwebapps\/file_download.*?"/gi);
	let novel = {}
	let values = []
	let count = 0
	console.log("total is " + urls.length)
	for(let url of urls) {
	let musthave = "School Performance Fact Sheet"
	if(!url.includes(musthave)) continue
	//url = url.substring(0,url.length-musthave.length-2)
	url = url.substring(0,url.length-1)
	url = urlbase + "/" + url.trim().replace(/\s/g,"%20")
	if(novel[url]) continue
	novel[url]=url
	values.push(url)
	}
	return values
	}

	///////////////////////////////////////////////////////////////////////////////////////////////
	// sometimes rows are messed up

	let badrow = 0

	Object.defineProperty(Array.prototype, 'last', {
	enumerable: false,
	configurable: true,
	get: function() {
	return this[this.length - 1];
	},
	set: undefined
	});

	let notstartwords = [ "Year","Employed","Field","Concurrent","Aggregated","Positions" ]

	function workharder(row) {

	let out = []

	for(let i = 0; i < row.length; i++) {
	let term = row[i].trim().replace(/\s+/g,' ').trim()

	// Calendar Year ... ugh
	if(term.startsWith("Calendar Year ")) {
	out.push("Calendar Year")
	term = term.substring(13)
	}

	// things that are lower case are automatically not a start word
	if(term.charAt(0) != term.charAt(0).toUpperCase()) {
	out[out.length-1] = out[out.length-1] + " " + term
	continue
	}
	// also stop words are not start words
	let stopped = 0
	for(let bad of notstartwords) {
	if(term.startsWith(bad)) {
	out[out.length-1] = out[out.length-1] + " " + term
	stopped = 1
	break
	}
	}
	if(stopped) continue
	out.push(term)
	}

	return out
	}

	function fixuprow(row,numeric=0) {

	// sometimes everything is in the first field - so copy all over and examine

	let rows2 = []
	for(let field of row) {
	field = field.trim()
	if(!field.length) continue
	rows2.push(field)
	}

	// if there is more than one valid entry then this is ok
	// if there is only one thing then this is probably a bad row

	if(rows2.length > 1) { badrow = 0; return rows2 }

	if(rows2.length == 0) throw new Error("no data")

	badrow = 1

	if(numeric==0)
	{ rows2 = rows2[0].split(/\r?\n/); rows2 = workharder(rows2) }
	else
	rows2 = rows2[0].split(' ')

	// console.log("*** had to fix row")
	// console.log(row)
	// console.log(rows2)

	return rows2
	}

	function speciallycheckfordupeto(row) {
	if(row.length < 2) return
	for(let i = 0; i < row.length ; i++) {
	let field = row[i]
	let members = field.match(/to /g) \|\| []
	if(members.length > 1) {
	let parts = field.split("\n")
	let first = parts[0] + " " + parts[1] + " " + parts[2]
	let second = parts[3] + " " + parts[4] + " " + parts[5]
	row[i] = first
	row.splice(i,0,second)
	//console.log("**** fixed bad two")
	//console.log(row)
	//console.log(parts)
	}
	}
	}

	function speciallycheckfordupeto2(row) {
	if(row.length < 2) return
	for(let i = 0; i < row.length ; i++) {
	let field = row[i].trim()
	let parts = field.split(' ')
	if(parts.length > 1) {
	//console.log("**** bad num")
	//console.log(row)
	//console.log(parts)
	//console.log(field)
	row[i] = parts[0]
	row.splice(i,0,parts[1])
	//console.log(row)
	}
	}
	}


	let expectedsize = 3
	let header = 0

	function fixuprowheader(row) {
	row = fixuprow(row,0)
	header = row
	expectedsize = row.length

	speciallycheckfordupeto(row)

	return row
	}

	function fixuprow2(row) {
	row = fixuprow(row,1)
	if(expectedsize != row.length) {
	//console.error("** bad row *")
	//console.error(header)
	//console.error(row)
	throw Error("Row length error")
	}

	speciallycheckfordupeto2(row)

	return row
	}

	///////////////////////////////////////////////////////////////////////////////////////////////
	//write stuff

	let MINCOLS = 16
	let docid = 0

	//https://www.npmjs.com/package/write-excel-file
	let excel_blob = []

	function excel_new_doc() {
	excel_blob = []
	}

	function excel_write_related(row) {

	let str = "SchooldID: " + docid + " Info: " + row.join(" ").replace(/[\n\r]+/g, '');

	let data = []
	data.push({value:str,fontWeight:"bold"})
	while(data.length< MINCOLS) data.push({value:""})

	excel_blob.push(data)
	}


	function excel_write_header(row) {

	row = fixuprowheader(row)

	let data = []
	data.push({value:"SchoolID",fontWeight:"bold"})
	for(let field of row) {
	//field = field.replace(/[^a-zA-Z ]/g, "")
	field = field.replace(/[\n\r]+/g, '');
	if(!field.length) continue
	data.push({ value:field.trim(), fontWeight:'bold' })
	}
	while(data.length< MINCOLS) data.push({value:""})

	excel_blob.push(data)
	}

	function excel_write_row(row) {

	row = fixuprow2(row)

	let data = []
	data.push({value:docid})
	for(let field of row) {
	if(!field.length) continue
	data.push({ value:field.trim() })
	}
	while(data.length< MINCOLS) data.push({value:""})
	excel_blob.push(data)
	}

	async function excel_save_doc(filename) {
	let stream = await excel(excel_blob)
	const output = fs.createWriteStream(filename)
	stream.pipe(output)
	}

	///////////////////////////////////////////////////////////////////////////////////////////////

	// this is a list of tables that have unique header rows...
	let uniques = {}

	// chew through the pdf and think about it

	async function chew(data) {
	let counter = 0
	let bucket = 0
	// visit every page
	for(let page of data.pageTables) {
	// visit every "row" aka "table" (they mistakenly refer to rows as 'tables')
	let rows = page.tables
	for(let r = 0; r < rows.length; r++) {
	// lets look at this row if it has anything fun in it
	let row = rows[r]
	if(!row.length) continue
	let isNumber = row[0].match(/^\d/)
	// first time we've seen a number? back up and write the header
	if(isNumber && bucket == 0) {
	bucket = rows[r-2].join(" ") + rows[r-1].join(" ")
	if(!uniques[bucket]) uniques[bucket] = []

	excel_write_related(rows[r-2])

	excel_write_header(rows[r-1])
	}
	// add numbers to a bucket
	if(isNumber) {
	uniques[bucket].push(row)
	excel_write_row(row)
	}
	// else flush the bucket
	else bucket = 0
	}
	}
	}

	///////////////////////////////////////////////////////////////////////////////////////////

	function pdf_promise(path,callback) {
	return new Promise((a,b)=>{ pdf_table_extractor(path,a) })
	}

	///////////////////////////////////////////////////////////////////////////////////////////
	// run

	async function boot() {
	let urls = await geturls();
	for(let url of urls) {
	// get raw data as pdf
	console.log("loading " + url)
	let [path,text] = await savetodoc(url)
	// start a doc
	excel_new_doc()
	// chew on it
	let data = await pdf_promise(path)
	chew(data)
	console.log("**** done")
	// save it
	excel_save_doc("out"+path+".xlsx")
	// done
	break
	}
	}

	boot()