Created
May 11, 2018 16:58
-
-
Save derickfay/13799d101abe6b4348f4f5167715b880 to your computer and use it in GitHub Desktop.
Clean scraped results from the National Archives of South Africa
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// scraper for http://www.national.archsrch.gov.za/sm300cv/smws/sm300dl | |
// | |
// usage: select results in search results, enter multiple documents view, then copy contents of the results frame to a text file called archives.txt | |
const fieldnames = ["Document","DEPOT","SOURCE","TYPE","VOLUME_NO","SYSTEM","REFERENCE","PART","DESCRIPTION","STARTING","ENDING","REMARKS.*"] | |
const replacements = ["",",",",",",",",",",",",",",",",",",",",",""] | |
var fs = require('fs'); | |
var path = process.cwd(); | |
let d = fs.readFileSync(path + "/archives.txt").toString().replace(/\,/g," "); | |
let records = d.split("\nDocument ") | |
var re = /\s*\n/g | |
records = records.map (r => r = r.replace(re,"")) | |
for (var f of fieldnames) { | |
var re = new RegExp(f,"g") | |
records = records.map (r => r = r.replace(re,replacements[fieldnames.indexOf(f)])) | |
} | |
var re = /\,\s*/g | |
records = records.map (r => r = r.replace(re,",")) | |
var re = / +/g | |
records = records.map (r => r = r.replace(re," ")) | |
headings = fieldnames.slice(0, fieldnames.length-1).join(",")+"\n" | |
d = headings + records.join("\n") | |
fs.writeFileSync(path + "/out.txt", d) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment