Skip to content

Instantly share code, notes, and snippets.

@bijection
Last active September 17, 2020 05:14
Show Gist options
  • Save bijection/50de8e5dbd8cc2ffd3c396aab3d0256d to your computer and use it in GitHub Desktop.
Save bijection/50de8e5dbd8cc2ffd3c396aab3d0256d to your computer and use it in GitHub Desktop.
GRAIN Data Cleaning
import './system_config.js'
var xlsx = require('xlsx') /// Let's start by defining a function that can get us an xls file remotely.
export default function getXls(url){
var f = new FileReader()
fetch(url)
.then(x => x.blob())
.then(x => f.readAsBinaryString(x))
return new Promise((resolve, reject) => {
f.onloadend = () => resolve(xlsx.read(f.result, {type:"binary"}))
})
}
import getXls from './cell0.js';
window.sheet; /// Now, we grab the file and stick it in a global variable 'sheet'
var res = await getXls('https://commondatastorage.googleapis.com/ckannet-storage/2012-08-14T085537/GRAIN---Land-grab-deals---Jan-2012.xls')
window.sheet = res.Sheets.Sheet1
window.cols = {} /// 'sheet' is a giant object right now, but we'd really like it as a json list. to get it there, let's start by extracting the names of the columns.
Object.keys(window.sheet)
.filter(key => key.slice(1) === '1')
.forEach(key => cols[key[0]] = sheet[key].v)
window.cols
window.json = [] /// Now that we have the column names, let's go ahead and fill in the rest of our json list.
Object.keys(window.sheet).forEach(key => {
var row = Number(key.slice(1))
if(!row || row < 2) return;
var rowIndex = row - 2
var colIndex = key[0]
json[rowIndex] = json[rowIndex] || {}
json[rowIndex][window.cols[colIndex]] = window.sheet[key].v
})
json
var investmentRules = [] /// Now let's normalize the invenstments!
window.normalizeInvestment = function normalizeInvestment(investment){
for (var rule of investmentRules){
if(rule(investment)) return rule(investment)
}
return false
}
function normalized(){
return json.map(row => row['Projected investment'])
.filter(x=>x)
.map(normalizeInvestment)
}
function stillAbnormal(){
return json.map(row => row['Projected investment'])
.filter(x=>x)
.filter(x => !normalizeInvestment(x))
}
stillAbnormal()
investmentRules.push( investment => {
var match = investment.match(/^US\$(\d+(\.\d+)?)\s+million/)
return match ? Number(match[1]) : false
})
stillAbnormal()
investmentRules.push( investment => {
var match = investment.match(/^US\$(\d)+,(\d+)\s+million/)
return match ? Number(match[1] + match[2]) : false
})
stillAbnormal()
investmentRules.push( investment => {
var match = investment.match(/^US\$(\d+(\.\d+)?)\s+billion/)
return match ? Number(match[1])*1000 : false
})
stillAbnormal().join('\n')
json.filter(row => row['Projected investment'] && row['Projected investment'].match(/\/yr/))
investmentRules.push( investment => { /// Let's do some special cases!! Woohoo!!
switch(investment){
case "US$30-35 million":
return 32
case "US$4/ha/yr (lease)":
return 7 /// $35 yrs \*50000 ha
case "US$57,600 (US$0.80/ha)":
return .0576
case "US$125,000/yr (land lease)":
return 3 /// $125,000 a year for 25 years
case "US$8/ha/yr (lease)":
return 5.4
case "US$1.2/ha/yr (after first 7 years) in Gambela and US$8/ha/yr (after first 6 years) in Bako":
return 10 /// I am not patient enough to figure this one out lol. Hopefully it's around 10
}
return false
})
stillAbnormal()
window.cleaned = []
window.json.forEach(row => {
var investment = row['Projected investment']
var cleaned_row = {...row}
delete cleaned_row['Projected investment']
cleaned_row['Projected Investment'] = 'unknown'
if(investment) {
cleaned_row['Projected Investment'] = normalizeInvestment(investment)
}
cleaned.push(cleaned_row)
})
window.cleaned
import {say} from 'cowsay'
say({text: "hello there!\n \nWelcome to the Carbide Alpha\nRelease Data Cleaning\nExample Notebook!\n \nJoin us for an interactive\njourney through an excel file!\n\nRun each of the following\ncells in order!\n \nSend bugs to\[email protected]!\n \nSend hellos to \[email protected]! "})
System.config({
"meta": {
'https://cdnjs.cloudflare.com/ajax/libs/xlsx/0.8.0/xlsx.full.min.js': {
exports: 'XLSX',
format: 'global'
}
},
"map": {
"xlsx": 'https://cdnjs.cloudflare.com/ajax/libs/xlsx/0.8.0/xlsx.full.min.js'
}
})
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment