Last active
August 29, 2015 14:00
-
-
Save prabhasp/11275405 to your computer and use it in GitHub Desktop.
Indicator Dependencies: R metaprogramming
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<!DOCTYPE html> | |
<!-- saved from url=(0014)about:internet --> | |
<html> | |
<head> | |
<meta http-equiv="Content-Type" content="text/html; charset=utf-8"/> | |
<meta http-equiv="x-ua-compatible" content="IE=9" > | |
<title>Finding indicator dependencies</title> | |
<style type="text/css"> | |
body, td { | |
font-family: sans-serif; | |
background-color: white; | |
font-size: 12px; | |
margin: 8px; | |
} | |
tt, code, pre { | |
font-family: 'DejaVu Sans Mono', 'Droid Sans Mono', 'Lucida Console', Consolas, Monaco, monospace; | |
} | |
h1 { | |
font-size:2.2em; | |
} | |
h2 { | |
font-size:1.8em; | |
} | |
h3 { | |
font-size:1.4em; | |
} | |
h4 { | |
font-size:1.0em; | |
} | |
h5 { | |
font-size:0.9em; | |
} | |
h6 { | |
font-size:0.8em; | |
} | |
a:visited { | |
color: rgb(50%, 0%, 50%); | |
} | |
pre { | |
margin-top: 0; | |
max-width: 95%; | |
border: 1px solid #ccc; | |
white-space: pre-wrap; | |
} | |
pre code { | |
display: block; padding: 0.5em; | |
} | |
code.r, code.cpp { | |
background-color: #F8F8F8; | |
} | |
table, td, th { | |
border: none; | |
} | |
blockquote { | |
color:#666666; | |
margin:0; | |
padding-left: 1em; | |
border-left: 0.5em #EEE solid; | |
} | |
hr { | |
height: 0px; | |
border-bottom: none; | |
border-top-width: thin; | |
border-top-style: dotted; | |
border-top-color: #999999; | |
} | |
@media print { | |
* { | |
background: transparent !important; | |
color: black !important; | |
filter:none !important; | |
-ms-filter: none !important; | |
} | |
body { | |
font-size:12pt; | |
max-width:100%; | |
} | |
a, a:visited { | |
text-decoration: underline; | |
} | |
hr { | |
visibility: hidden; | |
page-break-before: always; | |
} | |
pre, blockquote { | |
padding-right: 1em; | |
page-break-inside: avoid; | |
} | |
tr, img { | |
page-break-inside: avoid; | |
} | |
img { | |
max-width: 100% !important; | |
} | |
@page :left { | |
margin: 15mm 20mm 15mm 10mm; | |
} | |
@page :right { | |
margin: 15mm 10mm 15mm 20mm; | |
} | |
p, h2, h3 { | |
orphans: 3; widows: 3; | |
} | |
h2, h3 { | |
page-break-after: avoid; | |
} | |
} | |
</style> | |
<!-- Styles for R syntax highlighter --> | |
<style type="text/css"> | |
pre .operator, | |
pre .paren { | |
color: rgb(104, 118, 135) | |
} | |
pre .literal { | |
color: rgb(88, 72, 246) | |
} | |
pre .number { | |
color: rgb(0, 0, 205); | |
} | |
pre .comment { | |
color: rgb(76, 136, 107); | |
} | |
pre .keyword { | |
color: rgb(0, 0, 255); | |
} | |
pre .identifier { | |
color: rgb(0, 0, 0); | |
} | |
pre .string { | |
color: rgb(3, 106, 7); | |
} | |
</style> | |
<!-- R syntax highlighter --> | |
<script type="text/javascript"> | |
var hljs=new function(){function m(p){return p.replace(/&/gm,"&").replace(/</gm,"<")}function f(r,q,p){return RegExp(q,"m"+(r.cI?"i":"")+(p?"g":""))}function b(r){for(var p=0;p<r.childNodes.length;p++){var q=r.childNodes[p];if(q.nodeName=="CODE"){return q}if(!(q.nodeType==3&&q.nodeValue.match(/\s+/))){break}}}function h(t,s){var p="";for(var r=0;r<t.childNodes.length;r++){if(t.childNodes[r].nodeType==3){var q=t.childNodes[r].nodeValue;if(s){q=q.replace(/\n/g,"")}p+=q}else{if(t.childNodes[r].nodeName=="BR"){p+="\n"}else{p+=h(t.childNodes[r])}}}if(/MSIE [678]/.test(navigator.userAgent)){p=p.replace(/\r/g,"\n")}return p}function a(s){var r=s.className.split(/\s+/);r=r.concat(s.parentNode.className.split(/\s+/));for(var q=0;q<r.length;q++){var p=r[q].replace(/^language-/,"");if(e[p]){return p}}}function c(q){var p=[];(function(s,t){for(var r=0;r<s.childNodes.length;r++){if(s.childNodes[r].nodeType==3){t+=s.childNodes[r].nodeValue.length}else{if(s.childNodes[r].nodeName=="BR"){t+=1}else{if(s.childNodes[r].nodeType==1){p.push({event:"start",offset:t,node:s.childNodes[r]});t=arguments.callee(s.childNodes[r],t);p.push({event:"stop",offset:t,node:s.childNodes[r]})}}}}return t})(q,0);return p}function k(y,w,x){var q=0;var z="";var s=[];function u(){if(y.length&&w.length){if(y[0].offset!=w[0].offset){return(y[0].offset<w[0].offset)?y:w}else{return w[0].event=="start"?y:w}}else{return y.length?y:w}}function t(D){var A="<"+D.nodeName.toLowerCase();for(var B=0;B<D.attributes.length;B++){var C=D.attributes[B];A+=" "+C.nodeName.toLowerCase();if(C.value!==undefined&&C.value!==false&&C.value!==null){A+='="'+m(C.value)+'"'}}return A+">"}while(y.length||w.length){var v=u().splice(0,1)[0];z+=m(x.substr(q,v.offset-q));q=v.offset;if(v.event=="start"){z+=t(v.node);s.push(v.node)}else{if(v.event=="stop"){var p,r=s.length;do{r--;p=s[r];z+=("</"+p.nodeName.toLowerCase()+">")}while(p!=v.node);s.splice(r,1);while(r<s.length){z+=t(s[r]);r++}}}}return z+m(x.substr(q))}function j(){function q(x,y,v){if(x.compiled){return}var u;var s=[];if(x.k){x.lR=f(y,x.l||hljs.IR,true);for(var w in x.k){if(!x.k.hasOwnProperty(w)){continue}if(x.k[w] instanceof Object){u=x.k[w]}else{u=x.k;w="keyword"}for(var r in u){if(!u.hasOwnProperty(r)){continue}x.k[r]=[w,u[r]];s.push(r)}}}if(!v){if(x.bWK){x.b="\\b("+s.join("|")+")\\s"}x.bR=f(y,x.b?x.b:"\\B|\\b");if(!x.e&&!x.eW){x.e="\\B|\\b"}if(x.e){x.eR=f(y,x.e)}}if(x.i){x.iR=f(y,x.i)}if(x.r===undefined){x.r=1}if(!x.c){x.c=[]}x.compiled=true;for(var t=0;t<x.c.length;t++){if(x.c[t]=="self"){x.c[t]=x}q(x.c[t],y,false)}if(x.starts){q(x.starts,y,false)}}for(var p in e){if(!e.hasOwnProperty(p)){continue}q(e[p].dM,e[p],true)}}function d(B,C){if(!j.called){j();j.called=true}function q(r,M){for(var L=0;L<M.c.length;L++){if((M.c[L].bR.exec(r)||[null])[0]==r){return M.c[L]}}}function v(L,r){if(D[L].e&&D[L].eR.test(r)){return 1}if(D[L].eW){var M=v(L-1,r);return M?M+1:0}return 0}function w(r,L){return L.i&&L.iR.test(r)}function K(N,O){var M=[];for(var L=0;L<N.c.length;L++){M.push(N.c[L].b)}var r=D.length-1;do{if(D[r].e){M.push(D[r].e)}r--}while(D[r+1].eW);if(N.i){M.push(N.i)}return f(O,M.join("|"),true)}function p(M,L){var N=D[D.length-1];if(!N.t){N.t=K(N,E)}N.t.lastIndex=L;var r=N.t.exec(M);return r?[M.substr(L,r.index-L),r[0],false]:[M.substr(L),"",true]}function z(N,r){var L=E.cI?r[0].toLowerCase():r[0];var M=N.k[L];if(M&&M instanceof Array){return M}return false}function F(L,P){L=m(L);if(!P.k){return L}var r="";var O=0;P.lR.lastIndex=0;var M=P.lR.exec(L);while(M){r+=L.substr(O,M.index-O);var N=z(P,M);if(N){x+=N[1];r+='<span class="'+N[0]+'">'+M[0]+"</span>"}else{r+=M[0]}O=P.lR.lastIndex;M=P.lR.exec(L)}return r+L.substr(O,L.length-O)}function J(L,M){if(M.sL&&e[M.sL]){var r=d(M.sL,L);x+=r.keyword_count;return r.value}else{return F(L,M)}}function I(M,r){var L=M.cN?'<span class="'+M.cN+'">':"";if(M.rB){y+=L;M.buffer=""}else{if(M.eB){y+=m(r)+L;M.buffer=""}else{y+=L;M.buffer=r}}D.push(M);A+=M.r}function G(N,M,Q){var R=D[D.length-1];if(Q){y+=J(R.buffer+N,R);return false}var P=q(M,R);if(P){y+=J(R.buffer+N,R);I(P,M);return P.rB}var L=v(D.length-1,M);if(L){var O=R.cN?"</span>":"";if(R.rE){y+=J(R.buffer+N,R)+O}else{if(R.eE){y+=J(R.buffer+N,R)+O+m(M)}else{y+=J(R.buffer+N+M,R)+O}}while(L>1){O=D[D.length-2].cN?"</span>":"";y+=O;L--;D.length--}var r=D[D.length-1];D.length--;D[D.length-1].buffer="";if(r.starts){I(r.starts,"")}return R.rE}if(w(M,R)){throw"Illegal"}}var E=e[B];var D=[E.dM];var A=0;var x=0;var y="";try{var s,u=0;E.dM.buffer="";do{s=p(C,u);var t=G(s[0],s[1],s[2]);u+=s[0].length;if(!t){u+=s[1].length}}while(!s[2]);if(D.length>1){throw"Illegal"}return{r:A,keyword_count:x,value:y}}catch(H){if(H=="Illegal"){return{r:0,keyword_count:0,value:m(C)}}else{throw H}}}function g(t){var p={keyword_count:0,r:0,value:m(t)};var r=p;for(var q in e){if(!e.hasOwnProperty(q)){continue}var s=d(q,t);s.language=q;if(s.keyword_count+s.r>r.keyword_count+r.r){r=s}if(s.keyword_count+s.r>p.keyword_count+p.r){r=p;p=s}}if(r.language){p.second_best=r}return p}function i(r,q,p){if(q){r=r.replace(/^((<[^>]+>|\t)+)/gm,function(t,w,v,u){return w.replace(/\t/g,q)})}if(p){r=r.replace(/\n/g,"<br>")}return r}function n(t,w,r){var x=h(t,r);var v=a(t);var y,s;if(v){y=d(v,x)}else{return}var q=c(t);if(q.length){s=document.createElement("pre");s.innerHTML=y.value;y.value=k(q,c(s),x)}y.value=i(y.value,w,r);var u=t.className;if(!u.match("(\\s|^)(language-)?"+v+"(\\s|$)")){u=u?(u+" "+v):v}if(/MSIE [678]/.test(navigator.userAgent)&&t.tagName=="CODE"&&t.parentNode.tagName=="PRE"){s=t.parentNode;var p=document.createElement("div");p.innerHTML="<pre><code>"+y.value+"</code></pre>";t=p.firstChild.firstChild;p.firstChild.cN=s.cN;s.parentNode.replaceChild(p.firstChild,s)}else{t.innerHTML=y.value}t.className=u;t.result={language:v,kw:y.keyword_count,re:y.r};if(y.second_best){t.second_best={language:y.second_best.language,kw:y.second_best.keyword_count,re:y.second_best.r}}}function o(){if(o.called){return}o.called=true;var r=document.getElementsByTagName("pre");for(var p=0;p<r.length;p++){var q=b(r[p]);if(q){n(q,hljs.tabReplace)}}}function l(){if(window.addEventListener){window.addEventListener("DOMContentLoaded",o,false);window.addEventListener("load",o,false)}else{if(window.attachEvent){window.attachEvent("onload",o)}else{window.onload=o}}}var e={};this.LANGUAGES=e;this.highlight=d;this.highlightAuto=g;this.fixMarkup=i;this.highlightBlock=n;this.initHighlighting=o;this.initHighlightingOnLoad=l;this.IR="[a-zA-Z][a-zA-Z0-9_]*";this.UIR="[a-zA-Z_][a-zA-Z0-9_]*";this.NR="\\b\\d+(\\.\\d+)?";this.CNR="\\b(0[xX][a-fA-F0-9]+|(\\d+(\\.\\d*)?|\\.\\d+)([eE][-+]?\\d+)?)";this.BNR="\\b(0b[01]+)";this.RSR="!|!=|!==|%|%=|&|&&|&=|\\*|\\*=|\\+|\\+=|,|\\.|-|-=|/|/=|:|;|<|<<|<<=|<=|=|==|===|>|>=|>>|>>=|>>>|>>>=|\\?|\\[|\\{|\\(|\\^|\\^=|\\||\\|=|\\|\\||~";this.ER="(?![\\s\\S])";this.BE={b:"\\\\.",r:0};this.ASM={cN:"string",b:"'",e:"'",i:"\\n",c:[this.BE],r:0};this.QSM={cN:"string",b:'"',e:'"',i:"\\n",c:[this.BE],r:0};this.CLCM={cN:"comment",b:"//",e:"$"};this.CBLCLM={cN:"comment",b:"/\\*",e:"\\*/"};this.HCM={cN:"comment",b:"#",e:"$"};this.NM={cN:"number",b:this.NR,r:0};this.CNM={cN:"number",b:this.CNR,r:0};this.BNM={cN:"number",b:this.BNR,r:0};this.inherit=function(r,s){var p={};for(var q in r){p[q]=r[q]}if(s){for(var q in s){p[q]=s[q]}}return p}}();hljs.LANGUAGES.cpp=function(){var a={keyword:{"false":1,"int":1,"float":1,"while":1,"private":1,"char":1,"catch":1,"export":1,virtual:1,operator:2,sizeof:2,dynamic_cast:2,typedef:2,const_cast:2,"const":1,struct:1,"for":1,static_cast:2,union:1,namespace:1,unsigned:1,"long":1,"throw":1,"volatile":2,"static":1,"protected":1,bool:1,template:1,mutable:1,"if":1,"public":1,friend:2,"do":1,"return":1,"goto":1,auto:1,"void":2,"enum":1,"else":1,"break":1,"new":1,extern:1,using:1,"true":1,"class":1,asm:1,"case":1,typeid:1,"short":1,reinterpret_cast:2,"default":1,"double":1,register:1,explicit:1,signed:1,typename:1,"try":1,"this":1,"switch":1,"continue":1,wchar_t:1,inline:1,"delete":1,alignof:1,char16_t:1,char32_t:1,constexpr:1,decltype:1,noexcept:1,nullptr:1,static_assert:1,thread_local:1,restrict:1,_Bool:1,complex:1},built_in:{std:1,string:1,cin:1,cout:1,cerr:1,clog:1,stringstream:1,istringstream:1,ostringstream:1,auto_ptr:1,deque:1,list:1,queue:1,stack:1,vector:1,map:1,set:1,bitset:1,multiset:1,multimap:1,unordered_set:1,unordered_map:1,unordered_multiset:1,unordered_multimap:1,array:1,shared_ptr:1}};return{dM:{k:a,i:"</",c:[hljs.CLCM,hljs.CBLCLM,hljs.QSM,{cN:"string",b:"'\\\\?.",e:"'",i:"."},{cN:"number",b:"\\b(\\d+(\\.\\d*)?|\\.\\d+)(u|U|l|L|ul|UL|f|F)"},hljs.CNM,{cN:"preprocessor",b:"#",e:"$"},{cN:"stl_container",b:"\\b(deque|list|queue|stack|vector|map|set|bitset|multiset|multimap|unordered_map|unordered_set|unordered_multiset|unordered_multimap|array)\\s*<",e:">",k:a,r:10,c:["self"]}]}}}();hljs.LANGUAGES.r={dM:{c:[hljs.HCM,{cN:"number",b:"\\b0[xX][0-9a-fA-F]+[Li]?\\b",e:hljs.IMMEDIATE_RE,r:0},{cN:"number",b:"\\b\\d+(?:[eE][+\\-]?\\d*)?L\\b",e:hljs.IMMEDIATE_RE,r:0},{cN:"number",b:"\\b\\d+\\.(?!\\d)(?:i\\b)?",e:hljs.IMMEDIATE_RE,r:1},{cN:"number",b:"\\b\\d+(?:\\.\\d*)?(?:[eE][+\\-]?\\d*)?i?\\b",e:hljs.IMMEDIATE_RE,r:0},{cN:"number",b:"\\.\\d+(?:[eE][+\\-]?\\d*)?i?\\b",e:hljs.IMMEDIATE_RE,r:1},{cN:"keyword",b:"(?:tryCatch|library|setGeneric|setGroupGeneric)\\b",e:hljs.IMMEDIATE_RE,r:10},{cN:"keyword",b:"\\.\\.\\.",e:hljs.IMMEDIATE_RE,r:10},{cN:"keyword",b:"\\.\\.\\d+(?![\\w.])",e:hljs.IMMEDIATE_RE,r:10},{cN:"keyword",b:"\\b(?:function)",e:hljs.IMMEDIATE_RE,r:2},{cN:"keyword",b:"(?:if|in|break|next|repeat|else|for|return|switch|while|try|stop|warning|require|attach|detach|source|setMethod|setClass)\\b",e:hljs.IMMEDIATE_RE,r:1},{cN:"literal",b:"(?:NA|NA_integer_|NA_real_|NA_character_|NA_complex_)\\b",e:hljs.IMMEDIATE_RE,r:10},{cN:"literal",b:"(?:NULL|TRUE|FALSE|T|F|Inf|NaN)\\b",e:hljs.IMMEDIATE_RE,r:1},{cN:"identifier",b:"[a-zA-Z.][a-zA-Z0-9._]*\\b",e:hljs.IMMEDIATE_RE,r:0},{cN:"operator",b:"<\\-(?!\\s*\\d)",e:hljs.IMMEDIATE_RE,r:2},{cN:"operator",b:"\\->|<\\-",e:hljs.IMMEDIATE_RE,r:1},{cN:"operator",b:"%%|~",e:hljs.IMMEDIATE_RE},{cN:"operator",b:">=|<=|==|!=|\\|\\||&&|=|\\+|\\-|\\*|/|\\^|>|<|!|&|\\||\\$|:",e:hljs.IMMEDIATE_RE,r:0},{cN:"operator",b:"%",e:"%",i:"\\n",r:1},{cN:"identifier",b:"`",e:"`",r:0},{cN:"string",b:'"',e:'"',c:[hljs.BE],r:0},{cN:"string",b:"'",e:"'",c:[hljs.BE],r:0},{cN:"paren",b:"[[({\\])}]",e:hljs.IMMEDIATE_RE,r:0}]}}; | |
hljs.initHighlightingOnLoad(); | |
</script> | |
</head> | |
<body> | |
<p><link href="http://kevinburke.bitbucket.org/markdowncss/markdown.css" rel="stylesheet"></p> | |
<h1>Finding indicator dependencies</h1> | |
<p>I had been meaning to look into R's metaprogramming features for a while now. I finally had a chance today (thanks <a href="http://adv-r.had.co.nz/">Hadley</a>!), and I used it to experiment towards a problem that had been in the back of my mind for a while: finding depencies within indicator definitions.</p> | |
<p>Below, I implement a find dependencies function, which takes a set of indicators, and finds dependencies within it. Indicators are fields within a dataset, some of which are already there, and some of which are newly created. The dependency finding problem is investigating which new indicators derive from which existing ones. We think of these relationships as dependencies: for an indicator such as pupil-to-teacher-ratio (defined as the number-of-pupils divided by the number-of-teachers), pupil-to-teacher-ratio is dependent on number-of-pupils and number-of-teachers. Below, we'll see the dependency lists for a whole set of indicators.</p> | |
<p>This is useful in all sorts of contexts, from missing data analysis to many optimizations and streaming tasks. Creating a way to write indicators that allows for dependency analysis is usually tricky; you often end up introducing a lot of overhead for dependency analysis that detracts from the domain-specific indicator definitions. R's metaprogramming features show a pretty neat approach, at least at an experimental level. Note that most of the knowledge needed here is thanks to Hadley Wickam's amazing Advanced R programming book, particularly the chapters on <a href="http://adv-r.had.co.nz/Environments.html">environments</a>, <a href="http://adv-r.had.co.nz/Computing-on-the-language.html">non-standard evaluation</a>, and <a href="http://adv-r.had.co.nz/Expressions.html">metaprogramming</a>.</p> | |
<h2>First, The Indicators</h2> | |
<p>First, lets define the indicators. You can skim these, perhaps noticing that we aren't writing much more than the pure definitions of each indicator. I left the <code>infinite_as_NA</code> function in, but its possible that could be pulled out of here into another layer.</p> | |
<pre><code class="r"># Quick function for converting infinite values to NA | |
infinite_as_NA <- function(x) { | |
ifelse(is.infinite(x), NA, x) | |
} | |
require(stringr) # for string detect | |
</code></pre> | |
<pre><code>## Loading required package: stringr | |
</code></pre> | |
<pre><code class="r"># Example indicators (these are from a real example) | |
line_by_line_indicators = quote({ | |
is_primary = str_detect(facility_type, "primary") | |
is_junior_secondary = str_detect(facility_type, "junior") | |
pj = is_primary | is_junior_secondary | |
src = "mopup" | |
date_of_survey = as.character(as.Date(start)) | |
# INFRASTRUCTURE | |
improved_water_supply = improved_water_supply.tap | improved_water_supply.protected_well | | |
improved_water_supply.rainwater | improved_water_supply.handpump | |
improved_water_supply = improved_water_supply.tap | improved_water_supply.protected_well | | |
improved_water_supply.rainwater | improved_water_supply.handpump | |
improved_sanitation = improved_sanitation.vip_latrine | improved_sanitation.pit_latrine_with_slab | | |
improved_sanitation.flush | |
# PARTICIPATION | |
male_to_female_student_ratio = num_students_male/num_students_female | |
# INFRASTRUCTURE:WATER&SAN | |
pupil_toilet_ratio_facility = infinite_as_NA(num_students_total/num_toilets_total) | |
# INFRASTRUCTURE:BUILDING STRUCTURE | |
power_access = (power_sources.generator | power_sources.solar_system | power_sources.grid) | |
# INFRASTRUCTURE:LEARNING ENVIRONMENT | |
pupil_classrm_ratio = num_students_total/num_classrms_total | |
# ADEQUACY OF STAFFING | |
pupil_tchr_ratio = num_students_total/num_tchrs_total | |
}) | |
</code></pre> | |
<h2>Find Dependencies</h2> | |
<p>Here is the function to find dependencies (note: built off of one day's learning about meta-programming, if you notice a bug, do point it out).</p> | |
<pre><code class="r">## Find 'dependencies' in lhs = rhs type equations, where all names in rhs | |
## are dependences of lhs. Outputs in a named vector, where names are from | |
## lhs, and the values from rhs. Example, pupil_teacher_ratio = num_pupils / | |
## num_tchrs will produce (in json-represented named vector): | |
## {'pupil_teacher_ratio': 'num_pupils', 'pupil_teacher_ratio' : 'num_tchrs'} | |
find_deps <- function(x) { | |
if (is.atomic(x)) { | |
# these are evaluated values; we don't want these | |
character() | |
} else if (is.name(x)) { | |
# these are our unevaluated values; this is what we want | |
as.character(x) | |
} else if (is.call(x)) { | |
if (identical(x[[1]], quote(`=`))) { | |
# x looks like = lhs rhs; we recurse first on rhs (x[-1][-1]) | |
rhs <- unlist(lapply(x[-1][-1], find_deps)) | |
# and take the second element (x[[2]]) as lhs | |
lhs <- as.character(x[[2]]) | |
if (length(rhs) == 0) | |
setNames(NA, lhs) # no dependencies | |
else setNames(rhs, rep(lhs, length(rhs))) | |
} else { | |
# for other function calls, we recurse only on everything but function name | |
unlist(lapply(x[-1], find_deps)) | |
} | |
} | |
} | |
</code></pre> | |
<h3>The magic</h3> | |
<p>And finally, here is the magic. A dictionary of dependencies. The <em>dependent</em> indicator is listed as the key, once per indicator that it <em>depends on</em> (which is the value). Notice <code>src</code>, which doesn't have any dependencies.</p> | |
<pre><code class="r">dependencies <- find_deps(line_by_line_indicators) | |
cat(RJSONIO::toJSON(dependencies, pretty = TRUE)) | |
</code></pre> | |
<pre><code>## { | |
## "is_primary" : "facility_type", | |
## "is_junior_secondary" : "facility_type", | |
## "pj" : "is_primary", | |
## "pj" : "is_junior_secondary", | |
## "src" : null, | |
## "date_of_survey" : "start", | |
## "improved_water_supply" : "improved_water_supply.tap", | |
## "improved_water_supply" : "improved_water_supply.protected_well", | |
## "improved_water_supply" : "improved_water_supply.rainwater", | |
## "improved_water_supply" : "improved_water_supply.handpump", | |
## "improved_water_supply" : "improved_water_supply.tap", | |
## "improved_water_supply" : "improved_water_supply.protected_well", | |
## "improved_water_supply" : "improved_water_supply.rainwater", | |
## "improved_water_supply" : "improved_water_supply.handpump", | |
## "improved_sanitation" : "improved_sanitation.vip_latrine", | |
## "improved_sanitation" : "improved_sanitation.pit_latrine_with_slab", | |
## "improved_sanitation" : "improved_sanitation.flush", | |
## "male_to_female_student_ratio" : "num_students_male", | |
## "male_to_female_student_ratio" : "num_students_female", | |
## "pupil_toilet_ratio_facility" : "num_students_total", | |
## "pupil_toilet_ratio_facility" : "num_toilets_total", | |
## "power_access" : "power_sources.generator", | |
## "power_access" : "power_sources.solar_system", | |
## "power_access" : "power_sources.grid", | |
## "pupil_classrm_ratio" : "num_students_total", | |
## "pupil_classrm_ratio" : "num_classrms_total", | |
## "pupil_tchr_ratio" : "num_students_total", | |
## "pupil_tchr_ratio" : "num_tchrs_total" | |
## } | |
</code></pre> | |
<h3>Evaluation</h3> | |
<p>Evaluation isn't super tricky either. Here it is, being performed on a real dataset:</p> | |
<pre><code class="r">e <- readRDS("~/Code/mop_up/data/in_process_data/education_mopup_outliercleaned.rds") | |
e2 <- within(e, eval(line_by_line_indicators)) | |
## Some examples to show you the output was right: | |
sample_columns <- unique(c(na.omit(dependencies)[1:4], names(dependencies)[1:5])) | |
e2[1:10, sample_columns] | |
</code></pre> | |
<pre><code>## facility_type is_primary is_junior_secondary pj src | |
## 1 junior_sec_only FALSE TRUE TRUE mopup | |
## 2 primary_only TRUE FALSE TRUE mopup | |
## 3 <NA> NA NA NA mopup | |
## 4 primary_only TRUE FALSE TRUE mopup | |
## 5 primary_only TRUE FALSE TRUE mopup | |
## 6 primary_only TRUE FALSE TRUE mopup | |
## 7 junior_sec_only FALSE TRUE TRUE mopup | |
## 8 primary_only TRUE FALSE TRUE mopup | |
## 9 primary_only TRUE FALSE TRUE mopup | |
## 10 primary_only TRUE FALSE TRUE mopup | |
</code></pre> | |
</body> | |
</html> | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<link href="http://kevinburke.bitbucket.org/markdowncss/markdown.css" rel="stylesheet"> | |
Finding indicator dependencies | |
======================================================== | |
I had been meaning to look into R's metaprogramming features for a while now. I finally had a chance today (thanks [Hadley](http://adv-r.had.co.nz/)!), and I used it to experiment towards a problem that had been in the back of my mind for a while: finding depencies within indicator definitions. | |
Below, I implement a find dependencies function, which takes a set of indicators, and finds dependencies within it. Indicators are fields within a dataset, some of which are already there, and some of which are newly created. The dependency finding problem is investigating which new indicators derive from which existing ones. We think of these relationships as dependencies: for an indicator such as pupil-to-teacher-ratio (defined as the number-of-pupils divided by the number-of-teachers), pupil-to-teacher-ratio is dependent on number-of-pupils and number-of-teachers. Below, we'll see the dependency lists for a whole set of indicators. | |
This is useful in all sorts of contexts, from missing data analysis to many optimizations and streaming tasks. Creating a way to write indicators that allows for dependency analysis is usually tricky; you often end up introducing a lot of overhead for dependency analysis that detracts from the domain-specific indicator definitions. R's metaprogramming features show a pretty neat approach, at least at an experimental level. Note that most of the knowledge needed here is thanks to Hadley Wickam's amazing Advanced R programming book, particularly the chapters on [environments](http://adv-r.had.co.nz/Environments.html), [non-standard evaluation](http://adv-r.had.co.nz/Computing-on-the-language.html), and [metaprogramming](http://adv-r.had.co.nz/Expressions.html). | |
## First, The Indicators | |
First, lets define the indicators. You can skim these, perhaps noticing that we aren't writing much more than the pure definitions of each indicator. I left the `infinite_as_NA` function in, but its possible that could be pulled out of here into another layer. | |
```{r} | |
# Quick function for converting infinite values to NA | |
infinite_as_NA <- function(x) { ifelse(is.infinite(x), NA, x)} | |
require(stringr) # for string detect | |
# Example indicators (these are from a real example) | |
line_by_line_indicators = quote({ | |
is_primary = str_detect(facility_type, 'primary') | |
is_junior_secondary = str_detect(facility_type, 'junior') | |
pj = is_primary | is_junior_secondary | |
src = "mopup" | |
date_of_survey = as.character(as.Date(start)) | |
#INFRASTRUCTURE | |
improved_water_supply = improved_water_supply.tap | improved_water_supply.protected_well | | |
improved_water_supply.rainwater | improved_water_supply.handpump | |
improved_water_supply = improved_water_supply.tap | improved_water_supply.protected_well | | |
improved_water_supply.rainwater | improved_water_supply.handpump | |
improved_sanitation = improved_sanitation.vip_latrine | | |
improved_sanitation.pit_latrine_with_slab | improved_sanitation.flush | |
#PARTICIPATION | |
male_to_female_student_ratio = num_students_male / num_students_female | |
#INFRASTRUCTURE:WATER&SAN | |
pupil_toilet_ratio_facility = infinite_as_NA(num_students_total / num_toilets_total) | |
#INFRASTRUCTURE:BUILDING STRUCTURE | |
power_access = (power_sources.generator | power_sources.solar_system | power_sources.grid) | |
#INFRASTRUCTURE:LEARNING ENVIRONMENT | |
pupil_classrm_ratio = num_students_total / num_classrms_total | |
#ADEQUACY OF STAFFING | |
pupil_tchr_ratio = num_students_total / num_tchrs_total | |
}) | |
``` | |
## Find Dependencies | |
Here is the function to find dependencies (note: built off of one day's learning about meta-programming, if you notice a bug, do point it out). | |
```{r fig.width=7, fig.height=6} | |
## Find "dependencies" in lhs = rhs type equations, where all names in rhs | |
### are dependences of lhs. Outputs in a named vector, where names are from | |
### lhs, and the values from rhs. | |
### Example, pupil_teacher_ratio = num_pupils / num_tchrs | |
### will produce (in json-represented named vector): | |
### {"pupil_teacher_ratio": "num_pupils", "pupil_teacher_ratio" : "num_tchrs"} | |
find_deps <- function(x) { | |
if (is.atomic(x)) { | |
# these are evaluated values; we don't want these | |
character() | |
} else if (is.name(x)) { | |
# these are our unevaluated values; this is what we want | |
as.character(x) | |
} else if (is.call(x)) { | |
if (identical(x[[1]], quote(`=`))) { | |
# x looks like = lhs rhs; we recurse first on rhs (x[-1][-1]) | |
rhs <- unlist(lapply(x[-1][-1], find_deps)) | |
# and take the second element (x[[2]]) as lhs | |
lhs <- as.character(x[[2]]) | |
if(length(rhs) == 0) setNames(NA, lhs) # no dependencies | |
else setNames(rhs, rep(lhs, length(rhs))) | |
} else { | |
# for other function calls, we recurse only on everything but function name | |
unlist(lapply(x[-1], find_deps)) | |
} | |
} | |
} | |
``` | |
### The magic | |
And finally, here is the magic. A dictionary of dependencies. The _dependent_ indicator is listed as the key, once per indicator that it _depends on_ (which is the value). Notice `src`, which doesn't have any dependencies. | |
```{r} | |
dependencies <- find_deps(line_by_line_indicators) | |
cat(RJSONIO::toJSON(dependencies, pretty=TRUE)) | |
``` | |
### Evaluation | |
Evaluation isn't super tricky either. Here it is, being performed on a real dataset: | |
```{r} | |
e <- readRDS("~/Code/mop_up/data/in_process_data/education_mopup_outliercleaned.rds") | |
e2 <- within(e, eval(line_by_line_indicators)) | |
## Some examples to show you the output was right: | |
sample_columns <- unique(c(na.omit(dependencies)[1:4], names(dependencies)[1:5])) | |
e2[1:10, sample_columns] | |
``` |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment