Skip to content

Instantly share code, notes, and snippets.

@albertmeronyo
Last active August 29, 2015 14:15
Show Gist options
  • Select an option

  • Save albertmeronyo/dec56707a35ecfb70d69 to your computer and use it in GitHub Desktop.

Select an option

Save albertmeronyo/dec56707a35ecfb70d69 to your computer and use it in GitHub Desktop.
OEML dataset stats
# totalSize
bzcat *hisco.nt.bz2 | wc -l
# nSnapshots
ls *hisco.nt.bz2 | wc -l
# nInserts (pair)
diff -u <(bzcat `ls *hisco.nt.bz2 | head -n6 | tail -n2 | head -n1` | sort) <(bzcat `ls *hisco.nt.bz2 | head -n6 | tail -n1` | sort) | grep ^+ | wc -l
# nDeletes (pair)
diff -u <(bzcat `ls *hisco.nt.bz2 | head -n6 | tail -n2 | head -n1` | sort) <(bzcat `ls *hisco.nt.bz2 | head -n6 | tail -n1` | sort) | grep ^- | wc -l
# nInserts (dataset)
for i in `seq 2 \`ls *hisco.nt.bz2 | wc -l\``; do diff -u <(bzcat `ls *hisco.nt.bz2 | head -n$i | tail -n2 | head -n1` | sort) <(bzcat `ls *hisco.nt.bz2 | head -n$i | tail -n1` | sort) | grep ^+ | wc -l; done
# nDeletes (dataset)
for i in `seq 2 \`ls *hisco.nt.bz2 | wc -l\``; do diff -u <(bzcat `ls *hisco.nt.bz2 | head -n$i | tail -n2 | head -n1` | sort) <(bzcat `ls *hisco.nt.bz2 | head -n$i | tail -n1` | sort) | grep ^- | wc -l; done
# nComm (dataset)
for i in `seq 2 \`ls *hisco.nt.bz2 | wc -l\``; do comm <(bzcat `ls *hisco.nt.bz2 | head -n$i | tail -n2 | head -n1` | sort) <(bzcat `ls *hisco.nt.bz2 | head -n$i | tail -n1` | sort) -1 -2 | wc -l; done
# Same previous 3, summing intermediate results
for i in `seq 2 \`ls *hisco.nt.bz2 | wc -l\``; do diff -u <(bzcat `ls *hisco.nt.bz2 | head -n$i | tail -n2 | head -n1` | sort) <(bzcat `ls *hisco.nt.bz2 | head -n$i | tail -n1` | sort) | grep ^+ | wc -l; done | awk '{s+=$1} END {print s}'
for i in `seq 2 \`ls *hisco.nt.bz2 | wc -l\``; do diff -u <(bzcat `ls *hisco.nt.bz2 | head -n$i | tail -n2 | head -n1` | sort) <(bzcat `ls *hisco.nt.bz2 | head -n$i | tail -n1` | sort) | grep ^- | wc -l; done | awk '{s+=$1} END {print s}'
for i in `seq 2 \`ls *hisco.nt.bz2 | wc -l\``; do comm <(bzcat `ls *hisco.nt.bz2 | head -n$i | tail -n2 | head -n1` | sort) <(bzcat `ls *hisco.nt.bz2 | head -n$i | tail -n1` | sort) -1 -2 | wc -l; done | awk '{s+=$1} END {print s}'
# totalInstances
bzcat *hisco.nt.bz2 | grep "http://cedar.example.org/ns#occupation" | wc -l
# totalStructural
bzcat *hisco.nt.bz2 | grep "http://www.w3.org/2004/02/skos/core#broader" | wc -l
# Making all this magic generic
# totalSize
for d in `ls`; do cat $d/* | wc -l; done
# nSnapshots
for d in `ls`; do ls $d/*| wc -l; done
# nInserts
for d in `ls`; do for i in `seq 2 \`ls $d/* | wc -l\``; do diff -u <(cat `ls $d/* | head -n$i | tail -n2 | head -n1` | sort) <(cat `ls $d/* | head -n$i | tail -n1` | sort) | grep ^+ | wc -l; done | awk '{s+=$1} END {print s}'; done
# nDeletes
for d in `ls`; do for i in `seq 2 \`ls $d/* | wc -l\``; do diff -u <(cat `ls $d/* | head -n$i | tail -n2 | head -n1` | sort) <(cat `ls $d/* | head -n$i | tail -n1` | sort) | grep ^- | wc -l; done | awk '{s+=$1} END {print s}'; done
# nComm
for d in `ls`; do for i in `seq 2 \`ls $d/* | wc -l\``; do comm <(cat `ls $d/* | head -n$i | tail -n2 | head -n1` | sort) <(cat `ls $d/* | head -n$i | tail -n1` | sort) -1 -2 | wc -l; done | awk '{s+=$1} END {print s}'; done
# totalInstances
for d in `ls`; do cat $d/* | grep "http://www.w3.org/1999/02/22-rdf-syntax-ns#type" | wc -l; done
# totalStructural
for d in `ls`; do cat $d/* | grep "http://www.w3.org/2000/01/rdf-schema#subClassOf" | wc -l; done
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment