TomConlin · February 24, 2018 04:38
diff --git a/v2file_survey b/v2file_survey
 head -1 DECIPHER-18.tab | tr '\t' '\n' | grep -n .
 1:#DiseaseID
 2:DiseaseName
 3:HpoId
 4:HpoName
 5:ageOfOnsetId
 6:ageOfOnsetName
 7:evidenceCode
 8:frequencyModifier
 9:sex
 10:negation
 11:modifier
 12:description
 13:publication
 14:assignedBy
 15:dateCreated
 ########################

 # easier to survey in one pile
 grep -h -v "^#" *.tab > v2.tab

 # howmany rows
 wc -l < v2.tab
 92923

 # disease identifier types 
 cut -f1  -d':' v2.tab | sort | uniq -c | sort -nr
  92626 OMIM
    297 DECIPHER

 # files with the most rows
 cut -f1  v2.tab |  uniq -c | sort -nr| head 
    131 OMIM:312870
    128 OMIM:180849
    108 OMIM:607872
    108 OMIM:194050
     99 OMIM:613406
     95 OMIM:270400
     94 OMIM:194190
     87 OMIM:601803
     87 OMIM:122470
     85 OMIM:305600

 #  check if any disease identifiers in more than one file?
 cut -f1  v2.tab |  uniq -c | sort -nr| cut -c9- | uniq -c | sort -nr | head  
      1 OMIM:617537
      1 OMIM:617526
      1 OMIM:617506
      1 OMIM:617478
      1 OMIM:617468
      1 OMIM:617466
      1 OMIM:617460
      1 OMIM:617452
      1 OMIM:617450
      1 OMIM:617442
 # no

 # summary stats on rows per file
 cut -f1  v2.tab |  uniq -c | sort -nr| cut -c1-8 | ./sumstat.r
       V1        
 Min.   :  1.00  
 1st Qu.:  4.00  
 Median :  8.00  
 Mean   : 12.61  
 3rd Qu.: 17.00  
 Max.   :131.00  
 [1] "sd :12.46"

 ###########################################
 # sumstat.r 
 #! /usr/bin/Rscript --vanilla
 	x <- read.csv('stdin', header = F); 
 	summary(x);
 	sprintf("sd :%.02f", sd(x[,1]));
 ###########################################





 ###########################################
 # disease_ID
 # howmany distinct disease identifiers
 cut -f1 v2.tab | uniq | wc -l
 7370

 #####################################
 # disease_terms
 # howmany distinct disease ... "word thingies"(tm)
 cut -f2 v2.tab | uniq | wc -l
 12389
 # not 1:1

 # sometimes appears to be a ';;' seperated list
 # howmany are lists? 
 cut -f2 v2.tab | sort -u | grep -c  ";;" 
 2442

 ##########################################
 # HPO_ID
 # howmany?
 cut -f3 v2.tab | wc -l
 92923
 # unique?
 cut -f3 v2.tab | sort -u | wc -l
 6990
 # have correct curie prefix?
 cut -f3 v2.tab | cut -f1 -d':' | uniq -c
  92923 HP

 ################################
 # HPO_label
 cut -f4 v2.tab  | wc -l
 92923
 cut -f4 v2.tab  | sort -u | wc -l
 6990

 # anything not simple words? any puncuation indicating lists?
 cut -f4 v2.tab  | grep -v "[a-z A-Z]*"
 # nothing
 ################################
 # HPO_onset_ID
 cut -f5 v2.tab | sort | uniq -c | sort -nr
  92390 
    134 HP:0003577
     92 HP:0003593
     87 HP:0011463
     74 HP:0003623
     47 HP:0003581
     43 HP:0003621
     33 HP:0003584
      8 HP:0011462
      5 HP:0003576
      3 HP:0003627
      3 HP:0003596
      2 HP:0003578
      1 HP:0011461
      1 HP:0003590
 #####################################
 # HPO_onset_label
 cut -f6 v2.tab | sort | uniq -c | sort -nr
  92390 
    134 Congenital onset
     92 Infantile onset
     87 Childhood onset
     74 Neonatal onset
     47 Adult onset
     43 Juvenile onset
     33 Late onset
      8 Young adult onset
      5 Onset in infancy
      3 Onset in early adulthood
      3 Middle age onset
      2 Onset in childhood
      1 Onset in adolescence
      1 Fetal onset 

 ###############################
 # GO_evidence_code  
 # Note: web search on this header returns correct code descriptions as top hit 

 cut -f7 v2.tab | sort | uniq -c | sort -nr
  43974 TAS
  42536 IEA
   6388 PCS
     25 ICE

 ################################
 # phenotype_frequency  
 # sparse hodgpodge of:
 #   nothing 
 #	identifiers
 #	rationals
 #	pointlessly precise percentages (including ranges of percentages)

 # easy:   drop trailing zeros after a decimal point
 # medium: drop more than a place or so of suspect precision  
 #         e.g. 10.9756%  involved over 100k people or partial people.
 # hard: recover the proper rational a percentage was derived from (curating pubs)
 #
 # mixing identifiers and rationals; 
 # I guess one sparse colomn is better than two
 # but what would really make it worthwhile is if the identifiers 
 # refrenced a value (back in the ontology) which allowed them to be comparable 
 # (even approximatly) to the proper rationals (and percentages)

 # are all rationals proper?
 cut -f8 v2.tab | awk -F'/' '$2>0{if($1>$2)print}'
 18/2
 # of course not,  that would be silly!

 ##################################
 # sex
 cut -f9 v2.tab | sort | uniq -c | sort -nr
  92843 
     58 Male
     22 Female

 ################################
 # phenotype_negation
 cut -f10 v2.tab | sort | uniq -c | sort -nr
  92136 
    787 NOT

 #################################
 # HPO_modifier_ID
 cut -f11 v2.tab | sort | uniq -c | sort -nr
  92119 
    307 HP:0012825
    194 HP:0012828
     78 HP:0025303
     61 HP:0003676
     35 HP:0012829
     27 HP:0012832
     26 HP:0012833
     16 HP:0031375
     15 HP:0012826
     12 HP:0012840
     11 HP:0012839
      8 HP:0012837
      5 HP:0011010
      3 HP:0025153
      2 HP:0011009
      1 HP:0030650
      1 HP:0012827
      1 HP:0003831
      1 HP:0003680

 ########################################
 # phenotype_comments
 # 
 # 60509  empty
 # OMIM screaming caps (mostly descriptive) 
 # and some other more random hint like statemets 

 ########################################
 # publications
 # hmmm... more like
 # citations

 # can be lists (with different seperator than previous list)
 # can be missing curie suffix  `OMIM:`  (seventy like this) 
 # can be url
 # can be ISBN
 # can mix curie case 	`PMID:17918734;pmid:12687501`
 # can be spaced out   	`PMID:    17223397`
 # can be bare integer  	`12089525` 
 # can be folks   		`HPO:sdoelken`
 # there can be space after list seperators (or not)

 # not convinced they are publication identifiers


 ###############################################################
 # curators
  42171 HPO:skoehler
  37065 HPO:iea
  13534 HPO:probinson
     51 HPO:lccarmody
     49 HPO:sdoelken
     34 ZFIN:bruef; HPO:sdoelken
     13 HPO:curators
      6 PATOC:GVG; PATOC:PS

 # Is there a good reason not to insist on ORCIDs?  
 # surely these people should be amongst the most capable of understanding why.


 ##################################################
 # date_created

 the great thing about this format is how easy it is to spot outliers

 cut -f15 v2.tab | sort -u | head
 0017-04-03     ohoh
 2009-02-17
 2009-07-24
 2009-07-31
 2009-08-31
 2009-09-17
 2009-10-01
 2009-10-02
 2009-10-09
 2009-10-15
 
 cut -f15 v2.tab | sort -u | tail
 2017-12-03
 2017-12-10
 2017-12-11
 2017-12-12
 2017-12-13
 2017-12-17
 2017-12-22
 2018-01-25
 2018-01-28
 2018-15-20    soon-ish maybe if our god emperor needs more time on the links   

 # check the rest

 for date in $(cut -f15 v2.tab | sort -u); do 
 	date --date=${date}; 
 done | grep invalid
 date: invalid date ‘2018-15-20’

 apparently there was a rectroactive April third in the year 17... it was a Monday 

 other than that the dates look good.
	head -1 DECIPHER-18.tab \| tr '\t' '\n' \| grep -n .
	1:#DiseaseID
	2:DiseaseName
	3:HpoId
	4:HpoName
	5:ageOfOnsetId
	6:ageOfOnsetName
	7:evidenceCode
	8:frequencyModifier
	9:sex
	10:negation
	11:modifier
	12:description
	13:publication
	14:assignedBy
	15:dateCreated
	########################

	# easier to survey in one pile
	grep -h -v "^#" *.tab > v2.tab

	# howmany rows
	wc -l < v2.tab
	92923

	# disease identifier types
	cut -f1 -d':' v2.tab \| sort \| uniq -c \| sort -nr
	92626 OMIM
	297 DECIPHER

	# files with the most rows
	cut -f1 v2.tab \| uniq -c \| sort -nr\| head
	131 OMIM:312870
	128 OMIM:180849
	108 OMIM:607872
	108 OMIM:194050
	99 OMIM:613406
	95 OMIM:270400
	94 OMIM:194190
	87 OMIM:601803
	87 OMIM:122470
	85 OMIM:305600

	# check if any disease identifiers in more than one file?
	cut -f1 v2.tab \| uniq -c \| sort -nr\| cut -c9- \| uniq -c \| sort -nr \| head
	1 OMIM:617537
	1 OMIM:617526
	1 OMIM:617506
	1 OMIM:617478
	1 OMIM:617468
	1 OMIM:617466
	1 OMIM:617460
	1 OMIM:617452
	1 OMIM:617450
	1 OMIM:617442
	# no

	# summary stats on rows per file
	cut -f1 v2.tab \| uniq -c \| sort -nr\| cut -c1-8 \| ./sumstat.r
	V1
	Min. : 1.00
	1st Qu.: 4.00
	Median : 8.00
	Mean : 12.61
	3rd Qu.: 17.00
	Max. :131.00
	[1] "sd :12.46"

	###########################################
	# sumstat.r
	#! /usr/bin/Rscript --vanilla
	x <- read.csv('stdin', header = F);
	summary(x);
	sprintf("sd :%.02f", sd(x[,1]));
	###########################################





	###########################################
	# disease_ID
	# howmany distinct disease identifiers
	cut -f1 v2.tab \| uniq \| wc -l
	7370

	#####################################
	# disease_terms
	# howmany distinct disease ... "word thingies"(tm)
	cut -f2 v2.tab \| uniq \| wc -l
	12389
	# not 1:1

	# sometimes appears to be a ';;' seperated list
	# howmany are lists?
	cut -f2 v2.tab \| sort -u \| grep -c ";;"
	2442

	##########################################
	# HPO_ID
	# howmany?
	cut -f3 v2.tab \| wc -l
	92923
	# unique?
	cut -f3 v2.tab \| sort -u \| wc -l
	6990
	# have correct curie prefix?
	cut -f3 v2.tab \| cut -f1 -d':' \| uniq -c
	92923 HP

	################################
	# HPO_label
	cut -f4 v2.tab \| wc -l
	92923
	cut -f4 v2.tab \| sort -u \| wc -l
	6990

	# anything not simple words? any puncuation indicating lists?
	cut -f4 v2.tab \| grep -v "[a-z A-Z]*"
	# nothing
	################################
	# HPO_onset_ID
	cut -f5 v2.tab \| sort \| uniq -c \| sort -nr
	92390
	134 HP:0003577
	92 HP:0003593
	87 HP:0011463
	74 HP:0003623
	47 HP:0003581
	43 HP:0003621
	33 HP:0003584
	8 HP:0011462
	5 HP:0003576
	3 HP:0003627
	3 HP:0003596
	2 HP:0003578
	1 HP:0011461
	1 HP:0003590
	#####################################
	# HPO_onset_label
	cut -f6 v2.tab \| sort \| uniq -c \| sort -nr
	92390
	134 Congenital onset
	92 Infantile onset
	87 Childhood onset
	74 Neonatal onset
	47 Adult onset
	43 Juvenile onset
	33 Late onset
	8 Young adult onset
	5 Onset in infancy
	3 Onset in early adulthood
	3 Middle age onset
	2 Onset in childhood
	1 Onset in adolescence
	1 Fetal onset

	###############################
	# GO_evidence_code
	# Note: web search on this header returns correct code descriptions as top hit

	cut -f7 v2.tab \| sort \| uniq -c \| sort -nr
	43974 TAS
	42536 IEA
	6388 PCS
	25 ICE

	################################
	# phenotype_frequency
	# sparse hodgpodge of:
	# nothing
	# identifiers
	# rationals
	# pointlessly precise percentages (including ranges of percentages)

	# easy: drop trailing zeros after a decimal point
	# medium: drop more than a place or so of suspect precision
	# e.g. 10.9756% involved over 100k people or partial people.
	# hard: recover the proper rational a percentage was derived from (curating pubs)
	#
	# mixing identifiers and rationals;
	# I guess one sparse colomn is better than two
	# but what would really make it worthwhile is if the identifiers
	# refrenced a value (back in the ontology) which allowed them to be comparable
	# (even approximatly) to the proper rationals (and percentages)

	# are all rationals proper?
	cut -f8 v2.tab \| awk -F'/' '$2>0{if($1>$2)print}'
	18/2
	# of course not, that would be silly!

	##################################
	# sex
	cut -f9 v2.tab \| sort \| uniq -c \| sort -nr
	92843
	58 Male
	22 Female

	################################
	# phenotype_negation
	cut -f10 v2.tab \| sort \| uniq -c \| sort -nr
	92136
	787 NOT

	#################################
	# HPO_modifier_ID
	cut -f11 v2.tab \| sort \| uniq -c \| sort -nr
	92119
	307 HP:0012825
	194 HP:0012828
	78 HP:0025303
	61 HP:0003676
	35 HP:0012829
	27 HP:0012832
	26 HP:0012833
	16 HP:0031375
	15 HP:0012826
	12 HP:0012840
	11 HP:0012839
	8 HP:0012837
	5 HP:0011010
	3 HP:0025153
	2 HP:0011009
	1 HP:0030650
	1 HP:0012827
	1 HP:0003831
	1 HP:0003680

	########################################
	# phenotype_comments
	#
	# 60509 empty
	# OMIM screaming caps (mostly descriptive)
	# and some other more random hint like statemets

	########################################
	# publications
	# hmmm... more like
	# citations

	# can be lists (with different seperator than previous list)
	# can be missing curie suffix `OMIM:` (seventy like this)
	# can be url
	# can be ISBN
	# can mix curie case `PMID:17918734;pmid:12687501`
	# can be spaced out `PMID: 17223397`
	# can be bare integer `12089525`
	# can be folks `HPO:sdoelken`
	# there can be space after list seperators (or not)

	# not convinced they are publication identifiers


	###############################################################
	# curators
	42171 HPO:skoehler
	37065 HPO:iea
	13534 HPO:probinson
	51 HPO:lccarmody
	49 HPO:sdoelken
	34 ZFIN:bruef; HPO:sdoelken
	13 HPO:curators
	6 PATOC:GVG; PATOC:PS

	# Is there a good reason not to insist on ORCIDs?
	# surely these people should be amongst the most capable of understanding why.


	##################################################
	# date_created

	the great thing about this format is how easy it is to spot outliers

	cut -f15 v2.tab \| sort -u \| head
	0017-04-03 ohoh
	2009-02-17
	2009-07-24
	2009-07-31
	2009-08-31
	2009-09-17
	2009-10-01
	2009-10-02
	2009-10-09
	2009-10-15

	cut -f15 v2.tab \| sort -u \| tail
	2017-12-03
	2017-12-10
	2017-12-11
	2017-12-12
	2017-12-13
	2017-12-17
	2017-12-22
	2018-01-25
	2018-01-28
	2018-15-20 soon-ish maybe if our god emperor needs more time on the links

	# check the rest

	for date in $(cut -f15 v2.tab \| sort -u); do
	date --date=${date};
	done \| grep invalid
	date: invalid date ‘2018-15-20’

	apparently there was a rectroactive April third in the year 17... it was a Monday

	other than that the dates look good.