Created
May 11, 2018 17:33
-
-
Save raprasad/906b3068cc99d448665f2c1383611a0c to your computer and use it in GitHub Desktop.
schema draft 2
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"type":"object", | |
"properties":{ | |
"dataset":{ | |
"type":"object", | |
"properties":{ | |
"description":{ | |
"type":"string" | |
}, | |
"unitOfAnalysis":{ | |
"type":"string" | |
}, | |
"structure":{ | |
"type":"string", | |
"enum":[ | |
"wide", | |
"long", | |
"unknown" | |
] | |
}, | |
"rowCount":{ | |
"type":"integer", | |
"description":"Number of rows or observations in the dataset" | |
}, | |
"variableCount":{ | |
"type":"integer", | |
"description":"Number of variables or features or columns in the dataset" | |
}, | |
"dataSource":{ | |
"description":"Object containing descriptions of original source of dataset", | |
"type":[ | |
"object", | |
"null" | |
], | |
"oneOf":[ | |
{ | |
"properties":{ | |
"name":{ | |
"description":"Name such as a file name or database table name", | |
"type":"string" | |
}, | |
"type":{ | |
"description":"Type of data source", | |
"type":"string", | |
"enum":[ | |
"File", | |
"SQL database" | |
] | |
}, | |
"format":{ | |
"description":"Pertains to the type. For example, a file that is \"text/csv\" or a SQL database that is \"Postgres version 9.1\"", | |
"type":"string" | |
}, | |
"filesize":{ | |
"description":"Filesize in bytes. (optional)", | |
"type":"integer" | |
} | |
}, | |
"required":[ | |
"name", | |
"type" | |
] | |
}, | |
{ | |
"type":"null" | |
} | |
] | |
}, | |
"citation":{ | |
"description":"Object containing a schema.org Dataset description", | |
"type":"object", | |
"properties":{ | |
"name":{ | |
"description":"Name of the dataset", | |
"type":"string" | |
} | |
} | |
} | |
}, | |
"required":[ | |
"description", | |
"unitOfAnalysis", | |
"structure", | |
"rowCount", | |
"variableCount" | |
] | |
}, | |
"variables":{ | |
"type":"object", | |
"patternProperties":{ | |
"^[_a-zA-Z0-9]+$":{ | |
"type":"object", | |
"properties":{ | |
"variableName":{ | |
"type":"string" | |
}, | |
"description":{ | |
"type":"string", | |
"description":"Brief explanation of the variable " | |
}, | |
"numchar":{ | |
"type":"string", | |
"description":"Describes the variable as numeric or character valued", | |
"enum":[ | |
"character", | |
"numeric" | |
] | |
}, | |
"nature":{ | |
"type":"string", | |
"description":"Describes the classification of data into Nominal, Ordinal, Ratio, Interval, Percentage.", | |
"enum":[ | |
"interval", | |
"nominal", | |
"ordinal", | |
"percent", | |
"ratio", | |
"other" | |
] | |
}, | |
"binary":{ | |
"type":"boolean", | |
"description":"Signifies that the data can only take two values" | |
}, | |
"interval":{ | |
"type":"string", | |
"description":"Describes numeric variables as either continuously valued, or discrete valued", | |
"enum":[ | |
"continuous", | |
"discrete" | |
] | |
}, | |
"time":{ | |
"description":"Signifies that the variable describes points in time", | |
"oneOf":[ | |
{ | |
"type":"boolean" | |
}, | |
{ | |
"type":"string", | |
"enum":[ | |
"unknown" | |
] | |
} | |
] | |
}, | |
"invalidCount":{ | |
"type":"integer", | |
"description":"Counts the number of invalid observations, including missing values, nulls, NA's and any observation with a value enumerated in invalidSpecialCodes" | |
}, | |
"invalidSpecialCodes":{ | |
"type":"array", | |
"items":{ | |
"type":"number" | |
}, | |
"description":"Any numbers that represent invalid observations" | |
}, | |
"validCount":{ | |
"type":"integer", | |
"description":"Counts the number of valid observations" | |
}, | |
"uniqueCount":{ | |
"type":"integer", | |
"description":"Count of unique values, including invalid signifiers" | |
}, | |
"median":{ | |
"type":[ | |
"number", | |
"string" | |
], | |
"description":"A central value in the distribution such that there are as many values equal or above, as there are equal or below this value.", | |
"oneOf":[ | |
{ | |
"type":"number" | |
}, | |
{ | |
"type":"string", | |
"enum":[ | |
"NA" | |
] | |
} | |
] | |
}, | |
"mean":{ | |
"type":[ | |
"number", | |
"string" | |
], | |
"description":"Average of all numeric values, which are not contained in invalidSpecialCodes", | |
"oneOf":[ | |
{ | |
"type":"number" | |
}, | |
{ | |
"type":"string", | |
"enum":[ | |
"NA" | |
] | |
} | |
] | |
}, | |
"max":{ | |
"type":[ | |
"number", | |
"string" | |
], | |
"description":"Largest numeric value observed in dataset, that is not contained in invalidSpecialCodes", | |
"oneOf":[ | |
{ | |
"type":"number" | |
}, | |
{ | |
"type":"string", | |
"enum":[ | |
"NA" | |
] | |
} | |
] | |
}, | |
"min":{ | |
"type":[ | |
"number", | |
"string" | |
], | |
"description":"Least numeric value observed in dataset, that is not contained in invalidSpecialCodes", | |
"oneOf":[ | |
{ | |
"type":"number" | |
}, | |
{ | |
"type":"string", | |
"enum":[ | |
"NA" | |
] | |
} | |
] | |
}, | |
"mode":{ | |
"type":[ | |
"array", | |
"string" | |
], | |
"description":"Value that occurs most frequently. Multiple values in the case of ties.", | |
"oneOf":[ | |
{ | |
"type":"array", | |
"items":{ | |
"type":"number" | |
} | |
}, | |
{ | |
"type":"array", | |
"items":{ | |
"type":"string" | |
} | |
}, | |
{ | |
"type":"string", | |
"enum":[ | |
"NA" | |
] | |
} | |
] | |
}, | |
"modeFreq":{ | |
"type":[ | |
"integer", | |
"string" | |
], | |
"description":"Number of times value of mode is observed in variable", | |
"oneOf":[ | |
{ | |
"type":"integer" | |
}, | |
{ | |
"type":"string", | |
"enum":[ | |
"NA" | |
] | |
} | |
] | |
}, | |
"fewestValues":{ | |
"type":[ | |
"array", | |
"string" | |
], | |
"description":"Value that occurs least frequently. Multiple values in the case of ties.", | |
"oneOf":[ | |
{ | |
"type":"array", | |
"items":{ | |
"type":"number" | |
} | |
}, | |
{ | |
"type":"array", | |
"items":{ | |
"type":"string" | |
} | |
}, | |
{ | |
"type":"string", | |
"enum":[ | |
"NA" | |
] | |
} | |
] | |
}, | |
"fewestFreq":{ | |
"type":[ | |
"integer", | |
"string" | |
], | |
"description":"Number of times value of fewestValues is observed in variable", | |
"oneOf":[ | |
{ | |
"type":"integer" | |
}, | |
{ | |
"type":"string", | |
"enum":[ | |
"NA" | |
] | |
} | |
] | |
}, | |
"midpoint":{ | |
"type":[ | |
"number", | |
"string" | |
], | |
"description":"The value equidistant from the reported min and max values", | |
"oneOf":[ | |
{ | |
"type":"number" | |
}, | |
{ | |
"type":"string", | |
"xenum":[ | |
"NA" | |
] | |
} | |
] | |
}, | |
"midpointFreq":{ | |
"type":[ | |
"integer", | |
"string" | |
], | |
"description":"Number of observations with value equal to minpoint", | |
"oneOf":[ | |
{ | |
"type":"integer" | |
}, | |
{ | |
"type":"string", | |
"enum":[ | |
"NA" | |
] | |
} | |
] | |
}, | |
"stdDev":{ | |
"type":[ | |
"number", | |
"string" | |
], | |
"description":"Standard deviation of the values, measuring the spread between values, specifically using population formula", | |
"oneOf":[ | |
{ | |
"type":"number" | |
}, | |
{ | |
"type":"string", | |
"enum":[ | |
"NA" | |
] | |
} | |
] | |
}, | |
"herfindahlIndex":{ | |
"type":[ | |
"number", | |
"string" | |
], | |
"description":"Measure of heterogeneity of a categorical variable which gives the probability that any two randomly sampled observations have the same value", | |
"oneOf":[ | |
{ | |
"type":"number" | |
}, | |
{ | |
"type":"string", | |
"enum":[ | |
"NA" | |
] | |
} | |
] | |
}, | |
"plotValues":{ | |
"type":[ | |
"object", | |
"string" | |
], | |
"description":"Plot points of a bar chart for tracing distribution of variable", | |
"oneOf":[ | |
{ | |
"type":"object" | |
}, | |
{ | |
"type":"string", | |
"enum":[ | |
"NA" | |
] | |
} | |
] | |
}, | |
"pdfPlotType":{ | |
"type":[ | |
"string", | |
"null" | |
], | |
"description":"Describes default type of plot appropriate to represent distribution of variable", | |
"oneOf":[ | |
{ | |
"type":"string", | |
"enum":[ | |
"bar", | |
"continuous", | |
"NA" | |
] | |
}, | |
{ | |
"type":"null" | |
} | |
] | |
}, | |
"pdfPlotX":{ | |
"type":[ | |
"array", | |
"null" | |
], | |
"description":"Plot points along x dimension for tracing distribution of variable", | |
"oneOf":[ | |
{ | |
"type":"array", | |
"items":{ | |
"type":"number" | |
} | |
}, | |
{ | |
"type":"null" | |
} | |
] | |
}, | |
"pdfPlotY":{ | |
"type":[ | |
"array", | |
"null" | |
], | |
"description":"Plot points along y dimension for tracing distribution of variable", | |
"oneOf":[ | |
{ | |
"type":"array", | |
"items":{ | |
"type":"number" | |
} | |
}, | |
{ | |
"type":"null" | |
} | |
] | |
}, | |
"cdfPlotType":{ | |
"type":[ | |
"string", | |
"null" | |
], | |
"description":"Describes default type of plot appropriate to represent cumulative distribution of variable", | |
"oneOf":[ | |
{ | |
"type":"string", | |
"enum":[ | |
"bar", | |
"continuous", | |
"NA" | |
] | |
}, | |
{ | |
"type":"null" | |
} | |
] | |
}, | |
"cdfPlotX":{ | |
"type":[ | |
"array", | |
"null" | |
], | |
"description":"Plot points along x dimension for tracing cumulative distribution of variable", | |
"oneOf":[ | |
{ | |
"type":"array", | |
"items":{ | |
"type":"number" | |
} | |
}, | |
{ | |
"type":"null" | |
} | |
] | |
}, | |
"cdfPlotY":{ | |
"type":[ | |
"array", | |
"null" | |
], | |
"description":"Plot points along y dimension for tracing cumulative distribution of variable", | |
"oneOf":[ | |
{ | |
"type":"array", | |
"items":{ | |
"type":"number" | |
} | |
}, | |
{ | |
"type":"null" | |
} | |
] | |
}, | |
"interpretation":{ | |
"type":[ | |
"object", | |
"string" | |
], | |
"description":"Object containing descriptors to interpret variable", | |
"oneOf":[ | |
{ | |
"type":"object" | |
}, | |
{ | |
"type":"string", | |
"enum":[ | |
"NA" | |
] | |
} | |
] | |
}, | |
"tworavens":{ | |
"type":[ | |
"object", | |
"string" | |
], | |
"description":"Object containing metadata specifically used by TwoRavens platform", | |
"oneOf":[ | |
{ | |
"type":"object" | |
}, | |
{ | |
"type":"string", | |
"enum":[ | |
"NA" | |
] | |
} | |
] | |
} | |
}, | |
"required":[ | |
"variableName", | |
"description", | |
"numchar", | |
"nature", | |
"binary", | |
"interval", | |
"time", | |
"invalidCount", | |
"validCount", | |
"uniqueCount", | |
"median", | |
"mean", | |
"max", | |
"min", | |
"mode", | |
"modeFreq", | |
"fewestValues", | |
"fewestFreq", | |
"midpoint", | |
"midpointFreq", | |
"stdDev", | |
"herfindahlIndex", | |
"plotValues", | |
"pdfPlotType", | |
"pdfPlotX", | |
"pdfPlotY", | |
"cdfPlotType", | |
"cdfPlotX", | |
"cdfPlotY" | |
] | |
} | |
}, | |
"additionalProperties":false, | |
"minProperties":1 | |
} | |
}, | |
"required":[ | |
"dataset", | |
"variables" | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment