Skip to content

Instantly share code, notes, and snippets.

@raprasad
Created May 11, 2018 17:33
Show Gist options
  • Save raprasad/906b3068cc99d448665f2c1383611a0c to your computer and use it in GitHub Desktop.
Save raprasad/906b3068cc99d448665f2c1383611a0c to your computer and use it in GitHub Desktop.
schema draft 2
{
"type":"object",
"properties":{
"dataset":{
"type":"object",
"properties":{
"description":{
"type":"string"
},
"unitOfAnalysis":{
"type":"string"
},
"structure":{
"type":"string",
"enum":[
"wide",
"long",
"unknown"
]
},
"rowCount":{
"type":"integer",
"description":"Number of rows or observations in the dataset"
},
"variableCount":{
"type":"integer",
"description":"Number of variables or features or columns in the dataset"
},
"dataSource":{
"description":"Object containing descriptions of original source of dataset",
"type":[
"object",
"null"
],
"oneOf":[
{
"properties":{
"name":{
"description":"Name such as a file name or database table name",
"type":"string"
},
"type":{
"description":"Type of data source",
"type":"string",
"enum":[
"File",
"SQL database"
]
},
"format":{
"description":"Pertains to the type. For example, a file that is \"text/csv\" or a SQL database that is \"Postgres version 9.1\"",
"type":"string"
},
"filesize":{
"description":"Filesize in bytes. (optional)",
"type":"integer"
}
},
"required":[
"name",
"type"
]
},
{
"type":"null"
}
]
},
"citation":{
"description":"Object containing a schema.org Dataset description",
"type":"object",
"properties":{
"name":{
"description":"Name of the dataset",
"type":"string"
}
}
}
},
"required":[
"description",
"unitOfAnalysis",
"structure",
"rowCount",
"variableCount"
]
},
"variables":{
"type":"object",
"patternProperties":{
"^[_a-zA-Z0-9]+$":{
"type":"object",
"properties":{
"variableName":{
"type":"string"
},
"description":{
"type":"string",
"description":"Brief explanation of the variable "
},
"numchar":{
"type":"string",
"description":"Describes the variable as numeric or character valued",
"enum":[
"character",
"numeric"
]
},
"nature":{
"type":"string",
"description":"Describes the classification of data into Nominal, Ordinal, Ratio, Interval, Percentage.",
"enum":[
"interval",
"nominal",
"ordinal",
"percent",
"ratio",
"other"
]
},
"binary":{
"type":"boolean",
"description":"Signifies that the data can only take two values"
},
"interval":{
"type":"string",
"description":"Describes numeric variables as either continuously valued, or discrete valued",
"enum":[
"continuous",
"discrete"
]
},
"time":{
"description":"Signifies that the variable describes points in time",
"oneOf":[
{
"type":"boolean"
},
{
"type":"string",
"enum":[
"unknown"
]
}
]
},
"invalidCount":{
"type":"integer",
"description":"Counts the number of invalid observations, including missing values, nulls, NA's and any observation with a value enumerated in invalidSpecialCodes"
},
"invalidSpecialCodes":{
"type":"array",
"items":{
"type":"number"
},
"description":"Any numbers that represent invalid observations"
},
"validCount":{
"type":"integer",
"description":"Counts the number of valid observations"
},
"uniqueCount":{
"type":"integer",
"description":"Count of unique values, including invalid signifiers"
},
"median":{
"type":[
"number",
"string"
],
"description":"A central value in the distribution such that there are as many values equal or above, as there are equal or below this value.",
"oneOf":[
{
"type":"number"
},
{
"type":"string",
"enum":[
"NA"
]
}
]
},
"mean":{
"type":[
"number",
"string"
],
"description":"Average of all numeric values, which are not contained in invalidSpecialCodes",
"oneOf":[
{
"type":"number"
},
{
"type":"string",
"enum":[
"NA"
]
}
]
},
"max":{
"type":[
"number",
"string"
],
"description":"Largest numeric value observed in dataset, that is not contained in invalidSpecialCodes",
"oneOf":[
{
"type":"number"
},
{
"type":"string",
"enum":[
"NA"
]
}
]
},
"min":{
"type":[
"number",
"string"
],
"description":"Least numeric value observed in dataset, that is not contained in invalidSpecialCodes",
"oneOf":[
{
"type":"number"
},
{
"type":"string",
"enum":[
"NA"
]
}
]
},
"mode":{
"type":[
"array",
"string"
],
"description":"Value that occurs most frequently. Multiple values in the case of ties.",
"oneOf":[
{
"type":"array",
"items":{
"type":"number"
}
},
{
"type":"array",
"items":{
"type":"string"
}
},
{
"type":"string",
"enum":[
"NA"
]
}
]
},
"modeFreq":{
"type":[
"integer",
"string"
],
"description":"Number of times value of mode is observed in variable",
"oneOf":[
{
"type":"integer"
},
{
"type":"string",
"enum":[
"NA"
]
}
]
},
"fewestValues":{
"type":[
"array",
"string"
],
"description":"Value that occurs least frequently. Multiple values in the case of ties.",
"oneOf":[
{
"type":"array",
"items":{
"type":"number"
}
},
{
"type":"array",
"items":{
"type":"string"
}
},
{
"type":"string",
"enum":[
"NA"
]
}
]
},
"fewestFreq":{
"type":[
"integer",
"string"
],
"description":"Number of times value of fewestValues is observed in variable",
"oneOf":[
{
"type":"integer"
},
{
"type":"string",
"enum":[
"NA"
]
}
]
},
"midpoint":{
"type":[
"number",
"string"
],
"description":"The value equidistant from the reported min and max values",
"oneOf":[
{
"type":"number"
},
{
"type":"string",
"xenum":[
"NA"
]
}
]
},
"midpointFreq":{
"type":[
"integer",
"string"
],
"description":"Number of observations with value equal to minpoint",
"oneOf":[
{
"type":"integer"
},
{
"type":"string",
"enum":[
"NA"
]
}
]
},
"stdDev":{
"type":[
"number",
"string"
],
"description":"Standard deviation of the values, measuring the spread between values, specifically using population formula",
"oneOf":[
{
"type":"number"
},
{
"type":"string",
"enum":[
"NA"
]
}
]
},
"herfindahlIndex":{
"type":[
"number",
"string"
],
"description":"Measure of heterogeneity of a categorical variable which gives the probability that any two randomly sampled observations have the same value",
"oneOf":[
{
"type":"number"
},
{
"type":"string",
"enum":[
"NA"
]
}
]
},
"plotValues":{
"type":[
"object",
"string"
],
"description":"Plot points of a bar chart for tracing distribution of variable",
"oneOf":[
{
"type":"object"
},
{
"type":"string",
"enum":[
"NA"
]
}
]
},
"pdfPlotType":{
"type":[
"string",
"null"
],
"description":"Describes default type of plot appropriate to represent distribution of variable",
"oneOf":[
{
"type":"string",
"enum":[
"bar",
"continuous",
"NA"
]
},
{
"type":"null"
}
]
},
"pdfPlotX":{
"type":[
"array",
"null"
],
"description":"Plot points along x dimension for tracing distribution of variable",
"oneOf":[
{
"type":"array",
"items":{
"type":"number"
}
},
{
"type":"null"
}
]
},
"pdfPlotY":{
"type":[
"array",
"null"
],
"description":"Plot points along y dimension for tracing distribution of variable",
"oneOf":[
{
"type":"array",
"items":{
"type":"number"
}
},
{
"type":"null"
}
]
},
"cdfPlotType":{
"type":[
"string",
"null"
],
"description":"Describes default type of plot appropriate to represent cumulative distribution of variable",
"oneOf":[
{
"type":"string",
"enum":[
"bar",
"continuous",
"NA"
]
},
{
"type":"null"
}
]
},
"cdfPlotX":{
"type":[
"array",
"null"
],
"description":"Plot points along x dimension for tracing cumulative distribution of variable",
"oneOf":[
{
"type":"array",
"items":{
"type":"number"
}
},
{
"type":"null"
}
]
},
"cdfPlotY":{
"type":[
"array",
"null"
],
"description":"Plot points along y dimension for tracing cumulative distribution of variable",
"oneOf":[
{
"type":"array",
"items":{
"type":"number"
}
},
{
"type":"null"
}
]
},
"interpretation":{
"type":[
"object",
"string"
],
"description":"Object containing descriptors to interpret variable",
"oneOf":[
{
"type":"object"
},
{
"type":"string",
"enum":[
"NA"
]
}
]
},
"tworavens":{
"type":[
"object",
"string"
],
"description":"Object containing metadata specifically used by TwoRavens platform",
"oneOf":[
{
"type":"object"
},
{
"type":"string",
"enum":[
"NA"
]
}
]
}
},
"required":[
"variableName",
"description",
"numchar",
"nature",
"binary",
"interval",
"time",
"invalidCount",
"validCount",
"uniqueCount",
"median",
"mean",
"max",
"min",
"mode",
"modeFreq",
"fewestValues",
"fewestFreq",
"midpoint",
"midpointFreq",
"stdDev",
"herfindahlIndex",
"plotValues",
"pdfPlotType",
"pdfPlotX",
"pdfPlotY",
"cdfPlotType",
"cdfPlotX",
"cdfPlotY"
]
}
},
"additionalProperties":false,
"minProperties":1
}
},
"required":[
"dataset",
"variables"
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment