Skip to content

Instantly share code, notes, and snippets.

@raprasad
Last active May 11, 2018 12:15
Show Gist options
  • Save raprasad/d842f90aa205fd909bd987bb595c2e5d to your computer and use it in GitHub Desktop.
Save raprasad/d842f90aa205fd909bd987bb595c2e5d to your computer and use it in GitHub Desktop.
Variable schema, initial draft (incomplete)

This describes a single variable within the variables section.

Note: The interpretation and tworavens sections are placeholders in this version

{
   "type":"object",
   "patternProperties":{
      "^[_a-zA-Z0-9]+$":{
         "type":"object",
         "properties":{
            "variableName":{
               "type":"string"
            },
            "description":{
               "type":"string",
               "description":"Brief explanation of the variable "
            },
            "numchar":{
               "type":"string",
               "description":"Describes the variable as numeric or character valued",
               "enum":[
                  "character",
                  "numeric"
               ]
            },
            "nature":{
               "type":"string",
               "description":"Describes the classification of data into Nominal, Ordinal, Ratio, Interval, Percentage.",
               "enum":[
                  "interval",
                  "nominal",
                  "ordinal",
                  "percent",
                  "ratio",
                  "other"
               ]
            },
            "binary":{
               "type":"boolean",
               "description":"Signifies that the data can only take two values"
            },
            "interval":{
               "type":"string",
               "description":"Describes numeric variables as either continuously valued, or discrete valued",
               "enum":[
                  "continuous",
                  "discrete"
               ]
            },
            "time":{
               "type":"string",
               "description":"Signifies that the variable describes points in time",
               "enum":[
                  "no",
                  "yes",
                  "unknown"
               ]
            },
            "invalidCount":{
               "type":"integer",
               "description":"Counts the number of invalid observations, including missing values, nulls, NA's and any observation with a value enumerated in invalidSpecialCodes"
            },
            "invalidSpecialCodes":{
               "type":"array",
               "items":{
                 "type":"number"
               },
               "description":"Any numbers that represent invalid observations"
            },
            "validCount":{
               "type":"integer",
               "description":"Counts the number of valid observations"
            },
            "uniqueCount":{
               "type":"integer",
               "description":"Count of unique values, including invalid signifiers"
            },
            "median":{
               "type":[
                  "number",
                  "string"
               ],
               "description":"A central value in the distribution such that there are as many values equal or above, as there are equal or below this value.",
               "oneOf":[
                  {
                     "type":"number"
                  },
                  {
                     "type":"string",
                     "enum":[
                        "NA"
                     ]
                  }
               ]
            },
            "mean":{
               "type":[
                  "number",
                  "string"
               ],
               "description":"Average of all numeric values, which are not contained in invalidSpecialCodes",
               "oneOf":[
                  {
                     "type":"number"
                  },
                  {
                     "type":"string",
                     "enum":[
                        "NA"
                     ]
                  }
               ]
            },
            "max":{
               "type":[
                  "number",
                  "string"
               ],
               "description":"Largest numeric value observed in dataset, that is not contained in invalidSpecialCodes",
               "oneOf":[
                  {
                     "type":"number"
                  },
                  {
                     "type":"string",
                     "enum":[
                        "NA"
                     ]
                  }
               ]
            },
            "min":{
               "type":[
                  "number",
                  "string"
               ],
               "description":"Least numeric value observed in dataset, that is not contained in invalidSpecialCodes",
               "oneOf":[
                  {
                     "type":"number"
                  },
                  {
                     "type":"string",
                     "enum":[
                        "NA"
                     ]
                  }
               ]
            },
            "mode":{
               "type":[
                  "array",
                  "string"
               ],
               "description":"Value that occurs most frequently.  Multiple values in the case of ties.",
               "oneOf":[
                  {
                     "type":"array",
                     "items":{
                        "type":"number"
                     }
                  },
                  {
                     "type":"string",
                     "enum":[
                        "NA"
                     ]
                  }
               ]
            },
            "modeFreq":{
               "type":[
                  "integer",
                  "string"
               ],
               "description":"Number of times value of mode is observed in variable",
               "oneOf":[
                  {
                     "type":"integer"
                  },
                  {
                     "type":"string",
                     "enum":[
                        "NA"
                     ]
                  }
               ]
            },
            "fewestValues":{
               "type":[
                  "array",
                  "string"
               ],
               "description":"Value that occurs least frequently.  Multiple values in the case of ties.",
               "oneOf":[
                  {
                     "type":"array",
                     "items":{
                        "type":"number"
                     }
                  },
                  {
                     "type":"string",
                     "enum":[
                        "NA"
                     ]
                  }
               ]
            },
            "fewestFreq":{
               "type":[
                  "integer",
                  "string"
               ],
               "description":"Number of times value of fewestValues is observed in variable",
               "oneOf":[
                  {
                     "type":"integer"
                  },
                  {
                     "type":"string",
                     "enum":[
                        "NA"
                     ]
                  }
               ]
            },
            "midpoint":{
               "type":[
                  "number",
                  "string"
               ],
               "description":"The value equidistant from the reported min and max values",
               "oneOf":[
                  {
                     "type":"number"
                  },
                  {
                     "type":"string",
                     "enum":[
                        "NA"
                     ]
                  }
               ]
            },
            "midpointFreq":{
               "type":[
                  "integer",
                  "string"
               ],
               "description":"Number of observations with value equal to minpoint",
               "oneOf":[
                  {
                     "type":"integer"
                  },
                  {
                     "type":"string",
                     "enum":[
                        "NA"
                     ]
                  }
               ]
            },
            "stdDev":{
               "type":[
                  "number",
                  "string"
               ],
               "description":"Standard deviation of the values, measuring the spread between values, specifically using population formula",
               "oneOf":[
                  {
                     "type":"number"
                  },
                  {
                     "type":"string",
                     "enum":[
                        "NA"
                     ]
                  }
               ]
            },
            "herfindahlIndex":{
               "type":[
                  "number",
                  "string"
               ],
               "description":"Measure of heterogeneity of a categorical variable which gives the probability that any two randomly sampled observations have the same value",
               "oneOf":[
                  {
                     "type":"number"
                  },
                  {
                     "type":"string",
                     "enum":[
                        "NA"
                     ]
                  }
               ]
            },
            "plotValues":{
               "type":[
                  "object",
                  "string"
               ],
               "description":"Plot points of a bar chart for tracing distribution of variable",
               "oneOf":[
                  {
                     "type":"object"
                  },
                  {
                     "type":"string",
                     "enum":[
                        "NA"
                     ]
                  }
               ]
            },
            "plotType":{
               "type":"string",
               "description":"Describes default type of plot appropriate to represent distribution of variable",
               "enum":[
                  "bar",
                  "continuous"
               ]
            },
            "plotX":{
               "type":[
                  "array",
                  "string"
               ],
               "description":"Plot points along x dimension for tracing distribution of variable",
               "oneOf":[
                  {
                     "type":"array",
                     "items":{
                        "type":"number"
                     }
                  },
                  {
                     "type":"string",
                     "enum":[
                        "NA"
                     ]
                  }
               ]
            },
            "plotY":{
               "type":[
                  "array",
                  "string"
               ],
               "description":"Plot points along y dimension for tracing distribution of variable",
               "oneOf":[
                  {
                     "type":"array",
                     "items":{
                        "type":"number"
                     }
                  },
                  {
                     "type":"string",
                     "enum":[
                        "NA"
                     ]
                  }
               ]
            },
            "cdfPlotType":{
               "type":[
                  "string",
                  "string"
               ],
               "description":"Describes default type of plot appropriate to represent cumulative distribution of variable",
               "oneOf":[
                  {
                     "type":"string"
                  },
                  {
                     "type":"string",
                     "enum":[
                        "NA"
                     ]
                  }
               ]
            },
            "cdfPlotX":{
               "type":[
                  "array",
                  "string"
               ],
               "description":"Plot points along x dimension for tracing cumulative distribution of variable",
               "oneOf":[
                  {
                     "type":"array",
                     "items":{
                        "type":"number"
                     }
                  },
                  {
                     "type":"string",
                     "enum":[
                        "NA"
                     ]
                  }
               ]
            },
            "cdfPlotY":{
               "type":[
                  "array",
                  "string"
               ],
               "description":"Plot points along y dimension for tracing cumulative distribution of variable",
               "oneOf":[
                  {
                     "type":"array",
                     "items":{
                        "type":"number"
                     }
                  },
                  {
                     "type":"string",
                     "enum":[
                        "NA"
                     ]
                  }
               ]
            },
            "interpretation":{
               "type":[
                  "object",
                  "string"
               ],
               "description":"Object containing descriptors to interpret variable",
               "oneOf":[
                  {
                     "type":"object"
                  },
                  {
                     "type":"string",
                     "enum":[
                        "NA"
                     ]
                  }
               ]
            },
            "tworavens":{
               "type":[
                  "object",
                  "string"
               ],
               "description":"Object containing metadata specifically used by TwoRavens platform",
               "oneOf":[
                  {
                     "type":"object"
                  },
                  {
                     "type":"string",
                     "enum":[
                        "NA"
                     ]
                  }
               ]
            }
         },
         "required":[
            "variableName", "description", "numchar", "nature", "binary", "interval", "time", "invalidCount", "validCount", "uniqueCount", "median", "mean", "max", "min", "mode", "modeFreq", "fewestValues", "fewestFreq", "midpoint", "midpointFreq", "stdDev", "herfindahlIndex", "plotValues", "plotType", "plotX", "plotY", "cdfPlotType", "cdfPlotX", "cdfPlotY"
         ]
      }
   },
   "additionalProperties":false,
   "minProperties":1
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment