Last active
February 4, 2020 20:38
-
-
Save cimentadaj/8d50f7863fb1922840fe19fbd829a40c to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Make sure to install both | |
# devtools::install_github("timriffe/DDSQLtools") | |
library(DDSQLtools) | |
# install.packages("tictoc") | |
library(tictoc) | |
############################# issue 1 ######################################### | |
############################################################################### | |
# Works | |
myLocations <- c(28,492,570,674,308,96,196,8,376,662,670,642,84,188,442,100,192, | |
170,414,616,320,480,218,818,222,300,558,40,52,348)#470) | |
tic() | |
deaths <- get_recorddata(dataProcessIds = "Register", | |
indicatorTypeIds = "Deaths by age and sex", | |
locIds = myLocations, | |
locAreaTypeIds = "Whole area", | |
subGroupIds = "Total or All groups", | |
isComplete = "Abridged", | |
startYear = 1920, | |
endYear = 2020) | |
toc() | |
deaths2 <- get_recorddata(dataProcessIds = 9, startYear = 1920, endYear = 2020, indicatorTypeIds = 20, isComplete = 0, locIds = c(28, 492, 570, 674, 308, 96, 196, 8, 376, 662, 670, 642, 84, 188, 442, 100, 192, 170, 414, 616, 320, 480, 218, 818, 222, 300, 558, 40, 52, 348, 470), locAreaTypeIds = 2, subGroupIds = 2) | |
# Doesn't work (only adds one location) | |
myLocations <- c(28)#492,570,674,308,96,196,8,376,662,670,642,84,188,442,100,192,170,414,616,320,480,218,818,222,300,558,40,52,348,470,591) | |
deaths <- get_recorddata(dataProcessIds = "Register", | |
indicatorTypeIds = "Deaths by age and sex", | |
locIds = myLocations, | |
locAreaTypeIds = "Whole area", | |
subGroupIds = "Total or All groups", | |
isComplete = "Abridged", | |
startYear = 1920, | |
endYear = 2020) | |
## TODO! | |
## This error is because the URL is too long and the API breaks. | |
## Jorge: Add more documentation about string limit and the number of selected number of countries. | |
## Figure out the maximum length of the string and whether you can send a warning. | |
############################# issue 2 ######################################### | |
############################################################################### | |
# All your locations | |
myLocations <- c(28, 492, 570, 674, 308, 96, 196, 8, 376, 662, 670, 642, 84, | |
188, 442, 100, 192, 170, 414, 616, 320, 480, 218, 818, 222, | |
300, 558, 40, 52, 348, 470, 620, 702, 858, 56, 780, 388, 246, | |
352, 591, 144, 862) | |
tic() | |
# Loop through each location with `lapply` | |
deaths <- lapply(myLocations, function(x) { | |
# Measure time of beginning | |
tic() | |
res <- get_recorddata(dataProcessIds = 9, | |
startYear = 1920, | |
endYear = 2020, | |
indicatorTypeIds = 20, | |
isComplete = 0, | |
locIds = x, | |
locAreaTypeIds = 2, | |
subGroupIds = 2) | |
# Print time it took to make the request | |
cat("Country", x, ":") | |
toc() | |
# return the result | |
res | |
}) | |
toc() | |
# It took 26 minutes | |
# Merge all separate country data frames into | |
# one data frame. | |
deaths_bind <- do.call(rbind, deaths) | |
deaths_bind | |
## DONE! | |
## Jorge: Put examples on how to download several countries and how to deal with | |
## downloading a lot of data. Avoiding loops, avoiding string characters, etc... | |
## Add base/tidyverse examples in all vignettes | |
############################# issue 3 ######################################### | |
############################################################################### | |
## Check what is the default of isComplete because you should be able to | |
## request this data without specifying isComplete. You shouldn't need to specify | |
## isComplete in indicators with (sex, moratality). | |
## DONE! | |
## Put down notes to send to Dennis | |
## http://24.239.36.16:9654/un3/api/structureddatarecords?dataProcessIds=9&startYear=1920&endYear=2020&indicatorTypeIds=14&isComplete=2&locIds=28&locAreaTypeIds=2&subGroupIds=2 | |
births <- lapply(myLocations, function(x) { | |
tic() | |
res <- get_recorddata(dataProcessIds = 9, | |
startYear = 1920, | |
endYear = 2020, | |
indicatorTypeIds = 14, # Change this to 14, Births by sex | |
isComplete = 2, # Change this to 2 (Total) rather than Abridged | |
locIds = x, | |
locAreaTypeIds = 2, | |
subGroupIds = 2) | |
cat("Country", x, ":") | |
toc() | |
res | |
}) | |
# Takes 2 minutes | |
births_bind <- do.call(rbind, births) | |
births_bind | |
############################# issue 4 ######################################### | |
############################################################################### | |
## DONE! | |
## Jorge: This comes from the API but make sure that the API is returning all columns | |
## and you're not limiting the number of columns. In particular, we'd like to get | |
## the indicatorID because the indicatorTypeID is a broad group of indicator. | |
## Why are some of these columnas complete NA? Make sure that it comes from the API | |
## or R. | |
############################# issue 5 ######################################### | |
############################################################################### | |
## DONE! | |
## Would it be possible to add indicatorID as parameter? Check swagger and the | |
## API docs. If so, add as a parameter in `get_recordata`. | |
# For example: | |
res <- get_recorddata(dataProcessIds = 9, | |
startYear = 1920, | |
endYear = 2020, | |
indicatorTypeIds = c(16, 20), # Births by age of mother and deaths by age and sex | |
isComplete = 0, # Change this back to abridged | |
locIds = 28, | |
locAreaTypeIds = 2, | |
subGroupIds = 2) | |
# Both indicators | |
unique(res$indicatorName) | |
############################# issue 6 ######################################### | |
############################################################################### | |
## DONE! | |
## Where is created SeriesID? What does it mean? What is it really measuring and is | |
## stable? | |
############################# issue 7 ######################################### | |
############################################################################### | |
## DONE! | |
## This already comes form the API as 0. For example, search for "DataCatalogID" | |
## in this request: http://24.239.36.16:9654/un3/api/structureddatarecords?dataProcessIds=9&startYear=1920&endYear=2020&indicatorTypeIds=14&isComplete=2&locIds=28&locAreaTypeIds=2&subGroupIds=2 | |
############################# issue 8 ######################################### | |
############################################################################### | |
res <- get_recorddata(dataProcessIds = "Census", | |
startYear = 1920, | |
endYear = 2020, | |
indicatorTypeIds = 7, | |
isComplete = "Total", | |
locIds = "Denmark", | |
locAreaTypeIds = "Whole area", | |
subGroupIds = "Total or All groups") | |
# Should be now DD/MM/YYYY, documented in `get_recorddata`. | |
unique(res$TimeStart) | |
unique(res$TimeEnd) | |
## TODO | |
# Check that the date is the same format from other countries | |
############################# issue 9 ######################################### | |
############################################################################### | |
## DONE! | |
## from the API | |
############################# issue 10 ######################################## | |
############################################################################### | |
## TODO | |
# Ok, so we want a new order for the fields. This could be done from | |
# the R package or from the DB. We can sort it out and we'll adapt | |
# DDSQLtools. | |
# Make sure the order of columns is (reduced) and have the same order as the | |
# front-end DB. | |
# An idea would be to reduce the two columns (id and label) into a factor | |
# or a labelled vector from haven. | |
res <- get_recorddata(dataProcessIds = 9, # Register | |
startYear = 1920, | |
endYear = 2020, | |
indicatorTypeIds = 14, # Births by sex | |
isComplete = 2, # Total | |
locIds = 28, # Antigua and Barbuda | |
locAreaTypeIds = 2, # Whole area | |
subGroupIds = 2) # Total | |
## res_two <- | |
## res %>% | |
## select(matches("[a-z]ID$"), matches("[a-z]Name")) %>% | |
## as_tibble() %>% | |
## names() | |
## almost <- table(gsub("ID|Name", "", res_two)) | |
## vars_keep <- names(almost[almost == 2]) | |
## dput(setNames(paste0(vars_keep, "ID"), | |
## paste0(vars_keep, "Name"))) | |
# Error happens here. Waiting for Dennis comment | |
res <- get_recorddata(dataProcessIds = 2, | |
indicatorTypeIds = 8, | |
isComplete = 0, | |
locIds = 156, | |
locAreaTypeIds = 2, | |
subGroupIds = 2) | |
############################# issue 11 ######################################## | |
############################################################################### | |
## DONE! | |
# Yes, we're following the API. If needed, let us know and we'll switch. | |
############################# issue 12 ######################################## | |
############################################################################### | |
## DONE! | |
## Check whether the order columns are being returned and if not, contact | |
## Dennis about adding these fields in the response. | |
############################# Notes ########################################### | |
############################################################################### | |
## ## Issue 3 | |
## For some indicators, we shouldn't need to specify `isComplete`. For example: | |
## http://24.239.36.16:9654/un3/api/structureddatarecords?dataProcessIds=9&startYear=1920&endYear=2020&indicatorTypeIds=14&locIds=28&locAreaTypeIds=2&subGroupIds=2 | |
## Throws an error when not specifying `isComplete` but not when set | |
## http://24.239.36.16:9654/un3/api/structureddatarecords?dataProcessIds=9&startYear=1920&endYear=2020&indicatorTypeIds=14&isComplete=2&locIds=28&locAreaTypeIds=2&subGroupIds=2 | |
## By default all indicator should have `Total` as default because none of them are disaggregated by age distribution in the DB. I think this applies to all indicators the we're currently supporting in the API. | |
## ## Issue 4 | |
## When request the `indicatortypes` endpoint (http://24.239.36.16:9654/un3/api/indicatortypes), Patrick mentioned that we're not getting some columns they need. Currently we have: | |
## PK_IndicatorTypeID, ComponentID, ComponentName, Name, ShortName, SortOrder, Description | |
## And we should have: | |
## ComponentID, ComponentName, IndicatorTypeID (not labeled PK_IndicatorTypeID), IndicatorTypeName, IndicatorID, IndicatorName, IndicatorShortName, SortOrder, Description | |
## The comparison: | |
## ComponentID = is ok | |
## ComponentName = is ok | |
## PK_IndicatorTypeID = should be IndicatorTypeID | |
## IndicatorTypeName = I think we're missing this one (not sure if it's the current field `Name`) | |
## IndicatorID = currently missing from the response | |
## IndicatorName = I presume this is what we currently have as `Name` which should be renamed | |
## IndicatorShortName = I presume this is our current `ShortName`. It should be renamed and it is currently **empty**. It shouldn't be. | |
## SortOrder = is ok | |
## Description = name is ok but it is currently **empty**. It shouldn't be. | |
## Patrick, should the columns come in the same order as outlined here? If so, then we'll need to adapt the order of the response. | |
## Note from Jorge: Why are these fields empty in the `structureddatarecords`? IndicatorShortName, ReferenceYearEnd, ReferenceYearMid, FieldWorkStart, FieldWorkEnd, FieldWorkMiddle, PeriodStart, PeriodEnd, PeriodSpan, PeriodMiddle, Weight. Should they be? | |
## For example, from this request: http://24.239.36.16:9654/un3/api/structureddatarecords?dataProcessIds=9&startYear=1920&endYear=2020&indicatorTypeIds=14&isComplete=2&locIds=492&locAreaTypeIds=2&subGroupIds=2 | |
## ## Issue 5 | |
## In the current `structureddatarecords` endpoint there's no parameter that allows to request `IndicatorID`-- only `indicatorTypeIds`. See the parameter list in swagger (http://24.239.36.16:9654/un3/swagger/ui/index#!/StructuredData/StructuredData_GetStructuredDataRecords). Patrick mentioned that they'll need that parameter as well. | |
## Patrick: I've renamed the previous `get_indicators` to be `get_indicatortypes` and created a new `get_indicators` that returns the available `IndicatorID` | |
## ## Issue 6 | |
## Adapting Patrick's message (all of this is based on the result to a response from `structureddatarecords`). Patrick, please correct me if there's something wrong. | |
## For ages, SQL has already a field for SortOrder (I believe he refers to the column `agesort`, which is currently constant). | |
## Column `id`, it is unclear what this is about. I presume it refers to dynamic IDs created by the SQL/API. Currently it is only with 0 in R, and in anycase it shoud be a more meaningful name if it refers to a series ID, may be SeriesID. Ideally it would be best if such SeriesID would be stable and between different queries return the same ID for the same specific combination of keys. I don’t think it is currently the case. (Note from Jorge: I've checked this on the API and R and from the request I made they all return 1 rather than 0). | |
## PK_SubGroupID PK_SubGroupTypeID PK_DataProcessTypeID: these fields are unnecessary since you have the right version without PK_ prefix already with the right values | |
## DataProcessID is missing and PK_DataProcessID currently has 0 and should not appear. | |
## All SQL fields with PK_ prefix must be simplified without PK_ prefix. | |
## Note from Jorge: Towards the middle of our call we discussed the conecpt of the SeriesID and how it's a bit unclear what it actually means and why it is not being returned. | |
## ## Issue 7 | |
## DataCatalogID is always 0 in every request: http://24.239.36.16:9654/un3/api/structureddatarecords?dataProcessIds=9&startYear=1920&endYear=2020&indicatorTypeIds=14&isComplete=2&locIds=28&locAreaTypeIds=2&subGroupIds=2 | |
## Patrick mentioned that it shouldn't be 0. | |
## ## Issue 9 | |
## Copying Patrick's answer: For some recordsets our coding of the dates is incorrect for some vital events like births and deaths (e.g., same date for TimeStart and TimeEnd and TimeDuration=0) for some data sources (e.g., UNSD, Demographic Yearbook). We will correct our SQL database. | |
## Not sure if that's on the API side or the person responsible for the database. Just leaving it here in case Patrick wants to comment. | |
## ## Issue 12 | |
## Copying Patrick's answer: Get from SQL/API extra set of SortOrder fields for selected set of fields – Dennis most likely needs to add them into the standard SQL DB export structure after each of the fieldname with SortORder as suffix for each of them: | |
## Sex | |
## Age | |
## DataProcess | |
## DataProcessType | |
## DataStatus | |
## StatisticalConcept | |
## DataReliability | |
## DataType | |
## DataSource | |
## **I believe this is from the result of structureddatarecords** |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment