Skip to content

Instantly share code, notes, and snippets.

@jtrecenti
Created November 21, 2024 13:58
Show Gist options
  • Save jtrecenti/f67052f5ec8c229d2e40fe4563ee9524 to your computer and use it in GitHub Desktop.
Save jtrecenti/f67052f5ec8c229d2e40fe4563ee9524 to your computer and use it in GitHub Desktop.
# leitura do dicionário ---------------------------------------------------
f_dict <- "data-raw/dicionario_de_dados_agregados_por_setores_censitarios.xlsx"
nms <- readxl::excel_sheets(f_dict)
tabelas_dict <- nms |>
purrr::set_names() |>
purrr::map(\(x) readxl::read_excel(f_dict, x))
tabelas_dict[[1]] <- tabelas_dict[[1]] |>
dplyr::mutate(
Tipo = "-",
.before = Tema
)
dict_completo <- tabelas_dict[c(1, 3:5)] |>
dplyr::bind_rows(.id = "Tabela") |>
janitor::clean_names()
dict_siglas <- tabelas_dict[[2]] |>
janitor::clean_names()
# arquivos_zip ------------------------------------------------------------
u_base <- "https://ftp.ibge.gov.br/Censos/Censo_Demografico_2022/Agregados_por_Setores_Censitarios/Agregados_por_Setor_csv/"
f_links <- httr::GET(u_base) |>
httr::content() |>
xml2::xml_find_all("//a[contains(@href, '.zip')]") |>
xml2::xml_attr("href")
baixar_zip <- function(endpoint) {
u <- paste0(u_base, endpoint)
f_zip <- paste0("data-raw/agregados/zip/", basename(x))
if (!file.exists(f_zip)) {
httr::GET(u, httr::write_disk(f_zip, TRUE))
}
}
purrr::walk(f_links, baixar_zip, .progress = TRUE)
# dezipando ---------------------------------------------------------------
fs::dir_ls("data-raw/agregados/zip") |>
purrr::walk(\(x) {
zip::unzip(x, exdir = "data-raw/agregados/csv")
}, .progress = TRUE)
# transformando xlsx em parquet -------------------------------------------
f_csv <- fs::dir_ls("data-raw/agregados/csv")
ler_basico <- function(x) {
readr::read_csv2(
x,
na = c("", "NA", "."),
col_types = readr::cols(
v0001 = "n",
v0002 = "n",
v0003 = "n",
v0004 = "n",
v0005 = "n",
v0006 = "n",
v0007 = "n",
AREA_KM2 = "n",
.default = readr::col_character()
),
locale = readr::locale(
encoding = "latin1", decimal_mark = ",", grouping_mark = "."
),
show_col_types = FALSE
) |>
janitor::clean_names()
}
ler_variaveis <- function(x) {
if (stringr::str_detect(x, "alfabetizacao|domicilio[1-3]|demografia")) {
cls <- readr::cols(
CD_setor = "c",
.default = readr::col_number()
)
} else {
cls <- readr::cols(
CD_SETOR = "c",
.default = readr::col_number()
)
}
readr::read_csv2(
x,
col_types = cls,
locale = readr::locale(
encoding = "latin1", decimal_mark = ",", grouping_mark = "."
),
na = c("", "X", "NA")
) |>
janitor::clean_names()
}
ler_converter_parquet <- function(x) {
f_parquet <- x |>
fs::path_ext_set(".parquet") |>
stringr::str_replace("/csv/", "/parquet/")
if (!file.exists(f_parquet)) {
usethis::ui_info("Convertendo {basename(x)}...")
if (stringr::str_detect(x, "_basico_")) {
da <- ler_basico(x)
} else {
da <- ler_variaveis(x)
}
arrow::write_parquet(da, f_parquet)
rm(da)
}
f_parquet
}
purrr::walk(f_csv, ler_converter_parquet, .progress = TRUE)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment