Some of the rows have more columns than the first row of column names.
writeLines(
"a\tb\tc\td\te\tf
1\t2\t3\t4\t5\t6\t
1\t2\t3\t4\t5\t6\t7\t
1\t2\t3\t4\t5\t6\t7\t8
1\t2\t3\t4\t5\t6\t7\t8\t9\t10\t11
1\t2\t3\t4\t5\t6
",
"test.tsv"
)Note how the last column has entries with tabs \t in it:
# 2021+ behavior
readr::with_edition(
2,
readr::read_tsv("test.tsv", show_col_types = FALSE)
)
#> Warning: One or more parsing issues, call `problems()` on your data frame for details,
#> e.g.:
#> dat <- vroom(...)
#> problems(dat)
#> # A tibble: 5 × 6
#> a b c d e f
#> <dbl> <dbl> <dbl> <dbl> <dbl> <chr>
#> 1 1 2 3 4 5 "6"
#> 2 1 2 3 4 5 "6\t7"
#> 3 1 2 3 4 5 "6\t7\t8"
#> 4 1 2 3 4 5 "6\t7\t8\t9\t10\t11"
#> 5 1 2 3 4 5 "6"These superfluous cells would be lopped off in the past.
# 2020 behavior, before vroom's parsing engine was adopted
readr::with_edition(
1,
readr::read_tsv("test.tsv", show_col_types = FALSE)
)
#> Warning: 4 parsing failures.
#> row col expected actual file
#> 1 -- 6 columns 7 columns 'test.tsv'
#> 2 -- 6 columns 8 columns 'test.tsv'
#> 3 -- 6 columns 8 columns 'test.tsv'
#> 4 -- 6 columns 11 columns 'test.tsv'
#> # A tibble: 5 × 6
#> a b c d e f
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 1 2 3 4 5 6
#> 2 1 2 3 4 5 6
#> 3 1 2 3 4 5 6
#> 4 1 2 3 4 5 6
#> 5 1 2 3 4 5 6A sensible solution would be read each line separate as a tsv and bind them together.
raw_lines <- readr::read_lines("test.tsv")
rows <- raw_lines |>
tail(-1) |>
lapply(
function(x) {
readr::read_tsv(
I(x),
col_types = readr::cols(.default = readr::col_character()),
col_names = FALSE,
show_col_types = FALSE
)
}
) |>
purrr::list_rbind()
rows
#> # A tibble: 5 × 11
#> X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11
#> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
#> 1 1 2 3 4 5 6 <NA> <NA> <NA> <NA> <NA>
#> 2 1 2 3 4 5 6 7 <NA> <NA> <NA> <NA>
#> 3 1 2 3 4 5 6 7 8 <NA> <NA> <NA>
#> 4 1 2 3 4 5 6 7 8 9 10 11
#> 5 1 2 3 4 5 6 <NA> <NA> <NA> <NA> <NA>Then do some surgery to preserve the column names we do have.
first_row <- readr::read_tsv(
I(head(raw_lines, 2)),
show_col_types = FALSE
)
#> Warning: One or more parsing issues, call `problems()` on your data frame for details,
#> e.g.:
#> dat <- vroom(...)
#> problems(dat)
names(rows)[seq_along(colnames(first_row))] <- colnames(first_row)
readr::type_convert(rows)
#>
#> ── Column specification ────────────────────────────────────────────────────────
#> cols(
#> a = col_double(),
#> b = col_double(),
#> c = col_double(),
#> d = col_double(),
#> e = col_double(),
#> f = col_double(),
#> X7 = col_double(),
#> X8 = col_double(),
#> X9 = col_double(),
#> X10 = col_double(),
#> X11 = col_double()
#> )
#> # A tibble: 5 × 11
#> a b c d e f X7 X8 X9 X10 X11
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 1 2 3 4 5 6 NA NA NA NA NA
#> 2 1 2 3 4 5 6 7 NA NA NA NA
#> 3 1 2 3 4 5 6 7 8 NA NA NA
#> 4 1 2 3 4 5 6 7 8 9 10 11
#> 5 1 2 3 4 5 6 NA NA NA NA NAThe meltr package will read in the tsv into a long format.
m <- meltr::melt_tsv("test.tsv")
#> Registered S3 methods overwritten by 'meltr':
#> method from
#> print.date_names readr
#> print.locale readr
m
#> # A tibble: 50 × 4
#> row col data_type value
#> <dbl> <dbl> <chr> <chr>
#> 1 1 1 character a
#> 2 1 2 character b
#> 3 1 3 character c
#> 4 1 4 character d
#> 5 1 5 character e
#> 6 1 6 logical f
#> 7 2 1 integer 1
#> 8 2 2 integer 2
#> 9 2 3 integer 3
#> 10 2 4 integer 4
#> # ℹ 40 more rowsSo we can pivot this into a nice format and have the row-binding happen automatically.
m_wide <- m |>
dplyr::filter(row != 1) |>
tidyr::pivot_wider(
id_cols = row,
names_from = col,
values_from = value
) |>
dplyr::select(-row)
m_wide
#> # A tibble: 9 × 11
#> `1` `2` `3` `4` `5` `6` `7` `8` `9` `10` `11`
#> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
#> 1 1 2 3 4 5 6 <NA> <NA> <NA> <NA> <NA>
#> 2 <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
#> 3 1 2 3 4 5 6 7 <NA> <NA> <NA> <NA>
#> 4 <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
#> 5 1 2 3 4 5 6 7 8 <NA> <NA> <NA>
#> 6 1 2 3 4 5 6 7 8 9 10 11
#> 7 1 2 3 4 5 6 <NA> <NA> <NA> <NA> <NA>
#> 8 <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
#> 9 <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>Let’s merge the names we do know into the column names and apply readr’s name repair strategy.
current_names <- m |>
dplyr::filter(row == 1) |>
dplyr::pull(value)
new_names <- rep("", length(names(m_wide)))
new_names[seq_along(current_names)] <- current_names
names(m_wide) <- vctrs::vec_as_names(new_names, repair = "unique_quiet")
readr::type_convert(m_wide)
#>
#> ── Column specification ────────────────────────────────────────────────────────
#> cols(
#> a = col_double(),
#> b = col_double(),
#> c = col_double(),
#> d = col_double(),
#> e = col_double(),
#> f = col_double(),
#> ...7 = col_double(),
#> ...8 = col_double(),
#> ...9 = col_double(),
#> ...10 = col_double(),
#> ...11 = col_double()
#> )
#> # A tibble: 9 × 11
#> a b c d e f ...7 ...8 ...9 ...10 ...11
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 1 2 3 4 5 6 NA NA NA NA NA
#> 2 NA NA NA NA NA NA NA NA NA NA NA
#> 3 1 2 3 4 5 6 7 NA NA NA NA
#> 4 NA NA NA NA NA NA NA NA NA NA NA
#> 5 1 2 3 4 5 6 7 8 NA NA NA
#> 6 1 2 3 4 5 6 7 8 9 10 11
#> 7 1 2 3 4 5 6 NA NA NA NA NA
#> 8 NA NA NA NA NA NA NA NA NA NA NA
#> 9 NA NA NA NA NA NA NA NA NA NA NACreated on 2024-07-25 with reprex v2.1.1