Some of the rows have more columns than the first row of column names.
writeLines(
"a\tb\tc\td\te\tf
1\t2\t3\t4\t5\t6\t
1\t2\t3\t4\t5\t6\t7\t
1\t2\t3\t4\t5\t6\t7\t8
1\t2\t3\t4\t5\t6\t7\t8\t9\t10\t11
1\t2\t3\t4\t5\t6
",
"test.tsv"
)
Note how the last column has entries with tabs \t
in it:
# 2021+ behavior
readr::with_edition(
2,
readr::read_tsv("test.tsv", show_col_types = FALSE)
)
#> Warning: One or more parsing issues, call `problems()` on your data frame for details,
#> e.g.:
#> dat <- vroom(...)
#> problems(dat)
#> # A tibble: 5 × 6
#> a b c d e f
#> <dbl> <dbl> <dbl> <dbl> <dbl> <chr>
#> 1 1 2 3 4 5 "6"
#> 2 1 2 3 4 5 "6\t7"
#> 3 1 2 3 4 5 "6\t7\t8"
#> 4 1 2 3 4 5 "6\t7\t8\t9\t10\t11"
#> 5 1 2 3 4 5 "6"
These superfluous cells would be lopped off in the past.
# 2020 behavior, before vroom's parsing engine was adopted
readr::with_edition(
1,
readr::read_tsv("test.tsv", show_col_types = FALSE)
)
#> Warning: 4 parsing failures.
#> row col expected actual file
#> 1 -- 6 columns 7 columns 'test.tsv'
#> 2 -- 6 columns 8 columns 'test.tsv'
#> 3 -- 6 columns 8 columns 'test.tsv'
#> 4 -- 6 columns 11 columns 'test.tsv'
#> # A tibble: 5 × 6
#> a b c d e f
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 1 2 3 4 5 6
#> 2 1 2 3 4 5 6
#> 3 1 2 3 4 5 6
#> 4 1 2 3 4 5 6
#> 5 1 2 3 4 5 6
A sensible solution would be read each line separate as a tsv and bind them together.
raw_lines <- readr::read_lines("test.tsv")
rows <- raw_lines |>
tail(-1) |>
lapply(
function(x) {
readr::read_tsv(
I(x),
col_types = readr::cols(.default = readr::col_character()),
col_names = FALSE,
show_col_types = FALSE
)
}
) |>
purrr::list_rbind()
rows
#> # A tibble: 5 × 11
#> X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11
#> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
#> 1 1 2 3 4 5 6 <NA> <NA> <NA> <NA> <NA>
#> 2 1 2 3 4 5 6 7 <NA> <NA> <NA> <NA>
#> 3 1 2 3 4 5 6 7 8 <NA> <NA> <NA>
#> 4 1 2 3 4 5 6 7 8 9 10 11
#> 5 1 2 3 4 5 6 <NA> <NA> <NA> <NA> <NA>
Then do some surgery to preserve the column names we do have.
first_row <- readr::read_tsv(
I(head(raw_lines, 2)),
show_col_types = FALSE
)
#> Warning: One or more parsing issues, call `problems()` on your data frame for details,
#> e.g.:
#> dat <- vroom(...)
#> problems(dat)
names(rows)[seq_along(colnames(first_row))] <- colnames(first_row)
readr::type_convert(rows)
#>
#> ── Column specification ────────────────────────────────────────────────────────
#> cols(
#> a = col_double(),
#> b = col_double(),
#> c = col_double(),
#> d = col_double(),
#> e = col_double(),
#> f = col_double(),
#> X7 = col_double(),
#> X8 = col_double(),
#> X9 = col_double(),
#> X10 = col_double(),
#> X11 = col_double()
#> )
#> # A tibble: 5 × 11
#> a b c d e f X7 X8 X9 X10 X11
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 1 2 3 4 5 6 NA NA NA NA NA
#> 2 1 2 3 4 5 6 7 NA NA NA NA
#> 3 1 2 3 4 5 6 7 8 NA NA NA
#> 4 1 2 3 4 5 6 7 8 9 10 11
#> 5 1 2 3 4 5 6 NA NA NA NA NA
The meltr package will read in the tsv into a long format.
m <- meltr::melt_tsv("test.tsv")
#> Registered S3 methods overwritten by 'meltr':
#> method from
#> print.date_names readr
#> print.locale readr
m
#> # A tibble: 50 × 4
#> row col data_type value
#> <dbl> <dbl> <chr> <chr>
#> 1 1 1 character a
#> 2 1 2 character b
#> 3 1 3 character c
#> 4 1 4 character d
#> 5 1 5 character e
#> 6 1 6 logical f
#> 7 2 1 integer 1
#> 8 2 2 integer 2
#> 9 2 3 integer 3
#> 10 2 4 integer 4
#> # ℹ 40 more rows
So we can pivot this into a nice format and have the row-binding happen automatically.
m_wide <- m |>
dplyr::filter(row != 1) |>
tidyr::pivot_wider(
id_cols = row,
names_from = col,
values_from = value
) |>
dplyr::select(-row)
m_wide
#> # A tibble: 9 × 11
#> `1` `2` `3` `4` `5` `6` `7` `8` `9` `10` `11`
#> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
#> 1 1 2 3 4 5 6 <NA> <NA> <NA> <NA> <NA>
#> 2 <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
#> 3 1 2 3 4 5 6 7 <NA> <NA> <NA> <NA>
#> 4 <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
#> 5 1 2 3 4 5 6 7 8 <NA> <NA> <NA>
#> 6 1 2 3 4 5 6 7 8 9 10 11
#> 7 1 2 3 4 5 6 <NA> <NA> <NA> <NA> <NA>
#> 8 <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
#> 9 <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
Let’s merge the names we do know into the column names and apply readr’s name repair strategy.
current_names <- m |>
dplyr::filter(row == 1) |>
dplyr::pull(value)
new_names <- rep("", length(names(m_wide)))
new_names[seq_along(current_names)] <- current_names
names(m_wide) <- vctrs::vec_as_names(new_names, repair = "unique_quiet")
readr::type_convert(m_wide)
#>
#> ── Column specification ────────────────────────────────────────────────────────
#> cols(
#> a = col_double(),
#> b = col_double(),
#> c = col_double(),
#> d = col_double(),
#> e = col_double(),
#> f = col_double(),
#> ...7 = col_double(),
#> ...8 = col_double(),
#> ...9 = col_double(),
#> ...10 = col_double(),
#> ...11 = col_double()
#> )
#> # A tibble: 9 × 11
#> a b c d e f ...7 ...8 ...9 ...10 ...11
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 1 2 3 4 5 6 NA NA NA NA NA
#> 2 NA NA NA NA NA NA NA NA NA NA NA
#> 3 1 2 3 4 5 6 7 NA NA NA NA
#> 4 NA NA NA NA NA NA NA NA NA NA NA
#> 5 1 2 3 4 5 6 7 8 NA NA NA
#> 6 1 2 3 4 5 6 7 8 9 10 11
#> 7 1 2 3 4 5 6 NA NA NA NA NA
#> 8 NA NA NA NA NA NA NA NA NA NA NA
#> 9 NA NA NA NA NA NA NA NA NA NA NA
Created on 2024-07-25 with reprex v2.1.1