20 newsgroups python/R

why use reticulate when you can make life really difficult for yourself?

Carry out the tdf-if procedure from https://umap-learn.readthedocs.io/en/latest/sparse.html, save it in a form palatable to R:

# change this
your_data_path = "/your/data/path"

import os
import sklearn.datasets
import sklearn.feature_extraction.text

ng20v = sklearn.datasets.fetch_20newsgroups_vectorized(subset="all")
ng20tfidf = sklearn.feature_extraction.text.TfidfTransformer(norm='l1').fit_transform(ng20v.data)

ng20tfidfcoo = ng20tfidf.tocoo()
ng20tfidfrc = np.array(list(zip(ng20tfidfcoo.row, ng20tfidfcoo.col)))
# save row/column (or i,j) ints
np.savetxt(os.path.join(your_data_path, "20NGtfidf-rc.csv"), ng20tfidfrc, delimiter=',')
# save actual float values
np.savetxt(os.path.join(your_data_path, "20NGtfidf-x.csv"), ng20tfidfcoo.data, delimiter=',')

# save the labels
np.savetxt(os.path.join(your_data_path, "20NGtfidf-y.csv"), ng20v.target)
np.savetxt(os.path.join(your_data_path, "20NGtfidf-ynames.csv"), ng20v.target_names, fmt = "%s")

over in R:

# change this
your_data_path <- "/your/data/path"

ng20rc <- read.csv(file.path(your_data_path, "20NGtfidf-rc.csv"), header = FALSE)
ng20x <- read.csv(file.path(your_data_path, "20NGtfidf-x.csv"), header = FALSE)
ng20tfidf <- cbind(ng20rc, ng20x)
colnames(ng20tfidf) <- c("row", "col", "tdfif")
ng20tfidfs <-
  sparseMatrix(
    i = ng20tfidf$row,
    j = ng20tfidf$col,
    x = ng20tfidf$tdfif,
    dims = c(max(ng20tfidf$row) + 1, max(ng20tfidf$col) + 1),
    index1 = FALSE
  )

# you might want to save ng20tfidfs for future processing
# save(file = file.path(your_data_path, "ng20tfidfs.Rda"), ng20tfidfs)

# 1-index the labels
ng20y <- read.csv(file.path(your_data_path, "20NGtfidf-y.csv"), header = FALSE)$V1 + 1
ng20ynames <- read.csv(file.path(your_data_path, "20NGtfidf-ynames.csv"), header = FALSE)$V1
ng20ng <- ng20ynames[ng20y]
ng20ng <- factor(ng20ng, levels = ng20ynames)

# Use irlba for a fast accurate truncated SVD
ng20tfidf_pca100 <- irlba::prcomp_irlba(ng20tfidfs,
    n = 100, retx = TRUE, center = TRUE,
    scale = FALSE
  )$x

# subtract 1 from ng20y to get the labeling back to 0 indexed
ng20t100 <- data.frame(ng20tfidf_pca100, factor(ng20y - 1), ng20ng)
colnames(ng20t100)[101:102] <- c("Label", "NG")

jlmelville/ng20.md