amir-rahnama · February 21, 2019 18:54 · amir-rahnama · Jun 26, 2016
diff --git a/create-ngrams.R b/create-ngrams.R
 source("fast-ngrams.R")
 con <- file("path_to_file", "r") 
 data <- readLines(con, encoding = 'UTF-8')
 close(con)

 data <- clean(data)
 onegram <- text_to_ngrams(decode(data), 1)
 bigram <- text_to_ngrams(decode(data), 2)
 trigram <- text_to_ngrams(decode(data, 3))

 # How to calculate ngrams for a term
 sum(blogs_ngram[,colnames(onegram) == 'term'])
diff --git a/fast-ngrams.R b/fast-ngrams.R
 library(stringi)
 library(Matrix)
 library(iconv)
 library(tm)

 find_ngrams <- function(dat, n=1, verbose=FALSE) {
  library(pbapply)
  stopifnot(is.list(dat))
  stopifnot(is.numeric(n))
  stopifnot(n>0)
  if(n == 1) return(dat)
  pblapply(dat, function(y) {
    if(length(y)<=1) return(y)
    c(y, unlist(lapply(2:n, function(n_i) {
      if(n_i > length(y)) return(NULL)
      do.call(paste, unname(as.data.frame(embed(rev(y), n_i), stringsAsFactors=FALSE)), quote=FALSE)
    })))
  })
 }



 text_to_ngrams <- function(sents, n=2){
  tokens <- stri_split_fixed(sents, ' ')
  tokens <- find_ngrams(tokens, n=n, verbose=TRUE)
  token_vector <- unlist(tokens)
  bagofwords <- unique(token_vector)
  n.ids <- sapply(tokens, length)
  i <- rep(seq_along(n.ids), n.ids)
  j <- match(token_vector, bagofwords)
  M <- sparseMatrix(i=i, j=j, x=1L)
  colnames(M) <- bagofwords
  return(M)
 }


 clean <- function(docs) {
  docs <- removeNumbers(docs)
  docs <- removePunctuation(docs)
  docs <- stripWhitespace(docs)
  docs <- stemDocument(docs)
  
  return(docs)
 }

 decode <- function(text) {
  t1 <- iconv(text, from = "UTF-8", to = "ASCII")
  return(t1)
 }
	source("fast-ngrams.R")
	con <- file("path_to_file", "r")
	data <- readLines(con, encoding = 'UTF-8')
	close(con)

	data <- clean(data)
	onegram <- text_to_ngrams(decode(data), 1)
	bigram <- text_to_ngrams(decode(data), 2)
	trigram <- text_to_ngrams(decode(data, 3))

	# How to calculate ngrams for a term
	sum(blogs_ngram[,colnames(onegram) == 'term'])
	library(stringi)
	library(Matrix)
	library(iconv)
	library(tm)

	find_ngrams <- function(dat, n=1, verbose=FALSE) {
	library(pbapply)
	stopifnot(is.list(dat))
	stopifnot(is.numeric(n))
	stopifnot(n>0)
	if(n == 1) return(dat)
	pblapply(dat, function(y) {
	if(length(y)<=1) return(y)
	c(y, unlist(lapply(2:n, function(n_i) {
	if(n_i > length(y)) return(NULL)
	do.call(paste, unname(as.data.frame(embed(rev(y), n_i), stringsAsFactors=FALSE)), quote=FALSE)
	})))
	})
	}



	text_to_ngrams <- function(sents, n=2){
	tokens <- stri_split_fixed(sents, ' ')
	tokens <- find_ngrams(tokens, n=n, verbose=TRUE)
	token_vector <- unlist(tokens)
	bagofwords <- unique(token_vector)
	n.ids <- sapply(tokens, length)
	i <- rep(seq_along(n.ids), n.ids)
	j <- match(token_vector, bagofwords)
	M <- sparseMatrix(i=i, j=j, x=1L)
	colnames(M) <- bagofwords
	return(M)
	}


	clean <- function(docs) {
	docs <- removeNumbers(docs)
	docs <- removePunctuation(docs)
	docs <- stripWhitespace(docs)
	docs <- stemDocument(docs)

	return(docs)
	}

	decode <- function(text) {
	t1 <- iconv(text, from = "UTF-8", to = "ASCII")
	return(t1)
	}