primaryobjects · October 12, 2015 20:42
diff --git a/similar-sentences.R b/similar-sentences.R
 #
 # MMDS Programming Assignment 2: Finding Similar Sentences
 # This assignment is an optional challenge and it won't count in your final grade.
 # Your task is to quickly find the number of pairs of sentences that are at the word-level edit distance at most 1. Two sentences S1 and S2 they are at edit distance 1 if S1 can be transformed to S2 by: adding, removing or substituting a single word.
 # For example, consider the following sentences where each letter represents a word:
 # S1: A B C D
 # S2: A B X D
 # S3: A B C
 # S4: A B X C
 # Then pairs the following pairs of sentences are at word edit distance 1 or less: (S1, S2), (S1, S3), (S2, S4), (S3, S4).
 # The input data has 9,397,023 sentences, each one divided by a new line and with the sentence id at the beginning of the line. The zip compressed file size is around 500MB and it's located here.
 # All sentences in the input data are at least 10 words long. A straightforward LSH approach like the one taught in class for jaccard similarity can be used to solve this problem, however it may not necessarily be the faster approach.
 # Submit the number of similar pairs you find as a number without comas. For example, if you find 12,345 pairs, submit 12345.
 #

 ## Including the required R packages.
 packages <- c('tm', 'fastmatch')
 if (length(setdiff(packages, rownames(installed.packages()))) > 0) {
  install.packages(setdiff(packages, rownames(installed.packages())))  
 }
 # 
 library(tm)
 library(fastmatch)

 s <- c('dog fat cow', 'dog fat hoe', 'skinny puppy slick', 'cat dog eats bones', 'a really long sentence', 'a really changed long sentence', 'skinny puppy gets slick')

 s1b <- 'A B C D'
 s2b <- 'A B X D'
 s3b <- 'A B C'
 s4b <- 'A B X C'
 #s <- c(s1b, s2b, s3b, s4b)

 # Download dataset, if it does not exist. If error, try option: method='curl'
 fileName <- 'sentences.txt.zip';
 if (!file.exists(fileName)) {
  download.file('https://d396qusza40orc.cloudfront.net/mmds/datasets/sentences.txt.zip', fileName)
 }

 # Read data.
 s <- read.table(unz(fileName, gsub('.zip', '', fileName)), sep = '\n', nrows = 10000, stringsAsFactors = FALSE)

 # Remove leading digit from each line.
 s <- sapply(s, function(f) gsub('^\\d+ ', '', f))

 # Start the clock.
 ptm <- proc.time()

 # Create a corpus of the documents.
 corpus <- Corpus(VectorSource(s))

 # Create a term document matrix.
 docmatrix <- DocumentTermMatrix(corpus, list(wordLengths = c(1, Inf)))

 textIndices <- function(text, words) {
  # Converts text into string of index values into dictionary of words.
  # Text will be tokenized into a list of words. Words should be a character (string) array (ie., documentTermMatrix$dimnames$Terms).
  fmatch(scan_tokenizer(tolower(text)), words)
 }

 # Convert sentences into string of index values into dictionary.
 indices <- lapply(s, function(text) textIndices(text, docmatrix$dimnames$Terms))

 # Find sentences with edit-distance 1. Check for +/- difference in word or length.
 distances <- apply(as.matrix(indices), 1, function(row1) {
  str1 <- unlist(row1)
  
  apply(as.matrix(indices), 1, function(row2) {
    str2 <- unlist(row2)
    
    # Find the matching words + position in both sentences.
    inter <- intersect(str1, str2)
    
    # Calculate the word-edit distance between the sentences (http://stackoverflow.com/a/28878126/2596404).
    max(length(str1[which(!(str1 %in% inter))]), length(str2[which(!(str2 %in% inter))]))
  })
 })

 # Stop the clock
 proc.time() - ptm

 # Count the number of edit distance 1 (divide by 2 to discard a->b, b->a).
 editDistance1 <- length(which(distances == 1)) / 2
 # Count the number of identical sentences (subtract length of corpus to discard a->a) and divide by 2 to count pairs (discard a->b, b->a).
 editDistance0 <- (length(which(distances == 0)) - length(s)) / 2

 ed1 <- editDistance1 + editDistance0
 ed1
	#
	# MMDS Programming Assignment 2: Finding Similar Sentences
	# This assignment is an optional challenge and it won't count in your final grade.
	# Your task is to quickly find the number of pairs of sentences that are at the word-level edit distance at most 1. Two sentences S1 and S2 they are at edit distance 1 if S1 can be transformed to S2 by: adding, removing or substituting a single word.
	# For example, consider the following sentences where each letter represents a word:
	# S1: A B C D
	# S2: A B X D
	# S3: A B C
	# S4: A B X C
	# Then pairs the following pairs of sentences are at word edit distance 1 or less: (S1, S2), (S1, S3), (S2, S4), (S3, S4).
	# The input data has 9,397,023 sentences, each one divided by a new line and with the sentence id at the beginning of the line. The zip compressed file size is around 500MB and it's located here.
	# All sentences in the input data are at least 10 words long. A straightforward LSH approach like the one taught in class for jaccard similarity can be used to solve this problem, however it may not necessarily be the faster approach.
	# Submit the number of similar pairs you find as a number without comas. For example, if you find 12,345 pairs, submit 12345.
	#

	## Including the required R packages.
	packages <- c('tm', 'fastmatch')
	if (length(setdiff(packages, rownames(installed.packages()))) > 0) {
	install.packages(setdiff(packages, rownames(installed.packages())))
	}
	#
	library(tm)
	library(fastmatch)

	s <- c('dog fat cow', 'dog fat hoe', 'skinny puppy slick', 'cat dog eats bones', 'a really long sentence', 'a really changed long sentence', 'skinny puppy gets slick')

	s1b <- 'A B C D'
	s2b <- 'A B X D'
	s3b <- 'A B C'
	s4b <- 'A B X C'
	#s <- c(s1b, s2b, s3b, s4b)

	# Download dataset, if it does not exist. If error, try option: method='curl'
	fileName <- 'sentences.txt.zip';
	if (!file.exists(fileName)) {
	download.file('https://d396qusza40orc.cloudfront.net/mmds/datasets/sentences.txt.zip', fileName)
	}

	# Read data.
	s <- read.table(unz(fileName, gsub('.zip', '', fileName)), sep = '\n', nrows = 10000, stringsAsFactors = FALSE)

	# Remove leading digit from each line.
	s <- sapply(s, function(f) gsub('^\\d+ ', '', f))

	# Start the clock.
	ptm <- proc.time()

	# Create a corpus of the documents.
	corpus <- Corpus(VectorSource(s))

	# Create a term document matrix.
	docmatrix <- DocumentTermMatrix(corpus, list(wordLengths = c(1, Inf)))

	textIndices <- function(text, words) {
	# Converts text into string of index values into dictionary of words.
	# Text will be tokenized into a list of words. Words should be a character (string) array (ie., documentTermMatrix$dimnames$Terms).
	fmatch(scan_tokenizer(tolower(text)), words)
	}

	# Convert sentences into string of index values into dictionary.
	indices <- lapply(s, function(text) textIndices(text, docmatrix$dimnames$Terms))

	# Find sentences with edit-distance 1. Check for +/- difference in word or length.
	distances <- apply(as.matrix(indices), 1, function(row1) {
	str1 <- unlist(row1)

	apply(as.matrix(indices), 1, function(row2) {
	str2 <- unlist(row2)

	# Find the matching words + position in both sentences.
	inter <- intersect(str1, str2)

	# Calculate the word-edit distance between the sentences (http://stackoverflow.com/a/28878126/2596404).
	max(length(str1[which(!(str1 %in% inter))]), length(str2[which(!(str2 %in% inter))]))
	})
	})

	# Stop the clock
	proc.time() - ptm

	# Count the number of edit distance 1 (divide by 2 to discard a->b, b->a).
	editDistance1 <- length(which(distances == 1)) / 2
	# Count the number of identical sentences (subtract length of corpus to discard a->a) and divide by 2 to count pairs (discard a->b, b->a).
	editDistance0 <- (length(which(distances == 0)) - length(s)) / 2

	ed1 <- editDistance1 + editDistance0
	ed1