Last active
October 12, 2015 20:42
-
-
Save primaryobjects/c19b8747936e50ea4fe6 to your computer and use it in GitHub Desktop.
Mining Massive Datasets Programming Assignment 2: Finding Similar Sentences
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# | |
# MMDS Programming Assignment 2: Finding Similar Sentences | |
# This assignment is an optional challenge and it won't count in your final grade. | |
# Your task is to quickly find the number of pairs of sentences that are at the word-level edit distance at most 1. Two sentences S1 and S2 they are at edit distance 1 if S1 can be transformed to S2 by: adding, removing or substituting a single word. | |
# For example, consider the following sentences where each letter represents a word: | |
# S1: A B C D | |
# S2: A B X D | |
# S3: A B C | |
# S4: A B X C | |
# Then pairs the following pairs of sentences are at word edit distance 1 or less: (S1, S2), (S1, S3), (S2, S4), (S3, S4). | |
# The input data has 9,397,023 sentences, each one divided by a new line and with the sentence id at the beginning of the line. The zip compressed file size is around 500MB and it's located here. | |
# All sentences in the input data are at least 10 words long. A straightforward LSH approach like the one taught in class for jaccard similarity can be used to solve this problem, however it may not necessarily be the faster approach. | |
# Submit the number of similar pairs you find as a number without comas. For example, if you find 12,345 pairs, submit 12345. | |
# | |
## Including the required R packages. | |
packages <- c('tm', 'fastmatch') | |
if (length(setdiff(packages, rownames(installed.packages()))) > 0) { | |
install.packages(setdiff(packages, rownames(installed.packages()))) | |
} | |
# | |
library(tm) | |
library(fastmatch) | |
s <- c('dog fat cow', 'dog fat hoe', 'skinny puppy slick', 'cat dog eats bones', 'a really long sentence', 'a really changed long sentence', 'skinny puppy gets slick') | |
s1b <- 'A B C D' | |
s2b <- 'A B X D' | |
s3b <- 'A B C' | |
s4b <- 'A B X C' | |
#s <- c(s1b, s2b, s3b, s4b) | |
# Download dataset, if it does not exist. If error, try option: method='curl' | |
fileName <- 'sentences.txt.zip'; | |
if (!file.exists(fileName)) { | |
download.file('https://d396qusza40orc.cloudfront.net/mmds/datasets/sentences.txt.zip', fileName) | |
} | |
# Read data. | |
s <- read.table(unz(fileName, gsub('.zip', '', fileName)), sep = '\n', nrows = 10000, stringsAsFactors = FALSE) | |
# Remove leading digit from each line. | |
s <- sapply(s, function(f) gsub('^\\d+ ', '', f)) | |
# Start the clock. | |
ptm <- proc.time() | |
# Create a corpus of the documents. | |
corpus <- Corpus(VectorSource(s)) | |
# Create a term document matrix. | |
docmatrix <- DocumentTermMatrix(corpus, list(wordLengths = c(1, Inf))) | |
textIndices <- function(text, words) { | |
# Converts text into string of index values into dictionary of words. | |
# Text will be tokenized into a list of words. Words should be a character (string) array (ie., documentTermMatrix$dimnames$Terms). | |
fmatch(scan_tokenizer(tolower(text)), words) | |
} | |
# Convert sentences into string of index values into dictionary. | |
indices <- lapply(s, function(text) textIndices(text, docmatrix$dimnames$Terms)) | |
# Find sentences with edit-distance 1. Check for +/- difference in word or length. | |
distances <- apply(as.matrix(indices), 1, function(row1) { | |
str1 <- unlist(row1) | |
apply(as.matrix(indices), 1, function(row2) { | |
str2 <- unlist(row2) | |
# Find the matching words + position in both sentences. | |
inter <- intersect(str1, str2) | |
# Calculate the word-edit distance between the sentences (http://stackoverflow.com/a/28878126/2596404). | |
max(length(str1[which(!(str1 %in% inter))]), length(str2[which(!(str2 %in% inter))])) | |
}) | |
}) | |
# Stop the clock | |
proc.time() - ptm | |
# Count the number of edit distance 1 (divide by 2 to discard a->b, b->a). | |
editDistance1 <- length(which(distances == 1)) / 2 | |
# Count the number of identical sentences (subtract length of corpus to discard a->a) and divide by 2 to count pairs (discard a->b, b->a). | |
editDistance0 <- (length(which(distances == 0)) - length(s)) / 2 | |
ed1 <- editDistance1 + editDistance0 | |
ed1 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment