sureshgorakala · October 24, 2017 05:03
diff --git a/information_retreival_part1.r b/information_retreival_part1.r
 #load required pacakges
 if(!require("readtext"))
  install.packages("readtext")
 library(readtext)

 if(!require("tm"))
  install.packages("tm")
 library(tm)

 if(!require("stringr"))
  install.packages("stringr")
 library(stringr)

 if(!require("qdap"))
  install.packages("qdap")
 library(qdap)

 if(!require("slam"))
  install.packages("slam")
 library(slam)

 #data files are uploaded at below location:
 #https://github.com/sureshgorakala/machinelearning/tree/master/data

 #load all content files
 news_docs = readtext("*.txt")
 news_list = lapply(news_docs[,2],function(x) genX(x, " [", "]"))
 N.docs = length(news_list)
 names(news_list) = news_docs[,1]

 #load search queries
 search_queries = readtext("query.txt",dvsep = "\n")
 queries_list = unlist(strsplit(search_queries[1,2],"\n"))
 N.query = length(queries_list)
 names(queries_list) = paste0("query", c(1:N.query))

 #preprocess data news content
 #append both content and search queries together, convert the lists to VectorSource
 newscorpus = VectorSource(c(news_list,queries_list))
 newscorpus$Names = c(names(news_list),names(queries_list))
 #convert to corpus format
 newscorpus_preproc = Corpus(newscorpus)
 #cleaning the data
 newscorpus_preproc = tm_map(newscorpus_preproc,stripWhitespace)
 newscorpus_preproc = tm_map(newscorpus_preproc,removePunctuation)
 newscorpus_preproc = tm_map(newscorpus_preproc,content_transformer(tolower))
 newscorpus_preproc = tm_map(newscorpus_preproc,removeWords,stopwords("english"))


 #create tdm using weighted tfidf weightage
 tdm = TermDocumentMatrix(newscorpus_preproc,control = list(weighting = function(x) weightTfIdf(x, normalize = FALSE)))
 tdm_mat = as.matrix(tdm)
 colnames(tdm_mat) = c(names(news_list),names(queries_list))

 #normalizing the term document matrix
 tfidf_mat <- scale(tdm_mat, center = FALSE,scale = sqrt(colSums(tdm_mat^2)))

 #seperating query tdm matrix and content tdm matrix
 query.vectors <- tfidf_mat[, (N.docs + 1):(N.docs+N.query)]
 tfidf_mat <- tfidf_mat[, 1:N.docs]

 #calculating the similarity scores
 doc.scores <- t(query.vectors) %*% tfidf_mat

 results.df <- data.frame(querylist = queries_list,doc.scores)

 #function to display the final results
 showTopresults <- function(query){
  x = results.df[which(results.df$querylist == query),]
  yy =  data.frame(t(x),rownames(t(x)),row.names = NULL)[-1,]
  names(yy) = c("score","docs")
  yy$score = as.numeric(as.character(yy$score))
  yyy = yy[order(yy$score,decreasing = T),]
  
  return(yyy[which(yyy$score > 0),][1:3,])
 }

 #test the function
 showTopresults("narendra modi visit to washington")
	#load required pacakges
	if(!require("readtext"))
	install.packages("readtext")
	library(readtext)

	if(!require("tm"))
	install.packages("tm")
	library(tm)

	if(!require("stringr"))
	install.packages("stringr")
	library(stringr)

	if(!require("qdap"))
	install.packages("qdap")
	library(qdap)

	if(!require("slam"))
	install.packages("slam")
	library(slam)

	#data files are uploaded at below location:
	#https://github.com/sureshgorakala/machinelearning/tree/master/data

	#load all content files
	news_docs = readtext("*.txt")
	news_list = lapply(news_docs[,2],function(x) genX(x, " [", "]"))
	N.docs = length(news_list)
	names(news_list) = news_docs[,1]

	#load search queries
	search_queries = readtext("query.txt",dvsep = "\n")
	queries_list = unlist(strsplit(search_queries[1,2],"\n"))
	N.query = length(queries_list)
	names(queries_list) = paste0("query", c(1:N.query))

	#preprocess data news content
	#append both content and search queries together, convert the lists to VectorSource
	newscorpus = VectorSource(c(news_list,queries_list))
	newscorpus$Names = c(names(news_list),names(queries_list))
	#convert to corpus format
	newscorpus_preproc = Corpus(newscorpus)
	#cleaning the data
	newscorpus_preproc = tm_map(newscorpus_preproc,stripWhitespace)
	newscorpus_preproc = tm_map(newscorpus_preproc,removePunctuation)
	newscorpus_preproc = tm_map(newscorpus_preproc,content_transformer(tolower))
	newscorpus_preproc = tm_map(newscorpus_preproc,removeWords,stopwords("english"))


	#create tdm using weighted tfidf weightage
	tdm = TermDocumentMatrix(newscorpus_preproc,control = list(weighting = function(x) weightTfIdf(x, normalize = FALSE)))
	tdm_mat = as.matrix(tdm)
	colnames(tdm_mat) = c(names(news_list),names(queries_list))

	#normalizing the term document matrix
	tfidf_mat <- scale(tdm_mat, center = FALSE,scale = sqrt(colSums(tdm_mat^2)))

	#seperating query tdm matrix and content tdm matrix
	query.vectors <- tfidf_mat[, (N.docs + 1):(N.docs+N.query)]
	tfidf_mat <- tfidf_mat[, 1:N.docs]

	#calculating the similarity scores
	doc.scores <- t(query.vectors) %*% tfidf_mat

	results.df <- data.frame(querylist = queries_list,doc.scores)

	#function to display the final results
	showTopresults <- function(query){
	x = results.df[which(results.df$querylist == query),]
	yy = data.frame(t(x),rownames(t(x)),row.names = NULL)[-1,]
	names(yy) = c("score","docs")
	yy$score = as.numeric(as.character(yy$score))
	yyy = yy[order(yy$score,decreasing = T),]

	return(yyy[which(yyy$score > 0),][1:3,])
	}

	#test the function
	showTopresults("narendra modi visit to washington")