vgoklani · October 6, 2011 15:07
diff --git a/LSA.py b/LSA.py
 #!/usr/bin/python

 # reference => http://www.puffinwarellc.com/index.php/news-and-articles/articles/33.html

 from numpy import zeros
 from scipy.linalg import svd
 from math import log	# needed for TFIDF
 from numpy import asarray, sum

 titles = ["The Neatest Little Guide to Stock Market Investing",
 		"Investing For Dummies, 4th Edition",
 		"The Little Book of Common Sense Investing: The Only Way to Guarantee Your Fair Share of Stock Market Returns",
 		"The Little Book of Value Investing",
 		"Value Investing: From Graham to Buffett and Beyond",
 		"Rich Dad's Guide to Investing: What the Rich Invest in, That the Poor and the Middle Class Do Not!",
 		"Investing in Real Estate, 5th Edition",
 		"Stock Investing For Dummies",
 		"Rich Dad's Advisors: The ABC's of Real Estate Investing: The Secrets of Finding Hidden Profits Most Investors Miss"
 		]

 stopwords = ['and','edition','for','in','little','of','the','to']
 ignorechars = ''',:'!'''

 class LSA(object):
 	def __init__(self, stopwords, ignorechars):
 		self.stopwords = stopwords
 		self.ignorechars = ignorechars
 		self.wdict = {}
 		self.dcount = 0

 	def parse(self, doc):
 		words = doc.split();
 		for w in words:
 			w = w.lower().translate(None, self.ignorechars)
 			if w in self.stopwords:
 				continue
 			elif w in self.wdict:
 				self.wdict[w].append(self.dcount)
 			else:
 				self.wdict[w] = [self.dcount]
 		self.dcount += 1
 	
 	# rows -> keywords (occur more than twice), cols -> documentID
 	def build(self):
 		self.keys = [k for k in self.wdict.keys() if len(self.wdict[k]) > 1]
 		self.keys.sort()
 		self.A = zeros([len(self.keys), self.dcount])
 		for i, k in enumerate(self.keys):
 			for d in self.wdict[k]:
 				self.A[i,d] += 1
 	
 	def calc(self):
 		self.U, self.S, self.Vt = svd(self.A)
 	
 	def TFIDF(self):
 		WordsPerDoc = sum(self.A, axis=0)        
 		DocsPerWord = sum(asarray(self.A > 0, 'i'), axis=1)
 		rows, cols = self.A.shape
 		for i in range(rows):
 			for j in range(cols):
 				self.A[i,j] = (self.A[i,j] / WordsPerDoc[j]) * log(float(cols) / DocsPerWord[i])
 	
 	def printA(self):
 		print 'Here is the count matrix'
 		print self.A
 	
 	def printSVD(self):
 		print 'Here are the singular values'
 		print self.S
 		print 'Here are the first 3 columns of the U matrix'
 		print -1*self.U[:, 0:3]
 		print 'Here are the first 3 rows of the Vt matrix'
 		print -1*self.Vt[0:3, :]

 	def TFIDF(self):
 		WordsPerDoc = sum(self.A, axis=0) 
 		DocsPerWord = sum(asarray(self.A > 0, 'i'), axis=1) 
 		rows, cols = self.A.shape 
 		for i in range(rows):
 			for j in range(cols):
 				self.A[i,j] = (self.A[i,j] / WordsPerDoc[j]) * log(float(cols) / DocsPerWord[i])

 	@staticmethod
 	def main():
 		mylsa = LSA(stopwords, ignorechars)
 		for t in titles:
 			mylsa.parse(t)
 		mylsa.build()
 		mylsa.printA()
 		mylsa.calc()
 		mylsa.printSVD()

 if __name__ == '__main__':
 	LSA.main()
	#!/usr/bin/python

	# reference => http://www.puffinwarellc.com/index.php/news-and-articles/articles/33.html

	from numpy import zeros
	from scipy.linalg import svd
	from math import log # needed for TFIDF
	from numpy import asarray, sum

	titles = ["The Neatest Little Guide to Stock Market Investing",
	"Investing For Dummies, 4th Edition",
	"The Little Book of Common Sense Investing: The Only Way to Guarantee Your Fair Share of Stock Market Returns",
	"The Little Book of Value Investing",
	"Value Investing: From Graham to Buffett and Beyond",
	"Rich Dad's Guide to Investing: What the Rich Invest in, That the Poor and the Middle Class Do Not!",
	"Investing in Real Estate, 5th Edition",
	"Stock Investing For Dummies",
	"Rich Dad's Advisors: The ABC's of Real Estate Investing: The Secrets of Finding Hidden Profits Most Investors Miss"
	]

	stopwords = ['and','edition','for','in','little','of','the','to']
	ignorechars = ''',:'!'''

	class LSA(object):
	def __init__(self, stopwords, ignorechars):
	self.stopwords = stopwords
	self.ignorechars = ignorechars
	self.wdict = {}
	self.dcount = 0

	def parse(self, doc):
	words = doc.split();
	for w in words:
	w = w.lower().translate(None, self.ignorechars)
	if w in self.stopwords:
	continue
	elif w in self.wdict:
	self.wdict[w].append(self.dcount)
	else:
	self.wdict[w] = [self.dcount]
	self.dcount += 1

	# rows -> keywords (occur more than twice), cols -> documentID
	def build(self):
	self.keys = [k for k in self.wdict.keys() if len(self.wdict[k]) > 1]
	self.keys.sort()
	self.A = zeros([len(self.keys), self.dcount])
	for i, k in enumerate(self.keys):
	for d in self.wdict[k]:
	self.A[i,d] += 1

	def calc(self):
	self.U, self.S, self.Vt = svd(self.A)

	def TFIDF(self):
	WordsPerDoc = sum(self.A, axis=0)
	DocsPerWord = sum(asarray(self.A > 0, 'i'), axis=1)
	rows, cols = self.A.shape
	for i in range(rows):
	for j in range(cols):
	self.A[i,j] = (self.A[i,j] / WordsPerDoc[j]) * log(float(cols) / DocsPerWord[i])

	def printA(self):
	print 'Here is the count matrix'
	print self.A

	def printSVD(self):
	print 'Here are the singular values'
	print self.S
	print 'Here are the first 3 columns of the U matrix'
	print -1*self.U[:, 0:3]
	print 'Here are the first 3 rows of the Vt matrix'
	print -1*self.Vt[0:3, :]

	def TFIDF(self):
	WordsPerDoc = sum(self.A, axis=0)
	DocsPerWord = sum(asarray(self.A > 0, 'i'), axis=1)
	rows, cols = self.A.shape
	for i in range(rows):
	for j in range(cols):
	self.A[i,j] = (self.A[i,j] / WordsPerDoc[j]) * log(float(cols) / DocsPerWord[i])

	@staticmethod
	def main():
	mylsa = LSA(stopwords, ignorechars)
	for t in titles:
	mylsa.parse(t)
	mylsa.build()
	mylsa.printA()
	mylsa.calc()
	mylsa.printSVD()

	if __name__ == '__main__':
	LSA.main()