Tathagat Dasgupta Tathagatd96

Senior Associate Consultant at Infosys

Tathagatd96 / gist:6639c56315ff1c3503f9f0d8c2a47597

Created May 24, 2017 10:12

	2257

	From: [email protected] (Michael Collier)
	Subject: Converting images to HP LaserJet III?
	Nntp-Posting-Host: hampton

	comp.graphics

	[1 1 3 3 3 3 3 2 2 2]

Tathagatd96 / gist:762893f59fd0eb4a2ec7a4b3890b5cfe

Created May 24, 2017 10:12

	#SVM Implementation

	text_clf=Pipeline([('vect',CountVectorizer()),('tfidf',TfidfTransformer()),('clf',SGDClassifier(loss='hinge',alpha=1e-3,n_iter=5,random_state=42))])
	text_clf.fit(twenty_train.data,twenty_train.target)
	predicted=text_clf.predict(doc_test)

	print "SVM Accuracy:"
	print(np.mean(predicted==twenty_test.target))

Tathagatd96 / gist:781c4ca4d024b7c9ee76dd856cb64970

Created May 24, 2017 10:12

	#Performance on test set

	twenty_test=fetch_20newsgroups(subset='test',categories=categories,shuffle=True,random_state=42)
	doc_test=twenty_test.data
	predicted=text_clf.predict(doc_test)
	print "Classifier Accuracy:"
	print(np.mean(predicted==twenty_test.target))

Tathagatd96 / gist:5b9c307ee81a81cfd102932c6658d5d9

Created May 24, 2017 10:12

	#Building a pipeline

	text_clf=Pipeline([('vect',CountVectorizer()),('tfidf',TfidfTransformer()),('clf',MultinomialNB())])

	text_clf=text_clf.fit(twenty_train.data,twenty_train.target)

Tathagatd96 / gist:b81c9411e7e89a874206f7ae99295ac0

Created May 24, 2017 10:11

	#Classifier Training


	clf=MultinomialNB().fit(X_train_tfidf,twenty_train.target)

	docs_new=['God is love','OpenGL on the GPU is fast']
	X_new_counts=count_vect.transform(docs_new)
	X_new_tfidf=tfidf_transformer.transform(X_new_counts)

	predicted=clf.predict(X_new_tfidf)

Tathagatd96 / gist:32b56ebc81a2dce2791eff565259d6d9

Created May 24, 2017 10:11

	#tf-idf
	tfidf_transformer=TfidfTransformer()

	X_train_tfidf=tfidf_transformer.fit_transform(X_train_counts)
	print(X_train_tfidf.shape)

Tathagatd96 / gist:5956acaae32f7a6046a957b127ebbd15

Created May 24, 2017 10:11

	print len(twenty_train.data)

	print("\n".join(twenty_train.data[0].split("\n")[:3]))

	print(twenty_train.target_names[twenty_train.target[0]])

	print(twenty_train.target[:10])

	for t in twenty_train.target[:10]:
	print(twenty_train.target_names[t])

Tathagatd96 / gist:b98c26e83b64a6ce18684b57e8f0c39a

Created May 24, 2017 10:10

	sklearn.datasets.load_files("C://Users/Tathagat Dasgupta/Desktop/ML Project/20news-18828")

	categories=['alt.atheism','soc.religion.christian','comp.graphics','sci.med']
	print "hello"

	twenty_train=fetch_20newsgroups(subset='train',categories=categories,shuffle=True,random_state=42)

Tathagatd96 / gist:592836736210f3dabfec9266f1c14d60

Created May 24, 2017 10:10

	from sklearn.datasets import fetch_20newsgroups
	import sklearn.datasets
	from sklearn.feature_extraction.text import CountVectorizer
	from sklearn.feature_extraction.text import TfidfTransformer
	from sklearn.naive_bayes import MultinomialNB
	from sklearn.pipeline import Pipeline
	from sklearn.linear_model import SGDClassifier
	import numpy as np

Tathagatd96 / gist:0b05a46c5705a7dbc403d4c0d62272cb

Created May 23, 2017 16:03

	print len(twenty_train.data)

	print("\n".join(twenty_train.data[0].split("\n")[:3]))

	print(twenty_train.target_names[twenty_train.target[0]])

	#Preprocessing

	#Tokenizing text