dipanjanS’s gists

dipanjanS / effective_data_viz_29.py

Created January 10, 2018 21:54

	# Visualizing 5-D mix data using bubble charts
	# leveraging the concepts of hue, size and facets
	g = sns.FacetGrid(wines, col="wine_type", hue='quality_label',
	col_order=['red', 'white'], hue_order=['low', 'medium', 'high'],
	aspect=1.2, size=3.5, palette=sns.light_palette('black', 4)[1:])
	g.map(plt.scatter, "residual sugar", "alcohol", alpha=0.8,
	edgecolor='white', linewidth=0.5, s=wines['total sulfur dioxide']*2)
	fig = g.fig
	fig.subplots_adjust(top=0.8, wspace=0.3)
	fig.suptitle('Wine Type - Sulfur Dioxide - Residual Sugar - Alcohol - Quality', fontsize=14)

dipanjanS / effective_data_viz_30.py

Created January 10, 2018 22:05

	# Visualizing 6-D mix data using scatter charts
	# leveraging the concepts of hue, size, depth and shape
	fig = plt.figure(figsize=(8, 6))
	t = fig.suptitle('Wine Residual Sugar - Alcohol Content - Acidity - Total Sulfur Dioxide - Type - Quality', fontsize=14)
	ax = fig.add_subplot(111, projection='3d')

	xs = list(wines['residual sugar'])
	ys = list(wines['alcohol'])
	zs = list(wines['fixed acidity'])
	data_points = [(x, y, z) for x, y, z in zip(xs, ys, zs)]

dipanjanS / effective_data_viz_31.py

Last active January 24, 2018 04:31

	# Visualizing 6-D mix data using scatter charts
	# leveraging the concepts of hue, facets and size
	g = sns.FacetGrid(wines, row='wine_type', col="quality", hue='quality_label', size=4)
	g.map(plt.scatter, "residual sugar", "alcohol", alpha=0.5,
	edgecolor='k', linewidth=0.5, s=wines['total sulfur dioxide']*2)
	fig = g.fig
	fig.set_size_inches(18, 8)
	fig.subplots_adjust(top=0.85, wspace=0.3)
	fig.suptitle('Wine Type - Sulfur Dioxide - Residual Sugar - Alcohol - Quality Class - Quality Rating', fontsize=14)
	l = g.add_legend(title='Wine Quality Class')

dipanjanS / feature_engg_text_1.py

Created January 27, 2018 17:38

	corpus = ['The sky is blue and beautiful.',
	'Love this blue and beautiful sky!',
	'The quick brown fox jumps over the lazy dog.',
	"A king's breakfast has sausages, ham, bacon, eggs, toast and beans",
	'I love green eggs, ham, sausages and bacon!',
	'The brown fox is quick and the blue dog is lazy!',
	'The sky is very blue and the sky is very beautiful today',
	'The dog is lazy but the brown fox is quick!'
	]
	labels = ['weather', 'weather', 'animals', 'food', 'food', 'animals', 'weather', 'animals']

dipanjanS / feature_engg_text_2.py

Created January 27, 2018 18:23

	wpt = nltk.WordPunctTokenizer()
	stop_words = nltk.corpus.stopwords.words('english')

	def normalize_document(doc):
	# lower case and remove special characters\whitespaces
	doc = re.sub(r'[^a-zA-Z\s]', '', doc, re.I\|re.A)
	doc = doc.lower()
	doc = doc.strip()
	# tokenize document
	tokens = wpt.tokenize(doc)

dipanjanS / feature_engg_text_3.py

Created January 28, 2018 07:16

	from sklearn.feature_extraction.text import CountVectorizer

	cv = CountVectorizer(min_df=0., max_df=1.)
	cv_matrix = cv.fit_transform(norm_corpus)
	cv_matrix = cv_matrix.toarray()
	cv_matrix

dipanjanS / feature_engg_text_4.py

Created January 28, 2018 07:19

	# get all unique words in the corpus
	vocab = cv.get_feature_names()
	# show document feature vectors
	pd.DataFrame(cv_matrix, columns=vocab)

dipanjanS / feature_engg_text_5.py

Created January 28, 2018 07:32

	# you can set the n-gram range to 1,2 to get unigrams as well as bigrams
	bv = CountVectorizer(ngram_range=(2,2))
	bv_matrix = bv.fit_transform(norm_corpus)

	bv_matrix = bv_matrix.toarray()
	vocab = bv.get_feature_names()
	pd.DataFrame(bv_matrix, columns=vocab)

dipanjanS / feature_engg_text_6.py

Created January 28, 2018 07:47

	from sklearn.feature_extraction.text import TfidfVectorizer

	tv = TfidfVectorizer(min_df=0., max_df=1., use_idf=True)
	tv_matrix = tv.fit_transform(norm_corpus)
	tv_matrix = tv_matrix.toarray()

	vocab = tv.get_feature_names()
	pd.DataFrame(np.round(tv_matrix, 2), columns=vocab)

dipanjanS / feature_engg_text_7.py

Created January 28, 2018 09:08

	from sklearn.metrics.pairwise import cosine_similarity

	similarity_matrix = cosine_similarity(tv_matrix)
	similarity_df = pd.DataFrame(similarity_matrix)
	similarity_df

Dipanjan (DJ) Sarkar dipanjanS