Andrea D'Agostino andrea-dagostino

Data scientist. Founder of diariodiunanalista.it and writer @ Medium

andrea-dagostino / clustering_eng_4.py

Created November 23, 2021 12:06

posts/raggruppamento-testuale-con-tf-idf

	import nltk
	from nltk.corpus import stopwords
	# nltk.download('stopwords')

	stopwords.words("english")[:10] # <-- import the english stopwords

andrea-dagostino / clustering_eng_preprocessing.py

Created November 23, 2021 12:11

posts/raggruppamento-testuale-con-tf-idf

	def preprocess_text(text: str, remove_stopwords: bool) -> str:
	"""This utility function sanitizes a string by:
	- removing links
	- removing special characters
	- removing numbers
	- removing stopwords
	- transforming in lowercase
	- removing excessive whitespaces
	Args:
	text (str): the input text you want to clean

andrea-dagostino / clustering_ita_df_pca.py

Created November 23, 2021 15:02

posts/raggruppamento-testuale-con-tf-idf

	# assegnamo cluster e vettori PCA a delle colonne nel dataframe originale
	df['cluster'] = clusters
	df['x0'] = x0
	df['x1'] = x1

andrea-dagostino / clustering_ita_get_keywords.py

Last active November 23, 2021 17:20

posts/raggruppamento-testuale-con-tf-idf

	def get_top_keywords(n_terms):
	"""Questa funzione restituisce le keyword per ogni centroide del KMeans"""
	df = pd.DataFrame(X.todense()).groupby(clusters).mean() # raggruppa il vettore TF-IDF per gruppo
	terms = vectorizer.get_feature_names_out() # accedi ai termini del tf idf
	for i,r in df.iterrows():
	print('\nCluster {}'.format(i))
	print(','.join([terms[t] for t in np.argsort(r)[-n_terms:]])) # per ogni riga del dataframe, trova gli n termini che hanno il punteggio più alto

	get_top_keywords(10)

andrea-dagostino / clustering_ita_mapclusters.py

Created November 23, 2021 15:57

posts/raggruppamento-testuale-con-tf-idf

	# mappiamo cluster con termini adatti
	cluster_map = {0: "sport", 1: "tecnologia", 2: "religione"}
	# applichiamo mappatura
	df['cluster'] = df['cluster'].map(cluster_map)

andrea-dagostino / clustering_ita_viz.py

Last active November 23, 2021 16:03

posts/raggruppamento-testuale-con-tf-idf

	# settiamo la grandezza dell'immagine
	plt.figure(figsize=(12, 7))
	# settiamo titolo
	plt.title("Raggruppamento TF-IDF + KMeans 20newsgroup", fontdict={"fontsize": 18})
	# settiamo nome assi
	plt.xlabel("X0", fontdict={"fontsize": 16})
	plt.ylabel("X1", fontdict={"fontsize": 16})
	# creiamo diagramma a dispersione con seaborn, dove hue è la classe usata per raggruppare i dati
	sns.scatterplot(data=df, x='x0', y='x1', hue='cluster', palette="viridis")
	plt.show()

andrea-dagostino / clustering_functions.py

Created November 23, 2021 16:17

posts/raggruppamento-testuale-con-tf-idf

	def preprocess_text(text: str, remove_stopwords: bool) -> str:
	"""Funzione che pulisce il testo in input andando a
	- rimuovere i link
	- rimuovere i caratteri speciali
	- rimuovere i numeri
	- rimuovere le stopword
	- trasformare in minuscolo
	- rimuovere spazi bianchi eccessivi
	Argomenti:
	text (str): testo da pulire

andrea-dagostino / clustering_eng_vec.py

Created November 23, 2021 16:45

posts/raggruppamento-testuale-con-tf-idf

	# initialize the vectorizer
	vectorizer = TfidfVectorizer(sublinear_tf=True, min_df=5, max_df=0.95)
	# fit_transform applies TF-IDF to clean texts - we save the array of vectors in X
	X = vectorizer.fit_transform(df['cleaned'])

andrea-dagostino / clustering_eng_kmeans.py

Created November 23, 2021 17:07

posts/raggruppamento-testuale-con-tf-idf

	from sklearn.cluster import KMeans

	# initialize kmeans with 3 centroids
	kmeans = KMeans(n_clusters=3, random_state=42)
	# fit the model
	kmeans.fit(X)
	# store cluster labels in a variable
	clusters = kmeans.labels_

andrea-dagostino / clustering_eng_pca.py

Created November 23, 2021 17:15

posts/raggruppamento-testuale-con-tf-idf

	from sklearn.decomposition import PCA

	# initialize PCA with 2 components
	pca = PCA(n_components=2, random_state=42)
	# pass our X to the pca and store the reduced vectors into pca_vecs
	pca_vecs = pca.fit_transform(X.toarray())
	# save our two dimensions into x0 and x1
	x0 = pca_vecs[:, 0]
	x1 = pca_vecs[:, 1]