This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import nltk | |
from nltk.corpus import stopwords | |
# nltk.download('stopwords') | |
stopwords.words("english")[:10] # <-- import the english stopwords |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def preprocess_text(text: str, remove_stopwords: bool) -> str: | |
"""This utility function sanitizes a string by: | |
- removing links | |
- removing special characters | |
- removing numbers | |
- removing stopwords | |
- transforming in lowercase | |
- removing excessive whitespaces | |
Args: | |
text (str): the input text you want to clean |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# assegnamo cluster e vettori PCA a delle colonne nel dataframe originale | |
df['cluster'] = clusters | |
df['x0'] = x0 | |
df['x1'] = x1 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def get_top_keywords(n_terms): | |
"""Questa funzione restituisce le keyword per ogni centroide del KMeans""" | |
df = pd.DataFrame(X.todense()).groupby(clusters).mean() # raggruppa il vettore TF-IDF per gruppo | |
terms = vectorizer.get_feature_names_out() # accedi ai termini del tf idf | |
for i,r in df.iterrows(): | |
print('\nCluster {}'.format(i)) | |
print(','.join([terms[t] for t in np.argsort(r)[-n_terms:]])) # per ogni riga del dataframe, trova gli n termini che hanno il punteggio più alto | |
get_top_keywords(10) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# mappiamo cluster con termini adatti | |
cluster_map = {0: "sport", 1: "tecnologia", 2: "religione"} | |
# applichiamo mappatura | |
df['cluster'] = df['cluster'].map(cluster_map) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# settiamo la grandezza dell'immagine | |
plt.figure(figsize=(12, 7)) | |
# settiamo titolo | |
plt.title("Raggruppamento TF-IDF + KMeans 20newsgroup", fontdict={"fontsize": 18}) | |
# settiamo nome assi | |
plt.xlabel("X0", fontdict={"fontsize": 16}) | |
plt.ylabel("X1", fontdict={"fontsize": 16}) | |
# creiamo diagramma a dispersione con seaborn, dove hue è la classe usata per raggruppare i dati | |
sns.scatterplot(data=df, x='x0', y='x1', hue='cluster', palette="viridis") | |
plt.show() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def preprocess_text(text: str, remove_stopwords: bool) -> str: | |
"""Funzione che pulisce il testo in input andando a | |
- rimuovere i link | |
- rimuovere i caratteri speciali | |
- rimuovere i numeri | |
- rimuovere le stopword | |
- trasformare in minuscolo | |
- rimuovere spazi bianchi eccessivi | |
Argomenti: | |
text (str): testo da pulire |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# initialize the vectorizer | |
vectorizer = TfidfVectorizer(sublinear_tf=True, min_df=5, max_df=0.95) | |
# fit_transform applies TF-IDF to clean texts - we save the array of vectors in X | |
X = vectorizer.fit_transform(df['cleaned']) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from sklearn.cluster import KMeans | |
# initialize kmeans with 3 centroids | |
kmeans = KMeans(n_clusters=3, random_state=42) | |
# fit the model | |
kmeans.fit(X) | |
# store cluster labels in a variable | |
clusters = kmeans.labels_ |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from sklearn.decomposition import PCA | |
# initialize PCA with 2 components | |
pca = PCA(n_components=2, random_state=42) | |
# pass our X to the pca and store the reduced vectors into pca_vecs | |
pca_vecs = pca.fit_transform(X.toarray()) | |
# save our two dimensions into x0 and x1 | |
x0 = pca_vecs[:, 0] | |
x1 = pca_vecs[:, 1] |