This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from sklearn import model_selection | |
from sklearn import ensemble, neighbors | |
import matplotlib.pyplot as plt | |
# define a list that will contain the models you want to test | |
models = [] | |
models.append(("RandomForest", ensemble.RandomForestClassifier())) | |
models.append(("KNC", neighbors.KNeighborsClassifier())) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# importiamo le librerie necessarie da sklearn | |
from sklearn.datasets import fetch_20newsgroups | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
from sklearn.cluster import KMeans | |
from sklearn.decomposition import PCA | |
# importiamo le altre librerie necessarie | |
import pandas as pd | |
import numpy as np |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import nltk | |
from nltk.corpus import stopwords | |
# nltk.download('stopwords') | |
stopwords.words("english")[:10] # <-- importiamo le stopword inglesi |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
df = pd.DataFrame(dataset.data, columns=["corpus"]) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def preprocess_text(text: str, remove_stopwords: bool) -> str: | |
"""Funzione che pulisce il testo in input andando a | |
- rimuovere i link | |
- rimuovere i caratteri speciali | |
- rimuovere i numeri | |
- rimuovere le stopword | |
- trasformare in minuscolo | |
- rimuovere spazi bianchi eccessivi | |
Argomenti: | |
text (str): testo da pulire |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
df['cleaned'] = df['corpus'].apply(lambda x: preprocess_text(x, remove_stopwords=True)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# inizializziamo il vettorizzatore | |
vectorizer = TfidfVectorizer(sublinear_tf=True, min_df=5, max_df=0.95) | |
# fit_transform applica il TF-IDF ai testi puliti - salviamo la matrice di vettori in X | |
X = vectorizer.fit_transform(df['cleaned']) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from sklearn.cluster import KMeans | |
# inizializziamo il kmeans con 3 centroidi | |
kmeans = KMeans(n_clusters=3, random_state=42) | |
# addestriamo il modello | |
kmeans.fit(X) | |
# salviamo i gruppi di ogni punto | |
clusters = kmeans.labels_ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from sklearn.decomposition import PCA | |
# inizializziamo la PCA con 2 componenti | |
pca = PCA(n_components=2, random_state=42) | |
# passiamo alla pca il nostro array X | |
pca_vecs = pca.fit_transform(X.toarray()) | |
# salviamo le nostre due dimensioni in x0 e x1 | |
x0 = pca_vecs[:, 0] | |
x1 = pca_vecs[:, 1] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# import the dataset from sklearn | |
from sklearn.datasets import fetch_20newsgroups | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
from sklearn.cluster import KMeans | |
from sklearn.decomposition import PCA | |
# import other required libs | |
import pandas as pd | |
import numpy as np |
OlderNewer