Skip to content

Instantly share code, notes, and snippets.

View andrea-dagostino's full-sized avatar

Andrea D'Agostino andrea-dagostino

View GitHub Profile
M = np.zeros((posts.shape[0], posts.shape[0])) # creiamo una matrice 30x30 per contenere i risultati di testo_i con testo_j
for i, row in tqdm(posts.iterrows(), total=posts.shape[0], desc='1st level'): # definiamo i
for j, next_row in posts.iterrows(): # definiamo j
M[i, j] = compute_similarity(row.article, next_row.article) # popoliamo la matrice con i risultati
def compute_similarity(a, b):
tfidf = vectorizer.fit_transform([a, b])
return ((tfidf * tfidf.T).toarray())[0,1]
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(tokenizer=preprocess, stop_words=ita_stopwords)
remove_punctuation_map = dict((ord(char), None) for char in string.punctuation)
def preprocess(text):
return nltk.word_tokenize(text.lower().translate(remove_punctuation_map))
posts = df[df.url.str.contains('post')]
posts.reset_index(inplace=True)
if __name__ == "__main__":
list_of_websites = [
"https://www.diariodiunanalista.it/",
]
df = create_dataset(list_of_websites)
df.to_csv("dataset.csv", index=False)
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
import string
from tqdm import tqdm
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn import datasets
from sklearn import metrics
# create a dataset for a classification task
X, y = datasets.make_classification(n_samples=2000, n_features=20, n_classes=2, random_state=42)
# create the KFold object using sturges law
sturges = int(1 + np.log(len(X)))
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn import datasets
from sklearn import metrics
# creiamo un dataset per un task di classificazione
X, y = datasets.make_classification(n_samples=2000, n_features=20, n_classes=2, random_state=42)
# creiamo l'oggetto KFold applicando la regola di Sturges
sturges = int(1 + np.log(len(X)))
train_accs = []
test_accs = []
cols = [
'fixed.acidity', 'volatile.acidity', 'citric.acid','residual.sugar', 'chlorides', 'free.sulfur.dioxide',
'total.sulfur.dioxide', 'density', 'pH', 'sulphates', 'alcohol',
]
# init a loop where we dynamically change the value of max_depth
for depth in range(1, 25):