Andrea D'Agostino andrea-dagostino

Data scientist. Founder of diariodiunanalista.it and writer @ Medium

andrea-dagostino / fuzzy_logic_tagging_ita4.py

Created October 8, 2022 13:42

fuzzy_logic_tagging

	def fuzzy_tagging(tags, articles):
	"""
	Questa funzione riceve in input una lista di tag predefiniti e la lista di contenuto testuale da taggare.
	Restituisce un dataframe Pandas con gli articoli taggati
	"""
	results = []
	# ciclo nei tag
	for i, tag in enumerate(tags):
	d = {}
	ranking = process.extract(tag, articles, limit=4)

andrea-dagostino / fuzzy_logic_tagging_ita3.py

Created October 8, 2022 13:40

fuzzy_logic_tagging

	# carichiamo un dataset e isoliamo i post
	df = pd.read_csv('dataset.csv')
	posts = df[df.url.str.contains('post')]
	posts.reset_index(inplace=True, drop=True)
	articles = list(posts.article)

andrea-dagostino / fuzzy_logic_tagging_ita2.py

Created October 8, 2022 13:39

fuzzy_logic_tagging

	# queste sono i tag che vogliamo applicare ai nostri documenti.
	# cambiate questa lista a vostra discrezione
	tags = [
	"machine learning",
	"clustering",
	"carriera",
	"progetto",
	"consigli",
	"analytics",
	"deep learning",

andrea-dagostino / fuzzy_logic_tagging_ita1.py

Created October 8, 2022 13:38

fuzzy_logic_tagging

	from thefuzz import process
	import pandas as pd

andrea-dagostino / text_sim_tfidf_eng4.py

Created October 3, 2022 22:52

text_sim_tfidf

	top = similarity_df[similarity_df > 0.4] # change this
	mask = np.triu(np.ones_like(top))

	# let's create the viz
	plt.figure(figsize=(12, 12))
	sns.heatmap(
	top,
	square=True,
	annot=True,
	robust=True,

andrea-dagostino / text_sim_tfidf_eng2.py

Last active October 3, 2022 22:16

text_sim_tfidf

	labels = posts.url.str.split('/').str[3:].str[1] # we extract the titles of the articles from the url
	similarity_df = pd.DataFrame(M, columns=labels, index=labels) # let's create the dataframe
	mask = np.triu(np.ones_like(similarity_df)) # we apply a mask to remove the top of the heatmap

	# let's create the viz
	plt.figure(figsize=(12, 12))
	sns.heatmap(
	similarity_df,
	square=True,
	annot=True,

andrea-dagostino / text_sim_tfidf_eng1.py

Created October 3, 2022 22:13

text_sim_tfidf

	M = np.zeros((posts.shape[0], posts.shape[0])) # we create a 30x30 matrix to contain the results of article_i with article_j


	for i, row in tqdm(posts.iterrows(), total=posts.shape[0], desc='1st level'): # we define i
	for j, next_row in posts.iterrows(): # we define j
	M[i, j] = compute_similarity(row.article, next_row.article) # we populate the matrix with the results

andrea-dagostino / text_sim_tfidf_ita9.py

Created October 3, 2022 17:59

text_sim_tfidf

	import pandas as pd
	import numpy as np

	import nltk
	from nltk.corpus import stopwords
	import string

	from sklearn.feature_extraction.text import TfidfVectorizer

	from tqdm import tqdm

andrea-dagostino / text_sim_tfidf_ita8.py

Last active October 3, 2022 18:00

text_sim_tfidf

	top = similarity_df[similarity_df > 0.4] # andiamo a modificare qui
	mask = np.triu(np.ones_like(top))


	sns.heatmap(
	top,
	square=True,
	annot=True,
	robust=True,
	fmt='.2f',

andrea-dagostino / text_sim_tfidf_ita7.py

Created October 3, 2022 09:07

text_sim_tfidf

	labels = posts.url.str.split('/').str[3:].str[1] # estraiamo i titoli degli articoli dalle url
	similarity_df = pd.DataFrame(M, columns=labels, index=labels) # creiamo un dataframe
	mask = np.triu(np.ones_like(similarity_df)) # applichiamo una maschera per rimuovere la parte superiore della heatmap

	# creiamo la visualizzazione
	plt.figure(figsize=(12, 12))
	sns.heatmap(
	similarity_df,
	square=True,
	annot=True,