This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def fuzzy_tagging(tags, articles): | |
""" | |
Questa funzione riceve in input una lista di tag predefiniti e la lista di contenuto testuale da taggare. | |
Restituisce un dataframe Pandas con gli articoli taggati | |
""" | |
results = [] | |
# ciclo nei tag | |
for i, tag in enumerate(tags): | |
d = {} | |
ranking = process.extract(tag, articles, limit=4) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# carichiamo un dataset e isoliamo i post | |
df = pd.read_csv('dataset.csv') | |
posts = df[df.url.str.contains('post')] | |
posts.reset_index(inplace=True, drop=True) | |
articles = list(posts.article) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# queste sono i tag che vogliamo applicare ai nostri documenti. | |
# cambiate questa lista a vostra discrezione | |
tags = [ | |
"machine learning", | |
"clustering", | |
"carriera", | |
"progetto", | |
"consigli", | |
"analytics", | |
"deep learning", |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from thefuzz import process | |
import pandas as pd |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
top = similarity_df[similarity_df > 0.4] # change this | |
mask = np.triu(np.ones_like(top)) | |
# let's create the viz | |
plt.figure(figsize=(12, 12)) | |
sns.heatmap( | |
top, | |
square=True, | |
annot=True, | |
robust=True, |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
labels = posts.url.str.split('/').str[3:].str[1] # we extract the titles of the articles from the url | |
similarity_df = pd.DataFrame(M, columns=labels, index=labels) # let's create the dataframe | |
mask = np.triu(np.ones_like(similarity_df)) # we apply a mask to remove the top of the heatmap | |
# let's create the viz | |
plt.figure(figsize=(12, 12)) | |
sns.heatmap( | |
similarity_df, | |
square=True, | |
annot=True, |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
M = np.zeros((posts.shape[0], posts.shape[0])) # we create a 30x30 matrix to contain the results of article_i with article_j | |
for i, row in tqdm(posts.iterrows(), total=posts.shape[0], desc='1st level'): # we define i | |
for j, next_row in posts.iterrows(): # we define j | |
M[i, j] = compute_similarity(row.article, next_row.article) # we populate the matrix with the results |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import numpy as np | |
import nltk | |
from nltk.corpus import stopwords | |
import string | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
from tqdm import tqdm |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
top = similarity_df[similarity_df > 0.4] # andiamo a modificare qui | |
mask = np.triu(np.ones_like(top)) | |
sns.heatmap( | |
top, | |
square=True, | |
annot=True, | |
robust=True, | |
fmt='.2f', |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
labels = posts.url.str.split('/').str[3:].str[1] # estraiamo i titoli degli articoli dalle url | |
similarity_df = pd.DataFrame(M, columns=labels, index=labels) # creiamo un dataframe | |
mask = np.triu(np.ones_like(similarity_df)) # applichiamo una maschera per rimuovere la parte superiore della heatmap | |
# creiamo la visualizzazione | |
plt.figure(figsize=(12, 12)) | |
sns.heatmap( | |
similarity_df, | |
square=True, | |
annot=True, |