andrea-dagostino · November 22, 2021 13:35
diff --git a/clustering_ita_4.py b/clustering_ita_4.py
 def preprocess_text(text: str, remove_stopwords: bool) -> str:
    """Funzione che pulisce il testo in input andando a
    - rimuovere i link
    - rimuovere i caratteri speciali
    - rimuovere i numeri 
    - rimuovere le stopword
    - trasformare in minuscolo
    - rimuovere spazi bianchi eccessivi
    Argomenti:
        text (str): testo da pulire
        remove_stopwords (bool): rimuovere o meno le stopword
    Restituisce:
        str: testo pulito
    """
    # rimuovi link
    text = re.sub(r"http\S+", "", text)
    # rimuovi numeri e caratteri speciali
    text = re.sub("[^A-Za-z]+", " ", text)
    # rimuovere le stopword
    if remove_stopwords:
        # 1. crea token
        tokens = nltk.word_tokenize(text)
        # 2. controlla se è una stopword
        tokens = [w for w in tokens if not w.lower() in stopwords.words("english")]
        # 3. unisci tutti i token
        text = " ".join(tokens)
    # restituisci il testo pulito, senza spazi eccessivi, in minuscolo
    text = text.lower().strip()
    return text
	def preprocess_text(text: str, remove_stopwords: bool) -> str:
	"""Funzione che pulisce il testo in input andando a
	- rimuovere i link
	- rimuovere i caratteri speciali
	- rimuovere i numeri
	- rimuovere le stopword
	- trasformare in minuscolo
	- rimuovere spazi bianchi eccessivi
	Argomenti:
	text (str): testo da pulire
	remove_stopwords (bool): rimuovere o meno le stopword
	Restituisce:
	str: testo pulito
	"""
	# rimuovi link
	text = re.sub(r"http\S+", "", text)
	# rimuovi numeri e caratteri speciali
	text = re.sub("[^A-Za-z]+", " ", text)
	# rimuovere le stopword
	if remove_stopwords:
	# 1. crea token
	tokens = nltk.word_tokenize(text)
	# 2. controlla se è una stopword
	tokens = [w for w in tokens if not w.lower() in stopwords.words("english")]
	# 3. unisci tutti i token
	text = " ".join(tokens)
	# restituisci il testo pulito, senza spazi eccessivi, in minuscolo
	text = text.lower().strip()
	return text