Last active
March 27, 2023 11:59
-
-
Save andrea-dagostino/38ba6640383b10942f565a2e2c2368a8 to your computer and use it in GitHub Desktop.
fuzzy_logic_tagging
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def fuzzy_tagging(tags, articles): | |
""" | |
This function receives as input a list of predefined tags and the list of textual content to be tagged. | |
Returns a Pandas dataframe with the articles tagged | |
""" | |
results = [] | |
# iterate through tags | |
for i, tag in enumerate(tags): | |
d = {} | |
ranking = process.extract(tag, articles, limit=4) # extract the tag, ranking the 4 articles most representative | |
for r in ranking: | |
d = {"tag": tag, "index": articles.index(r[0]), "confidence": r[1]} | |
results.append(d) | |
# organize everything in a pandas dataframe | |
raw_tags = pd.DataFrame(results) | |
raw_tags.set_index('index', inplace=True, drop=True) | |
d = {} | |
for i, row in raw_tags.iterrows(): | |
if d.get(i): | |
if row['confidence'] >= 55: # if the threshold exceeds the value of 55 | |
d[i] += ', ' + str(row['tag']) | |
else: | |
d[i] = str(row['tag']) | |
# create the final dataset | |
tags = pd.Series(d, name='tag') | |
tagged_df = pd.concat([posts, tags], axis=1) | |
return tagged_df |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment