Skip to content

Instantly share code, notes, and snippets.

@fclesio
Created July 3, 2019 10:43
Show Gist options
  • Save fclesio/497d32ceb2d670252c2a4a34f606455c to your computer and use it in GitHub Desktop.
Save fclesio/497d32ceb2d670252c2a4a34f606455c to your computer and use it in GitHub Desktop.
def get_lexical_diversity(df, artist):
dataframe = df[df['artist'] == artist]
# Word stats
full_text_count = pd.DataFrame(Counter(" ".join(dataframe["lyric"]).split()), index=[0])
full_text_count = full_text_count.T
full_text_count = full_text_count.reset_index()
full_text_count.columns = ['word', 'qty']
# Distinct words to include in numerator
distinct_words = set()
dataframe['lyric'].str.lower().str.split().apply(distinct_words.update)
total_distinct_words = len(distinct_words)
# All words to include in denominator
total_words = full_text_count['qty'].sum()
lexical_diversity = round((total_distinct_words / total_words),2)
print(f'Lexical Diversity for {artist}: {lexical_diversity}')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment