-
-
Save jeff082chen/2158c380bb6e229a54b0a5e63b5bc1f0 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# text = 妳讀進來的字串 | |
# Process the text | |
words = text.lower().split() | |
word_counts = {} | |
character_counts = {char: 0 for char in "abcdefghijklmnopqrstuvwxyz0123456789"} | |
for word in words: | |
# Remove punctuation from words | |
cleaned_word = "".join([char for char in word if char.isalnum()]) | |
if cleaned_word: | |
word_counts[cleaned_word] = word_counts.get(cleaned_word, 0) + 1 | |
for char in cleaned_word: | |
if char in character_counts: | |
character_counts[char] += 1 | |
# Total number of words | |
total_words = sum(word_counts.values()) | |
# Total number of word's type | |
total_word_types = len(word_counts) | |
# The first three words that most frequently appear in the article and their number | |
most_frequent_words = sorted(word_counts.items(), key=lambda item: item[1], reverse=True)[:3] | |
# First three words that only appear once, in lexical order | |
words_appear_once = sorted([word for word, count in word_counts.items() if count == 1])[:3] | |
total_words, total_word_types, most_frequent_words, words_appear_once, sorted(character_counts.items()) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment