Skip to content

Instantly share code, notes, and snippets.

@TahaRostami
Created September 19, 2024 21:20
Show Gist options
  • Save TahaRostami/d1067af6ac9864e752eefa5bff52cb4a to your computer and use it in GitHub Desktop.
Save TahaRostami/d1067af6ac9864e752eefa5bff52cb4a to your computer and use it in GitHub Desktop.
A quantitative analysis of some well-known vocabulary books, along with proposed metrics and methods to enhance the vocabulary learning experience

Optimizing Vocabulary Learning: Evaluating and Enhancing Coverage Metrics for Effective English Learner Resources

Some proposed metrics for evaluating vocabulary books, such as the coverage of frequent words for English learners, have their shortcomings. After highlighting these weaknesses, I’ll propose a metric that takes into account both the coverage of essential words and the effort required by the learner.

Afterward, an approach is proposed that aims to optimize this metric. The approach begins by sampling words based on their frequency and categorizing them into two groups: words that are easy to learn automatically and words that need to be learned intentionally. The chosen words are then organized so that the main focus is on the intentionally learnable words, while the automatically learnable words are used to elaborate on and provide context for the main words, as well as in reading examples.

Finally, a quantitative analysis and comparison of some well-known vocabulary books—such as 504 Absolutely Essential Words, 4000 Essential English Words, Barron’s 1100 Words You Need to Know, Merriam-Webster Vocabulary Builder, General English Under Scrutiny, and Oxford Word Skills Basic Book—is provided. This analysis concludes with adjustments highlighting what makes a vocabulary book effective.

# A core code for a quantitative analysis and comparison of some well-known vocabulary books
import fitz # PyMuPDF
import re
import nltk
from nltk.corpus import words
from nltk.tokenize import word_tokenize
import pandas as pd
# Ensure necessary NLTK data is available
nltk.download('words')
nltk.download('punkt')
def load_BNC_COCA_list(file_path):
df = pd.read_excel(file_path, usecols=['List ', 'Headword ', 'Related forms'])
H=0
M=0
L=0
# Initialize dictionaries to store word frequencies and related words
word_dict = {}
related_to_headword_map = {}
# Populate word_dict and related_to_headword_map from the frequency list
for index, row in df.iterrows():
headword = row['Headword ']
# Ensure headword is a valid string
if isinstance(headword, str):
headword = headword.strip().lower()
list_value = row['List '].strip() if isinstance(row['List '], str) else None
word_dict[headword] = list_value
if list_value in ['1k', '2k', '3k']:
H += 1
elif list_value in ['4k', '5k', '6k', '7k', '8k', '9k']:
M += 1
else:
L += 1
# Process related forms, if available
related_forms = row['Related forms']
if isinstance(related_forms, str):
related_words = re.findall(r'(\w+)', related_forms.lower())
# Map related words to the headword
for related_word in related_words:
related_word = related_word.strip().lower()
word_dict[related_word] = list_value
related_to_headword_map[related_word] = headword
return word_dict, related_to_headword_map, H,M,L
# Function to extract text from a PDF
def extract_text_from_pdf(pdf_path, to_page=None):
doc = fitz.open(pdf_path)
text = ""
for i, page in enumerate(doc):
if to_page is not None and i >= to_page:
break
text += page.get_text()
return text, len(doc) if to_page is None else to_page
# Function to extract total and unique English words from the text
def extract_english_words(text):
english_words = set(words.words()) # NLTK word list
word_list = word_tokenize(text.lower()) # Tokenize and lowercase the text
total_words = [word for word in word_list if word.isalpha() and word in english_words] # Filter English words
unique_words = set(total_words) # Set of unique English words
return total_words, unique_words
# Load BNC/COCA frequency data
word_dict, related_to_headword_map, H,M,L = load_BNC_COCA_list('BNC_COCA_lists.xlsx')
base_path = "vocab_books_path\\"
w504 = "504 Absolutely Essential Words 6th"
w4000 = "4000 Essential English Words 1"
w1100 = "Barrons 1100 Words You Need to Know 5th"
wweb = "Merriam Webster Vocabulary Builder"
oxf = "Oxford Word Skills Basic Book"
uns = "Under Scrutiny"
def get_book_path(title):
return f"{base_path}{title}.pdf"
def calc_stat_version_2(title, to_pages=None):
pdf_path = get_book_path(title)
text, cnt_pages = extract_text_from_pdf(pdf_path, to_pages)
total_words, unique_words = extract_english_words(text)
headwords=list({related_to_headword_map.get(uw, uw) for uw in unique_words})
# Initialize counters for word frequency categories
high_freq_count, mid_freq_count, low_freq_count, not_available_count = 0, 0, 0, 0
unique_word_to_cat = {}
# Classify each unique word into high, mid, low frequency or not available
for headword in headwords:
word_category = word_dict.get(headword, 'N/A')
unique_word_to_cat[headword] = word_category
if word_category in ['1k', '2k', '3k']:
high_freq_count += 1
elif word_category in ['4k', '5k', '6k', '7k', '8k', '9k']:
mid_freq_count += 1
elif word_category != 'N/A':
low_freq_count += 1
else:
not_available_count += 1
total_unique_words = len(unique_words)
print_results(title, total_words, unique_words, high_freq_count, mid_freq_count, low_freq_count,
not_available_count, total_unique_words)
def print_results(title, total_words, unique_words, high_freq_count, mid_freq_count, low_freq_count,
not_available_count, total_unique_words):
print(f"{title}\n{'-' * 50}")
print(f"Total number of English words: {len(total_words)}")
print(f"Number of unique English words: {len(unique_words)}")
# Calculate percentages for each frequency category
high_freq_percentage = (high_freq_count / total_unique_words)
mid_freq_percentage = (mid_freq_count / total_unique_words)
low_freq_percentage = (low_freq_count / total_unique_words)
not_available_percentage = (not_available_count / total_unique_words)
# Assign weights for difficulty
high_freq_weight = H/(H+M+L) # Less weight since it reduces difficulty
mid_freq_weight = M/(H+M+L) # Moderate weight
low_freq_weight = L/(H+M+L) # High weight since it increases difficulty more
not_available_weight = L/(H+M+L) # High weight since it increases difficulty more
# Calculate difficulty score
difficulty_score = (
(not_available_weight * not_available_percentage) +
(low_freq_weight * low_freq_percentage) +
(mid_freq_weight * mid_freq_percentage) +
(high_freq_weight * high_freq_percentage)
) / (not_available_weight + low_freq_weight + mid_freq_weight + high_freq_weight)
print(f"\nBook's Difficulty Level: {difficulty_score:.4f}")
print(f" High-frequency words: {high_freq_count} ({high_freq_percentage:.2f}%)")
print(f" Mid-frequency words: {mid_freq_count} ({mid_freq_percentage:.2f}%)")
print(f" Low-frequency words: {low_freq_count} ({low_freq_percentage:.2f}%)")
print(f" Not available in frequency list: {not_available_count} ({not_available_percentage:.2f}%)")
# Frequent Words Coverage
high_freq_coverage = (high_freq_count / H)
mid_freq_coverage = (mid_freq_count / M)
low_freq_coverage = (low_freq_count / L)
# Weights for each frequency group
high_freq_weight = 1.0 - H/(H+M+L)
mid_freq_weight = 1.0 - M/(H+M+L)
low_freq_weight = 1.0 - L/(H+M+L)
# Calculating coverage score
coverage_score = (
(high_freq_weight * high_freq_coverage) +
(mid_freq_weight * mid_freq_coverage) +
(low_freq_weight * low_freq_coverage)
) / (high_freq_weight + mid_freq_weight + low_freq_weight)
print(f"\nBook's Frequent Words Coverage: {coverage_score:.4f}")
print(f" High-frequency words: {high_freq_count} ({high_freq_coverage:.2f}%)")
print(f" Mid-frequency words: {mid_freq_count} ({mid_freq_coverage:.2f}%)")
print(f" Low-frequency words: {low_freq_count} ({low_freq_coverage:.2f}%)")
# Vocabulary Goodness Score
print(f"\nVocabulary Goodness Score: {coverage_score*100/len(total_words):.4f}")
print(f"{'-' * 50}\n")
# The analysis is limited to the section before the bonus vocabularies in 504 Absolutely Essential Words to
# concentrate on the core content of the book
calc_stat_version_2(w504, to_pages=144)
calc_stat_version_2(oxf)
calc_stat_version_2(w4000)
calc_stat_version_2(wweb)
calc_stat_version_2(w1100)
calc_stat_version_2(uns)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment