|
# A core code for a quantitative analysis and comparison of some well-known vocabulary books |
|
import fitz # PyMuPDF |
|
import re |
|
import nltk |
|
from nltk.corpus import words |
|
from nltk.tokenize import word_tokenize |
|
import pandas as pd |
|
|
|
# Ensure necessary NLTK data is available |
|
nltk.download('words') |
|
nltk.download('punkt') |
|
|
|
def load_BNC_COCA_list(file_path): |
|
df = pd.read_excel(file_path, usecols=['List ', 'Headword ', 'Related forms']) |
|
|
|
H=0 |
|
M=0 |
|
L=0 |
|
|
|
# Initialize dictionaries to store word frequencies and related words |
|
word_dict = {} |
|
related_to_headword_map = {} |
|
|
|
# Populate word_dict and related_to_headword_map from the frequency list |
|
for index, row in df.iterrows(): |
|
headword = row['Headword '] |
|
|
|
# Ensure headword is a valid string |
|
if isinstance(headword, str): |
|
headword = headword.strip().lower() |
|
list_value = row['List '].strip() if isinstance(row['List '], str) else None |
|
word_dict[headword] = list_value |
|
if list_value in ['1k', '2k', '3k']: |
|
H += 1 |
|
elif list_value in ['4k', '5k', '6k', '7k', '8k', '9k']: |
|
M += 1 |
|
else: |
|
L += 1 |
|
|
|
# Process related forms, if available |
|
related_forms = row['Related forms'] |
|
if isinstance(related_forms, str): |
|
related_words = re.findall(r'(\w+)', related_forms.lower()) |
|
|
|
# Map related words to the headword |
|
for related_word in related_words: |
|
related_word = related_word.strip().lower() |
|
word_dict[related_word] = list_value |
|
related_to_headword_map[related_word] = headword |
|
|
|
return word_dict, related_to_headword_map, H,M,L |
|
|
|
# Function to extract text from a PDF |
|
def extract_text_from_pdf(pdf_path, to_page=None): |
|
doc = fitz.open(pdf_path) |
|
text = "" |
|
for i, page in enumerate(doc): |
|
if to_page is not None and i >= to_page: |
|
break |
|
text += page.get_text() |
|
return text, len(doc) if to_page is None else to_page |
|
|
|
# Function to extract total and unique English words from the text |
|
def extract_english_words(text): |
|
english_words = set(words.words()) # NLTK word list |
|
word_list = word_tokenize(text.lower()) # Tokenize and lowercase the text |
|
total_words = [word for word in word_list if word.isalpha() and word in english_words] # Filter English words |
|
unique_words = set(total_words) # Set of unique English words |
|
return total_words, unique_words |
|
|
|
# Load BNC/COCA frequency data |
|
word_dict, related_to_headword_map, H,M,L = load_BNC_COCA_list('BNC_COCA_lists.xlsx') |
|
|
|
base_path = "vocab_books_path\\" |
|
w504 = "504 Absolutely Essential Words 6th" |
|
w4000 = "4000 Essential English Words 1" |
|
w1100 = "Barrons 1100 Words You Need to Know 5th" |
|
wweb = "Merriam Webster Vocabulary Builder" |
|
oxf = "Oxford Word Skills Basic Book" |
|
uns = "Under Scrutiny" |
|
def get_book_path(title): |
|
return f"{base_path}{title}.pdf" |
|
|
|
def calc_stat_version_2(title, to_pages=None): |
|
pdf_path = get_book_path(title) |
|
text, cnt_pages = extract_text_from_pdf(pdf_path, to_pages) |
|
|
|
|
|
total_words, unique_words = extract_english_words(text) |
|
|
|
headwords=list({related_to_headword_map.get(uw, uw) for uw in unique_words}) |
|
|
|
# Initialize counters for word frequency categories |
|
high_freq_count, mid_freq_count, low_freq_count, not_available_count = 0, 0, 0, 0 |
|
unique_word_to_cat = {} |
|
|
|
# Classify each unique word into high, mid, low frequency or not available |
|
for headword in headwords: |
|
word_category = word_dict.get(headword, 'N/A') |
|
unique_word_to_cat[headword] = word_category |
|
|
|
if word_category in ['1k', '2k', '3k']: |
|
high_freq_count += 1 |
|
elif word_category in ['4k', '5k', '6k', '7k', '8k', '9k']: |
|
mid_freq_count += 1 |
|
elif word_category != 'N/A': |
|
low_freq_count += 1 |
|
else: |
|
not_available_count += 1 |
|
|
|
total_unique_words = len(unique_words) |
|
|
|
print_results(title, total_words, unique_words, high_freq_count, mid_freq_count, low_freq_count, |
|
not_available_count, total_unique_words) |
|
|
|
|
|
def print_results(title, total_words, unique_words, high_freq_count, mid_freq_count, low_freq_count, |
|
not_available_count, total_unique_words): |
|
print(f"{title}\n{'-' * 50}") |
|
print(f"Total number of English words: {len(total_words)}") |
|
print(f"Number of unique English words: {len(unique_words)}") |
|
|
|
# Calculate percentages for each frequency category |
|
high_freq_percentage = (high_freq_count / total_unique_words) |
|
mid_freq_percentage = (mid_freq_count / total_unique_words) |
|
low_freq_percentage = (low_freq_count / total_unique_words) |
|
not_available_percentage = (not_available_count / total_unique_words) |
|
|
|
# Assign weights for difficulty |
|
high_freq_weight = H/(H+M+L) # Less weight since it reduces difficulty |
|
mid_freq_weight = M/(H+M+L) # Moderate weight |
|
low_freq_weight = L/(H+M+L) # High weight since it increases difficulty more |
|
not_available_weight = L/(H+M+L) # High weight since it increases difficulty more |
|
|
|
# Calculate difficulty score |
|
difficulty_score = ( |
|
(not_available_weight * not_available_percentage) + |
|
(low_freq_weight * low_freq_percentage) + |
|
(mid_freq_weight * mid_freq_percentage) + |
|
(high_freq_weight * high_freq_percentage) |
|
) / (not_available_weight + low_freq_weight + mid_freq_weight + high_freq_weight) |
|
|
|
|
|
|
|
print(f"\nBook's Difficulty Level: {difficulty_score:.4f}") |
|
print(f" High-frequency words: {high_freq_count} ({high_freq_percentage:.2f}%)") |
|
print(f" Mid-frequency words: {mid_freq_count} ({mid_freq_percentage:.2f}%)") |
|
print(f" Low-frequency words: {low_freq_count} ({low_freq_percentage:.2f}%)") |
|
print(f" Not available in frequency list: {not_available_count} ({not_available_percentage:.2f}%)") |
|
|
|
# Frequent Words Coverage |
|
high_freq_coverage = (high_freq_count / H) |
|
mid_freq_coverage = (mid_freq_count / M) |
|
low_freq_coverage = (low_freq_count / L) |
|
|
|
# Weights for each frequency group |
|
high_freq_weight = 1.0 - H/(H+M+L) |
|
mid_freq_weight = 1.0 - M/(H+M+L) |
|
low_freq_weight = 1.0 - L/(H+M+L) |
|
|
|
# Calculating coverage score |
|
coverage_score = ( |
|
(high_freq_weight * high_freq_coverage) + |
|
(mid_freq_weight * mid_freq_coverage) + |
|
(low_freq_weight * low_freq_coverage) |
|
) / (high_freq_weight + mid_freq_weight + low_freq_weight) |
|
|
|
print(f"\nBook's Frequent Words Coverage: {coverage_score:.4f}") |
|
print(f" High-frequency words: {high_freq_count} ({high_freq_coverage:.2f}%)") |
|
print(f" Mid-frequency words: {mid_freq_count} ({mid_freq_coverage:.2f}%)") |
|
print(f" Low-frequency words: {low_freq_count} ({low_freq_coverage:.2f}%)") |
|
|
|
# Vocabulary Goodness Score |
|
print(f"\nVocabulary Goodness Score: {coverage_score*100/len(total_words):.4f}") |
|
print(f"{'-' * 50}\n") |
|
|
|
|
|
|
|
|
|
|
|
|
|
# The analysis is limited to the section before the bonus vocabularies in 504 Absolutely Essential Words to |
|
# concentrate on the core content of the book |
|
calc_stat_version_2(w504, to_pages=144) |
|
|
|
calc_stat_version_2(oxf) |
|
calc_stat_version_2(w4000) |
|
calc_stat_version_2(wweb) |
|
calc_stat_version_2(w1100) |
|
calc_stat_version_2(uns) |