TahaRostami · September 19, 2024 21:20
diff --git a/optim_en_vocab_learning.md b/optim_en_vocab_learning.md
diff --git a/zmain.py b/zmain.py
 # A core code for a quantitative analysis and comparison of some well-known vocabulary books
 import fitz  # PyMuPDF
 import re
 import nltk
 from nltk.corpus import words
 from nltk.tokenize import word_tokenize
 import pandas as pd

 # Ensure necessary NLTK data is available
 nltk.download('words')
 nltk.download('punkt')

 def load_BNC_COCA_list(file_path):
    df = pd.read_excel(file_path, usecols=['List ', 'Headword ', 'Related forms'])

    H=0
    M=0
    L=0

    # Initialize dictionaries to store word frequencies and related words
    word_dict = {}
    related_to_headword_map = {}

    # Populate word_dict and related_to_headword_map from the frequency list
    for index, row in df.iterrows():
        headword = row['Headword ']

        # Ensure headword is a valid string
        if isinstance(headword, str):
            headword = headword.strip().lower()
            list_value = row['List '].strip() if isinstance(row['List '], str) else None
            word_dict[headword] = list_value
            if list_value in ['1k', '2k', '3k']:
                H += 1
            elif list_value in ['4k', '5k', '6k', '7k', '8k', '9k']:
                M += 1
            else:
                L += 1

            # Process related forms, if available
            related_forms = row['Related forms']
            if isinstance(related_forms, str):
                related_words = re.findall(r'(\w+)', related_forms.lower())

                # Map related words to the headword
                for related_word in related_words:
                    related_word = related_word.strip().lower()
                    word_dict[related_word] = list_value
                    related_to_headword_map[related_word] = headword

    return word_dict, related_to_headword_map, H,M,L

 # Function to extract text from a PDF
 def extract_text_from_pdf(pdf_path, to_page=None):
    doc = fitz.open(pdf_path)
    text = ""
    for i, page in enumerate(doc):
        if to_page is not None and i >= to_page:
            break
        text += page.get_text()
    return text, len(doc) if to_page is None else to_page

 # Function to extract total and unique English words from the text
 def extract_english_words(text):
    english_words = set(words.words())  # NLTK word list
    word_list = word_tokenize(text.lower())  # Tokenize and lowercase the text
    total_words = [word for word in word_list if word.isalpha() and word in english_words]  # Filter English words
    unique_words = set(total_words)  # Set of unique English words
    return total_words, unique_words

 # Load BNC/COCA frequency data
 word_dict, related_to_headword_map, H,M,L = load_BNC_COCA_list('BNC_COCA_lists.xlsx')

 base_path = "vocab_books_path\\"
 w504 = "504 Absolutely Essential Words 6th"
 w4000 = "4000 Essential English Words 1"
 w1100 = "Barrons 1100 Words You Need to Know 5th"
 wweb = "Merriam Webster Vocabulary Builder"
 oxf = "Oxford Word Skills Basic Book"
 uns = "Under Scrutiny"
 def get_book_path(title):
    return f"{base_path}{title}.pdf"

 def calc_stat_version_2(title, to_pages=None):
    pdf_path = get_book_path(title)
    text, cnt_pages = extract_text_from_pdf(pdf_path, to_pages)


    total_words, unique_words = extract_english_words(text)

    headwords=list({related_to_headword_map.get(uw, uw) for uw in unique_words})

    # Initialize counters for word frequency categories
    high_freq_count, mid_freq_count, low_freq_count, not_available_count = 0, 0, 0, 0
    unique_word_to_cat = {}

    # Classify each unique word into high, mid, low frequency or not available
    for headword in headwords:
        word_category = word_dict.get(headword, 'N/A')
        unique_word_to_cat[headword] = word_category

        if word_category in ['1k', '2k', '3k']:
            high_freq_count += 1
        elif word_category in ['4k', '5k', '6k', '7k', '8k', '9k']:
            mid_freq_count += 1
        elif word_category != 'N/A':
            low_freq_count += 1
        else:
            not_available_count += 1

    total_unique_words = len(unique_words)

    print_results(title, total_words, unique_words, high_freq_count, mid_freq_count, low_freq_count,
                  not_available_count, total_unique_words)


 def print_results(title, total_words, unique_words, high_freq_count, mid_freq_count, low_freq_count,
                  not_available_count, total_unique_words):
    print(f"{title}\n{'-' * 50}")
    print(f"Total number of English words: {len(total_words)}")
    print(f"Number of unique English words: {len(unique_words)}")

    # Calculate percentages for each frequency category
    high_freq_percentage = (high_freq_count / total_unique_words)
    mid_freq_percentage = (mid_freq_count / total_unique_words)
    low_freq_percentage = (low_freq_count / total_unique_words)
    not_available_percentage = (not_available_count / total_unique_words)

    # Assign weights for difficulty
    high_freq_weight = H/(H+M+L)  # Less weight since it reduces difficulty
    mid_freq_weight = M/(H+M+L)   # Moderate weight
    low_freq_weight = L/(H+M+L)   # High weight since it increases difficulty more
    not_available_weight = L/(H+M+L)  # High weight since it increases difficulty more

    # Calculate difficulty score
    difficulty_score = (
                               (not_available_weight * not_available_percentage) +
                               (low_freq_weight * low_freq_percentage) +
                               (mid_freq_weight * mid_freq_percentage) +
                               (high_freq_weight * high_freq_percentage)
                       ) / (not_available_weight + low_freq_weight + mid_freq_weight + high_freq_weight)



    print(f"\nBook's Difficulty Level: {difficulty_score:.4f}")
    print(f"    High-frequency words: {high_freq_count} ({high_freq_percentage:.2f}%)")
    print(f"    Mid-frequency words: {mid_freq_count} ({mid_freq_percentage:.2f}%)")
    print(f"    Low-frequency words: {low_freq_count} ({low_freq_percentage:.2f}%)")
    print(f"    Not available in frequency list: {not_available_count} ({not_available_percentage:.2f}%)")

    # Frequent Words Coverage
    high_freq_coverage = (high_freq_count / H)
    mid_freq_coverage = (mid_freq_count / M)
    low_freq_coverage = (low_freq_count / L)

    # Weights for each frequency group
    high_freq_weight = 1.0 - H/(H+M+L)
    mid_freq_weight = 1.0 - M/(H+M+L)
    low_freq_weight = 1.0 - L/(H+M+L)

    # Calculating coverage score
    coverage_score = (
                             (high_freq_weight * high_freq_coverage) +
                             (mid_freq_weight * mid_freq_coverage) +
                             (low_freq_weight * low_freq_coverage)
                     ) / (high_freq_weight + mid_freq_weight + low_freq_weight)

    print(f"\nBook's Frequent Words Coverage: {coverage_score:.4f}")
    print(f"    High-frequency words: {high_freq_count} ({high_freq_coverage:.2f}%)")
    print(f"    Mid-frequency words: {mid_freq_count} ({mid_freq_coverage:.2f}%)")
    print(f"    Low-frequency words: {low_freq_count} ({low_freq_coverage:.2f}%)")

    # Vocabulary Goodness Score
    print(f"\nVocabulary Goodness Score: {coverage_score*100/len(total_words):.4f}")
    print(f"{'-' * 50}\n")






 # The analysis is limited to the section before the bonus vocabularies in 504 Absolutely Essential Words to
 # concentrate on the core content of the book
 calc_stat_version_2(w504, to_pages=144)

 calc_stat_version_2(oxf)
 calc_stat_version_2(w4000)
 calc_stat_version_2(wweb)
 calc_stat_version_2(w1100)
 calc_stat_version_2(uns)
	# A core code for a quantitative analysis and comparison of some well-known vocabulary books
	import fitz # PyMuPDF
	import re
	import nltk
	from nltk.corpus import words
	from nltk.tokenize import word_tokenize
	import pandas as pd

	# Ensure necessary NLTK data is available
	nltk.download('words')
	nltk.download('punkt')

	def load_BNC_COCA_list(file_path):
	df = pd.read_excel(file_path, usecols=['List ', 'Headword ', 'Related forms'])

	H=0
	M=0
	L=0

	# Initialize dictionaries to store word frequencies and related words
	word_dict = {}
	related_to_headword_map = {}

	# Populate word_dict and related_to_headword_map from the frequency list
	for index, row in df.iterrows():
	headword = row['Headword ']

	# Ensure headword is a valid string
	if isinstance(headword, str):
	headword = headword.strip().lower()
	list_value = row['List '].strip() if isinstance(row['List '], str) else None
	word_dict[headword] = list_value
	if list_value in ['1k', '2k', '3k']:
	H += 1
	elif list_value in ['4k', '5k', '6k', '7k', '8k', '9k']:
	M += 1
	else:
	L += 1

	# Process related forms, if available
	related_forms = row['Related forms']
	if isinstance(related_forms, str):
	related_words = re.findall(r'(\w+)', related_forms.lower())

	# Map related words to the headword
	for related_word in related_words:
	related_word = related_word.strip().lower()
	word_dict[related_word] = list_value
	related_to_headword_map[related_word] = headword

	return word_dict, related_to_headword_map, H,M,L

	# Function to extract text from a PDF
	def extract_text_from_pdf(pdf_path, to_page=None):
	doc = fitz.open(pdf_path)
	text = ""
	for i, page in enumerate(doc):
	if to_page is not None and i >= to_page:
	break
	text += page.get_text()
	return text, len(doc) if to_page is None else to_page

	# Function to extract total and unique English words from the text
	def extract_english_words(text):
	english_words = set(words.words()) # NLTK word list
	word_list = word_tokenize(text.lower()) # Tokenize and lowercase the text
	total_words = [word for word in word_list if word.isalpha() and word in english_words] # Filter English words
	unique_words = set(total_words) # Set of unique English words
	return total_words, unique_words

	# Load BNC/COCA frequency data
	word_dict, related_to_headword_map, H,M,L = load_BNC_COCA_list('BNC_COCA_lists.xlsx')

	base_path = "vocab_books_path\\"
	w504 = "504 Absolutely Essential Words 6th"
	w4000 = "4000 Essential English Words 1"
	w1100 = "Barrons 1100 Words You Need to Know 5th"
	wweb = "Merriam Webster Vocabulary Builder"
	oxf = "Oxford Word Skills Basic Book"
	uns = "Under Scrutiny"
	def get_book_path(title):
	return f"{base_path}{title}.pdf"

	def calc_stat_version_2(title, to_pages=None):
	pdf_path = get_book_path(title)
	text, cnt_pages = extract_text_from_pdf(pdf_path, to_pages)


	total_words, unique_words = extract_english_words(text)

	headwords=list({related_to_headword_map.get(uw, uw) for uw in unique_words})

	# Initialize counters for word frequency categories
	high_freq_count, mid_freq_count, low_freq_count, not_available_count = 0, 0, 0, 0
	unique_word_to_cat = {}

	# Classify each unique word into high, mid, low frequency or not available
	for headword in headwords:
	word_category = word_dict.get(headword, 'N/A')
	unique_word_to_cat[headword] = word_category

	if word_category in ['1k', '2k', '3k']:
	high_freq_count += 1
	elif word_category in ['4k', '5k', '6k', '7k', '8k', '9k']:
	mid_freq_count += 1
	elif word_category != 'N/A':
	low_freq_count += 1
	else:
	not_available_count += 1

	total_unique_words = len(unique_words)

	print_results(title, total_words, unique_words, high_freq_count, mid_freq_count, low_freq_count,
	not_available_count, total_unique_words)


	def print_results(title, total_words, unique_words, high_freq_count, mid_freq_count, low_freq_count,
	not_available_count, total_unique_words):
	print(f"{title}\n{'-' * 50}")
	print(f"Total number of English words: {len(total_words)}")
	print(f"Number of unique English words: {len(unique_words)}")

	# Calculate percentages for each frequency category
	high_freq_percentage = (high_freq_count / total_unique_words)
	mid_freq_percentage = (mid_freq_count / total_unique_words)
	low_freq_percentage = (low_freq_count / total_unique_words)
	not_available_percentage = (not_available_count / total_unique_words)

	# Assign weights for difficulty
	high_freq_weight = H/(H+M+L) # Less weight since it reduces difficulty
	mid_freq_weight = M/(H+M+L) # Moderate weight
	low_freq_weight = L/(H+M+L) # High weight since it increases difficulty more
	not_available_weight = L/(H+M+L) # High weight since it increases difficulty more

	# Calculate difficulty score
	difficulty_score = (
	(not_available_weight * not_available_percentage) +
	(low_freq_weight * low_freq_percentage) +
	(mid_freq_weight * mid_freq_percentage) +
	(high_freq_weight * high_freq_percentage)
	) / (not_available_weight + low_freq_weight + mid_freq_weight + high_freq_weight)



	print(f"\nBook's Difficulty Level: {difficulty_score:.4f}")
	print(f" High-frequency words: {high_freq_count} ({high_freq_percentage:.2f}%)")
	print(f" Mid-frequency words: {mid_freq_count} ({mid_freq_percentage:.2f}%)")
	print(f" Low-frequency words: {low_freq_count} ({low_freq_percentage:.2f}%)")
	print(f" Not available in frequency list: {not_available_count} ({not_available_percentage:.2f}%)")

	# Frequent Words Coverage
	high_freq_coverage = (high_freq_count / H)
	mid_freq_coverage = (mid_freq_count / M)
	low_freq_coverage = (low_freq_count / L)

	# Weights for each frequency group
	high_freq_weight = 1.0 - H/(H+M+L)
	mid_freq_weight = 1.0 - M/(H+M+L)
	low_freq_weight = 1.0 - L/(H+M+L)

	# Calculating coverage score
	coverage_score = (
	(high_freq_weight * high_freq_coverage) +
	(mid_freq_weight * mid_freq_coverage) +
	(low_freq_weight * low_freq_coverage)
	) / (high_freq_weight + mid_freq_weight + low_freq_weight)

	print(f"\nBook's Frequent Words Coverage: {coverage_score:.4f}")
	print(f" High-frequency words: {high_freq_count} ({high_freq_coverage:.2f}%)")
	print(f" Mid-frequency words: {mid_freq_count} ({mid_freq_coverage:.2f}%)")
	print(f" Low-frequency words: {low_freq_count} ({low_freq_coverage:.2f}%)")

	# Vocabulary Goodness Score
	print(f"\nVocabulary Goodness Score: {coverage_score*100/len(total_words):.4f}")
	print(f"{'-' * 50}\n")






	# The analysis is limited to the section before the bonus vocabularies in 504 Absolutely Essential Words to
	# concentrate on the core content of the book
	calc_stat_version_2(w504, to_pages=144)

	calc_stat_version_2(oxf)
	calc_stat_version_2(w4000)
	calc_stat_version_2(wweb)
	calc_stat_version_2(w1100)
	calc_stat_version_2(uns)