paceaux · April 6, 2026 15:24
diff --git a/char-ngrams.py b/char-ngrams.py
 from collections import Counter
 import re

 def char_ngrams(word, size=1):
    """
    Generates character-level n-grams for a given string with start/end markers.

    Args:
        word (str): The input string to process.
        size (int): The length of the n-gram window. Defaults to 1.

    Returns:
        list: A list of n-grams with a start and end tag.
    """
    grams = []
    
    word = f"^{word}$"
    for i, char in enumerate(word):
        if i < len(word) - (size - 1):
            gram = word[i:i+size]
            gram = gram.replace('^', '<S>')
            gram = gram.replace('$', '</S>')
            grams.append(gram)
    
    return grams


 def get_ngrams(words, size=1):
    """
    Processes a collection of words into their respective character n-grams.

    Args:
        words (list of str): A list of strings to be tokenized into n-grams.
        size (int): The length of the n-gram window. Defaults to 1.

    Returns:
        list of list: A nested list where each inner list contains 
                      the n-grams for a specific word.
    """
    ngrams = []
    
    for word in words:
        ngrams.append(char_ngrams(word, size))
    
    return ngrams

 def get_ngram_counts(words, size=1):
    """
    Calculates the frequency of all character n-grams across a list of words.

    Args:
        words (list of str): A list of strings to analyze.
        size (int): The length of the n-gram window. Defaults to 1.

    Returns:
        collections.Counter: A counter object mapping each n-gram to its 
                             occurrence count.
    """
    ngrams = get_ngrams(words, size)
    counts = Counter()
    
    for ngram_set in ngrams:
        counts.update(ngram_set)
    
    return counts

 def get_ngram_probs(words, size=1):
    """
    Calculates the probability distribution of character n-grams.

    The probability is calculated as the count of a specific n-gram divided 
    by the total number of all n-grams generated from the input.

    Args:
        words (list of str): A list of strings to analyze.
        size (int): The length of the n-gram window. Defaults to 1.

    Returns:
        dict: A dictionary where keys are n-grams and values are their 
              probabilities, rounded to 3 decimal places.
    """
    counts = get_ngram_counts(words, size)
    probs = dict()
    total = counts.total()
    
    for key, value in counts.items():
        probs[key] = round(value / total, 3)
    
    return probs
diff --git a/word_ngrams.py b/word_ngrams.py
 from typing import List

 def bigram(sentence: str) -> List[str]:
    """
    Generates a list of bigrams from a given string.
    
    Normalizes the input by removing basic punctuation
    (commas, periods, semicolons) and converting the text 
    to lowercase before processing.

    Args:
        sentence (str): The input string to be processed into bigrams.

    Returns:
        List[str]: A list of hyphenated word pairs (e.g., ["hello-world", "world-again"]).

    Example:
        >>> bigram("Hello world, again.")
        ['hello-world', 'world-again']
    """
    # You may need this dictionary to get the bigrams
    bigrams = []
    
    # remove punctuations and change the sentence into lowercase letters
    # Note: Using a loop or regex is often cleaner for more punctuation marks
    for char in ",.;":
        sentence = sentence.replace(char, "")
        
    sentence = sentence.lower()
    words = sentence.split(" ")
    
    # Filter out empty strings that might result from double spaces
    words = [w for w in words if w]

    for i, word in enumerate(words):
        if i < len(words) - 1:
            pair = words[i:i+2]
            bigram_str = "-".join(pair)
            bigrams.append(bigram_str)

    return bigrams
	from collections import Counter
	import re

	def char_ngrams(word, size=1):
	"""
	Generates character-level n-grams for a given string with start/end markers.

	Args:
	word (str): The input string to process.
	size (int): The length of the n-gram window. Defaults to 1.

	Returns:
	list: A list of n-grams with a start and end tag.
	"""
	grams = []

	word = f"^{word}$"
	for i, char in enumerate(word):
	if i < len(word) - (size - 1):
	gram = word[i:i+size]
	gram = gram.replace('^', '<S>')
	gram = gram.replace('$', '</S>')
	grams.append(gram)

	return grams


	def get_ngrams(words, size=1):
	"""
	Processes a collection of words into their respective character n-grams.

	Args:
	words (list of str): A list of strings to be tokenized into n-grams.
	size (int): The length of the n-gram window. Defaults to 1.

	Returns:
	list of list: A nested list where each inner list contains
	the n-grams for a specific word.
	"""
	ngrams = []

	for word in words:
	ngrams.append(char_ngrams(word, size))

	return ngrams

	def get_ngram_counts(words, size=1):
	"""
	Calculates the frequency of all character n-grams across a list of words.

	Args:
	words (list of str): A list of strings to analyze.
	size (int): The length of the n-gram window. Defaults to 1.

	Returns:
	collections.Counter: A counter object mapping each n-gram to its
	occurrence count.
	"""
	ngrams = get_ngrams(words, size)
	counts = Counter()

	for ngram_set in ngrams:
	counts.update(ngram_set)

	return counts

	def get_ngram_probs(words, size=1):
	"""
	Calculates the probability distribution of character n-grams.

	The probability is calculated as the count of a specific n-gram divided
	by the total number of all n-grams generated from the input.

	Args:
	words (list of str): A list of strings to analyze.
	size (int): The length of the n-gram window. Defaults to 1.

	Returns:
	dict: A dictionary where keys are n-grams and values are their
	probabilities, rounded to 3 decimal places.
	"""
	counts = get_ngram_counts(words, size)
	probs = dict()
	total = counts.total()

	for key, value in counts.items():
	probs[key] = round(value / total, 3)

	return probs
	from typing import List

	def bigram(sentence: str) -> List[str]:
	"""
	Generates a list of bigrams from a given string.

	Normalizes the input by removing basic punctuation
	(commas, periods, semicolons) and converting the text
	to lowercase before processing.

	Args:
	sentence (str): The input string to be processed into bigrams.

	Returns:
	List[str]: A list of hyphenated word pairs (e.g., ["hello-world", "world-again"]).

	Example:
	>>> bigram("Hello world, again.")
	['hello-world', 'world-again']
	"""
	# You may need this dictionary to get the bigrams
	bigrams = []

	# remove punctuations and change the sentence into lowercase letters
	# Note: Using a loop or regex is often cleaner for more punctuation marks
	for char in ",.;":
	sentence = sentence.replace(char, "")

	sentence = sentence.lower()
	words = sentence.split(" ")

	# Filter out empty strings that might result from double spaces
	words = [w for w in words if w]

	for i, word in enumerate(words):
	if i < len(words) - 1:
	pair = words[i:i+2]
	bigram_str = "-".join(pair)
	bigrams.append(bigram_str)

	return bigrams