Last active
April 6, 2026 15:24
-
-
Save paceaux/db1a54667685773f4d8980cb9c4be015 to your computer and use it in GitHub Desktop.
Ngrams n data
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from collections import Counter | |
| import re | |
| def char_ngrams(word, size=1): | |
| """ | |
| Generates character-level n-grams for a given string with start/end markers. | |
| Args: | |
| word (str): The input string to process. | |
| size (int): The length of the n-gram window. Defaults to 1. | |
| Returns: | |
| list: A list of n-grams with a start and end tag. | |
| """ | |
| grams = [] | |
| word = f"^{word}$" | |
| for i, char in enumerate(word): | |
| if i < len(word) - (size - 1): | |
| gram = word[i:i+size] | |
| gram = gram.replace('^', '<S>') | |
| gram = gram.replace('$', '</S>') | |
| grams.append(gram) | |
| return grams | |
| def get_ngrams(words, size=1): | |
| """ | |
| Processes a collection of words into their respective character n-grams. | |
| Args: | |
| words (list of str): A list of strings to be tokenized into n-grams. | |
| size (int): The length of the n-gram window. Defaults to 1. | |
| Returns: | |
| list of list: A nested list where each inner list contains | |
| the n-grams for a specific word. | |
| """ | |
| ngrams = [] | |
| for word in words: | |
| ngrams.append(char_ngrams(word, size)) | |
| return ngrams | |
| def get_ngram_counts(words, size=1): | |
| """ | |
| Calculates the frequency of all character n-grams across a list of words. | |
| Args: | |
| words (list of str): A list of strings to analyze. | |
| size (int): The length of the n-gram window. Defaults to 1. | |
| Returns: | |
| collections.Counter: A counter object mapping each n-gram to its | |
| occurrence count. | |
| """ | |
| ngrams = get_ngrams(words, size) | |
| counts = Counter() | |
| for ngram_set in ngrams: | |
| counts.update(ngram_set) | |
| return counts | |
| def get_ngram_probs(words, size=1): | |
| """ | |
| Calculates the probability distribution of character n-grams. | |
| The probability is calculated as the count of a specific n-gram divided | |
| by the total number of all n-grams generated from the input. | |
| Args: | |
| words (list of str): A list of strings to analyze. | |
| size (int): The length of the n-gram window. Defaults to 1. | |
| Returns: | |
| dict: A dictionary where keys are n-grams and values are their | |
| probabilities, rounded to 3 decimal places. | |
| """ | |
| counts = get_ngram_counts(words, size) | |
| probs = dict() | |
| total = counts.total() | |
| for key, value in counts.items(): | |
| probs[key] = round(value / total, 3) | |
| return probs |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from typing import List | |
| def bigram(sentence: str) -> List[str]: | |
| """ | |
| Generates a list of bigrams from a given string. | |
| Normalizes the input by removing basic punctuation | |
| (commas, periods, semicolons) and converting the text | |
| to lowercase before processing. | |
| Args: | |
| sentence (str): The input string to be processed into bigrams. | |
| Returns: | |
| List[str]: A list of hyphenated word pairs (e.g., ["hello-world", "world-again"]). | |
| Example: | |
| >>> bigram("Hello world, again.") | |
| ['hello-world', 'world-again'] | |
| """ | |
| # You may need this dictionary to get the bigrams | |
| bigrams = [] | |
| # remove punctuations and change the sentence into lowercase letters | |
| # Note: Using a loop or regex is often cleaner for more punctuation marks | |
| for char in ",.;": | |
| sentence = sentence.replace(char, "") | |
| sentence = sentence.lower() | |
| words = sentence.split(" ") | |
| # Filter out empty strings that might result from double spaces | |
| words = [w for w in words if w] | |
| for i, word in enumerate(words): | |
| if i < len(words) - 1: | |
| pair = words[i:i+2] | |
| bigram_str = "-".join(pair) | |
| bigrams.append(bigram_str) | |
| return bigrams |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment