Skip to content

Instantly share code, notes, and snippets.

@paceaux
Last active April 6, 2026 15:24
Show Gist options
  • Select an option

  • Save paceaux/db1a54667685773f4d8980cb9c4be015 to your computer and use it in GitHub Desktop.

Select an option

Save paceaux/db1a54667685773f4d8980cb9c4be015 to your computer and use it in GitHub Desktop.
Ngrams n data
from collections import Counter
import re
def char_ngrams(word, size=1):
"""
Generates character-level n-grams for a given string with start/end markers.
Args:
word (str): The input string to process.
size (int): The length of the n-gram window. Defaults to 1.
Returns:
list: A list of n-grams with a start and end tag.
"""
grams = []
word = f"^{word}$"
for i, char in enumerate(word):
if i < len(word) - (size - 1):
gram = word[i:i+size]
gram = gram.replace('^', '<S>')
gram = gram.replace('$', '</S>')
grams.append(gram)
return grams
def get_ngrams(words, size=1):
"""
Processes a collection of words into their respective character n-grams.
Args:
words (list of str): A list of strings to be tokenized into n-grams.
size (int): The length of the n-gram window. Defaults to 1.
Returns:
list of list: A nested list where each inner list contains
the n-grams for a specific word.
"""
ngrams = []
for word in words:
ngrams.append(char_ngrams(word, size))
return ngrams
def get_ngram_counts(words, size=1):
"""
Calculates the frequency of all character n-grams across a list of words.
Args:
words (list of str): A list of strings to analyze.
size (int): The length of the n-gram window. Defaults to 1.
Returns:
collections.Counter: A counter object mapping each n-gram to its
occurrence count.
"""
ngrams = get_ngrams(words, size)
counts = Counter()
for ngram_set in ngrams:
counts.update(ngram_set)
return counts
def get_ngram_probs(words, size=1):
"""
Calculates the probability distribution of character n-grams.
The probability is calculated as the count of a specific n-gram divided
by the total number of all n-grams generated from the input.
Args:
words (list of str): A list of strings to analyze.
size (int): The length of the n-gram window. Defaults to 1.
Returns:
dict: A dictionary where keys are n-grams and values are their
probabilities, rounded to 3 decimal places.
"""
counts = get_ngram_counts(words, size)
probs = dict()
total = counts.total()
for key, value in counts.items():
probs[key] = round(value / total, 3)
return probs
from typing import List
def bigram(sentence: str) -> List[str]:
"""
Generates a list of bigrams from a given string.
Normalizes the input by removing basic punctuation
(commas, periods, semicolons) and converting the text
to lowercase before processing.
Args:
sentence (str): The input string to be processed into bigrams.
Returns:
List[str]: A list of hyphenated word pairs (e.g., ["hello-world", "world-again"]).
Example:
>>> bigram("Hello world, again.")
['hello-world', 'world-again']
"""
# You may need this dictionary to get the bigrams
bigrams = []
# remove punctuations and change the sentence into lowercase letters
# Note: Using a loop or regex is often cleaner for more punctuation marks
for char in ",.;":
sentence = sentence.replace(char, "")
sentence = sentence.lower()
words = sentence.split(" ")
# Filter out empty strings that might result from double spaces
words = [w for w in words if w]
for i, word in enumerate(words):
if i < len(words) - 1:
pair = words[i:i+2]
bigram_str = "-".join(pair)
bigrams.append(bigram_str)
return bigrams
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment