Skip to content

Instantly share code, notes, and snippets.

@SolidAlloy
Created April 22, 2026 17:44
Show Gist options
  • Select an option

  • Save SolidAlloy/1b9aa4b79d43b9947cb56471a21d91c2 to your computer and use it in GitHub Desktop.

Select an option

Save SolidAlloy/1b9aa4b79d43b9947cb56471a21d91c2 to your computer and use it in GitHub Desktop.
Word frequency and n-grams generator
package main
// The program takes corpora in txt and csv formats and generates a word frequency list, as well as n-gram lists.
// The word frequency list can be used for Cyanophage, while n-grams are in a format suitable for dariogoetz's Keyboard Layout Optimizer.
// The program goes through 7 GB of text in 10 minutes.
// I wrote it in Odin because I'm practicing it, but also because any naive implementation in a compiled language will be better than Python.
// The following corpora were used for Ukrainian:
// https://github.com/kateryna-bobrovnyk/ukr-twi-corpus
// https://lang.org.ua/en/ubertext/ (wikipedia & fiction, split into sentences)
import "core:fmt"
import "core:slice"
import "core:unicode/utf8"
import "core:strings"
import "core:bufio"
import os "core:os/os2"
import "core:io"
import "core:log"
import "core:math"
import "core:encoding/csv"
import "core:unicode"
process_word :: proc(word_buf: []u8, weight: f64, word_counts: ^map[string]f64, char_counts: ^[9000]f64) {
contains_letters := false
word_str := string(word_buf)
for char in word_str {
if unicode.is_letter(char) {
contains_letters = true
break
}
}
if !contains_letters { return }
if strings.contains(word_str, "...") {
improved_word, _ := strings.replace(word_str, "...", ".", 1, context.temp_allocator)
if count, exists := &word_counts[improved_word]; exists {
count^ += weight
} else {
permanent_word := strings.clone(improved_word, context.allocator)
word_counts[permanent_word] = weight
}
} else {
if count, exists := &word_counts[word_str]; exists {
count^ += weight
} else {
permanent_word := strings.clone(word_str, context.allocator)
word_counts[permanent_word] = weight
}
}
}
process_line :: proc(line: string, weight: f64, word_counts: ^map[string]f64, char_counts: ^[9000]f64) {
word_buf: [256]u8
word_len := 0
i := 0
for i < len(line) {
char, size := utf8.decode_rune_in_string(line[i:])
i += size
char = unicode.to_lower(char)
if int(char) < 9000 {
key := char
switch key {
case '–', '—': key = '-'
case '\'', '’': key = '’'
case '«', '»': key = '"'
}
switch key {
case 'а'..='щ', 'ь', 'ю', 'я', 'є', 'і', 'ї', 'ґ', '.', ',', '-', ' ', '!', '"', ':', '?', '(', ')', '’':
char_counts[int(key)] += weight
}
}
switch char {
case 'а'..='щ', 'ь', 'ю', 'я', 'є', 'і', 'ї', '.', ',', '-':
if word_len + size <= len(word_buf) {
bytes, b_len := utf8.encode_rune(char)
for b_i in 0..<b_len {
word_buf[word_len] = bytes[b_i]
word_len += 1
}
}
case 'ґ':
if word_len + utf8.rune_size('г') <= len(word_buf) {
bytes, b_len := utf8.encode_rune('г')
for b_i in 0..<b_len {
word_buf[word_len] = bytes[b_i]
word_len += 1
}
}
case:
process_word(word_buf[:word_len], weight, word_counts, char_counts)
word_len = 0
}
}
process_word(word_buf[:word_len], weight, word_counts, char_counts)
}
process_csv :: proc(name: string, weight: f64, word_counts: ^map[string]f64, char_counts: ^[9000]f64) -> u64 {
file, err := os.open(name); assert(err == nil)
defer { err := os.close(file); assert(err == nil) }
reader: csv.Reader
csv.reader_init(&reader, os.to_reader(file))
defer csv.reader_destroy(&reader)
header, csv_err := csv.read(&reader, context.temp_allocator); assert(csv_err == nil)
lines_count: int
processed_bytes: u64
for {
record, io_err := csv.read(&reader, context.temp_allocator)
if io_err != nil {
if io_err != io.Error.EOF {
log.debugf("Error reading line: {}", io_err)
} else {
break
}
}
if len(record) > 3 {
text_field := record[3]
lines, _ := strings.split_lines(text_field, context.temp_allocator)
for line in lines {
process_line(line, weight, word_counts, char_counts)
}
processed_bytes += u64(len(text_field))
lines_count += 1
if lines_count % 1_000_000 == 0 {
log.infof("{}: {}m", name, lines_count / 1_000_000)
}
}
free_all(context.temp_allocator)
}
return processed_bytes
}
process_txt :: proc(name: string, weight: f64, word_counts: ^map[string]f64, char_counts: ^[9000]f64, csv_bytes: u64) {
file, err := os.open(name); assert(err == nil)
defer { err := os.close(file); assert(err == nil) }
file_size, size_err := os.file_size(file); assert(size_err == nil)
weight := weight * f64(csv_bytes / u64(file_size))
reader: bufio.Reader
bufio.reader_init(&reader, os.to_reader(file))
lines_count: int
for {
line, io_err := bufio.nreader_read_string(&reader, '\n', context.temp_allocator)
lines_count += 1
if io_err != nil {
if io_err != io.Error.EOF {
log.debugf("Error reading line: %v\n", io_err)
} else {
break
}
}
process_line(line, weight, word_counts, char_counts)
if lines_count % 1_000_000 == 0 {
log.infof("{}: {}m", name, lines_count / 1_000_000)
}
free_all(context.temp_allocator)
}
}
main :: proc() {
context.logger = log.create_console_logger()
word_counts := make(map[string]f64, 2048)
char_counts: [9000]f64
processed_csv_bytes := process_csv("social.csv", 1.2, &word_counts, &char_counts)
process_txt("wiki.txt", 0.6, &word_counts, &char_counts, processed_csv_bytes)
process_txt("literature.txt", 0.3, &word_counts, &char_counts, processed_csv_bytes)
grams_1 := make(map[[1]rune]f64, 35)
grams_2 := make(map[[2]rune]f64, 100)
grams_3 := make(map[[3]rune]f64, 300)
for word, word_weight in word_counts {
gram_runes := utf8.string_to_runes(word, context.temp_allocator)
gram1: [1]rune
gram2: [2]rune
gram3: [3]rune
for i in 0..<len(gram_runes) {
gram1[0] = gram_runes[i]
if count, exists := &grams_1[gram1]; exists {
count^ += word_weight
} else {
grams_1[gram1] = word_weight
}
if i < len(gram_runes) - 1 {
gram2[0] = gram_runes[i]
gram2[1] = gram_runes[i+1]
if count, exists := &grams_2[gram2]; exists {
count^ += word_weight
} else {
grams_2[gram2] = word_weight
}
}
if i < len(gram_runes) - 2 {
gram3[0] = gram_runes[i]
gram3[1] = gram_runes[i+1]
gram3[2] = gram_runes[i+2]
if count, exists := &grams_3[gram3]; exists {
count^ += word_weight
} else {
grams_3[gram3] = word_weight
}
}
}
free_all(context.temp_allocator)
}
Word_Freq :: struct {
word: string,
count: f64,
}
Char_Freq :: struct {
char: rune,
count: f64,
}
{
char_freqs := make([dynamic]Char_Freq, 0, 100)
for count, i in char_counts {
if count > 0 {
append(&char_freqs, Char_Freq {rune(i), count})
}
}
slice.sort_by(char_freqs[:], proc(i, j: Char_Freq) -> bool {
return i.count > j.count
})
total : f64 = 0
for freq in char_freqs {
total += freq.count
}
for freq in char_freqs {
log.debugf("{}: %.3f%%", freq.char, freq.count / total * 100)
}
}
{
word_freqs := make([dynamic]Word_Freq, 0, len(word_counts))
for word, count in word_counts {
append(&word_freqs, Word_Freq {word, count})
}
slice.sort_by(word_freqs[:], proc(i, j: Word_Freq) -> bool {
return i.count > j.count
})
out_fd, out_err := os.open("words-ukrainian.json", os.O_WRONLY | os.O_CREATE | os.O_TRUNC, os.Permissions{.Read_User, .Write_User, .Read_Group, .Read_Other})
if out_err != nil {
log.error("Error creating output file.")
return
}
defer os.close(out_fd)
file_stream := os.to_stream(out_fd)
writer: bufio.Writer
bufio.writer_init(&writer, file_stream)
defer bufio.writer_destroy(&writer)
bufio.writer_write_string(&writer, "{\n")
for i := 0; i < len(word_freqs); i += 1 {
comma := "," if i < len(word_freqs) - 1 else ""
bufio.writer_write_string(&writer, fmt.tprintfln(" \"{}\": %v%s", word_freqs[i].word, word_freqs[i].count, comma))
}
bufio.writer_write_string(&writer, "}\n")
bufio.writer_flush(&writer)
}
write_gram_file :: proc(name: string, dict: map[$T]f64) {
gram_counts := make([dynamic]Word_Freq, 0, len(dict), context.temp_allocator)
for runes, count in dict {
runes := runes
append(&gram_counts, Word_Freq {utf8.runes_to_string(runes[:], context.temp_allocator), count})
}
slice.sort_by(gram_counts[:], proc(i, j: Word_Freq) -> bool {
return i.count > j.count
})
out_fd, out_err := os.open(name, os.O_WRONLY | os.O_CREATE | os.O_TRUNC, os.Permissions{.Read_User, .Write_User, .Read_Group, .Read_Other})
if out_err != nil {
log.error("Error creating output file.")
return
}
defer os.close(out_fd)
file_stream := os.to_stream(out_fd)
writer: bufio.Writer
bufio.writer_init(&writer, file_stream)
defer bufio.writer_destroy(&writer)
for i in 0..<len(gram_counts) {
bufio.writer_write_string(&writer, fmt.tprintfln("{} {}", i64(math.round(gram_counts[i].count)), gram_counts[i].word))
}
bufio.writer_flush(&writer)
free_all(context.temp_allocator)
}
write_gram_file("1-grams.txt", grams_1)
write_gram_file("2-grams.txt", grams_2)
write_gram_file("3-grams.txt", grams_3)
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment