Created
April 22, 2026 17:44
-
-
Save SolidAlloy/1b9aa4b79d43b9947cb56471a21d91c2 to your computer and use it in GitHub Desktop.
Word frequency and n-grams generator
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| package main | |
| // The program takes corpora in txt and csv formats and generates a word frequency list, as well as n-gram lists. | |
| // The word frequency list can be used for Cyanophage, while n-grams are in a format suitable for dariogoetz's Keyboard Layout Optimizer. | |
| // The program goes through 7 GB of text in 10 minutes. | |
| // I wrote it in Odin because I'm practicing it, but also because any naive implementation in a compiled language will be better than Python. | |
| // The following corpora were used for Ukrainian: | |
| // https://github.com/kateryna-bobrovnyk/ukr-twi-corpus | |
| // https://lang.org.ua/en/ubertext/ (wikipedia & fiction, split into sentences) | |
| import "core:fmt" | |
| import "core:slice" | |
| import "core:unicode/utf8" | |
| import "core:strings" | |
| import "core:bufio" | |
| import os "core:os/os2" | |
| import "core:io" | |
| import "core:log" | |
| import "core:math" | |
| import "core:encoding/csv" | |
| import "core:unicode" | |
| process_word :: proc(word_buf: []u8, weight: f64, word_counts: ^map[string]f64, char_counts: ^[9000]f64) { | |
| contains_letters := false | |
| word_str := string(word_buf) | |
| for char in word_str { | |
| if unicode.is_letter(char) { | |
| contains_letters = true | |
| break | |
| } | |
| } | |
| if !contains_letters { return } | |
| if strings.contains(word_str, "...") { | |
| improved_word, _ := strings.replace(word_str, "...", ".", 1, context.temp_allocator) | |
| if count, exists := &word_counts[improved_word]; exists { | |
| count^ += weight | |
| } else { | |
| permanent_word := strings.clone(improved_word, context.allocator) | |
| word_counts[permanent_word] = weight | |
| } | |
| } else { | |
| if count, exists := &word_counts[word_str]; exists { | |
| count^ += weight | |
| } else { | |
| permanent_word := strings.clone(word_str, context.allocator) | |
| word_counts[permanent_word] = weight | |
| } | |
| } | |
| } | |
| process_line :: proc(line: string, weight: f64, word_counts: ^map[string]f64, char_counts: ^[9000]f64) { | |
| word_buf: [256]u8 | |
| word_len := 0 | |
| i := 0 | |
| for i < len(line) { | |
| char, size := utf8.decode_rune_in_string(line[i:]) | |
| i += size | |
| char = unicode.to_lower(char) | |
| if int(char) < 9000 { | |
| key := char | |
| switch key { | |
| case '–', '—': key = '-' | |
| case '\'', '’': key = '’' | |
| case '«', '»': key = '"' | |
| } | |
| switch key { | |
| case 'а'..='щ', 'ь', 'ю', 'я', 'є', 'і', 'ї', 'ґ', '.', ',', '-', ' ', '!', '"', ':', '?', '(', ')', '’': | |
| char_counts[int(key)] += weight | |
| } | |
| } | |
| switch char { | |
| case 'а'..='щ', 'ь', 'ю', 'я', 'є', 'і', 'ї', '.', ',', '-': | |
| if word_len + size <= len(word_buf) { | |
| bytes, b_len := utf8.encode_rune(char) | |
| for b_i in 0..<b_len { | |
| word_buf[word_len] = bytes[b_i] | |
| word_len += 1 | |
| } | |
| } | |
| case 'ґ': | |
| if word_len + utf8.rune_size('г') <= len(word_buf) { | |
| bytes, b_len := utf8.encode_rune('г') | |
| for b_i in 0..<b_len { | |
| word_buf[word_len] = bytes[b_i] | |
| word_len += 1 | |
| } | |
| } | |
| case: | |
| process_word(word_buf[:word_len], weight, word_counts, char_counts) | |
| word_len = 0 | |
| } | |
| } | |
| process_word(word_buf[:word_len], weight, word_counts, char_counts) | |
| } | |
| process_csv :: proc(name: string, weight: f64, word_counts: ^map[string]f64, char_counts: ^[9000]f64) -> u64 { | |
| file, err := os.open(name); assert(err == nil) | |
| defer { err := os.close(file); assert(err == nil) } | |
| reader: csv.Reader | |
| csv.reader_init(&reader, os.to_reader(file)) | |
| defer csv.reader_destroy(&reader) | |
| header, csv_err := csv.read(&reader, context.temp_allocator); assert(csv_err == nil) | |
| lines_count: int | |
| processed_bytes: u64 | |
| for { | |
| record, io_err := csv.read(&reader, context.temp_allocator) | |
| if io_err != nil { | |
| if io_err != io.Error.EOF { | |
| log.debugf("Error reading line: {}", io_err) | |
| } else { | |
| break | |
| } | |
| } | |
| if len(record) > 3 { | |
| text_field := record[3] | |
| lines, _ := strings.split_lines(text_field, context.temp_allocator) | |
| for line in lines { | |
| process_line(line, weight, word_counts, char_counts) | |
| } | |
| processed_bytes += u64(len(text_field)) | |
| lines_count += 1 | |
| if lines_count % 1_000_000 == 0 { | |
| log.infof("{}: {}m", name, lines_count / 1_000_000) | |
| } | |
| } | |
| free_all(context.temp_allocator) | |
| } | |
| return processed_bytes | |
| } | |
| process_txt :: proc(name: string, weight: f64, word_counts: ^map[string]f64, char_counts: ^[9000]f64, csv_bytes: u64) { | |
| file, err := os.open(name); assert(err == nil) | |
| defer { err := os.close(file); assert(err == nil) } | |
| file_size, size_err := os.file_size(file); assert(size_err == nil) | |
| weight := weight * f64(csv_bytes / u64(file_size)) | |
| reader: bufio.Reader | |
| bufio.reader_init(&reader, os.to_reader(file)) | |
| lines_count: int | |
| for { | |
| line, io_err := bufio.nreader_read_string(&reader, '\n', context.temp_allocator) | |
| lines_count += 1 | |
| if io_err != nil { | |
| if io_err != io.Error.EOF { | |
| log.debugf("Error reading line: %v\n", io_err) | |
| } else { | |
| break | |
| } | |
| } | |
| process_line(line, weight, word_counts, char_counts) | |
| if lines_count % 1_000_000 == 0 { | |
| log.infof("{}: {}m", name, lines_count / 1_000_000) | |
| } | |
| free_all(context.temp_allocator) | |
| } | |
| } | |
| main :: proc() { | |
| context.logger = log.create_console_logger() | |
| word_counts := make(map[string]f64, 2048) | |
| char_counts: [9000]f64 | |
| processed_csv_bytes := process_csv("social.csv", 1.2, &word_counts, &char_counts) | |
| process_txt("wiki.txt", 0.6, &word_counts, &char_counts, processed_csv_bytes) | |
| process_txt("literature.txt", 0.3, &word_counts, &char_counts, processed_csv_bytes) | |
| grams_1 := make(map[[1]rune]f64, 35) | |
| grams_2 := make(map[[2]rune]f64, 100) | |
| grams_3 := make(map[[3]rune]f64, 300) | |
| for word, word_weight in word_counts { | |
| gram_runes := utf8.string_to_runes(word, context.temp_allocator) | |
| gram1: [1]rune | |
| gram2: [2]rune | |
| gram3: [3]rune | |
| for i in 0..<len(gram_runes) { | |
| gram1[0] = gram_runes[i] | |
| if count, exists := &grams_1[gram1]; exists { | |
| count^ += word_weight | |
| } else { | |
| grams_1[gram1] = word_weight | |
| } | |
| if i < len(gram_runes) - 1 { | |
| gram2[0] = gram_runes[i] | |
| gram2[1] = gram_runes[i+1] | |
| if count, exists := &grams_2[gram2]; exists { | |
| count^ += word_weight | |
| } else { | |
| grams_2[gram2] = word_weight | |
| } | |
| } | |
| if i < len(gram_runes) - 2 { | |
| gram3[0] = gram_runes[i] | |
| gram3[1] = gram_runes[i+1] | |
| gram3[2] = gram_runes[i+2] | |
| if count, exists := &grams_3[gram3]; exists { | |
| count^ += word_weight | |
| } else { | |
| grams_3[gram3] = word_weight | |
| } | |
| } | |
| } | |
| free_all(context.temp_allocator) | |
| } | |
| Word_Freq :: struct { | |
| word: string, | |
| count: f64, | |
| } | |
| Char_Freq :: struct { | |
| char: rune, | |
| count: f64, | |
| } | |
| { | |
| char_freqs := make([dynamic]Char_Freq, 0, 100) | |
| for count, i in char_counts { | |
| if count > 0 { | |
| append(&char_freqs, Char_Freq {rune(i), count}) | |
| } | |
| } | |
| slice.sort_by(char_freqs[:], proc(i, j: Char_Freq) -> bool { | |
| return i.count > j.count | |
| }) | |
| total : f64 = 0 | |
| for freq in char_freqs { | |
| total += freq.count | |
| } | |
| for freq in char_freqs { | |
| log.debugf("{}: %.3f%%", freq.char, freq.count / total * 100) | |
| } | |
| } | |
| { | |
| word_freqs := make([dynamic]Word_Freq, 0, len(word_counts)) | |
| for word, count in word_counts { | |
| append(&word_freqs, Word_Freq {word, count}) | |
| } | |
| slice.sort_by(word_freqs[:], proc(i, j: Word_Freq) -> bool { | |
| return i.count > j.count | |
| }) | |
| out_fd, out_err := os.open("words-ukrainian.json", os.O_WRONLY | os.O_CREATE | os.O_TRUNC, os.Permissions{.Read_User, .Write_User, .Read_Group, .Read_Other}) | |
| if out_err != nil { | |
| log.error("Error creating output file.") | |
| return | |
| } | |
| defer os.close(out_fd) | |
| file_stream := os.to_stream(out_fd) | |
| writer: bufio.Writer | |
| bufio.writer_init(&writer, file_stream) | |
| defer bufio.writer_destroy(&writer) | |
| bufio.writer_write_string(&writer, "{\n") | |
| for i := 0; i < len(word_freqs); i += 1 { | |
| comma := "," if i < len(word_freqs) - 1 else "" | |
| bufio.writer_write_string(&writer, fmt.tprintfln(" \"{}\": %v%s", word_freqs[i].word, word_freqs[i].count, comma)) | |
| } | |
| bufio.writer_write_string(&writer, "}\n") | |
| bufio.writer_flush(&writer) | |
| } | |
| write_gram_file :: proc(name: string, dict: map[$T]f64) { | |
| gram_counts := make([dynamic]Word_Freq, 0, len(dict), context.temp_allocator) | |
| for runes, count in dict { | |
| runes := runes | |
| append(&gram_counts, Word_Freq {utf8.runes_to_string(runes[:], context.temp_allocator), count}) | |
| } | |
| slice.sort_by(gram_counts[:], proc(i, j: Word_Freq) -> bool { | |
| return i.count > j.count | |
| }) | |
| out_fd, out_err := os.open(name, os.O_WRONLY | os.O_CREATE | os.O_TRUNC, os.Permissions{.Read_User, .Write_User, .Read_Group, .Read_Other}) | |
| if out_err != nil { | |
| log.error("Error creating output file.") | |
| return | |
| } | |
| defer os.close(out_fd) | |
| file_stream := os.to_stream(out_fd) | |
| writer: bufio.Writer | |
| bufio.writer_init(&writer, file_stream) | |
| defer bufio.writer_destroy(&writer) | |
| for i in 0..<len(gram_counts) { | |
| bufio.writer_write_string(&writer, fmt.tprintfln("{} {}", i64(math.round(gram_counts[i].count)), gram_counts[i].word)) | |
| } | |
| bufio.writer_flush(&writer) | |
| free_all(context.temp_allocator) | |
| } | |
| write_gram_file("1-grams.txt", grams_1) | |
| write_gram_file("2-grams.txt", grams_2) | |
| write_gram_file("3-grams.txt", grams_3) | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment