SolidAlloy · April 22, 2026 17:44
diff --git a/main.odin b/main.odin
 package main

 // The program takes corpora in txt and csv formats and generates a word frequency list, as well as n-gram lists.
 // The word frequency list can be used for Cyanophage, while n-grams are in a format suitable for dariogoetz's Keyboard Layout Optimizer.
 // The program goes through 7 GB of text in 10 minutes.
 // I wrote it in Odin because I'm practicing it, but also because any naive implementation in a compiled language will be better than Python.

 // The following corpora were used for Ukrainian:
 // https://github.com/kateryna-bobrovnyk/ukr-twi-corpus
 // https://lang.org.ua/en/ubertext/ (wikipedia & fiction, split into sentences)

 import "core:fmt"
 import "core:slice"
 import "core:unicode/utf8"
 import "core:strings"
 import "core:bufio"
 import os "core:os/os2"
 import "core:io"
 import "core:log"
 import "core:math"
 import "core:encoding/csv"
 import "core:unicode"

 process_word :: proc(word_buf: []u8, weight: f64, word_counts: ^map[string]f64, char_counts: ^[9000]f64) {
   contains_letters := false
   word_str := string(word_buf)
   
   for char in word_str {
      if unicode.is_letter(char) {
         contains_letters = true
         break 
      }
   }

   if !contains_letters { return }

   if strings.contains(word_str, "...") {
      improved_word, _ := strings.replace(word_str, "...", ".", 1, context.temp_allocator)
      if count, exists := &word_counts[improved_word]; exists {
         count^ += weight
      } else {
         permanent_word := strings.clone(improved_word, context.allocator)
         word_counts[permanent_word] = weight
      }
   } else {
      if count, exists := &word_counts[word_str]; exists {
         count^ += weight
      } else {
         permanent_word := strings.clone(word_str, context.allocator)
         word_counts[permanent_word] = weight
      }
   }
 }

 process_line :: proc(line: string, weight: f64, word_counts: ^map[string]f64, char_counts: ^[9000]f64) {
   word_buf: [256]u8
   word_len := 0

   i := 0
   for i < len(line) {
      char, size := utf8.decode_rune_in_string(line[i:])
      i += size
      char = unicode.to_lower(char)

      if int(char) < 9000 {
         key := char
         switch key {
         case '–', '—': key = '-'
         case '\'', '’': key = '’'
         case '«', '»': key = '"'
         }
         
         switch key {
         case 'а'..='щ', 'ь', 'ю', 'я', 'є', 'і', 'ї', 'ґ', '.', ',', '-', ' ', '!', '"', ':', '?', '(', ')', '’':
            char_counts[int(key)] += weight
         }
      }

      switch char {
      case 'а'..='щ', 'ь', 'ю', 'я', 'є', 'і', 'ї', '.', ',', '-':
         if word_len + size <= len(word_buf) {
            bytes, b_len := utf8.encode_rune(char)
            for b_i in 0..<b_len {
               word_buf[word_len] = bytes[b_i]
               word_len += 1
            }
         }
      case 'ґ':
         if word_len + utf8.rune_size('г') <= len(word_buf) {
            bytes, b_len := utf8.encode_rune('г')
            for b_i in 0..<b_len {
               word_buf[word_len] = bytes[b_i]
               word_len += 1
            }
         }
      case:
         process_word(word_buf[:word_len], weight, word_counts, char_counts)
         word_len = 0
      }
   }
   
   process_word(word_buf[:word_len], weight, word_counts, char_counts)
 }

 process_csv :: proc(name: string, weight: f64, word_counts: ^map[string]f64, char_counts: ^[9000]f64) -> u64 {
   file, err := os.open(name); assert(err == nil)
   defer { err := os.close(file); assert(err == nil) }

   reader: csv.Reader
   csv.reader_init(&reader, os.to_reader(file))
   defer csv.reader_destroy(&reader)

   header, csv_err := csv.read(&reader, context.temp_allocator); assert(csv_err == nil)

   lines_count: int
   processed_bytes: u64

   for {
      record, io_err := csv.read(&reader, context.temp_allocator)

      if io_err != nil {
         if io_err != io.Error.EOF {
            log.debugf("Error reading line: {}", io_err)
         } else {
            break
         }
      }

      if len(record) > 3 {
         text_field := record[3]
         lines, _ := strings.split_lines(text_field, context.temp_allocator)
         
         for line in lines {
            process_line(line, weight, word_counts, char_counts)
         }
         
         processed_bytes += u64(len(text_field))
         lines_count += 1

         if lines_count % 1_000_000 == 0 {
            log.infof("{}: {}m", name, lines_count / 1_000_000)
         }
      }

      free_all(context.temp_allocator)
   }

   return processed_bytes
 }

 process_txt :: proc(name: string, weight: f64, word_counts: ^map[string]f64, char_counts: ^[9000]f64, csv_bytes: u64) {
   file, err := os.open(name); assert(err == nil)
   defer { err := os.close(file); assert(err == nil) }

   file_size, size_err := os.file_size(file); assert(size_err == nil)
   weight := weight * f64(csv_bytes / u64(file_size))

   reader: bufio.Reader
   bufio.reader_init(&reader, os.to_reader(file))

   lines_count: int

   for {
      line, io_err := bufio.nreader_read_string(&reader, '\n', context.temp_allocator)
      lines_count += 1

      if io_err != nil {
         if io_err != io.Error.EOF {
            log.debugf("Error reading line: %v\n", io_err)
         } else {
            break
         }
      }

      process_line(line, weight, word_counts, char_counts)

      if lines_count % 1_000_000 == 0 {
         log.infof("{}: {}m", name, lines_count / 1_000_000)
      }

      free_all(context.temp_allocator)
   }
 }

 main :: proc() {
   context.logger = log.create_console_logger()

   word_counts := make(map[string]f64, 2048)
   char_counts: [9000]f64

   processed_csv_bytes := process_csv("social.csv", 1.2, &word_counts, &char_counts)
   process_txt("wiki.txt", 0.6, &word_counts, &char_counts, processed_csv_bytes)
   process_txt("literature.txt", 0.3, &word_counts, &char_counts, processed_csv_bytes)

   grams_1 := make(map[[1]rune]f64, 35)
   grams_2 := make(map[[2]rune]f64, 100)
   grams_3 := make(map[[3]rune]f64, 300)

   for word, word_weight in word_counts {
      gram_runes := utf8.string_to_runes(word, context.temp_allocator)

      gram1: [1]rune
      gram2: [2]rune
      gram3: [3]rune

      for i in 0..<len(gram_runes) {
         gram1[0] = gram_runes[i]

         if count, exists := &grams_1[gram1]; exists {
            count^ += word_weight
         } else {
            grams_1[gram1] = word_weight
         }

         if i < len(gram_runes) - 1 {
            gram2[0] = gram_runes[i]
            gram2[1] = gram_runes[i+1]

            if count, exists := &grams_2[gram2]; exists {
               count^ += word_weight
            } else {
               grams_2[gram2] = word_weight
            }
         }

         if i < len(gram_runes) - 2 {
            gram3[0] = gram_runes[i]
            gram3[1] = gram_runes[i+1]
            gram3[2] = gram_runes[i+2]

            if count, exists := &grams_3[gram3]; exists {
               count^ += word_weight
            } else {
               grams_3[gram3] = word_weight
            }
         }
      }
      free_all(context.temp_allocator)
   }

   Word_Freq :: struct {
      word:  string,
      count: f64,
   }

   Char_Freq :: struct {
      char:  rune,
      count: f64,
   }

   {
      char_freqs := make([dynamic]Char_Freq, 0, 100)

      for count, i in char_counts {
         if count > 0 {
            append(&char_freqs, Char_Freq {rune(i), count})
         }
      }

      slice.sort_by(char_freqs[:], proc(i, j: Char_Freq) -> bool {
         return i.count > j.count
      })

      total : f64 = 0
      for freq in char_freqs {
         total += freq.count
      }

      for freq in char_freqs {
         log.debugf("{}: %.3f%%", freq.char, freq.count / total * 100)
      }
   }
   
   {
      word_freqs := make([dynamic]Word_Freq, 0, len(word_counts))

      for word, count in word_counts {
         append(&word_freqs, Word_Freq {word, count})
      }

      slice.sort_by(word_freqs[:], proc(i, j: Word_Freq) -> bool {
         return i.count > j.count
      })

      out_fd, out_err := os.open("words-ukrainian.json", os.O_WRONLY | os.O_CREATE | os.O_TRUNC, os.Permissions{.Read_User, .Write_User, .Read_Group, .Read_Other})
      if out_err != nil {
         log.error("Error creating output file.")
         return
      }
      defer os.close(out_fd)

      file_stream := os.to_stream(out_fd)

      writer: bufio.Writer
      bufio.writer_init(&writer, file_stream)
      defer bufio.writer_destroy(&writer)

      bufio.writer_write_string(&writer, "{\n")
      for i := 0; i < len(word_freqs); i += 1 {
         comma := "," if i < len(word_freqs) - 1 else ""
         bufio.writer_write_string(&writer, fmt.tprintfln("  \"{}\": %v%s", word_freqs[i].word, word_freqs[i].count, comma))
      }
      bufio.writer_write_string(&writer, "}\n")
      bufio.writer_flush(&writer)
   }

   write_gram_file :: proc(name: string, dict: map[$T]f64) {
      gram_counts := make([dynamic]Word_Freq, 0, len(dict), context.temp_allocator)

      for runes, count in dict {
         runes := runes
         append(&gram_counts, Word_Freq {utf8.runes_to_string(runes[:], context.temp_allocator), count})
      }

      slice.sort_by(gram_counts[:], proc(i, j: Word_Freq) -> bool {
         return i.count > j.count
      })

      out_fd, out_err := os.open(name, os.O_WRONLY | os.O_CREATE | os.O_TRUNC, os.Permissions{.Read_User, .Write_User, .Read_Group, .Read_Other})
      if out_err != nil {
         log.error("Error creating output file.")
         return
      }
      defer os.close(out_fd)

      file_stream := os.to_stream(out_fd)

      writer: bufio.Writer
      bufio.writer_init(&writer, file_stream)
      defer bufio.writer_destroy(&writer)

      for i in 0..<len(gram_counts) {
         bufio.writer_write_string(&writer, fmt.tprintfln("{} {}", i64(math.round(gram_counts[i].count)), gram_counts[i].word))
      }
      bufio.writer_flush(&writer)
      free_all(context.temp_allocator)
   }

   write_gram_file("1-grams.txt", grams_1)
   write_gram_file("2-grams.txt", grams_2)
   write_gram_file("3-grams.txt", grams_3)
 }
	package main

	// The program takes corpora in txt and csv formats and generates a word frequency list, as well as n-gram lists.
	// The word frequency list can be used for Cyanophage, while n-grams are in a format suitable for dariogoetz's Keyboard Layout Optimizer.
	// The program goes through 7 GB of text in 10 minutes.
	// I wrote it in Odin because I'm practicing it, but also because any naive implementation in a compiled language will be better than Python.

	// The following corpora were used for Ukrainian:
	// https://github.com/kateryna-bobrovnyk/ukr-twi-corpus
	// https://lang.org.ua/en/ubertext/ (wikipedia & fiction, split into sentences)

	import "core:fmt"
	import "core:slice"
	import "core:unicode/utf8"
	import "core:strings"
	import "core:bufio"
	import os "core:os/os2"
	import "core:io"
	import "core:log"
	import "core:math"
	import "core:encoding/csv"
	import "core:unicode"

	process_word :: proc(word_buf: []u8, weight: f64, word_counts: ^map[string]f64, char_counts: ^[9000]f64) {
	contains_letters := false
	word_str := string(word_buf)

	for char in word_str {
	if unicode.is_letter(char) {
	contains_letters = true
	break
	}
	}

	if !contains_letters { return }

	if strings.contains(word_str, "...") {
	improved_word, _ := strings.replace(word_str, "...", ".", 1, context.temp_allocator)
	if count, exists := &word_counts[improved_word]; exists {
	count^ += weight
	} else {
	permanent_word := strings.clone(improved_word, context.allocator)
	word_counts[permanent_word] = weight
	}
	} else {
	if count, exists := &word_counts[word_str]; exists {
	count^ += weight
	} else {
	permanent_word := strings.clone(word_str, context.allocator)
	word_counts[permanent_word] = weight
	}
	}
	}

	process_line :: proc(line: string, weight: f64, word_counts: ^map[string]f64, char_counts: ^[9000]f64) {
	word_buf: [256]u8
	word_len := 0

	i := 0
	for i < len(line) {
	char, size := utf8.decode_rune_in_string(line[i:])
	i += size
	char = unicode.to_lower(char)

	if int(char) < 9000 {
	key := char
	switch key {
	case '–', '—': key = '-'
	case '\'', '’': key = '’'
	case '«', '»': key = '"'
	}

	switch key {
	case 'а'..='щ', 'ь', 'ю', 'я', 'є', 'і', 'ї', 'ґ', '.', ',', '-', ' ', '!', '"', ':', '?', '(', ')', '’':
	char_counts[int(key)] += weight
	}
	}

	switch char {
	case 'а'..='щ', 'ь', 'ю', 'я', 'є', 'і', 'ї', '.', ',', '-':
	if word_len + size <= len(word_buf) {
	bytes, b_len := utf8.encode_rune(char)
	for b_i in 0..<b_len {
	word_buf[word_len] = bytes[b_i]
	word_len += 1
	}
	}
	case 'ґ':
	if word_len + utf8.rune_size('г') <= len(word_buf) {
	bytes, b_len := utf8.encode_rune('г')
	for b_i in 0..<b_len {
	word_buf[word_len] = bytes[b_i]
	word_len += 1
	}
	}
	case:
	process_word(word_buf[:word_len], weight, word_counts, char_counts)
	word_len = 0
	}
	}

	process_word(word_buf[:word_len], weight, word_counts, char_counts)
	}

	process_csv :: proc(name: string, weight: f64, word_counts: ^map[string]f64, char_counts: ^[9000]f64) -> u64 {
	file, err := os.open(name); assert(err == nil)
	defer { err := os.close(file); assert(err == nil) }

	reader: csv.Reader
	csv.reader_init(&reader, os.to_reader(file))
	defer csv.reader_destroy(&reader)

	header, csv_err := csv.read(&reader, context.temp_allocator); assert(csv_err == nil)

	lines_count: int
	processed_bytes: u64

	for {
	record, io_err := csv.read(&reader, context.temp_allocator)

	if io_err != nil {
	if io_err != io.Error.EOF {
	log.debugf("Error reading line: {}", io_err)
	} else {
	break
	}
	}

	if len(record) > 3 {
	text_field := record[3]
	lines, _ := strings.split_lines(text_field, context.temp_allocator)

	for line in lines {
	process_line(line, weight, word_counts, char_counts)
	}

	processed_bytes += u64(len(text_field))
	lines_count += 1

	if lines_count % 1_000_000 == 0 {
	log.infof("{}: {}m", name, lines_count / 1_000_000)
	}
	}

	free_all(context.temp_allocator)
	}

	return processed_bytes
	}

	process_txt :: proc(name: string, weight: f64, word_counts: ^map[string]f64, char_counts: ^[9000]f64, csv_bytes: u64) {
	file, err := os.open(name); assert(err == nil)
	defer { err := os.close(file); assert(err == nil) }

	file_size, size_err := os.file_size(file); assert(size_err == nil)
	weight := weight * f64(csv_bytes / u64(file_size))

	reader: bufio.Reader
	bufio.reader_init(&reader, os.to_reader(file))

	lines_count: int

	for {
	line, io_err := bufio.nreader_read_string(&reader, '\n', context.temp_allocator)
	lines_count += 1

	if io_err != nil {
	if io_err != io.Error.EOF {
	log.debugf("Error reading line: %v\n", io_err)
	} else {
	break
	}
	}

	process_line(line, weight, word_counts, char_counts)

	if lines_count % 1_000_000 == 0 {
	log.infof("{}: {}m", name, lines_count / 1_000_000)
	}

	free_all(context.temp_allocator)
	}
	}

	main :: proc() {
	context.logger = log.create_console_logger()

	word_counts := make(map[string]f64, 2048)
	char_counts: [9000]f64

	processed_csv_bytes := process_csv("social.csv", 1.2, &word_counts, &char_counts)
	process_txt("wiki.txt", 0.6, &word_counts, &char_counts, processed_csv_bytes)
	process_txt("literature.txt", 0.3, &word_counts, &char_counts, processed_csv_bytes)

	grams_1 := make(map[[1]rune]f64, 35)
	grams_2 := make(map[[2]rune]f64, 100)
	grams_3 := make(map[[3]rune]f64, 300)

	for word, word_weight in word_counts {
	gram_runes := utf8.string_to_runes(word, context.temp_allocator)

	gram1: [1]rune
	gram2: [2]rune
	gram3: [3]rune

	for i in 0..<len(gram_runes) {
	gram1[0] = gram_runes[i]

	if count, exists := &grams_1[gram1]; exists {
	count^ += word_weight
	} else {
	grams_1[gram1] = word_weight
	}

	if i < len(gram_runes) - 1 {
	gram2[0] = gram_runes[i]
	gram2[1] = gram_runes[i+1]

	if count, exists := &grams_2[gram2]; exists {
	count^ += word_weight
	} else {
	grams_2[gram2] = word_weight
	}
	}

	if i < len(gram_runes) - 2 {
	gram3[0] = gram_runes[i]
	gram3[1] = gram_runes[i+1]
	gram3[2] = gram_runes[i+2]

	if count, exists := &grams_3[gram3]; exists {
	count^ += word_weight
	} else {
	grams_3[gram3] = word_weight
	}
	}
	}
	free_all(context.temp_allocator)
	}

	Word_Freq :: struct {
	word: string,
	count: f64,
	}

	Char_Freq :: struct {
	char: rune,
	count: f64,
	}

	{
	char_freqs := make([dynamic]Char_Freq, 0, 100)

	for count, i in char_counts {
	if count > 0 {
	append(&char_freqs, Char_Freq {rune(i), count})
	}
	}

	slice.sort_by(char_freqs[:], proc(i, j: Char_Freq) -> bool {
	return i.count > j.count
	})

	total : f64 = 0
	for freq in char_freqs {
	total += freq.count
	}

	for freq in char_freqs {
	log.debugf("{}: %.3f%%", freq.char, freq.count / total * 100)
	}
	}

	{
	word_freqs := make([dynamic]Word_Freq, 0, len(word_counts))

	for word, count in word_counts {
	append(&word_freqs, Word_Freq {word, count})
	}

	slice.sort_by(word_freqs[:], proc(i, j: Word_Freq) -> bool {
	return i.count > j.count
	})

	out_fd, out_err := os.open("words-ukrainian.json", os.O_WRONLY \| os.O_CREATE \| os.O_TRUNC, os.Permissions{.Read_User, .Write_User, .Read_Group, .Read_Other})
	if out_err != nil {
	log.error("Error creating output file.")
	return
	}
	defer os.close(out_fd)

	file_stream := os.to_stream(out_fd)

	writer: bufio.Writer
	bufio.writer_init(&writer, file_stream)
	defer bufio.writer_destroy(&writer)

	bufio.writer_write_string(&writer, "{\n")
	for i := 0; i < len(word_freqs); i += 1 {
	comma := "," if i < len(word_freqs) - 1 else ""
	bufio.writer_write_string(&writer, fmt.tprintfln(" \"{}\": %v%s", word_freqs[i].word, word_freqs[i].count, comma))
	}
	bufio.writer_write_string(&writer, "}\n")
	bufio.writer_flush(&writer)
	}

	write_gram_file :: proc(name: string, dict: map[$T]f64) {
	gram_counts := make([dynamic]Word_Freq, 0, len(dict), context.temp_allocator)

	for runes, count in dict {
	runes := runes
	append(&gram_counts, Word_Freq {utf8.runes_to_string(runes[:], context.temp_allocator), count})
	}

	slice.sort_by(gram_counts[:], proc(i, j: Word_Freq) -> bool {
	return i.count > j.count
	})

	out_fd, out_err := os.open(name, os.O_WRONLY \| os.O_CREATE \| os.O_TRUNC, os.Permissions{.Read_User, .Write_User, .Read_Group, .Read_Other})
	if out_err != nil {
	log.error("Error creating output file.")
	return
	}
	defer os.close(out_fd)

	file_stream := os.to_stream(out_fd)

	writer: bufio.Writer
	bufio.writer_init(&writer, file_stream)
	defer bufio.writer_destroy(&writer)

	for i in 0..<len(gram_counts) {
	bufio.writer_write_string(&writer, fmt.tprintfln("{} {}", i64(math.round(gram_counts[i].count)), gram_counts[i].word))
	}
	bufio.writer_flush(&writer)
	free_all(context.temp_allocator)
	}

	write_gram_file("1-grams.txt", grams_1)
	write_gram_file("2-grams.txt", grams_2)
	write_gram_file("3-grams.txt", grams_3)
	}
No results found