Skip to content

Instantly share code, notes, and snippets.

@nikopartanen
Created March 20, 2019 13:07
Show Gist options
  • Save nikopartanen/e7d4b6745d947a56e2db06bc89dad166 to your computer and use it in GitHub Desktop.
Save nikopartanen/e7d4b6745d947a56e2db06bc89dad166 to your computer and use it in GitHub Desktop.
### This is Niko Partanen's example R script that splits the National
### Library of Finland's dataset OCR Ground Truth Pages (Swedish Fraktur)
### into line-image–text-file pairs that can be used with training
### models with Tesseract. Same approach works easily also with Ocropy
### Data source:
### https://digi.kansalliskirjasto.fi/opendata
library(tidyverse)
library(xml2)
library(measurements)
library(magick)
library(fs)
if (dir_exists("test")){
dir_delete("test")
}
if (dir_exists("train")){
dir_delete("train")
}
# We are taking a random sample when test and train set are
# divided, so setting seed is necessary to ensure reproducibility
set.seed(20170818)
# This function reads Alto file and saves it as individual files.
# Directory is simply the folder name, in this case `test` or `train`.
alto2lines <- function(xml_file, target_directory){
if (! dir_exists(target_directory)){
dir_create(target_directory)
}
xml_basename <- str_extract(xml_file, "[^/]+(?=-gt2.xml)")
xml <- read_xml(xml_file)
image_tif <- xml_file %>% str_replace_all("-gt2.xml$", ".tif") %>%
image_read() %>%
image_threshold(type = "white") %>%
image_convert(colorspace = "Gray") # simple binarization
# Alto files use tenth of millimeter as the measure unit,
# so units need to be converted to pixels -- this seems to work
mm10inch <- function(number){
measurements::conv_unit((number / 10), "mm", "inch") * 300
}
# This function saves individual line
save_line <- function(page_image, info){
crop_string <- str_glue("{info$width}x{info$height}+{info$hpos}+{info$vpos}")
cropped_line <- image_crop(page_image, crop_string)
image_write(cropped_line, str_glue("{target_directory}/{xml_basename}-{info$order_id}.tif"))
write_lines(info$content, str_glue("{target_directory}/{xml_basename}-{info$order_id}.gt.txt"))
}
# Saving the lines is done in the end of this function
xml %>% xml_find_all("//d1:TextLine") %>%
map_df(~ tibble(id = .x %>% xml_attr("ID"),
hpos = .x %>% xml2::xml_attr("HPOS") %>% as.numeric(),
vpos = .x %>% xml_attr("VPOS") %>% as.numeric(),
width = .x %>% xml_attr("WIDTH") %>% as.numeric(),
height = .x %>% xml_attr("HEIGHT") %>% as.numeric(),
content = .x %>% xml_find_all("./d1:String") %>% xml_attr("CONTENT") %>%
paste0(collapse = " "),
box_id = .x %>% xml_attr("ID"),
height_page = .x %>%
xml_find_first("//d1:Page") %>%
xml_attr("HEIGHT") %>% as.numeric,
width_page = .x %>%
xml_find_first("//d1:Page") %>%
xml_attr("WIDTH") %>% as.numeric)
) %>%
mutate_if(is.double, mm10inch) %>% # here we do mm10inch
filter(! content == '') %>%
mutate(order_id = 1:n() %>% str_pad(width = 4, pad = "0")) %>%
mutate(xmax = hpos + width,
xmin = hpos,
ymax = vpos + height,
ymin = vpos) %>%
mutate_if(is.double, round, digits = 0) %>%
split(.$order_id) %>%
walk(~ {save_line(page_image = image_tif, info = .x)})
}
# Here we list the files
gt_files <- dir("nlf_ocr_groundtruth_sv/",
pattern = "xml",
recursive = TRUE,
full.names = TRUE)
# Here the test and train set are split, 15% goes to testing.
# OCR training usually splits another test set while training,
# but for further evaluation this is useful.
test <- sample(gt_files, length(gt_files) * 0.15)
train <- gt_files[! (gt_files %in% test)]
# Both file lists are processed into lines
test %>%
walk(~ {alto2lines(xml_file = .x, target_directory = "test")})
train %>%
walk(~ {alto2lines(xml_file = .x, target_directory = "train")})
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment