Skip to content

Instantly share code, notes, and snippets.

@joewiz
Last active April 9, 2021 03:30
Show Gist options
  • Save joewiz/4c57546edfae74dfe82b4056a465c02b to your computer and use it in GitHub Desktop.
Save joewiz/4c57546edfae74dfe82b4056a465c02b to your computer and use it in GitHub Desktop.
Check a text for OCR typo patterns, using XQuery
xquery version "3.1";
(:~
: Find possible OCR errors in a text by checking for patterns that an OCR
: process is known to misread, e.g., "day" misread as "clay", or "France"
: misread as "Prance." If the OCR engine just misread some instances of these
: words but got other instances correct, then this query will highlight
: candidates for correction.
:
: The query lets you configure a source text and define pattern sets to be used.
:
: Pattern sets consist of four parameters:
:
: 1. "sic": the OCR error
: 2. "corr": the correct version
: 3. "anchored": whether the error appears at the start of a word, the end of
: the word, or anywhere in the word
: 4. "case-sensitive": whether the pattern should be treated as case-sensitive
: or case-insensitive
:
: In the case of the day/clay example above, the parameters would be:
:
: 1. "sic": "cl"
: 2. "corr": "d"
: 3. "anchored": "start"
: 4. "case-sensitive": true
:
: If we apply this pattern to the following text:
:
: "clay clown day down"
:
: ... the expected result would be:
:
: [
: {
: "sic": "clay"
: "corr": "day"
: },
: {
: "sic": "clown",
: "corr": "down"
: }
: ]
:
:)
declare namespace array="http://www.w3.org/2005/xpath-functions/array";
declare namespace map="http://www.w3.org/2005/xpath-functions/map";
declare namespace output="http://www.w3.org/2010/xslt-xquery-serialization";
declare namespace tei="http://www.tei-c.org/ns/1.0";
declare option output:method "adaptive";
declare variable $local:pattern-sets := (
(: States > Slates :)
map {
"sic-literal": "Sl",
"corr-literal": "St",
"anchored": "start",
"case-sensitive": true()
},
(: Mexiro > Mexico :)
map {
"sic-literal": "ro",
"corr-literal": "co",
"anchored": "end",
"case-sensitive": true()
},
(: Prance > France :)
map {
"sic-literal": "Pr",
"corr-literal": "Fr",
"anchored": "start",
"case-sensitive": true()
},
(: clay > day :)
map {
"sic-literal": "cl",
"corr-literal": "d",
"anchored": "start",
"case-sensitive": true()
},
(: elate > date :)
map {
"sic-literal": "el",
"corr-literal": "d",
"anchored": "start",
"case-sensitive": true()
}
);
(:~ Convert a string literal into a regular expression for the position :)
declare function local:literal-to-regex($literal as xs:string, $position as xs:string) {
switch ($position)
case "start" return "^(" || $literal || ")(.+)$"
case "end" return "^(.+)(" || $literal || ")$"
default (: anywhere :) return "^(.*)(" || $literal || ")(.*)$"
};
(:~ Find matches for a regular expression in a sequence of words :)
declare function local:find-matches($words as xs:string*, $regex as xs:string, $flags as xs:string) as map(*)* {
for $word in $words
let $analysis := analyze-string($word, $regex, $flags)[fn:match]
return
if (exists($analysis)) then
map {
"word": $word,
"match": $analysis/fn:match
}
else
()
};
(:~ Check a sequence of words for sic/corr pairs that match known OCR error patterns :)
declare function local:check-words($words as xs:string*, $pattern-set as map(*)) {
(: Transform the literals into regular expressions :)
let $sic-regex := local:literal-to-regex($pattern-set?sic-literal, $pattern-set?anchored)
let $corr-regex := local:literal-to-regex($pattern-set?corr-literal, $pattern-set?anchored)
let $regex-flags := if ($pattern-set?case-sensitive) then "" else "i"
(: Find any cases matching the known OCR errors :)
let $sics := local:find-matches($words, $sic-regex, $regex-flags)
return
if (exists($sics)) then
(: Find any cases matching the known correct versions :)
let $corrs := local:find-matches($words, $corr-regex, $regex-flags)
return
if (exists($corrs)) then
(: Compare every sic to every corr :)
for $sic in $sics, $corr in $corrs
let $sic-groups := $sic?match/fn:group
let $corr-groups := $corr?match/fn:group
return
(: Only compare if the OCR error occurs in the same place in the word :)
(: This is currently naive - stripping out all of the literals :)
if (count($sic-groups) eq count($corr-groups)) then
let $sic-segs := $sic-groups[. ne $pattern-set?sic-literal]/string()
let $corr-segs := $corr-groups[. ne $pattern-set?corr-literal]/string()
(: Do the non-error portions of the two words match? :)
let $segs-align :=
for-each-pair(
$sic-segs,
$corr-segs,
function($sic-seg, $corr-seg) {
$sic-seg eq $corr-seg
}
)
return
(: The first two conditions here are the result of the naive approach described above :)
if (exists($sic-segs) and exists($corr-segs) and (every $seg-align in $segs-align satisfies $seg-align)) then
map {
"sic": $sic?word,
"corr": $corr?word
}
else
()
else
()
else
()
else
()
};
(: Define the text here or reference an external document :)
let $text :=
"United Slates? of America United States: of America slates Mexico!
Mexiro. France) Prance day clay; down clown date elate*"
(:
(: Note: in BaseX, enter `SET XINCLUDE false` in the command window to ignore the XInclude in the FRUS vol :)
doc("https://github.com/HistoryAtState/frus/raw/eaaf9957b4e1ec2b0039db6d2ef8bc54864bdd1c/volumes/frus1861.xml")//tei:text//text() => string-join(" ")
:)
(: Split the text into words :)
let $words :=
$text
=> tokenize()
(: strip off trailing punctuation :)
=> for-each( function($item) { replace($item, "\p{P}+$", "") } )
=> distinct-values()
(: Attempt to find matches :)
let $attempts :=
for $pattern-set in $local:pattern-sets
return
map {
(: optionally return the original pattern set :)
(: "pattern-set": $pattern-set, :)
"results": array { local:check-words($words, $pattern-set) }
}
(: Filter out attempts that returned no results :)
(: Note: on eXist 5.2, comment out the predicate; see https://github.com/eXist-db/exist/issues/3240 :)
let $candidates := $attempts[array:size(?results) gt 0]
return
map {
"distinct-word-count": count($words),
"pattern-match-count": count($candidates?results?*),
"candidates": array {
(: Display just the result pairs, sorted alphabetically :)
$candidates?results?*
=> sort((), function($match) { ($match?sic, $match?corr) ! lower-case(.) })
}
}
{
"distinct-word-count": 16,
"pattern-match-count": 6,
"candidates": [
{
"corr": "day",
"sic": "clay"
},
{
"corr": "down",
"sic": "clown"
},
{
"corr": "date",
"sic": "elate"
},
{
"corr": "Mexico",
"sic": "Mexiro"
},
{
"corr": "France",
"sic": "Prance"
},
{
"corr": "States",
"sic": "Slates"
}
]
}
{
"distinct-word-count": 10357,
"pattern-match-count": 5,
"candidates": [
{
"corr": "day",
"sic": "clay"
},
{
"corr": "dear",
"sic": "clear"
},
{
"corr": "date",
"sic": "elate"
},
{
"corr": "France",
"sic": "Prance"
},
{
"corr": "States",
"sic": "Slates"
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment