Last active
April 9, 2021 03:30
-
-
Save joewiz/4c57546edfae74dfe82b4056a465c02b to your computer and use it in GitHub Desktop.
Check a text for OCR typo patterns, using XQuery
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
xquery version "3.1"; | |
(:~ | |
: Find possible OCR errors in a text by checking for patterns that an OCR | |
: process is known to misread, e.g., "day" misread as "clay", or "France" | |
: misread as "Prance." If the OCR engine just misread some instances of these | |
: words but got other instances correct, then this query will highlight | |
: candidates for correction. | |
: | |
: The query lets you configure a source text and define pattern sets to be used. | |
: | |
: Pattern sets consist of four parameters: | |
: | |
: 1. "sic": the OCR error | |
: 2. "corr": the correct version | |
: 3. "anchored": whether the error appears at the start of a word, the end of | |
: the word, or anywhere in the word | |
: 4. "case-sensitive": whether the pattern should be treated as case-sensitive | |
: or case-insensitive | |
: | |
: In the case of the day/clay example above, the parameters would be: | |
: | |
: 1. "sic": "cl" | |
: 2. "corr": "d" | |
: 3. "anchored": "start" | |
: 4. "case-sensitive": true | |
: | |
: If we apply this pattern to the following text: | |
: | |
: "clay clown day down" | |
: | |
: ... the expected result would be: | |
: | |
: [ | |
: { | |
: "sic": "clay" | |
: "corr": "day" | |
: }, | |
: { | |
: "sic": "clown", | |
: "corr": "down" | |
: } | |
: ] | |
: | |
:) | |
declare namespace array="http://www.w3.org/2005/xpath-functions/array"; | |
declare namespace map="http://www.w3.org/2005/xpath-functions/map"; | |
declare namespace output="http://www.w3.org/2010/xslt-xquery-serialization"; | |
declare namespace tei="http://www.tei-c.org/ns/1.0"; | |
declare option output:method "adaptive"; | |
declare variable $local:pattern-sets := ( | |
(: States > Slates :) | |
map { | |
"sic-literal": "Sl", | |
"corr-literal": "St", | |
"anchored": "start", | |
"case-sensitive": true() | |
}, | |
(: Mexiro > Mexico :) | |
map { | |
"sic-literal": "ro", | |
"corr-literal": "co", | |
"anchored": "end", | |
"case-sensitive": true() | |
}, | |
(: Prance > France :) | |
map { | |
"sic-literal": "Pr", | |
"corr-literal": "Fr", | |
"anchored": "start", | |
"case-sensitive": true() | |
}, | |
(: clay > day :) | |
map { | |
"sic-literal": "cl", | |
"corr-literal": "d", | |
"anchored": "start", | |
"case-sensitive": true() | |
}, | |
(: elate > date :) | |
map { | |
"sic-literal": "el", | |
"corr-literal": "d", | |
"anchored": "start", | |
"case-sensitive": true() | |
} | |
); | |
(:~ Convert a string literal into a regular expression for the position :) | |
declare function local:literal-to-regex($literal as xs:string, $position as xs:string) { | |
switch ($position) | |
case "start" return "^(" || $literal || ")(.+)$" | |
case "end" return "^(.+)(" || $literal || ")$" | |
default (: anywhere :) return "^(.*)(" || $literal || ")(.*)$" | |
}; | |
(:~ Find matches for a regular expression in a sequence of words :) | |
declare function local:find-matches($words as xs:string*, $regex as xs:string, $flags as xs:string) as map(*)* { | |
for $word in $words | |
let $analysis := analyze-string($word, $regex, $flags)[fn:match] | |
return | |
if (exists($analysis)) then | |
map { | |
"word": $word, | |
"match": $analysis/fn:match | |
} | |
else | |
() | |
}; | |
(:~ Check a sequence of words for sic/corr pairs that match known OCR error patterns :) | |
declare function local:check-words($words as xs:string*, $pattern-set as map(*)) { | |
(: Transform the literals into regular expressions :) | |
let $sic-regex := local:literal-to-regex($pattern-set?sic-literal, $pattern-set?anchored) | |
let $corr-regex := local:literal-to-regex($pattern-set?corr-literal, $pattern-set?anchored) | |
let $regex-flags := if ($pattern-set?case-sensitive) then "" else "i" | |
(: Find any cases matching the known OCR errors :) | |
let $sics := local:find-matches($words, $sic-regex, $regex-flags) | |
return | |
if (exists($sics)) then | |
(: Find any cases matching the known correct versions :) | |
let $corrs := local:find-matches($words, $corr-regex, $regex-flags) | |
return | |
if (exists($corrs)) then | |
(: Compare every sic to every corr :) | |
for $sic in $sics, $corr in $corrs | |
let $sic-groups := $sic?match/fn:group | |
let $corr-groups := $corr?match/fn:group | |
return | |
(: Only compare if the OCR error occurs in the same place in the word :) | |
(: This is currently naive - stripping out all of the literals :) | |
if (count($sic-groups) eq count($corr-groups)) then | |
let $sic-segs := $sic-groups[. ne $pattern-set?sic-literal]/string() | |
let $corr-segs := $corr-groups[. ne $pattern-set?corr-literal]/string() | |
(: Do the non-error portions of the two words match? :) | |
let $segs-align := | |
for-each-pair( | |
$sic-segs, | |
$corr-segs, | |
function($sic-seg, $corr-seg) { | |
$sic-seg eq $corr-seg | |
} | |
) | |
return | |
(: The first two conditions here are the result of the naive approach described above :) | |
if (exists($sic-segs) and exists($corr-segs) and (every $seg-align in $segs-align satisfies $seg-align)) then | |
map { | |
"sic": $sic?word, | |
"corr": $corr?word | |
} | |
else | |
() | |
else | |
() | |
else | |
() | |
else | |
() | |
}; | |
(: Define the text here or reference an external document :) | |
let $text := | |
"United Slates? of America United States: of America slates Mexico! | |
Mexiro. France) Prance day clay; down clown date elate*" | |
(: | |
(: Note: in BaseX, enter `SET XINCLUDE false` in the command window to ignore the XInclude in the FRUS vol :) | |
doc("https://github.com/HistoryAtState/frus/raw/eaaf9957b4e1ec2b0039db6d2ef8bc54864bdd1c/volumes/frus1861.xml")//tei:text//text() => string-join(" ") | |
:) | |
(: Split the text into words :) | |
let $words := | |
$text | |
=> tokenize() | |
(: strip off trailing punctuation :) | |
=> for-each( function($item) { replace($item, "\p{P}+$", "") } ) | |
=> distinct-values() | |
(: Attempt to find matches :) | |
let $attempts := | |
for $pattern-set in $local:pattern-sets | |
return | |
map { | |
(: optionally return the original pattern set :) | |
(: "pattern-set": $pattern-set, :) | |
"results": array { local:check-words($words, $pattern-set) } | |
} | |
(: Filter out attempts that returned no results :) | |
(: Note: on eXist 5.2, comment out the predicate; see https://github.com/eXist-db/exist/issues/3240 :) | |
let $candidates := $attempts[array:size(?results) gt 0] | |
return | |
map { | |
"distinct-word-count": count($words), | |
"pattern-match-count": count($candidates?results?*), | |
"candidates": array { | |
(: Display just the result pairs, sorted alphabetically :) | |
$candidates?results?* | |
=> sort((), function($match) { ($match?sic, $match?corr) ! lower-case(.) }) | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"distinct-word-count": 16, | |
"pattern-match-count": 6, | |
"candidates": [ | |
{ | |
"corr": "day", | |
"sic": "clay" | |
}, | |
{ | |
"corr": "down", | |
"sic": "clown" | |
}, | |
{ | |
"corr": "date", | |
"sic": "elate" | |
}, | |
{ | |
"corr": "Mexico", | |
"sic": "Mexiro" | |
}, | |
{ | |
"corr": "France", | |
"sic": "Prance" | |
}, | |
{ | |
"corr": "States", | |
"sic": "Slates" | |
} | |
] | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"distinct-word-count": 10357, | |
"pattern-match-count": 5, | |
"candidates": [ | |
{ | |
"corr": "day", | |
"sic": "clay" | |
}, | |
{ | |
"corr": "dear", | |
"sic": "clear" | |
}, | |
{ | |
"corr": "date", | |
"sic": "elate" | |
}, | |
{ | |
"corr": "France", | |
"sic": "Prance" | |
}, | |
{ | |
"corr": "States", | |
"sic": "Slates" | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment