Last active
April 9, 2021 03:22
-
-
Save joewiz/5889711 to your computer and use it in GitHub Desktop.
Split (or "tokenize") a string into "sentences", with XQuery. See http://joewiz.org/2013/06/29/one-paragraph-many-sentences/.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
xquery version "1.0"; | |
(: A naive approach to sentence tokenization inspired by http://stackoverflow.com/a/2103653/659732 | |
: | |
: Works well with edited text like newspapers. Parameters like punctuation can/should be edited; | |
: see the section below called "criteria". | |
: | |
: For a more sophisticated approach, see Tibor Kiss and Jan Strunk, "Unsupervised Multilingual | |
: Sentence Boundary Detection", Computational Linguistics, Volume 32, Issue 4, December 2006, | |
: pp. 485-525. Also, see these discussions of sentence tokenization: | |
: - http://nltk.org/book/ch06.html#sec-further-examples-of-supervised-classification | |
: - http://www.robincamille.com/2012-02-18-nltk-sentence-tokenizer/ | |
:) | |
declare function local:tokenize-sentences($string as xs:string*) | |
{ | |
let $words := tokenize($string, '\s+')[. ne ''] | |
let $first-sentence := normalize-space(local:get-first-sentence($words, '')) | |
return | |
($first-sentence, | |
let $word-count-of-sentence := count(tokenize($first-sentence, ' ')) | |
return | |
if (count($words) gt $word-count-of-sentence) then | |
local:tokenize-sentences(string-join(subsequence($words, $word-count-of-sentence + 1), ' ')) | |
else | |
() | |
) | |
}; | |
declare function local:get-first-sentence($words as xs:string*, $sentence as xs:string) { | |
(: if there are no (more) words to check, we're done, so return whatever we have for the sentence :) | |
if (empty($words)) then | |
$sentence | |
(: begin analyzing the word :) | |
else | |
let $word := subsequence($words, 1, 1) | |
let $next := subsequence($words, 2, 1) | |
let $rest := subsequence($words, 2) | |
(: criteria :) | |
let $final-punctuation-marks := '.?!' | |
let $post-punctuation-possibilities := '’”"'')' | |
let $pre-punctuation-possibilities := '‘“"''(' | |
let $final-punctuation-regex := concat('[', $final-punctuation-marks, '][', $post-punctuation-possibilities, ']?$') | |
let $capitalized-abbreviation-test-regex := '[A-Z][.?!]' | |
let $capitalized-test-regex := concat('^[', $pre-punctuation-possibilities, ']*?[A-Z]') | |
let $words-with-ignorable-final-punctuation-marks := ('Mr.', 'Mrs.', 'Dr.', 'Amb.') | |
let $known-phrases-with-ignorable-final-punctuation-marks := ('U.S. Government') | |
(: test the word against the criteria :) | |
let $word-ends-with-punctuation := matches($word, $final-punctuation-regex) | |
let $word-is-capitalized-abbreviation := matches($word, $capitalized-abbreviation-test-regex) | |
let $next-word-is-capitalized := matches($next, $capitalized-test-regex) | |
let $word-has-ignorable-punctuation := $word = $words-with-ignorable-final-punctuation-marks | |
return | |
(: if word doesn't end with punctuation (like "the" or "Minister"), | |
then consider it part of the existing sentence and move to the next word. :) | |
if (not($word-ends-with-punctuation)) then | |
local:get-first-sentence( | |
$rest, | |
concat($sentence, ' ', $word) | |
) | |
(: if the word is in our list of words with allowable final punctuation (like "Mr."), | |
then consider it part of the existing sentence and move to the next word. :) | |
else if ($word-has-ignorable-punctuation) then | |
local:get-first-sentence( | |
$rest, | |
concat($sentence, ' ', $word) | |
) | |
(: if the word is an abbreviation and the next word is not capitalized (like "A.B.M. treaty"), | |
or if the word ends with punctuation and the next word is not capitalized (like "'What?' he asked.") | |
then consider it part of the existing sentence and move to the next word. :) | |
else if (($word-is-capitalized-abbreviation or $word-ends-with-punctuation) and not($next-word-is-capitalized)) then | |
local:get-first-sentence( | |
$rest, | |
concat($sentence, ' ', $word) | |
) | |
(: if the word is part of a known phrase that could be mistaken for the end of a sentence (like "U.S. Government"), | |
then consider it part of the existing sentence and move to the next word. :) | |
else | |
let $sorted-phrases := | |
(: order by word length, longest to shortest :) | |
for $phrase in $known-phrases-with-ignorable-final-punctuation-marks | |
order by string-length($phrase) descending | |
return $phrase | |
let $words-as-string := string-join($words, ' ') | |
let $matching-phrase := | |
subsequence( | |
for $phrase in $sorted-phrases | |
return | |
if (starts-with($words-as-string, $phrase)) then | |
$phrase | |
else () | |
, 1, 1) | |
return | |
if ($matching-phrase) then | |
let $phrase-length := count(tokenize($matching-phrase, ' ')) | |
let $rest := subsequence($words, $phrase-length + 1) | |
return | |
local:get-first-sentence( | |
$rest, | |
concat($sentence, ' ', $matching-phrase) | |
) | |
(: the word ends the sentence - we're done with this sentence! :) | |
else | |
concat($sentence, ' ', $word) | |
}; | |
(: sample text taken from http://history.state.gov/historicaldocuments/frus1964-68v06/d213 :) | |
let $source-text := | |
'154613. You should arrange to deliver following note to North Vietnamese Embassy. | |
If in your opinion it can be done without creating an issue, we would prefer that | |
you ask North Vietnamese Charge to come to your Embassy to receive note. “The U.S. | |
Government agrees with the statement of the Government of the DRV, in its note of | |
April 27, that it is necessary for Hanoi and Washington to engage in conversations | |
promptly. The U.S. Government notes that the DRV has now agreed that representatives | |
of the two countries should hold private discussions for the sole purpose of | |
agreeing on a location and date. The U.S. Government notes that the DRV did not | |
respond to its suggestion of April 23 that we meet for this limited purpose in a | |
‘capital not previously considered by either side.’ The U.S. Government suggested | |
the DRV might wish to indicate three appropriate locations suitable for this limited | |
purpose. The U.S. Government does not consider that the suggestion of Warsaw is | |
responsive or acceptable. The U.S. Government is prepared for these limited discussions | |
on April 30 or several days thereafter. The U.S. Government would welcome the prompt | |
response of the DRV to this suggestion.”' | |
let $sentences := local:tokenize-sentences($source-text) | |
return | |
<p sentence-count="{count($sentences)}">{ | |
for $sentence at $n in $sentences | |
return | |
<s n="{$n}">{$sentence}</s> | |
}</p> | |
(: should return: | |
<p sentence-count="10"> | |
<s n="1">154613.</s> | |
<s n="2">You should arrange to deliver following note to North Vietnamese Embassy.</s> | |
<s n="3">If in your opinion it can be done without creating an issue, we would prefer that you | |
ask North Vietnamese Charge to come to your Embassy to receive note.</s> | |
<s n="4">“The U.S. Government agrees with the statement of the Government of the DRV, in its | |
note of April 27, that it is necessary for Hanoi and Washington to engage in conversations | |
promptly.</s> | |
<s n="5">The U.S. Government notes that the DRV has now agreed that representatives of the two | |
countries should hold private discussions for the sole purpose of agreeing on a location and | |
date.</s> | |
<s n="6">The U.S. Government notes that the DRV did not respond to its suggestion of April 23 | |
that we meet for this limited purpose in a ‘capital not previously considered by either | |
side.’</s> | |
<s n="7">The U.S. Government suggested the DRV might wish to indicate three appropriate | |
locations suitable for this limited purpose.</s> | |
<s n="8">The U.S. Government does not consider that the suggestion of Warsaw is responsive or | |
acceptable.</s> | |
<s n="9">The U.S. Government is prepared for these limited discussions on April 30 or several | |
days thereafter.</s> | |
<s n="10">The U.S. Government would welcome the prompt response of the DRV to this | |
suggestion.”</s> | |
</p> | |
:) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment