Skip to content

Instantly share code, notes, and snippets.

@joewiz
Last active April 9, 2021 03:22
Show Gist options
  • Save joewiz/5889711 to your computer and use it in GitHub Desktop.
Save joewiz/5889711 to your computer and use it in GitHub Desktop.
Split (or "tokenize") a string into "sentences", with XQuery. See http://joewiz.org/2013/06/29/one-paragraph-many-sentences/.
xquery version "1.0";
(: A naive approach to sentence tokenization inspired by http://stackoverflow.com/a/2103653/659732
:
: Works well with edited text like newspapers. Parameters like punctuation can/should be edited;
: see the section below called "criteria".
:
: For a more sophisticated approach, see Tibor Kiss and Jan Strunk, "Unsupervised Multilingual
: Sentence Boundary Detection", Computational Linguistics, Volume 32, Issue 4, December 2006,
: pp. 485-525. Also, see these discussions of sentence tokenization:
: - http://nltk.org/book/ch06.html#sec-further-examples-of-supervised-classification
: - http://www.robincamille.com/2012-02-18-nltk-sentence-tokenizer/
:)
declare function local:tokenize-sentences($string as xs:string*)
{
let $words := tokenize($string, '\s+')[. ne '']
let $first-sentence := normalize-space(local:get-first-sentence($words, ''))
return
($first-sentence,
let $word-count-of-sentence := count(tokenize($first-sentence, ' '))
return
if (count($words) gt $word-count-of-sentence) then
local:tokenize-sentences(string-join(subsequence($words, $word-count-of-sentence + 1), ' '))
else
()
)
};
declare function local:get-first-sentence($words as xs:string*, $sentence as xs:string) {
(: if there are no (more) words to check, we're done, so return whatever we have for the sentence :)
if (empty($words)) then
$sentence
(: begin analyzing the word :)
else
let $word := subsequence($words, 1, 1)
let $next := subsequence($words, 2, 1)
let $rest := subsequence($words, 2)
(: criteria :)
let $final-punctuation-marks := '.?!'
let $post-punctuation-possibilities := '’”"'')'
let $pre-punctuation-possibilities := '‘“"''('
let $final-punctuation-regex := concat('[', $final-punctuation-marks, '][', $post-punctuation-possibilities, ']?$')
let $capitalized-abbreviation-test-regex := '[A-Z][.?!]'
let $capitalized-test-regex := concat('^[', $pre-punctuation-possibilities, ']*?[A-Z]')
let $words-with-ignorable-final-punctuation-marks := ('Mr.', 'Mrs.', 'Dr.', 'Amb.')
let $known-phrases-with-ignorable-final-punctuation-marks := ('U.S. Government')
(: test the word against the criteria :)
let $word-ends-with-punctuation := matches($word, $final-punctuation-regex)
let $word-is-capitalized-abbreviation := matches($word, $capitalized-abbreviation-test-regex)
let $next-word-is-capitalized := matches($next, $capitalized-test-regex)
let $word-has-ignorable-punctuation := $word = $words-with-ignorable-final-punctuation-marks
return
(: if word doesn't end with punctuation (like "the" or "Minister"),
then consider it part of the existing sentence and move to the next word. :)
if (not($word-ends-with-punctuation)) then
local:get-first-sentence(
$rest,
concat($sentence, ' ', $word)
)
(: if the word is in our list of words with allowable final punctuation (like "Mr."),
then consider it part of the existing sentence and move to the next word. :)
else if ($word-has-ignorable-punctuation) then
local:get-first-sentence(
$rest,
concat($sentence, ' ', $word)
)
(: if the word is an abbreviation and the next word is not capitalized (like "A.B.M. treaty"),
or if the word ends with punctuation and the next word is not capitalized (like "'What?' he asked.")
then consider it part of the existing sentence and move to the next word. :)
else if (($word-is-capitalized-abbreviation or $word-ends-with-punctuation) and not($next-word-is-capitalized)) then
local:get-first-sentence(
$rest,
concat($sentence, ' ', $word)
)
(: if the word is part of a known phrase that could be mistaken for the end of a sentence (like "U.S. Government"),
then consider it part of the existing sentence and move to the next word. :)
else
let $sorted-phrases :=
(: order by word length, longest to shortest :)
for $phrase in $known-phrases-with-ignorable-final-punctuation-marks
order by string-length($phrase) descending
return $phrase
let $words-as-string := string-join($words, ' ')
let $matching-phrase :=
subsequence(
for $phrase in $sorted-phrases
return
if (starts-with($words-as-string, $phrase)) then
$phrase
else ()
, 1, 1)
return
if ($matching-phrase) then
let $phrase-length := count(tokenize($matching-phrase, ' '))
let $rest := subsequence($words, $phrase-length + 1)
return
local:get-first-sentence(
$rest,
concat($sentence, ' ', $matching-phrase)
)
(: the word ends the sentence - we're done with this sentence! :)
else
concat($sentence, ' ', $word)
};
(: sample text taken from http://history.state.gov/historicaldocuments/frus1964-68v06/d213 :)
let $source-text :=
'154613. You should arrange to deliver following note to North Vietnamese Embassy.
If in your opinion it can be done without creating an issue, we would prefer that
you ask North Vietnamese Charge to come to your Embassy to receive note. “The U.S.
Government agrees with the statement of the Government of the DRV, in its note of
April 27, that it is necessary for Hanoi and Washington to engage in conversations
promptly. The U.S. Government notes that the DRV has now agreed that representatives
of the two countries should hold private discussions for the sole purpose of
agreeing on a location and date. The U.S. Government notes that the DRV did not
respond to its suggestion of April 23 that we meet for this limited purpose in a
‘capital not previously considered by either side.’ The U.S. Government suggested
the DRV might wish to indicate three appropriate locations suitable for this limited
purpose. The U.S. Government does not consider that the suggestion of Warsaw is
responsive or acceptable. The U.S. Government is prepared for these limited discussions
on April 30 or several days thereafter. The U.S. Government would welcome the prompt
response of the DRV to this suggestion.”'
let $sentences := local:tokenize-sentences($source-text)
return
<p sentence-count="{count($sentences)}">{
for $sentence at $n in $sentences
return
<s n="{$n}">{$sentence}</s>
}</p>
(: should return:
<p sentence-count="10">
<s n="1">154613.</s>
<s n="2">You should arrange to deliver following note to North Vietnamese Embassy.</s>
<s n="3">If in your opinion it can be done without creating an issue, we would prefer that you
ask North Vietnamese Charge to come to your Embassy to receive note.</s>
<s n="4">“The U.S. Government agrees with the statement of the Government of the DRV, in its
note of April 27, that it is necessary for Hanoi and Washington to engage in conversations
promptly.</s>
<s n="5">The U.S. Government notes that the DRV has now agreed that representatives of the two
countries should hold private discussions for the sole purpose of agreeing on a location and
date.</s>
<s n="6">The U.S. Government notes that the DRV did not respond to its suggestion of April 23
that we meet for this limited purpose in a ‘capital not previously considered by either
side.’</s>
<s n="7">The U.S. Government suggested the DRV might wish to indicate three appropriate
locations suitable for this limited purpose.</s>
<s n="8">The U.S. Government does not consider that the suggestion of Warsaw is responsive or
acceptable.</s>
<s n="9">The U.S. Government is prepared for these limited discussions on April 30 or several
days thereafter.</s>
<s n="10">The U.S. Government would welcome the prompt response of the DRV to this
suggestion.”</s>
</p>
:)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment