Created
August 9, 2018 15:29
-
-
Save mbutterick/11f81fa7252825d2195826a3d557acb0 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#lang racket | |
(require rackunit pollen/decode pollen/template/html) | |
(define str "Крикну — а в ответ тишина.") | |
(define (wrap-emdashes str) | |
;; define match pattern | |
;; one or more Unicode letters followed by space emdash space | |
;; letter pattern is parenthesized to produce submatch for word | |
(define pat #px"(\\p{L}+) — ") | |
;; divide string into substrings at boundaries of `pat` matches | |
(define substrs (regexp-match* pat str #:gap-select? #t)) | |
;; walk through substrings | |
(for/list ([substr (in-list substrs)]) | |
(match (regexp-match pat substr) ; try to match `pat` | |
;; if it matches, you'll get two results: the whole match, | |
;; and the word submatch | |
[(list all word) | |
;; wrap `word` as needed | |
`(span ((class "no-wrap")) ,word thinsp mdash thinsp)] | |
;; otherwise return the substring | |
[#f substr]))) | |
(check-equal? (->html (decode str #:string-proc wrap-emdashes)) | |
"<span class=\"no-wrap\">Крикну — </span>а в ответ тишина.") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment