Created
January 28, 2011 08:11
-
-
Save aisamanra/799987 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
;; guile-utf8.scm | |
;; A quickly hacked together set of functions to turn | |
;; utf-8-encoded text (which Guile sees as raw bytes) into | |
;; ASCII-encoded HTML, with a few other functions for | |
;; getting the code values out of a UTF-8 string. | |
;; anon-let is a hacky macro to create a new scope while | |
;; allowing definitions within it to bind in its enclosing | |
;; scope. It also allows define-local which only defines | |
;; within that scope. It is used here to close over constants | |
;; which needn't clutter the global namespace and to keep | |
;; small utility functions private. | |
(define-macro (anon-let clauses . body) | |
(define (forward body) | |
(map | |
(lambda (clause) | |
(if (list? (cadr clause)) | |
`(define ,(caadr clause) #f) | |
`(define ,(cadr clause) #f))) | |
(filter | |
(lambda (clause) | |
(and (list? clause) | |
(equal? (car clause) 'define))) | |
body))) | |
(define (later body) | |
(map | |
(lambda (clause) | |
(cond ((and (list? clause) | |
(equal? (car clause) 'define-local)) | |
`(define ,@(cdr clause))) | |
((or (not (list? clause)) | |
(not (equal? (car clause) 'define))) | |
clause) | |
((list? (cadr clause)) | |
`(set! ,(caadr clause) | |
(lambda ,(cdadr clause) | |
,@(cddr clause)))) | |
(else | |
`(set! ,(cadr clause) | |
,@(cddr clause))))) | |
body)) | |
`(begin | |
,@(forward body) | |
(let ,clauses | |
,@(later body)))) | |
;; returns the first n characters of the string s | |
(define (string-take s n) | |
(substring s 0 n)) | |
;; returns the string s without the first n characters | |
(define (string-drop s n) | |
(substring s n)) | |
;; returns the integer value of the first character of s | |
(define (first-char s) | |
(char->integer (string-ref s 0))) | |
(anon-let | |
((_1b-mask #b+10000000) ;; All of these are masks used | |
(_mb-mask #b+11000000) ;; to identify whether a given byte | |
(_2b-mask #b+11100000) ;; begins a 1- to 4-byte codepoint | |
(_3b-mask #b+11110000) ;; or is is in the middle of a | |
(_4b-mask #b+11111000)) ;; codepoint. | |
;; These, in conjunction with the masks above, are used | |
;; to determine what part of a codepoint a given byte | |
;; constitutes. | |
(define-local (one-byte? s) | |
(= (logand (first-char s) _1b-mask) #b+00000000)) | |
(define-local (two-byte? s) | |
(= (logand (first-char s) _2b-mask) #b+11000000)) | |
(define-local (three-byte? s) | |
(= (logand (first-char s) _3b-mask) #b+11100000)) | |
(define-local (four-byte? s) | |
(= (logand (first-char s) _4b-mask) #b+11110000)) | |
(define-local (mid-byte? s) | |
(= (logand (first-char s) _mb-mask) #b+10000000)) | |
;; Returns a list of strings where each (possibly multibyte) | |
;; string represents a single UTF-8 codepoint. | |
(define (ustring->list s) | |
(define (helper s l) | |
(cond ((equal? s "") | |
(reverse l)) | |
((one-byte? s) | |
(helper (string-drop s 1) | |
(cons (string-take s 1) l))) | |
((two-byte? s) | |
(helper (string-drop s 2) | |
(cons (string-take s 2) l))) | |
((three-byte? s) | |
(helper (string-drop s 3) | |
(cons (string-take s 3) l))) | |
((four-byte? s) | |
(helper (string-drop s 4) | |
(cons (string-take s 4) l))))) | |
(helper s '())) | |
;; flips the bits in a single byte | |
(define-local (flip-byte x) | |
(logxor x #b+11111111)) | |
;; Takes a string representing a single UTF-8 codepoint | |
;; and returns the integer representation of it. If the | |
;; string contains multiple codepoints, it returns the | |
;; last. | |
(define (uchar->integer s) | |
(define (helper s n) | |
(cond ((equal? s "") | |
n) | |
((one-byte? s) | |
(first-char s)) | |
((two-byte? s) | |
(helper (string-drop s 1) | |
(logand (flip-byte _2b-mask) | |
(first-char s)))) | |
((three-byte? s) | |
(helper (string-drop s 1) | |
(logand (flip-byte _3b-mask) | |
(first-char s)))) | |
((four-byte? s) | |
(helper (string-drop s 1) | |
(logand (flip-byte _4b-mask) | |
(first-char s)))) | |
((mid-byte? s) | |
(helper (string-drop s 1) | |
(logior (logand (flip-byte _mb-mask) | |
(first-char s)) | |
(ash n 6)))))) | |
(helper s 0)) | |
;; Returns a list of integers representing the unicode | |
;; values of the characters in the UTF-8 string. | |
(define (ustring->intlist s) | |
(map uchar->integer (ustring->list s))) | |
;; Takes a UTF-8 string and converts all non-ASCII | |
;; codepoints to HTML escape sequences. | |
(define (utf-8-to-html s) | |
(string-join | |
(map (lambda (char) | |
(cond ((= char 10) | |
"<br/>") | |
((< char 128) | |
(string (integer->char char))) | |
(else | |
(format #f "&#~a;" | |
(number->string char))))) | |
(ustring->intlist s)) | |
"")) | |
) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment