Skip to content

Instantly share code, notes, and snippets.

@informatimago
Created May 15, 2021 02:15
Show Gist options
  • Save informatimago/4a2298e501cd826475739bb544a63f6a to your computer and use it in GitHub Desktop.
Save informatimago/4a2298e501cd826475739bb544a63f6a to your computer and use it in GitHub Desktop.
;;;; -*- mode:lisp;coding:utf-8 -*-
;;;;**************************************************************************
;;;;FILE: babel-extension-test.lisp
;;;;LANGUAGE: Common-Lisp
;;;;SYSTEM: Common-Lisp
;;;;USER-INTERFACE: NONE
;;;;DESCRIPTION
;;;;
;;;; Tests decode-character.
;;;;
;;;;AUTHORS
;;;; <PJB> Pascal J. Bourguignon <[email protected]>
;;;;MODIFICATIONS
;;;; 2021-05-15 <PJB> Created.
;;;;BUGS
;;;;LEGAL
;;;; AGPL3
;;;;
;;;; Copyright Pascal J. Bourguignon 2021 - 2021
;;;;
;;;; This program is free software: you can redistribute it and/or modify
;;;; it under the terms of the GNU Affero General Public License as published by
;;;; the Free Software Foundation, either version 3 of the License, or
;;;; (at your option) any later version.
;;;;
;;;; This program is distributed in the hope that it will be useful,
;;;; but WITHOUT ANY WARRANTY; without even the implied warranty of
;;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
;;;; GNU Affero General Public License for more details.
;;;;
;;;; You should have received a copy of the GNU Affero General Public License
;;;; along with this program. If not, see <http://www.gnu.org/licenses/>.
;;;;**************************************************************************
(defpackage "COM.INFORMATIMAGO.CLEXT.BABEL-EXTENSION.TEST"
(:use "COMMON-LISP"
"COM.INFORMATIMAGO.CLEXT.BABEL-EXTENSION"
"COM.INFORMATIMAGO.COMMON-LISP.CESARUM.SIMPLE-TEST")
(:export "TEST/ALL"))
(in-package "COM.INFORMATIMAGO.CLEXT.BABEL-EXTENSION.TEST")
(define-test test/decode-character/us-ascii ()
(let ((encoding :us-ascii)
(octets (make-array 10 :element-type '(unsigned-byte 8) :initial-element 32)))
(check equal
(multiple-value-list (decode-character octets :start 0 :end 0 :encoding encoding))
'(nil t 1))
(loop :for code :from 0 :to 127
:do (setf (aref octets 0) code)
(check equal
(multiple-value-list (decode-character octets :start 0 :end 1 :encoding encoding))
(list (code-char code) t 1)
(encoding code octets)))
(loop :for code :from 128 :to 255
:do (setf (aref octets 0) code)
(check equal
(multiple-value-list (decode-character octets :start 0 :end 1 :encoding encoding))
'(nil nil 1)
(encoding code octets)))
(loop :for code :from 0 :to 127
:do (setf (aref octets 0) code)
(setf (aref octets 1) 65)
(check equal
(multiple-value-list (decode-character octets :start 0 :end 2 :encoding encoding))
(list (code-char code) t 1)
(encoding code octets)))
(loop :for code :from 128 :to 255
:do (setf (aref octets 0) code)
(setf (aref octets 1) 65)
(check equal
(multiple-value-list (decode-character octets :start 0 :end 2 :encoding encoding))
'(nil nil 1)
(encoding code octets)))))
(define-test test/decode-character/iso-8859-1 ()
(let ((encoding :iso-8859-1)
(octets (make-array 10 :element-type '(unsigned-byte 8) :initial-element 32)))
(check equal
(multiple-value-list (decode-character octets :start 0 :end 0 :encoding encoding))
'(nil t 1))
(loop :for code :from 0 :to 255
:do (setf (aref octets 0) code)
(check equal
(multiple-value-list (decode-character octets :start 0 :end 1 :encoding encoding))
(list (code-char code) t 1)
(encoding code octets)))
(loop :for code :from 0 :to 255
:do (setf (aref octets 0) code)
(setf (aref octets 1) 65)
(check equal
(multiple-value-list (decode-character octets :start 0 :end 2 :encoding encoding))
(list (code-char code) t 1)
(encoding code octets)))))
(defun utf-8-to-octets (code octets &key (start 0) end)
(assert (<= code char-code-limit) (code)
"Code ~D should be a unicode code point between 0 and ~A"
code char-code-limit)
(cond
((<= code #x7f)
(assert (<= (+ start 1) (or end (length octets))))
(setf (aref octets start) code)
(incf start))
((<= code #x7ff)
(assert (<= (+ start 2) (or end (length octets))))
(setf (aref octets start) (dpb (ldb (byte 5 6) code) (byte 5 0) #b11000000))
(incf start)
(setf (aref octets start) (dpb (ldb (byte 6 0) code) (byte 6 0) #b10000000))
(incf start))
((<= code #xffff)
(assert (<= (+ start 3) (or end (length octets))))
(setf (aref octets start) (dpb (ldb (byte 4 12) code) (byte 4 0) #b11100000))
(incf start)
(setf (aref octets start) (dpb (ldb (byte 6 6) code) (byte 6 0) #b10000000))
(incf start)
(setf (aref octets start) (dpb (ldb (byte 6 0) code) (byte 6 0) #b10000000))
(incf start))
((<= code #x10ffff)
(assert (<= (+ start 4) (or end (length octets))))
(setf (aref octets start) (dpb (ldb (byte 3 18) code) (byte 3 0) #b11110000))
(incf start)
(setf (aref octets start) (dpb (ldb (byte 6 12) code) (byte 6 0) #b10000000))
(incf start)
(setf (aref octets start) (dpb (ldb (byte 6 6) code) (byte 6 0) #b10000000))
(incf start)
(setf (aref octets start) (dpb (ldb (byte 6 0) code) (byte 6 0) #b10000000))
(incf start))
(t
(error "Invalid unicode code-point for utf-8 encoding ~D (#x~:*~X)" code)))
(values start octets))
(define-test test/utf-8-to-octets ()
(let ((octets (make-array 10 :element-type '(unsigned-byte 8) :initial-element 32)))
(assert-true (= 1 (utf-8-to-octets #x45 octets)))
(assert-true (= 2 (utf-8-to-octets #x745 octets)))
(assert-true (= 3 (utf-8-to-octets #x7045 octets)))
(assert-true (= 4 (utf-8-to-octets #x100045 octets)))))
(define-test test/decode-character/utf-8 ()
(let ((encoding :utf-8)
(octets (make-array 10 :element-type '(unsigned-byte 8) :initial-element 32)))
(check equal
(multiple-value-list (decode-character octets :start 0 :end 0 :encoding encoding))
'(nil t 1))
;; Note: this includes the cases where code-char returns NIL:
(loop :for code :from 0 :below char-code-limit
:for next := (utf-8-to-octets code octets :start 0 :end (length octets))
:do (utf-8-to-octets 65 octets :start next :end (length octets))
(if (<= code #x7f)
(check equal
(multiple-value-list (decode-character octets :start 0 :end 1 :encoding encoding))
(list (code-char code) t next)
(encoding code octets))
(check equal
(multiple-value-list (decode-character octets :start 0 :end 1 :encoding encoding))
(list nil t next)
(encoding code octets)))
(if (<= code #x7ff)
(check equal
(multiple-value-list (decode-character octets :start 0 :end 2 :encoding encoding))
(list (code-char code) t next)
(encoding code octets))
(check equal
(multiple-value-list (decode-character octets :start 0 :end 2 :encoding encoding))
(list nil t next)
(encoding code octets)))
(if (<= code #xffff)
(check equal
(multiple-value-list (decode-character octets :start 0 :end 3 :encoding encoding))
(list (code-char code) t next)
(encoding code octets))
(check equal
(multiple-value-list (decode-character octets :start 0 :end 3 :encoding encoding))
(list nil t next)
(encoding code octets)))
(if (<= code #x10ffff)
(check equal
(multiple-value-list (decode-character octets :start 0 :end 4 :encoding encoding))
(list (code-char code) t next)
(encoding code octets))
(check equal
(multiple-value-list (decode-character octets :start 0 :end 4 :encoding encoding))
(list nil t next)
(encoding code octets))))
;; Testing invalid utf-8 code sequences:
(check equal (multiple-value-list (decode-character (replace octets #(130)) :encoding encoding))
'(nil nil 1)
(encoding octets))
(check equal (multiple-value-list (decode-character (replace octets #(#b11000000 #b00100001)) :encoding encoding))
'(nil nil 2)
(encoding octets))
(check equal (multiple-value-list (decode-character (replace octets #(#b11000000 #b11100001)) :encoding encoding))
'(nil nil 2)
(encoding octets))
(check equal (multiple-value-list (decode-character (replace octets #(#b11100000 #b10110011 #b00100001)) :encoding encoding))
'(nil nil 3)
(encoding octets))
(check equal (multiple-value-list (decode-character (replace octets #(#b11100000 #b10110011 #b11100001)) :encoding encoding))
'(nil nil 3)
(encoding octets))
(check equal (multiple-value-list (decode-character (replace octets #(#b11110000 #b00100001 #b10110011 #b10110011)) :encoding encoding))
'(nil nil 4)
(encoding octets))
(check equal (multiple-value-list (decode-character (replace octets #(#b11110000 #b10110011 #b00100001 #b10110011)) :encoding encoding))
'(nil nil 4)
(encoding octets))
(check equal (multiple-value-list (decode-character (replace octets #(#b11110000 #b10110011 #b10110011 #b00100001)) :encoding encoding))
'(nil nil 4)
(encoding octets))
(check equal (multiple-value-list (decode-character (replace octets #(#b11110000 #b11100001 #b10110011 #b10110011)) :encoding encoding))
'(nil nil 4)
(encoding octets))
(check equal (multiple-value-list (decode-character (replace octets #(#b11110000 #b10110011 #b11100001 #b10110011)) :encoding encoding))
'(nil nil 4)
(encoding octets))
(check equal (multiple-value-list (decode-character (replace octets #(#b11110000 #b10110011 #b10110011 #b11100001)) :encoding encoding))
'(nil nil 4)
(encoding octets))))
(define-test test/decode-character/eucjp ()
(let* ((encoding :eucjp)
(string "こんにちは / コンニチハ")
(octets (babel:string-to-octets string :encoding :eucjp)))
(loop
:for expected :across string
:for start := 0 :then (+ start size)
:for (character validp size) := (multiple-value-list (decode-character octets :start start :encoding encoding))
:do (assert-true character (character) "decode-character should have decoded a ~S character from ~A" encoding start)
(assert-true validp (validp) "decode-character should have decoded a valid ~S code sequence from ~A" encoding start)
(check char= character expected (encoding start octets character expected))
:finally (incf start size)
(check = start (length octets) (encoding start octets)))))
(define-test test/all ()
(test/decode-character/us-ascii)
(test/decode-character/iso-8859-1)
(test/utf-8-to-octets)
(test/decode-character/utf-8)
(test/decode-character/eucjp))
;; (test/all)
;;;; THE END ;;;;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment