Created
May 15, 2021 02:14
-
-
Save informatimago/e4e20f3b2fe7c3646d7c76b1930925c1 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
;;;; -*- mode:lisp;coding:utf-8 -*- | |
;;;;************************************************************************** | |
;;;;FILE: babel-extension.lisp | |
;;;;LANGUAGE: Common-Lisp | |
;;;;SYSTEM: Common-Lisp | |
;;;;USER-INTERFACE: NONE | |
;;;;DESCRIPTION | |
;;;; | |
;;;; A function to test for code sequences. | |
;;;; | |
;;;;AUTHORS | |
;;;; <PJB> Pascal J. Bourguignon <[email protected]> | |
;;;;MODIFICATIONS | |
;;;; 2021-05-14 <PJB> Created | |
;;;;BUGS | |
;;;;LEGAL | |
;;;; AGPL3 | |
;;;; | |
;;;; Copyright Pascal J. Bourguignon 2021 - 2021 | |
;;;; | |
;;;; This program is free software: you can redistribute it and/or modify | |
;;;; it under the terms of the GNU Affero General Public License as published by | |
;;;; the Free Software Foundation, either version 3 of the License, or | |
;;;; (at your option) any later version. | |
;;;; | |
;;;; This program is distributed in the hope that it will be useful, | |
;;;; but WITHOUT ANY WARRANTY; without even the implied warranty of | |
;;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
;;;; GNU Affero General Public License for more details. | |
;;;; | |
;;;; You should have received a copy of the GNU Affero General Public License | |
;;;; along with this program. If not, see <http://www.gnu.org/licenses/>. | |
;;;;************************************************************************** | |
(defpackage "COM.INFORMATIMAGO.CLEXT.BABEL-EXTENSION" | |
(:use "COMMON-LISP" | |
"BABEL") | |
(:export "DECODE-CHARACTER")) | |
(in-package "COM.INFORMATIMAGO.CLEXT.BABEL-EXTENSION") | |
;; (defparameter *replacement-character* | |
;; (if (<= 65535 char-code-limit) ; does it really mean that the | |
;; ; implementation uses unicode? | |
;; | |
;; (code-char 65533) ; #\Replacement_Character | |
;; | |
;; ;; Let's assume ASCII: | |
;; (code-char 26) ; #\Sub | |
;; ;; SUB is #x3f in EBCDIC | |
;; ) | |
;; | |
;; "A replacement character.") | |
(defparameter *replacement-character* (code-char 65533) | |
;; TODO: Is it always the same for all encodings? | |
"The replacement character used by babel.") | |
(defun decode-character (octets &key (start 0) end (encoding :utf-8)) | |
;; we'll optimize :us-ascii, :iso-8895-1 and :utf-8 cases. | |
(let ((end (or end (length octets)))) | |
(case encoding | |
((:us-ascii :csascii :cp637 :ibm637 :us :iso646-us :ascii :iso-ir-6) | |
(if (<= end start) | |
(values nil t 1) | |
(let ((code (aref octets start))) | |
(if (<= 0 code 127) | |
(values (code-char code) t 1) | |
(values nil nil 1))))) | |
((:iso-8859-1 :iso_8859-1 :latin1 :l1 :ibm819 :cp819 :csisolatin1) | |
(if (<= end start) | |
(values nil t 1) | |
(let ((code (aref octets start))) | |
(if (<= 0 code 255) | |
(values (code-char code) t 1) | |
(values nil nil 1))))) | |
((:utf-8) | |
(if (<= end start) | |
(values nil t 1) | |
(let ((byte (aref octets start)) | |
(code 0)) | |
(cond | |
((<= 0 byte 127) ; 1 byte | |
(values (code-char byte) t 1)) | |
((<= #b11000000 byte #b11011111) ; 2 bytes | |
(setf code (ash (ldb (byte 5 0) byte) 6)) | |
(incf start) | |
(if (< start end) | |
(let ((byte (aref octets start))) | |
(if (<= #b10000000 byte #b10111111) | |
(progn | |
(setf code (dpb (ldb (byte 6 0) byte) (byte 6 0) code)) | |
(values (code-char code) t 2)) | |
(values nil nil 2))) | |
(values nil t 2))) | |
((<= #b11100000 byte #b11101111) ; 3 bytes | |
(setf code (ash (ldb (byte 4 0) byte) 12)) | |
(incf start) | |
(if (< start end) | |
(let ((byte (aref octets start))) | |
(if (<= #b10000000 byte #b10111111) | |
(progn | |
(setf code (dpb (ldb (byte 6 0) byte) (byte 6 6) code)) | |
(incf start) | |
(if (< start end) | |
(let ((byte (aref octets start))) | |
(if (<= #b10000000 byte #b10111111) | |
(progn | |
(setf code (dpb (ldb (byte 6 0) byte) (byte 6 0) code)) | |
(values (code-char code) t 3)) | |
(values nil nil 3))) | |
(values nil t 3))) | |
(values nil nil 3))) | |
(values nil t 3))) | |
((<= #b11110000 byte #b11110111) ; 4 bytes | |
(setf code (ash (ldb (byte 3 0) byte) 18)) | |
(incf start) | |
(if (< start end) | |
(let ((byte (aref octets start))) | |
(if (<= #b10000000 byte #b10111111) | |
(progn | |
(setf code (dpb (ldb (byte 6 0) byte) (byte 6 12) code)) | |
(incf start) | |
(if (< start end) | |
(let ((byte (aref octets start))) | |
(if (<= #b10000000 byte #b10111111) | |
(progn | |
(setf code (dpb (ldb (byte 6 0) byte) (byte 6 6) code)) | |
(incf start) | |
(if (< start end) | |
(let ((byte (aref octets start))) | |
(if (<= #b10000000 byte #b10111111) | |
(progn | |
(setf code (dpb (ldb (byte 6 0) byte) (byte 6 0) code)) | |
(values (code-char code) t 4)) | |
(values nil nil 4))) | |
(values nil t 4))) | |
(values nil nil 4))) | |
(values nil t 4))) | |
(values nil nil 4))) | |
(values nil t 4))) | |
(t | |
(values nil nil 1)))))) | |
(otherwise | |
(handler-case | |
(octets-to-string octets :start start :end end :errorp nil :encoding encoding) | |
(:no-error (string) | |
(if (= 1 (length string)) | |
(if (char= (aref string 0) *replacement-character*) | |
(values nil t 1) ; ??? | |
(values (aref string 0) t (length (string-to-octets string :encoding encoding)))) | |
(values (aref string 0) t (length (string-to-octets string :end 1 :encoding encoding))))) | |
(end-of-input-in-character () | |
(values nil t 1)) ; ??? | |
(character-out-of-range () | |
(values nil t 1)) ; ??? | |
(character-decoding-error () | |
(values nil nil 1) #|???|#)))))) | |
;;;; THE END ;;;; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment