Skip to content

Instantly share code, notes, and snippets.

@informatimago
Created May 15, 2021 02:14
Show Gist options
  • Save informatimago/e4e20f3b2fe7c3646d7c76b1930925c1 to your computer and use it in GitHub Desktop.
Save informatimago/e4e20f3b2fe7c3646d7c76b1930925c1 to your computer and use it in GitHub Desktop.
;;;; -*- mode:lisp;coding:utf-8 -*-
;;;;**************************************************************************
;;;;FILE: babel-extension.lisp
;;;;LANGUAGE: Common-Lisp
;;;;SYSTEM: Common-Lisp
;;;;USER-INTERFACE: NONE
;;;;DESCRIPTION
;;;;
;;;; A function to test for code sequences.
;;;;
;;;;AUTHORS
;;;; <PJB> Pascal J. Bourguignon <[email protected]>
;;;;MODIFICATIONS
;;;; 2021-05-14 <PJB> Created
;;;;BUGS
;;;;LEGAL
;;;; AGPL3
;;;;
;;;; Copyright Pascal J. Bourguignon 2021 - 2021
;;;;
;;;; This program is free software: you can redistribute it and/or modify
;;;; it under the terms of the GNU Affero General Public License as published by
;;;; the Free Software Foundation, either version 3 of the License, or
;;;; (at your option) any later version.
;;;;
;;;; This program is distributed in the hope that it will be useful,
;;;; but WITHOUT ANY WARRANTY; without even the implied warranty of
;;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
;;;; GNU Affero General Public License for more details.
;;;;
;;;; You should have received a copy of the GNU Affero General Public License
;;;; along with this program. If not, see <http://www.gnu.org/licenses/>.
;;;;**************************************************************************
(defpackage "COM.INFORMATIMAGO.CLEXT.BABEL-EXTENSION"
(:use "COMMON-LISP"
"BABEL")
(:export "DECODE-CHARACTER"))
(in-package "COM.INFORMATIMAGO.CLEXT.BABEL-EXTENSION")
;; (defparameter *replacement-character*
;; (if (<= 65535 char-code-limit) ; does it really mean that the
;; ; implementation uses unicode?
;;
;; (code-char 65533) ; #\Replacement_Character
;;
;; ;; Let's assume ASCII:
;; (code-char 26) ; #\Sub
;; ;; SUB is #x3f in EBCDIC
;; )
;;
;; "A replacement character.")
(defparameter *replacement-character* (code-char 65533)
;; TODO: Is it always the same for all encodings?
"The replacement character used by babel.")
(defun decode-character (octets &key (start 0) end (encoding :utf-8))
;; we'll optimize :us-ascii, :iso-8895-1 and :utf-8 cases.
(let ((end (or end (length octets))))
(case encoding
((:us-ascii :csascii :cp637 :ibm637 :us :iso646-us :ascii :iso-ir-6)
(if (<= end start)
(values nil t 1)
(let ((code (aref octets start)))
(if (<= 0 code 127)
(values (code-char code) t 1)
(values nil nil 1)))))
((:iso-8859-1 :iso_8859-1 :latin1 :l1 :ibm819 :cp819 :csisolatin1)
(if (<= end start)
(values nil t 1)
(let ((code (aref octets start)))
(if (<= 0 code 255)
(values (code-char code) t 1)
(values nil nil 1)))))
((:utf-8)
(if (<= end start)
(values nil t 1)
(let ((byte (aref octets start))
(code 0))
(cond
((<= 0 byte 127) ; 1 byte
(values (code-char byte) t 1))
((<= #b11000000 byte #b11011111) ; 2 bytes
(setf code (ash (ldb (byte 5 0) byte) 6))
(incf start)
(if (< start end)
(let ((byte (aref octets start)))
(if (<= #b10000000 byte #b10111111)
(progn
(setf code (dpb (ldb (byte 6 0) byte) (byte 6 0) code))
(values (code-char code) t 2))
(values nil nil 2)))
(values nil t 2)))
((<= #b11100000 byte #b11101111) ; 3 bytes
(setf code (ash (ldb (byte 4 0) byte) 12))
(incf start)
(if (< start end)
(let ((byte (aref octets start)))
(if (<= #b10000000 byte #b10111111)
(progn
(setf code (dpb (ldb (byte 6 0) byte) (byte 6 6) code))
(incf start)
(if (< start end)
(let ((byte (aref octets start)))
(if (<= #b10000000 byte #b10111111)
(progn
(setf code (dpb (ldb (byte 6 0) byte) (byte 6 0) code))
(values (code-char code) t 3))
(values nil nil 3)))
(values nil t 3)))
(values nil nil 3)))
(values nil t 3)))
((<= #b11110000 byte #b11110111) ; 4 bytes
(setf code (ash (ldb (byte 3 0) byte) 18))
(incf start)
(if (< start end)
(let ((byte (aref octets start)))
(if (<= #b10000000 byte #b10111111)
(progn
(setf code (dpb (ldb (byte 6 0) byte) (byte 6 12) code))
(incf start)
(if (< start end)
(let ((byte (aref octets start)))
(if (<= #b10000000 byte #b10111111)
(progn
(setf code (dpb (ldb (byte 6 0) byte) (byte 6 6) code))
(incf start)
(if (< start end)
(let ((byte (aref octets start)))
(if (<= #b10000000 byte #b10111111)
(progn
(setf code (dpb (ldb (byte 6 0) byte) (byte 6 0) code))
(values (code-char code) t 4))
(values nil nil 4)))
(values nil t 4)))
(values nil nil 4)))
(values nil t 4)))
(values nil nil 4)))
(values nil t 4)))
(t
(values nil nil 1))))))
(otherwise
(handler-case
(octets-to-string octets :start start :end end :errorp nil :encoding encoding)
(:no-error (string)
(if (= 1 (length string))
(if (char= (aref string 0) *replacement-character*)
(values nil t 1) ; ???
(values (aref string 0) t (length (string-to-octets string :encoding encoding))))
(values (aref string 0) t (length (string-to-octets string :end 1 :encoding encoding)))))
(end-of-input-in-character ()
(values nil t 1)) ; ???
(character-out-of-range ()
(values nil t 1)) ; ???
(character-decoding-error ()
(values nil nil 1) #|???|#))))))
;;;; THE END ;;;;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment