Added support for more complex character encodings (ISO-2022-JP)

This commit is contained in:
Juan Jose Garcia Ripoll 2009-01-08 19:52:01 +01:00
parent 9fbd0a0e93
commit 1bf50bed96
10 changed files with 419 additions and 132 deletions

View file

@ -0,0 +1,55 @@
(cl:unless (cl:find-package :ISO-2022-JP)
(make-package :ISO-2022-JP))
(in-package :ISO-2022-JP)
(defun compose (bytes)
(loop for i in bytes
with cum = 0
do (setf cum (+ (ash cum 8) i))
finally (return cum)))
(defun mapping-hash-table (sequence &optional (mask 0))
(loop with hash = (make-hash-table :size (floor (* 1.5 (length sequence)))
:test 'eq)
for i from 0 below (length sequence) by 2
for multibyte = (elt sequence i)
for codepoint = (elt sequence (1+ i))
for unicode-char = (code-char codepoint)
when (zerop (logand multibyte mask))
do (progn
(setf (gethash multibyte hash) unicode-char)
(setf (gethash unicode-char hash) multibyte)
(when (> multibyte #xFF)
(setf (gethash (ash multibyte -8) hash) t)))
finally (return hash)))
(defun multimap (escapes tables)
(loop for seq in escapes
for table in tables
for table-cons = (member table tables :test 'eq)
do (progn
;; Change escape sequence into byte codes
(setf seq (mapcar #'char-code seq))
;; Store it in the hash table
(setf (gethash t table) seq)
(loop for other-table in tables
do (loop for i from 1
for precedings = (butlast seq i)
while precedings
do (setf (gethash (compose precedings) other-table) t)
finally (setf (gethash (compose seq) other-table) table-cons)))))
(nconc tables tables))
(defparameter ext::iso-2022-jp
(let* ((ascii-no-esc (mapping-hash-table (loop for i from 0 to 127
unless (= i (char-code #\esc))
nconc (list i i))))
(jis208 (mapping-hash-table (ext::load-encoding :jisx0208) #x8080))
(jis201 (mapping-hash-table (ext::load-encoding :jisx0201) #x80)))
(multimap '((#\Esc #\( #\B)
(#\Esc #\( #\J)
(#\Esc #\$ #\@)
(#\Esc #\$ #\B))
(list ascii-no-esc jis201 jis208 jis208))))
(delete-package :ISO-2022-JP)

View file

@ -0,0 +1,57 @@
(cl:unless (cl:find-package :ISO-2022-JP-1)
(make-package :ISO-2022-JP-1))
(in-package :ISO-2022-JP-1)
(defun compose (bytes)
(loop for i in bytes
with cum = 0
do (setf cum (+ (ash cum 8) i))
finally (return cum)))
(defun mapping-hash-table (sequence &optional (mask 0))
(loop with hash = (make-hash-table :size (floor (* 1.5 (length sequence)))
:test 'eq)
for i from 0 below (length sequence) by 2
for multibyte = (elt sequence i)
for codepoint = (elt sequence (1+ i))
for unicode-char = (code-char codepoint)
when (zerop (logand multibyte mask))
do (progn
(setf (gethash multibyte hash) unicode-char)
(setf (gethash unicode-char hash) multibyte)
(when (> multibyte #xFF)
(setf (gethash (ash multibyte -8) hash) t)))
finally (return hash)))
(defun multimap (escapes tables)
(loop for seq in escapes
for table in tables
for table-cons = (member table tables :test 'eq)
do (progn
;; Change escape sequence into byte codes
(setf seq (mapcar #'char-code seq))
;; Store it in the hash table
(setf (gethash t table) seq)
(loop for other-table in tables
do (loop for i from 1
for precedings = (butlast seq i)
while precedings
do (setf (gethash (compose precedings) other-table) t)
finally (setf (gethash (compose seq) other-table) table-cons)))))
(nconc tables tables))
(defparameter ext::iso-2022-jp-1
(let* ((ascii-no-esc (mapping-hash-table (loop for i from 0 to 127
unless (= i (char-code #\esc))
nconc (list i i))))
(jis212 (mapping-hash-table (ext::load-encoding :jisx0212) #x8080))
(jis208 (mapping-hash-table (ext::load-encoding :jisx0208) #x8080))
(jis201 (mapping-hash-table (ext::load-encoding :jisx0201) #x80)))
(multimap '((#\Esc #\( #\B)
(#\Esc #\( #\J)
(#\Esc #\$ #\@)
(#\Esc #\$ #\B)
(#\Esc #\$ #\( #\D))
(list ascii-no-esc jis201 jis208 jis208 jis212))))
(delete-package :ISO-2022-JP-1)

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

View file

@ -1,6 +1,6 @@
;;; -*- Mode: Lisp; Syntax: Common-Lisp; -*-
;;;
;;; Copyright (c) 2009, Giuseppe Attardi.
;;; Copyright (c) 2009, Juan Jose Garcia-Ripoll
;;;
;;; This program is free software; you can redistribute it and/or
;;; modify it under the terms of the GNU Library General Public
@ -9,130 +9,18 @@
;;;
;;; See file '../Copyright' for full details.
(defconstant +sequence-type+ '(unsigned-byte 16))
(defun read-mapping (url)
(let ((command (format nil "curl \"~A\" | sed '/^#.*$/d;s,0x,#x,g;s,U+\\([0-9A-Fa-f]*\\),#x\\1,g;s,#UNDEFINED,NIL # UNDEFINED,g;/LEAD BYTE/d' | grep -v '<reserverd>' | sed 's,# .*$,,g;/#x.*/!d' > tmp.txt" url)))
(unless (zerop (si::system command))
(error "Unable to retrieve file ~A" url)))
(let ((mapping '()))
(with-open-file (s "tmp.txt" :direction :input :external-format :utf-8)
(loop for line = (read-line s nil nil)
while line
do (with-input-from-string (aux line)
(let ((byte 0)
(unicode 0))
(when (and (setf byte (read aux nil nil))
(setf unicode (read aux nil nil)))
(unless (and (typep byte +sequence-type+)
(typep unicode +sequence-type+))
(error "Sequence type ~A is unable to capture this encoding (codes ~X and ~X found)"
+sequence-type+ byte unicode))
(setf mapping (list* unicode byte mapping)))))))
(unless mapping
(error "Error reading file ~A" url))
(si::system "rm -f tmp.txt")
(print (reduce #'max mapping :initial-value 0))
(make-array (length mapping) :element-type +sequence-type+ :initial-contents (nreverse mapping))))
(defun generate-mapping (name url output-file)
(let* ((mapping (read-mapping url)))
(format t "~&;;; Generating ~A~%;;; ~Tfrom ~A" output-file url)
(force-output t)
(if (pathname-type output-file)
(with-open-file (s output-file :direction :output :if-exists :supersede
:element-type +sequence-type+ :external-format :big-endian)
(write-byte (length mapping) s)
(write-sequence mapping s))
(with-open-file (s output-file :direction :output :if-exists :supersede)
(print mapping s)))))
(defconstant +all-mappings+
'(("ATARIST" "http://unicode.org/Public/MAPPINGS/VENDORS/MISC/ATARIST.TXT")
("ISO-8859-1" "http://unicode.org/Public/MAPPINGS/ISO8859/8859-1.TXT")
("ISO-8859-2" "http://unicode.org/Public/MAPPINGS/ISO8859/8859-2.TXT")
("ISO-8859-3" "http://unicode.org/Public/MAPPINGS/ISO8859/8859-3.TXT")
("ISO-8859-4" "http://unicode.org/Public/MAPPINGS/ISO8859/8859-4.TXT")
("ISO-8859-5" "http://unicode.org/Public/MAPPINGS/ISO8859/8859-5.TXT")
("ISO-8859-6" "http://unicode.org/Public/MAPPINGS/ISO8859/8859-6.TXT")
("ISO-8859-7" "http://unicode.org/Public/MAPPINGS/ISO8859/8859-7.TXT")
("ISO-8859-8" "http://unicode.org/Public/MAPPINGS/ISO8859/8859-8.TXT")
("ISO-8859-9" "http://unicode.org/Public/MAPPINGS/ISO8859/8859-9.TXT")
("ISO-8859-10" "http://unicode.org/Public/MAPPINGS/ISO8859/8859-10.TXT")
("ISO-8859-11" "http://unicode.org/Public/MAPPINGS/ISO8859/8859-11.TXT")
("ISO-8859-13" "http://unicode.org/Public/MAPPINGS/ISO8859/8859-13.TXT")
("ISO-8859-14" "http://unicode.org/Public/MAPPINGS/ISO8859/8859-14.TXT")
("ISO-8859-15" "http://unicode.org/Public/MAPPINGS/ISO8859/8859-15.TXT")
("ISO-8859-16" "http://unicode.org/Public/MAPPINGS/ISO8859/8859-16.TXT")
("KOI8-R" "http://unicode.org/Public/MAPPINGS/VENDORS/MISC/KOI8-R.TXT")
("KOI8-U" "http://unicode.org/Public/MAPPINGS/VENDORS/MISC/KOI8-U.TXT")
("CP-856" "http://unicode.org/Public/MAPPINGS/VENDORS/MISC/CP856.TXT")
("CP-856" "http://unicode.org/Public/MAPPINGS/VENDORS/MISC/CP856.TXT")
("DOS-CP437" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/PC/CP437.TXT")
("DOS-CP737" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/PC/CP737.TXT")
("DOS-CP775" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/PC/CP775.TXT")
("DOS-CP850" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/PC/CP850.TXT")
("DOS-CP852" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/PC/CP852.TXT")
("DOS-CP855" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/PC/CP855.TXT")
("DOS-CP857" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/PC/CP857.TXT")
("DOS-CP860" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/PC/CP860.TXT")
("DOS-CP861" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/PC/CP861.TXT")
("DOS-CP862" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/PC/CP862.TXT")
("DOS-CP863" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/PC/CP863.TXT")
("DOS-CP864" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/PC/CP864.TXT")
("DOS-CP865" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/PC/CP865.TXT")
("DOS-CP866" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/PC/CP866.TXT")
("DOS-CP869" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/PC/CP869.TXT")
("DOS-CP874" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/PC/CP874.TXT")
; Redundant WINDOWS-CP874 DOS-CP874
;("WINDOWS-CP874" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP874.TXT")
("WINDOWS-CP932" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP932.TXT" "BIN")
("WINDOWS-CP936" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP936.TXT" "BIN")
("WINDOWS-CP949" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP949.TXT" "BIN")
("WINDOWS-CP950" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP950.TXT" "BIN")
("WINDOWS-CP1250" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP1250.TXT")
("WINDOWS-CP1251" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP1251.TXT")
("WINDOWS-CP1252" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP1252.TXT")
("WINDOWS-CP1253" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP1253.TXT")
("WINDOWS-CP1254" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP1254.TXT")
("WINDOWS-CP1255" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP1255.TXT")
("WINDOWS-CP1256" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP1256.TXT")
("WINDOWS-CP1257" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP1257.TXT")
("WINDOWS-CP1258" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP1258.TXT")
;("JISX0201" "http://unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/JIS/JIS0201.TXT")
;("JISX0212" "http://unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/JIS/JIS0212.TXT")
;("SHIFT-JIS" "http://unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/JIS/SHIFTJIS.TXT")
;Unable to parse because they output more than one Unicode character
;("SJIS-0213" "http://x0213.org/codetable/sjis-0213-2004-std.txt")
;("EUC-JISX0213" "http://x0213.org/codetable/euc-jis")
))
(defun copy-file (in out)
(let ((buffer (make-array 8192 :element-type '(unsigned-byte 8))))
(format t "~%;;; Copying ~A to ~A" in out)
(with-open-file (sin in :direction :input :element-type '(unsigned-byte 8))
(with-open-file (sout out :direction :output :element-type '(unsigned-byte 8)
:if-exists :supersede :if-does-not-exist :create)
(loop for nbytes = (read-sequence buffer sin)
until (zerop nbytes)
do (write-sequence buffer sout :end nbytes))))))
(load (merge-pathnames "tools" *load-pathname*))
(loop for entry in +all-mappings+
for name = (first entry)
for url = (second entry)
for type = (or (third entry) "BIN")
for orig = (make-pathname :name name :type type :defaults "ext:encodings;")
for copy = (ensure-directories-exist (make-pathname :name name :type type :defaults "build:encodings;"))
for orig = (make-pathname :name name :type "BIN" :defaults "ext:encodings;")
for copy = (merge-pathnames "build:encodings;" orig)
do (progn
(unless (probe-file orig)
(generate-mapping name url orig))
(let ((mapping (if (equalp name "JISX0208")
(mapcar #'rest (read-mapping name 3))
(read-mapping name))))
(dump-mapping-array mapping orig)))
(copy-file orig copy)))
(defconstant +aliases+
@ -197,4 +85,7 @@
do (with-open-file (out filename :direction :output :if-exists :supersede
:if-does-not-exist :create :element-type 'base-char)
(format t "~%;;; Creating alias ~A -> ~A, ~A" alias name filename)
(princ name out))))
(format out "(defparameter ext::~A 'ext::~A)" alias name))))
(copy-file "ext:encodings;ISO-2022-JP" "build:encodings;ISO-2022-JP")
(copy-file "ext:encodings;ISO-2022-JP-1" "build:encodings;ISO-2022-JP-1")

View file

@ -0,0 +1,194 @@
;;; -*- Mode: Lisp; Syntax: Common-Lisp; -*-
;;;
;;; Copyright (c) 2009, Juan Jose Garcia-Ripoll
;;;
;;; This program is free software; you can redistribute it and/or
;;; modify it under the terms of the GNU Library General Public
;;; License as published by the Free Software Foundation; either
;;; version 2 of the License, or (at your option) any later version.
;;;
;;; See file '../Copyright' for full details.
(defconstant +sequence-type+ '(unsigned-byte 16))
(defconstant +source-pathname+
(make-pathname :name nil :type nil
:defaults (merge-pathnames "ext:;sources;" *load-pathname*)))
(defconstant +all-mappings+
'(("ATARIST" "http://unicode.org/Public/MAPPINGS/VENDORS/MISC/ATARIST.TXT")
("ISO-8859-1" "http://unicode.org/Public/MAPPINGS/ISO8859/8859-1.TXT")
("ISO-8859-2" "http://unicode.org/Public/MAPPINGS/ISO8859/8859-2.TXT")
("ISO-8859-3" "http://unicode.org/Public/MAPPINGS/ISO8859/8859-3.TXT")
("ISO-8859-4" "http://unicode.org/Public/MAPPINGS/ISO8859/8859-4.TXT")
("ISO-8859-5" "http://unicode.org/Public/MAPPINGS/ISO8859/8859-5.TXT")
("ISO-8859-6" "http://unicode.org/Public/MAPPINGS/ISO8859/8859-6.TXT")
("ISO-8859-7" "http://unicode.org/Public/MAPPINGS/ISO8859/8859-7.TXT")
("ISO-8859-8" "http://unicode.org/Public/MAPPINGS/ISO8859/8859-8.TXT")
("ISO-8859-9" "http://unicode.org/Public/MAPPINGS/ISO8859/8859-9.TXT")
("ISO-8859-10" "http://unicode.org/Public/MAPPINGS/ISO8859/8859-10.TXT")
("ISO-8859-11" "http://unicode.org/Public/MAPPINGS/ISO8859/8859-11.TXT")
("ISO-8859-13" "http://unicode.org/Public/MAPPINGS/ISO8859/8859-13.TXT")
("ISO-8859-14" "http://unicode.org/Public/MAPPINGS/ISO8859/8859-14.TXT")
("ISO-8859-15" "http://unicode.org/Public/MAPPINGS/ISO8859/8859-15.TXT")
("ISO-8859-16" "http://unicode.org/Public/MAPPINGS/ISO8859/8859-16.TXT")
("KOI8-R" "http://unicode.org/Public/MAPPINGS/VENDORS/MISC/KOI8-R.TXT")
("KOI8-U" "http://unicode.org/Public/MAPPINGS/VENDORS/MISC/KOI8-U.TXT")
("CP-856" "http://unicode.org/Public/MAPPINGS/VENDORS/MISC/CP856.TXT")
("CP-856" "http://unicode.org/Public/MAPPINGS/VENDORS/MISC/CP856.TXT")
("DOS-CP437" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/PC/CP437.TXT")
("DOS-CP737" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/PC/CP737.TXT")
("DOS-CP775" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/PC/CP775.TXT")
("DOS-CP850" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/PC/CP850.TXT")
("DOS-CP852" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/PC/CP852.TXT")
("DOS-CP855" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/PC/CP855.TXT")
("DOS-CP857" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/PC/CP857.TXT")
("DOS-CP860" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/PC/CP860.TXT")
("DOS-CP861" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/PC/CP861.TXT")
("DOS-CP862" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/PC/CP862.TXT")
("DOS-CP863" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/PC/CP863.TXT")
("DOS-CP864" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/PC/CP864.TXT")
("DOS-CP865" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/PC/CP865.TXT")
("DOS-CP866" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/PC/CP866.TXT")
("DOS-CP869" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/PC/CP869.TXT")
("DOS-CP874" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/PC/CP874.TXT")
; Redundant WINDOWS-CP874 DOS-CP874
;("WINDOWS-CP874" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP874.TXT")
("WINDOWS-CP932" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP932.TXT")
("WINDOWS-CP936" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP936.TXT")
("WINDOWS-CP949" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP949.TXT")
("WINDOWS-CP950" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP950.TXT")
("WINDOWS-CP1250" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP1250.TXT")
("WINDOWS-CP1251" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP1251.TXT")
("WINDOWS-CP1252" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP1252.TXT")
("WINDOWS-CP1253" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP1253.TXT")
("WINDOWS-CP1254" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP1254.TXT")
("WINDOWS-CP1255" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP1255.TXT")
("WINDOWS-CP1256" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP1256.TXT")
("WINDOWS-CP1257" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP1257.TXT")
("WINDOWS-CP1258" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP1258.TXT")
("JISX0201" "http://unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/JIS/JIS0201.TXT")
("JISX0208" "http://unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/JIS/JIS0208.TXT"
;; Fixes compatible with libiconv: we replace a reverse solidus with a
;; fullwidth reverse solidus, so that JISX0208 does not contain characters
;; in the ASCII range (Needed by ISO-2022-JP-1)
((#x815F #x2140 #xff3c)))
("JISX0212" "http://unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/JIS/JIS0212.TXT"
;; Fixes compatible with libiconv: we replace a tilde with a
;; fullwidth tilde, so that JISX0212 does not contain characters
;; in the ASCII range (Needed by ISO-2022-JP-1)
((#x2237 #xff5e)))
("SHIFT-JIS" "http://unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/JIS/SHIFTJIS.TXT")
;Unable to parse because they output more than one Unicode character
;("SJIS-0213" "http://x0213.org/codetable/sjis-0213-2004-std.txt")
;("EUC-JISX0213" "http://x0213.org/codetable/euc-jis")
))
(defun download (filename url)
(unless (probe-file filename)
(let ((command (format nil "curl \"~A\" > ~A" url filename)))
(unless (zerop (si::system command))
(error "Unable to retrieve file ~A" url)))))
(defun reformat (line)
(loop with l = (length line)
for i from 0 below l
for c = (char line i)
do (cond ((eql c #\#)
(return (if (zerop i) "" (subseq line 0 (1- i)))))
((not (standard-char-p c))
(setf (char line i) #\space))
((and (eql c #\0)
(let ((j (1+ i)))
(and (< j l) (member (char line j) '(#\x #\X)))))
(setf (char line i) #\#)))
finally (return line)))
(defun read-mapping (name &optional (n 2))
(let* ((source-file (make-pathname :name name :defaults +source-pathname+))
(record (find name +all-mappings+ :key #'first :test #'equalp))
(fixes (third record))
(source-url (fourth record)))
(unless (probe-file source-file)
(unless source-url
(error "Unknown encoding ~A" name))
(download file source-url))
(with-open-file (in source-file :direction :input)
(loop with output = '()
for line = (reformat (read-line in nil nil))
while line
unless (zerop (length line))
do (with-input-from-string (aux line)
(let ((byte-list (loop for byte = (read aux nil nil)
while byte
collect byte)))
(unless (/= (length byte-list) n)
(loop for i in fixes
when (= (first i) (first byte-list))
do (progn (setf byte-list i) (return)))
(push byte-list output))))
finally (return (nreverse output))))))
(defun mapping-hash-table (mapping)
(loop with hash = (make-hash-table :size (floor (* 1.5 (length mapping)))
:test 'eq)
for (multibyte codepoint) in mapping
for unicode-char = (code-char codepoint)
do (progn
(setf (gethash multibyte hash) unicode-char)
(setf (gethash unicode-char hash) multibyte)
(when (> multibyte #xFF)
(setf (gethash (ash multibyte -8) hash) t)))
finally (return hash)))
(defun dump-mapping-array (mapping-assoc output-file)
(let* ((mapping-list (reduce #'nconc mapping-assoc))
(mapping-array (make-array (length mapping-list) :element-type +sequence-type+
:initial-contents mapping-list)))
(format t "~%;;; Generating ~A" output-file)
(force-output t)
(with-open-file (s output-file :direction :output :if-exists :supersede
:element-type +sequence-type+ :external-format :big-endian)
(write-byte (length mapping-array) s)
(write-sequence mapping-array s))))
(defun copy-file (in out)
(let ((buffer (make-array 8192 :element-type '(unsigned-byte 8))))
(format t "~%;;; Copying ~A to ~A" in out)
(with-open-file (sin in :direction :input :element-type '(unsigned-byte 8))
(with-open-file (sout out :direction :output :element-type '(unsigned-byte 8)
:if-exists :supersede :if-does-not-exist :create)
(loop for nbytes = (read-sequence buffer sin)
until (zerop nbytes)
do (write-sequence buffer sout :end nbytes))))))
(defun all-valid-unicode-chars (mapping)
(if (consp mapping)
(loop for sublist on mapping
for i from 0 below 10
until (and (eq sublist mapping) (plusp i))
collect (all-valid-unicode-chars (first sublist)))
(concatenate 'string (loop for key being the hash-key in mapping
when (characterp key)
collect key))))
(defun compare-hashes (h1 h2)
(flet ((h1-in-h2 (h1 h2)
(loop for k being the hash-key in h1 using (hash-value v)
for v2 = (gethash k h2 nil)
unless (or (consp v2) (consp v) (equal v v2))
do (progn (print (list h1 k v h2 k v2))
(error)
(return nil))
finally (return t))))
(and (h1-in-h2 h1 h2)
(h1-in-h2 h2 h1))))

View file

@ -60,6 +60,12 @@
#define ecl_ftello ftello
#endif
/* Maximum number of bytes required to encode a character.
* This currently corresponds to (4 + 2) for the ISO-2022-JP-* encodings
* with 4 being the charset prefix, 2 for the character.
*/
#define ENCODING_BUFFER_MAX_SIZE 6
static cl_index ecl_read_byte8(cl_object stream, unsigned char *c, cl_index n);
static cl_index ecl_write_byte8(cl_object stream, unsigned char *c, cl_index n);
@ -565,7 +571,7 @@ eformat_unread_char(cl_object strm, int c)
}
{
cl_object l = Cnil;
unsigned char buffer[10];
unsigned char buffer[2*ENCODING_BUFFER_MAX_SIZE];
int ndx = 0;
cl_fixnum i = strm->stream.last_code[0];
if (i != EOF) {
@ -598,7 +604,7 @@ eformat_read_char(cl_object strm)
static int
eformat_write_char(cl_object strm, int c)
{
unsigned char buffer[4];
unsigned char buffer[ENCODING_BUFFER_MAX_SIZE];
int nbytes = strm->stream.encoder(strm, buffer, c);
if (nbytes == 0) {
character_size_overflow(strm, c);
@ -926,7 +932,7 @@ ucs_2_encoder(cl_object stream, unsigned char *buffer, int c)
}
/*
* USER DEFINED ENCODINGS
* USER DEFINED ENCODINGS. SIMPLE CASE.
*/
static int
@ -975,6 +981,84 @@ user_encoder(cl_object stream, unsigned char *buffer, int c)
}
}
/*
* USER DEFINED ENCODINGS. SIMPLE CASE.
*/
static int
user_multistate_decoder(cl_object stream, cl_eformat_read_byte8 read_byte8,
cl_object source)
{
cl_object table_list = stream->stream.format_table;
cl_object table = ECL_CONS_CAR(table_list);
cl_object character;
cl_fixnum i, j;
unsigned char buffer[ENCODING_BUFFER_MAX_SIZE];
for (i = j = 0; i < ENCODING_BUFFER_MAX_SIZE; i++) {
if (read_byte8(source, buffer+i, 1) < 1) {
return EOF;
}
j = (j << 8) | buffer[i];
character = ecl_gethash_safe(MAKE_FIXNUM(j), table, Cnil);
if (CHARACTERP(character)) {
return CHAR_CODE(character);
}
if (Null(character)) {
invalid_codepoint(stream, buffer[0]);
}
if (character == Ct) {
/* Need more characters */
continue;
}
if (CONSP(character)) {
/* Changed the state. */
stream->stream.format_table = table_list = character;
table = ECL_CONS_CAR(table_list);
i = j = 0;
continue;
}
break;
}
FEerror("Internal error in decoder table.", 0);
}
static int
user_multistate_encoder(cl_object stream, unsigned char *buffer, int c)
{
cl_object table_list = stream->stream.format_table;
cl_object p = table_list;
do {
cl_object table = ECL_CONS_CAR(p);
cl_object byte = ecl_gethash_safe(CODE_CHAR(c), table, Cnil);
if (!Null(byte)) {
cl_fixnum code = fix(byte);
int n = 0;
if (p != table_list) {
/* Must output a escape sequence */
cl_object x = ecl_gethash_safe(Ct, table, Cnil);
while (!Null(x)) {
buffer[0] = fix(ECL_CONS_CAR(x));
buffer++;
x = ECL_CONS_CDR(x);
n++;
}
stream->stream.format_table = p;
}
if (code > 0xFF) {
buffer[1] = code & 0xFF; code >>= 8;
buffer[0] = code;
return n+2;
} else {
buffer[0] = code;
return n+1;
}
}
p = ECL_CONS_CDR(p);
} while (p != table_list);
/* Exhausted all lists */
return 0;
}
/*
* UTF-8
*/
@ -2880,7 +2964,7 @@ set_stream_elt_type(cl_object stream, cl_fixnum byte_size, int flags,
switch (flags & ECL_STREAM_FORMAT) {
case ECL_STREAM_BINARY:
IO_STREAM_ELT_TYPE(stream) = cl_list(2, t, MAKE_FIXNUM(byte_size));
stream->stream.format = Cnil;
stream->stream.format = t;
stream->stream.ops->read_char = not_character_read_char;
stream->stream.ops->write_char = not_character_write_char;
break;
@ -2944,8 +3028,13 @@ set_stream_elt_type(cl_object stream, cl_fixnum byte_size, int flags,
IO_STREAM_ELT_TYPE(stream) = @'character';
byte_size = 8;
stream->stream.format = stream->stream.format_table;
stream->stream.encoder = user_encoder;
stream->stream.decoder = user_decoder;
if (CONSP(stream->stream.format)) {
stream->stream.encoder = user_multistate_encoder;
stream->stream.decoder = user_multistate_decoder;
} else {
stream->stream.encoder = user_encoder;
stream->stream.decoder = user_decoder;
}
break;
case ECL_STREAM_US_ASCII:
IO_STREAM_ELT_TYPE(stream) = @'base-char';
@ -2967,20 +3056,21 @@ set_stream_elt_type(cl_object stream, cl_fixnum byte_size, int flags,
FEerror("Invalid or unsupported external format ~A with code ~D",
2, external_format, MAKE_FIXNUM(flags));
}
t = @':LF';
if (stream->stream.ops->write_char == eformat_write_char &&
(flags & ECL_STREAM_CR)) {
cl_object key;
if (flags & ECL_STREAM_LF) {
stream->stream.ops->read_char = eformat_read_char_crlf;
stream->stream.ops->write_char = eformat_write_char_crlf;
key = @':CRLF';
t = @':CRLF';
} else {
stream->stream.ops->read_char = eformat_read_char_cr;
stream->stream.ops->write_char = eformat_write_char_cr;
key = @':CR';
t = @':CR';
}
stream->stream.format = cl_list(2, key, stream->stream.format);
}
stream->stream.format = cl_list(2, stream->stream.format, t);
{
cl_object (*read_byte)(cl_object);
void (*write_byte)(cl_object,cl_object);

View file

@ -291,8 +291,8 @@ the one defined in the ANSI standard. *print-base* is 10, *print-array* is t,
(defun ext:load-encoding (name)
(let ((filename (make-pathname :name (symbol-name name) :defaults "sys:encodings;")))
(cond ((probe-file filename)
(with-open-file (s filename :direction :input)
(read s)))
(load filename :verbose nil)
name)
((probe-file (setf filename (make-pathname :type "BIN" :defaults filename)))
(with-open-file (in filename :element-type '(unsigned-byte 16)
:external-format :big-endian)