Added support for more complex character encodings (ISO-2022-JP)
This commit is contained in:
parent
9fbd0a0e93
commit
1bf50bed96
10 changed files with 419 additions and 132 deletions
55
contrib/encodings/ISO-2022-JP
Normal file
55
contrib/encodings/ISO-2022-JP
Normal file
|
|
@ -0,0 +1,55 @@
|
|||
(cl:unless (cl:find-package :ISO-2022-JP)
|
||||
(make-package :ISO-2022-JP))
|
||||
(in-package :ISO-2022-JP)
|
||||
|
||||
(defun compose (bytes)
|
||||
(loop for i in bytes
|
||||
with cum = 0
|
||||
do (setf cum (+ (ash cum 8) i))
|
||||
finally (return cum)))
|
||||
|
||||
(defun mapping-hash-table (sequence &optional (mask 0))
|
||||
(loop with hash = (make-hash-table :size (floor (* 1.5 (length sequence)))
|
||||
:test 'eq)
|
||||
for i from 0 below (length sequence) by 2
|
||||
for multibyte = (elt sequence i)
|
||||
for codepoint = (elt sequence (1+ i))
|
||||
for unicode-char = (code-char codepoint)
|
||||
when (zerop (logand multibyte mask))
|
||||
do (progn
|
||||
(setf (gethash multibyte hash) unicode-char)
|
||||
(setf (gethash unicode-char hash) multibyte)
|
||||
(when (> multibyte #xFF)
|
||||
(setf (gethash (ash multibyte -8) hash) t)))
|
||||
finally (return hash)))
|
||||
|
||||
(defun multimap (escapes tables)
|
||||
(loop for seq in escapes
|
||||
for table in tables
|
||||
for table-cons = (member table tables :test 'eq)
|
||||
do (progn
|
||||
;; Change escape sequence into byte codes
|
||||
(setf seq (mapcar #'char-code seq))
|
||||
;; Store it in the hash table
|
||||
(setf (gethash t table) seq)
|
||||
(loop for other-table in tables
|
||||
do (loop for i from 1
|
||||
for precedings = (butlast seq i)
|
||||
while precedings
|
||||
do (setf (gethash (compose precedings) other-table) t)
|
||||
finally (setf (gethash (compose seq) other-table) table-cons)))))
|
||||
(nconc tables tables))
|
||||
|
||||
(defparameter ext::iso-2022-jp
|
||||
(let* ((ascii-no-esc (mapping-hash-table (loop for i from 0 to 127
|
||||
unless (= i (char-code #\esc))
|
||||
nconc (list i i))))
|
||||
(jis208 (mapping-hash-table (ext::load-encoding :jisx0208) #x8080))
|
||||
(jis201 (mapping-hash-table (ext::load-encoding :jisx0201) #x80)))
|
||||
(multimap '((#\Esc #\( #\B)
|
||||
(#\Esc #\( #\J)
|
||||
(#\Esc #\$ #\@)
|
||||
(#\Esc #\$ #\B))
|
||||
(list ascii-no-esc jis201 jis208 jis208))))
|
||||
|
||||
(delete-package :ISO-2022-JP)
|
||||
57
contrib/encodings/ISO-2022-JP-1
Normal file
57
contrib/encodings/ISO-2022-JP-1
Normal file
|
|
@ -0,0 +1,57 @@
|
|||
(cl:unless (cl:find-package :ISO-2022-JP-1)
|
||||
(make-package :ISO-2022-JP-1))
|
||||
(in-package :ISO-2022-JP-1)
|
||||
|
||||
(defun compose (bytes)
|
||||
(loop for i in bytes
|
||||
with cum = 0
|
||||
do (setf cum (+ (ash cum 8) i))
|
||||
finally (return cum)))
|
||||
|
||||
(defun mapping-hash-table (sequence &optional (mask 0))
|
||||
(loop with hash = (make-hash-table :size (floor (* 1.5 (length sequence)))
|
||||
:test 'eq)
|
||||
for i from 0 below (length sequence) by 2
|
||||
for multibyte = (elt sequence i)
|
||||
for codepoint = (elt sequence (1+ i))
|
||||
for unicode-char = (code-char codepoint)
|
||||
when (zerop (logand multibyte mask))
|
||||
do (progn
|
||||
(setf (gethash multibyte hash) unicode-char)
|
||||
(setf (gethash unicode-char hash) multibyte)
|
||||
(when (> multibyte #xFF)
|
||||
(setf (gethash (ash multibyte -8) hash) t)))
|
||||
finally (return hash)))
|
||||
|
||||
(defun multimap (escapes tables)
|
||||
(loop for seq in escapes
|
||||
for table in tables
|
||||
for table-cons = (member table tables :test 'eq)
|
||||
do (progn
|
||||
;; Change escape sequence into byte codes
|
||||
(setf seq (mapcar #'char-code seq))
|
||||
;; Store it in the hash table
|
||||
(setf (gethash t table) seq)
|
||||
(loop for other-table in tables
|
||||
do (loop for i from 1
|
||||
for precedings = (butlast seq i)
|
||||
while precedings
|
||||
do (setf (gethash (compose precedings) other-table) t)
|
||||
finally (setf (gethash (compose seq) other-table) table-cons)))))
|
||||
(nconc tables tables))
|
||||
|
||||
(defparameter ext::iso-2022-jp-1
|
||||
(let* ((ascii-no-esc (mapping-hash-table (loop for i from 0 to 127
|
||||
unless (= i (char-code #\esc))
|
||||
nconc (list i i))))
|
||||
(jis212 (mapping-hash-table (ext::load-encoding :jisx0212) #x8080))
|
||||
(jis208 (mapping-hash-table (ext::load-encoding :jisx0208) #x8080))
|
||||
(jis201 (mapping-hash-table (ext::load-encoding :jisx0201) #x80)))
|
||||
(multimap '((#\Esc #\( #\B)
|
||||
(#\Esc #\( #\J)
|
||||
(#\Esc #\$ #\@)
|
||||
(#\Esc #\$ #\B)
|
||||
(#\Esc #\$ #\( #\D))
|
||||
(list ascii-no-esc jis201 jis208 jis208 jis212))))
|
||||
|
||||
(delete-package :ISO-2022-JP-1)
|
||||
BIN
contrib/encodings/JISX0201.BIN
Normal file
BIN
contrib/encodings/JISX0201.BIN
Normal file
Binary file not shown.
BIN
contrib/encodings/JISX0208.BIN
Normal file
BIN
contrib/encodings/JISX0208.BIN
Normal file
Binary file not shown.
BIN
contrib/encodings/JISX0212.BIN
Normal file
BIN
contrib/encodings/JISX0212.BIN
Normal file
Binary file not shown.
BIN
contrib/encodings/SHIFT-JIS.BIN
Normal file
BIN
contrib/encodings/SHIFT-JIS.BIN
Normal file
Binary file not shown.
|
|
@ -1,6 +1,6 @@
|
|||
;;; -*- Mode: Lisp; Syntax: Common-Lisp; -*-
|
||||
;;;
|
||||
;;; Copyright (c) 2009, Giuseppe Attardi.
|
||||
;;; Copyright (c) 2009, Juan Jose Garcia-Ripoll
|
||||
;;;
|
||||
;;; This program is free software; you can redistribute it and/or
|
||||
;;; modify it under the terms of the GNU Library General Public
|
||||
|
|
@ -9,130 +9,18 @@
|
|||
;;;
|
||||
;;; See file '../Copyright' for full details.
|
||||
|
||||
(defconstant +sequence-type+ '(unsigned-byte 16))
|
||||
|
||||
(defun read-mapping (url)
|
||||
(let ((command (format nil "curl \"~A\" | sed '/^#.*$/d;s,0x,#x,g;s,U+\\([0-9A-Fa-f]*\\),#x\\1,g;s,#UNDEFINED,NIL # UNDEFINED,g;/LEAD BYTE/d' | grep -v '<reserverd>' | sed 's,# .*$,,g;/#x.*/!d' > tmp.txt" url)))
|
||||
(unless (zerop (si::system command))
|
||||
(error "Unable to retrieve file ~A" url)))
|
||||
(let ((mapping '()))
|
||||
(with-open-file (s "tmp.txt" :direction :input :external-format :utf-8)
|
||||
(loop for line = (read-line s nil nil)
|
||||
while line
|
||||
do (with-input-from-string (aux line)
|
||||
(let ((byte 0)
|
||||
(unicode 0))
|
||||
(when (and (setf byte (read aux nil nil))
|
||||
(setf unicode (read aux nil nil)))
|
||||
(unless (and (typep byte +sequence-type+)
|
||||
(typep unicode +sequence-type+))
|
||||
(error "Sequence type ~A is unable to capture this encoding (codes ~X and ~X found)"
|
||||
+sequence-type+ byte unicode))
|
||||
(setf mapping (list* unicode byte mapping)))))))
|
||||
(unless mapping
|
||||
(error "Error reading file ~A" url))
|
||||
(si::system "rm -f tmp.txt")
|
||||
(print (reduce #'max mapping :initial-value 0))
|
||||
(make-array (length mapping) :element-type +sequence-type+ :initial-contents (nreverse mapping))))
|
||||
|
||||
(defun generate-mapping (name url output-file)
|
||||
(let* ((mapping (read-mapping url)))
|
||||
(format t "~&;;; Generating ~A~%;;; ~Tfrom ~A" output-file url)
|
||||
(force-output t)
|
||||
(if (pathname-type output-file)
|
||||
(with-open-file (s output-file :direction :output :if-exists :supersede
|
||||
:element-type +sequence-type+ :external-format :big-endian)
|
||||
(write-byte (length mapping) s)
|
||||
(write-sequence mapping s))
|
||||
(with-open-file (s output-file :direction :output :if-exists :supersede)
|
||||
(print mapping s)))))
|
||||
|
||||
(defconstant +all-mappings+
|
||||
'(("ATARIST" "http://unicode.org/Public/MAPPINGS/VENDORS/MISC/ATARIST.TXT")
|
||||
|
||||
("ISO-8859-1" "http://unicode.org/Public/MAPPINGS/ISO8859/8859-1.TXT")
|
||||
("ISO-8859-2" "http://unicode.org/Public/MAPPINGS/ISO8859/8859-2.TXT")
|
||||
("ISO-8859-3" "http://unicode.org/Public/MAPPINGS/ISO8859/8859-3.TXT")
|
||||
("ISO-8859-4" "http://unicode.org/Public/MAPPINGS/ISO8859/8859-4.TXT")
|
||||
("ISO-8859-5" "http://unicode.org/Public/MAPPINGS/ISO8859/8859-5.TXT")
|
||||
("ISO-8859-6" "http://unicode.org/Public/MAPPINGS/ISO8859/8859-6.TXT")
|
||||
("ISO-8859-7" "http://unicode.org/Public/MAPPINGS/ISO8859/8859-7.TXT")
|
||||
("ISO-8859-8" "http://unicode.org/Public/MAPPINGS/ISO8859/8859-8.TXT")
|
||||
("ISO-8859-9" "http://unicode.org/Public/MAPPINGS/ISO8859/8859-9.TXT")
|
||||
("ISO-8859-10" "http://unicode.org/Public/MAPPINGS/ISO8859/8859-10.TXT")
|
||||
("ISO-8859-11" "http://unicode.org/Public/MAPPINGS/ISO8859/8859-11.TXT")
|
||||
("ISO-8859-13" "http://unicode.org/Public/MAPPINGS/ISO8859/8859-13.TXT")
|
||||
("ISO-8859-14" "http://unicode.org/Public/MAPPINGS/ISO8859/8859-14.TXT")
|
||||
("ISO-8859-15" "http://unicode.org/Public/MAPPINGS/ISO8859/8859-15.TXT")
|
||||
("ISO-8859-16" "http://unicode.org/Public/MAPPINGS/ISO8859/8859-16.TXT")
|
||||
("KOI8-R" "http://unicode.org/Public/MAPPINGS/VENDORS/MISC/KOI8-R.TXT")
|
||||
("KOI8-U" "http://unicode.org/Public/MAPPINGS/VENDORS/MISC/KOI8-U.TXT")
|
||||
("CP-856" "http://unicode.org/Public/MAPPINGS/VENDORS/MISC/CP856.TXT")
|
||||
("CP-856" "http://unicode.org/Public/MAPPINGS/VENDORS/MISC/CP856.TXT")
|
||||
|
||||
("DOS-CP437" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/PC/CP437.TXT")
|
||||
("DOS-CP737" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/PC/CP737.TXT")
|
||||
("DOS-CP775" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/PC/CP775.TXT")
|
||||
("DOS-CP850" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/PC/CP850.TXT")
|
||||
("DOS-CP852" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/PC/CP852.TXT")
|
||||
("DOS-CP855" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/PC/CP855.TXT")
|
||||
("DOS-CP857" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/PC/CP857.TXT")
|
||||
("DOS-CP860" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/PC/CP860.TXT")
|
||||
("DOS-CP861" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/PC/CP861.TXT")
|
||||
("DOS-CP862" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/PC/CP862.TXT")
|
||||
("DOS-CP863" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/PC/CP863.TXT")
|
||||
("DOS-CP864" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/PC/CP864.TXT")
|
||||
("DOS-CP865" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/PC/CP865.TXT")
|
||||
("DOS-CP866" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/PC/CP866.TXT")
|
||||
("DOS-CP869" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/PC/CP869.TXT")
|
||||
("DOS-CP874" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/PC/CP874.TXT")
|
||||
|
||||
; Redundant WINDOWS-CP874 DOS-CP874
|
||||
;("WINDOWS-CP874" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP874.TXT")
|
||||
|
||||
("WINDOWS-CP932" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP932.TXT" "BIN")
|
||||
("WINDOWS-CP936" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP936.TXT" "BIN")
|
||||
("WINDOWS-CP949" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP949.TXT" "BIN")
|
||||
("WINDOWS-CP950" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP950.TXT" "BIN")
|
||||
|
||||
("WINDOWS-CP1250" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP1250.TXT")
|
||||
("WINDOWS-CP1251" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP1251.TXT")
|
||||
("WINDOWS-CP1252" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP1252.TXT")
|
||||
("WINDOWS-CP1253" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP1253.TXT")
|
||||
("WINDOWS-CP1254" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP1254.TXT")
|
||||
("WINDOWS-CP1255" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP1255.TXT")
|
||||
("WINDOWS-CP1256" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP1256.TXT")
|
||||
("WINDOWS-CP1257" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP1257.TXT")
|
||||
("WINDOWS-CP1258" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP1258.TXT")
|
||||
|
||||
;("JISX0201" "http://unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/JIS/JIS0201.TXT")
|
||||
;("JISX0212" "http://unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/JIS/JIS0212.TXT")
|
||||
;("SHIFT-JIS" "http://unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/JIS/SHIFTJIS.TXT")
|
||||
|
||||
;Unable to parse because they output more than one Unicode character
|
||||
;("SJIS-0213" "http://x0213.org/codetable/sjis-0213-2004-std.txt")
|
||||
;("EUC-JISX0213" "http://x0213.org/codetable/euc-jis")
|
||||
))
|
||||
|
||||
(defun copy-file (in out)
|
||||
(let ((buffer (make-array 8192 :element-type '(unsigned-byte 8))))
|
||||
(format t "~%;;; Copying ~A to ~A" in out)
|
||||
(with-open-file (sin in :direction :input :element-type '(unsigned-byte 8))
|
||||
(with-open-file (sout out :direction :output :element-type '(unsigned-byte 8)
|
||||
:if-exists :supersede :if-does-not-exist :create)
|
||||
(loop for nbytes = (read-sequence buffer sin)
|
||||
until (zerop nbytes)
|
||||
do (write-sequence buffer sout :end nbytes))))))
|
||||
(load (merge-pathnames "tools" *load-pathname*))
|
||||
|
||||
(loop for entry in +all-mappings+
|
||||
for name = (first entry)
|
||||
for url = (second entry)
|
||||
for type = (or (third entry) "BIN")
|
||||
for orig = (make-pathname :name name :type type :defaults "ext:encodings;")
|
||||
for copy = (ensure-directories-exist (make-pathname :name name :type type :defaults "build:encodings;"))
|
||||
for orig = (make-pathname :name name :type "BIN" :defaults "ext:encodings;")
|
||||
for copy = (merge-pathnames "build:encodings;" orig)
|
||||
do (progn
|
||||
(unless (probe-file orig)
|
||||
(generate-mapping name url orig))
|
||||
(let ((mapping (if (equalp name "JISX0208")
|
||||
(mapcar #'rest (read-mapping name 3))
|
||||
(read-mapping name))))
|
||||
(dump-mapping-array mapping orig)))
|
||||
(copy-file orig copy)))
|
||||
|
||||
(defconstant +aliases+
|
||||
|
|
@ -197,4 +85,7 @@
|
|||
do (with-open-file (out filename :direction :output :if-exists :supersede
|
||||
:if-does-not-exist :create :element-type 'base-char)
|
||||
(format t "~%;;; Creating alias ~A -> ~A, ~A" alias name filename)
|
||||
(princ name out))))
|
||||
(format out "(defparameter ext::~A 'ext::~A)" alias name))))
|
||||
|
||||
(copy-file "ext:encodings;ISO-2022-JP" "build:encodings;ISO-2022-JP")
|
||||
(copy-file "ext:encodings;ISO-2022-JP-1" "build:encodings;ISO-2022-JP-1")
|
||||
|
|
|
|||
194
contrib/encodings/tools.lisp
Normal file
194
contrib/encodings/tools.lisp
Normal file
|
|
@ -0,0 +1,194 @@
|
|||
;;; -*- Mode: Lisp; Syntax: Common-Lisp; -*-
|
||||
;;;
|
||||
;;; Copyright (c) 2009, Juan Jose Garcia-Ripoll
|
||||
;;;
|
||||
;;; This program is free software; you can redistribute it and/or
|
||||
;;; modify it under the terms of the GNU Library General Public
|
||||
;;; License as published by the Free Software Foundation; either
|
||||
;;; version 2 of the License, or (at your option) any later version.
|
||||
;;;
|
||||
;;; See file '../Copyright' for full details.
|
||||
|
||||
(defconstant +sequence-type+ '(unsigned-byte 16))
|
||||
|
||||
(defconstant +source-pathname+
|
||||
(make-pathname :name nil :type nil
|
||||
:defaults (merge-pathnames "ext:;sources;" *load-pathname*)))
|
||||
|
||||
(defconstant +all-mappings+
|
||||
'(("ATARIST" "http://unicode.org/Public/MAPPINGS/VENDORS/MISC/ATARIST.TXT")
|
||||
|
||||
("ISO-8859-1" "http://unicode.org/Public/MAPPINGS/ISO8859/8859-1.TXT")
|
||||
("ISO-8859-2" "http://unicode.org/Public/MAPPINGS/ISO8859/8859-2.TXT")
|
||||
("ISO-8859-3" "http://unicode.org/Public/MAPPINGS/ISO8859/8859-3.TXT")
|
||||
("ISO-8859-4" "http://unicode.org/Public/MAPPINGS/ISO8859/8859-4.TXT")
|
||||
("ISO-8859-5" "http://unicode.org/Public/MAPPINGS/ISO8859/8859-5.TXT")
|
||||
("ISO-8859-6" "http://unicode.org/Public/MAPPINGS/ISO8859/8859-6.TXT")
|
||||
("ISO-8859-7" "http://unicode.org/Public/MAPPINGS/ISO8859/8859-7.TXT")
|
||||
("ISO-8859-8" "http://unicode.org/Public/MAPPINGS/ISO8859/8859-8.TXT")
|
||||
("ISO-8859-9" "http://unicode.org/Public/MAPPINGS/ISO8859/8859-9.TXT")
|
||||
("ISO-8859-10" "http://unicode.org/Public/MAPPINGS/ISO8859/8859-10.TXT")
|
||||
("ISO-8859-11" "http://unicode.org/Public/MAPPINGS/ISO8859/8859-11.TXT")
|
||||
("ISO-8859-13" "http://unicode.org/Public/MAPPINGS/ISO8859/8859-13.TXT")
|
||||
("ISO-8859-14" "http://unicode.org/Public/MAPPINGS/ISO8859/8859-14.TXT")
|
||||
("ISO-8859-15" "http://unicode.org/Public/MAPPINGS/ISO8859/8859-15.TXT")
|
||||
("ISO-8859-16" "http://unicode.org/Public/MAPPINGS/ISO8859/8859-16.TXT")
|
||||
("KOI8-R" "http://unicode.org/Public/MAPPINGS/VENDORS/MISC/KOI8-R.TXT")
|
||||
("KOI8-U" "http://unicode.org/Public/MAPPINGS/VENDORS/MISC/KOI8-U.TXT")
|
||||
("CP-856" "http://unicode.org/Public/MAPPINGS/VENDORS/MISC/CP856.TXT")
|
||||
("CP-856" "http://unicode.org/Public/MAPPINGS/VENDORS/MISC/CP856.TXT")
|
||||
|
||||
("DOS-CP437" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/PC/CP437.TXT")
|
||||
("DOS-CP737" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/PC/CP737.TXT")
|
||||
("DOS-CP775" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/PC/CP775.TXT")
|
||||
("DOS-CP850" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/PC/CP850.TXT")
|
||||
("DOS-CP852" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/PC/CP852.TXT")
|
||||
("DOS-CP855" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/PC/CP855.TXT")
|
||||
("DOS-CP857" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/PC/CP857.TXT")
|
||||
("DOS-CP860" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/PC/CP860.TXT")
|
||||
("DOS-CP861" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/PC/CP861.TXT")
|
||||
("DOS-CP862" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/PC/CP862.TXT")
|
||||
("DOS-CP863" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/PC/CP863.TXT")
|
||||
("DOS-CP864" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/PC/CP864.TXT")
|
||||
("DOS-CP865" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/PC/CP865.TXT")
|
||||
("DOS-CP866" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/PC/CP866.TXT")
|
||||
("DOS-CP869" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/PC/CP869.TXT")
|
||||
("DOS-CP874" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/PC/CP874.TXT")
|
||||
|
||||
; Redundant WINDOWS-CP874 DOS-CP874
|
||||
;("WINDOWS-CP874" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP874.TXT")
|
||||
|
||||
("WINDOWS-CP932" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP932.TXT")
|
||||
("WINDOWS-CP936" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP936.TXT")
|
||||
("WINDOWS-CP949" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP949.TXT")
|
||||
("WINDOWS-CP950" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP950.TXT")
|
||||
|
||||
("WINDOWS-CP1250" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP1250.TXT")
|
||||
("WINDOWS-CP1251" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP1251.TXT")
|
||||
("WINDOWS-CP1252" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP1252.TXT")
|
||||
("WINDOWS-CP1253" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP1253.TXT")
|
||||
("WINDOWS-CP1254" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP1254.TXT")
|
||||
("WINDOWS-CP1255" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP1255.TXT")
|
||||
("WINDOWS-CP1256" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP1256.TXT")
|
||||
("WINDOWS-CP1257" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP1257.TXT")
|
||||
("WINDOWS-CP1258" "http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP1258.TXT")
|
||||
|
||||
("JISX0201" "http://unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/JIS/JIS0201.TXT")
|
||||
("JISX0208" "http://unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/JIS/JIS0208.TXT"
|
||||
;; Fixes compatible with libiconv: we replace a reverse solidus with a
|
||||
;; fullwidth reverse solidus, so that JISX0208 does not contain characters
|
||||
;; in the ASCII range (Needed by ISO-2022-JP-1)
|
||||
((#x815F #x2140 #xff3c)))
|
||||
|
||||
("JISX0212" "http://unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/JIS/JIS0212.TXT"
|
||||
;; Fixes compatible with libiconv: we replace a tilde with a
|
||||
;; fullwidth tilde, so that JISX0212 does not contain characters
|
||||
;; in the ASCII range (Needed by ISO-2022-JP-1)
|
||||
((#x2237 #xff5e)))
|
||||
|
||||
("SHIFT-JIS" "http://unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/JIS/SHIFTJIS.TXT")
|
||||
|
||||
;Unable to parse because they output more than one Unicode character
|
||||
;("SJIS-0213" "http://x0213.org/codetable/sjis-0213-2004-std.txt")
|
||||
;("EUC-JISX0213" "http://x0213.org/codetable/euc-jis")
|
||||
))
|
||||
|
||||
(defun download (filename url)
|
||||
(unless (probe-file filename)
|
||||
(let ((command (format nil "curl \"~A\" > ~A" url filename)))
|
||||
(unless (zerop (si::system command))
|
||||
(error "Unable to retrieve file ~A" url)))))
|
||||
|
||||
(defun reformat (line)
|
||||
(loop with l = (length line)
|
||||
for i from 0 below l
|
||||
for c = (char line i)
|
||||
do (cond ((eql c #\#)
|
||||
(return (if (zerop i) "" (subseq line 0 (1- i)))))
|
||||
((not (standard-char-p c))
|
||||
(setf (char line i) #\space))
|
||||
((and (eql c #\0)
|
||||
(let ((j (1+ i)))
|
||||
(and (< j l) (member (char line j) '(#\x #\X)))))
|
||||
(setf (char line i) #\#)))
|
||||
finally (return line)))
|
||||
|
||||
(defun read-mapping (name &optional (n 2))
|
||||
(let* ((source-file (make-pathname :name name :defaults +source-pathname+))
|
||||
(record (find name +all-mappings+ :key #'first :test #'equalp))
|
||||
(fixes (third record))
|
||||
(source-url (fourth record)))
|
||||
(unless (probe-file source-file)
|
||||
(unless source-url
|
||||
(error "Unknown encoding ~A" name))
|
||||
(download file source-url))
|
||||
(with-open-file (in source-file :direction :input)
|
||||
(loop with output = '()
|
||||
for line = (reformat (read-line in nil nil))
|
||||
while line
|
||||
unless (zerop (length line))
|
||||
do (with-input-from-string (aux line)
|
||||
(let ((byte-list (loop for byte = (read aux nil nil)
|
||||
while byte
|
||||
collect byte)))
|
||||
(unless (/= (length byte-list) n)
|
||||
(loop for i in fixes
|
||||
when (= (first i) (first byte-list))
|
||||
do (progn (setf byte-list i) (return)))
|
||||
(push byte-list output))))
|
||||
finally (return (nreverse output))))))
|
||||
|
||||
(defun mapping-hash-table (mapping)
|
||||
(loop with hash = (make-hash-table :size (floor (* 1.5 (length mapping)))
|
||||
:test 'eq)
|
||||
for (multibyte codepoint) in mapping
|
||||
for unicode-char = (code-char codepoint)
|
||||
do (progn
|
||||
(setf (gethash multibyte hash) unicode-char)
|
||||
(setf (gethash unicode-char hash) multibyte)
|
||||
(when (> multibyte #xFF)
|
||||
(setf (gethash (ash multibyte -8) hash) t)))
|
||||
finally (return hash)))
|
||||
|
||||
(defun dump-mapping-array (mapping-assoc output-file)
|
||||
(let* ((mapping-list (reduce #'nconc mapping-assoc))
|
||||
(mapping-array (make-array (length mapping-list) :element-type +sequence-type+
|
||||
:initial-contents mapping-list)))
|
||||
(format t "~%;;; Generating ~A" output-file)
|
||||
(force-output t)
|
||||
(with-open-file (s output-file :direction :output :if-exists :supersede
|
||||
:element-type +sequence-type+ :external-format :big-endian)
|
||||
(write-byte (length mapping-array) s)
|
||||
(write-sequence mapping-array s))))
|
||||
|
||||
(defun copy-file (in out)
|
||||
(let ((buffer (make-array 8192 :element-type '(unsigned-byte 8))))
|
||||
(format t "~%;;; Copying ~A to ~A" in out)
|
||||
(with-open-file (sin in :direction :input :element-type '(unsigned-byte 8))
|
||||
(with-open-file (sout out :direction :output :element-type '(unsigned-byte 8)
|
||||
:if-exists :supersede :if-does-not-exist :create)
|
||||
(loop for nbytes = (read-sequence buffer sin)
|
||||
until (zerop nbytes)
|
||||
do (write-sequence buffer sout :end nbytes))))))
|
||||
|
||||
(defun all-valid-unicode-chars (mapping)
|
||||
(if (consp mapping)
|
||||
(loop for sublist on mapping
|
||||
for i from 0 below 10
|
||||
until (and (eq sublist mapping) (plusp i))
|
||||
collect (all-valid-unicode-chars (first sublist)))
|
||||
(concatenate 'string (loop for key being the hash-key in mapping
|
||||
when (characterp key)
|
||||
collect key))))
|
||||
|
||||
(defun compare-hashes (h1 h2)
|
||||
(flet ((h1-in-h2 (h1 h2)
|
||||
(loop for k being the hash-key in h1 using (hash-value v)
|
||||
for v2 = (gethash k h2 nil)
|
||||
unless (or (consp v2) (consp v) (equal v v2))
|
||||
do (progn (print (list h1 k v h2 k v2))
|
||||
(error)
|
||||
(return nil))
|
||||
finally (return t))))
|
||||
(and (h1-in-h2 h1 h2)
|
||||
(h1-in-h2 h2 h1))))
|
||||
108
src/c/file.d
108
src/c/file.d
|
|
@ -60,6 +60,12 @@
|
|||
#define ecl_ftello ftello
|
||||
#endif
|
||||
|
||||
/* Maximum number of bytes required to encode a character.
|
||||
* This currently corresponds to (4 + 2) for the ISO-2022-JP-* encodings
|
||||
* with 4 being the charset prefix, 2 for the character.
|
||||
*/
|
||||
#define ENCODING_BUFFER_MAX_SIZE 6
|
||||
|
||||
static cl_index ecl_read_byte8(cl_object stream, unsigned char *c, cl_index n);
|
||||
static cl_index ecl_write_byte8(cl_object stream, unsigned char *c, cl_index n);
|
||||
|
||||
|
|
@ -565,7 +571,7 @@ eformat_unread_char(cl_object strm, int c)
|
|||
}
|
||||
{
|
||||
cl_object l = Cnil;
|
||||
unsigned char buffer[10];
|
||||
unsigned char buffer[2*ENCODING_BUFFER_MAX_SIZE];
|
||||
int ndx = 0;
|
||||
cl_fixnum i = strm->stream.last_code[0];
|
||||
if (i != EOF) {
|
||||
|
|
@ -598,7 +604,7 @@ eformat_read_char(cl_object strm)
|
|||
static int
|
||||
eformat_write_char(cl_object strm, int c)
|
||||
{
|
||||
unsigned char buffer[4];
|
||||
unsigned char buffer[ENCODING_BUFFER_MAX_SIZE];
|
||||
int nbytes = strm->stream.encoder(strm, buffer, c);
|
||||
if (nbytes == 0) {
|
||||
character_size_overflow(strm, c);
|
||||
|
|
@ -926,7 +932,7 @@ ucs_2_encoder(cl_object stream, unsigned char *buffer, int c)
|
|||
}
|
||||
|
||||
/*
|
||||
* USER DEFINED ENCODINGS
|
||||
* USER DEFINED ENCODINGS. SIMPLE CASE.
|
||||
*/
|
||||
|
||||
static int
|
||||
|
|
@ -975,6 +981,84 @@ user_encoder(cl_object stream, unsigned char *buffer, int c)
|
|||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* USER DEFINED ENCODINGS. SIMPLE CASE.
|
||||
*/
|
||||
|
||||
static int
|
||||
user_multistate_decoder(cl_object stream, cl_eformat_read_byte8 read_byte8,
|
||||
cl_object source)
|
||||
{
|
||||
cl_object table_list = stream->stream.format_table;
|
||||
cl_object table = ECL_CONS_CAR(table_list);
|
||||
cl_object character;
|
||||
cl_fixnum i, j;
|
||||
unsigned char buffer[ENCODING_BUFFER_MAX_SIZE];
|
||||
for (i = j = 0; i < ENCODING_BUFFER_MAX_SIZE; i++) {
|
||||
if (read_byte8(source, buffer+i, 1) < 1) {
|
||||
return EOF;
|
||||
}
|
||||
j = (j << 8) | buffer[i];
|
||||
character = ecl_gethash_safe(MAKE_FIXNUM(j), table, Cnil);
|
||||
if (CHARACTERP(character)) {
|
||||
return CHAR_CODE(character);
|
||||
}
|
||||
if (Null(character)) {
|
||||
invalid_codepoint(stream, buffer[0]);
|
||||
}
|
||||
if (character == Ct) {
|
||||
/* Need more characters */
|
||||
continue;
|
||||
}
|
||||
if (CONSP(character)) {
|
||||
/* Changed the state. */
|
||||
stream->stream.format_table = table_list = character;
|
||||
table = ECL_CONS_CAR(table_list);
|
||||
i = j = 0;
|
||||
continue;
|
||||
}
|
||||
break;
|
||||
}
|
||||
FEerror("Internal error in decoder table.", 0);
|
||||
}
|
||||
|
||||
static int
|
||||
user_multistate_encoder(cl_object stream, unsigned char *buffer, int c)
|
||||
{
|
||||
cl_object table_list = stream->stream.format_table;
|
||||
cl_object p = table_list;
|
||||
do {
|
||||
cl_object table = ECL_CONS_CAR(p);
|
||||
cl_object byte = ecl_gethash_safe(CODE_CHAR(c), table, Cnil);
|
||||
if (!Null(byte)) {
|
||||
cl_fixnum code = fix(byte);
|
||||
int n = 0;
|
||||
if (p != table_list) {
|
||||
/* Must output a escape sequence */
|
||||
cl_object x = ecl_gethash_safe(Ct, table, Cnil);
|
||||
while (!Null(x)) {
|
||||
buffer[0] = fix(ECL_CONS_CAR(x));
|
||||
buffer++;
|
||||
x = ECL_CONS_CDR(x);
|
||||
n++;
|
||||
}
|
||||
stream->stream.format_table = p;
|
||||
}
|
||||
if (code > 0xFF) {
|
||||
buffer[1] = code & 0xFF; code >>= 8;
|
||||
buffer[0] = code;
|
||||
return n+2;
|
||||
} else {
|
||||
buffer[0] = code;
|
||||
return n+1;
|
||||
}
|
||||
}
|
||||
p = ECL_CONS_CDR(p);
|
||||
} while (p != table_list);
|
||||
/* Exhausted all lists */
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* UTF-8
|
||||
*/
|
||||
|
|
@ -2880,7 +2964,7 @@ set_stream_elt_type(cl_object stream, cl_fixnum byte_size, int flags,
|
|||
switch (flags & ECL_STREAM_FORMAT) {
|
||||
case ECL_STREAM_BINARY:
|
||||
IO_STREAM_ELT_TYPE(stream) = cl_list(2, t, MAKE_FIXNUM(byte_size));
|
||||
stream->stream.format = Cnil;
|
||||
stream->stream.format = t;
|
||||
stream->stream.ops->read_char = not_character_read_char;
|
||||
stream->stream.ops->write_char = not_character_write_char;
|
||||
break;
|
||||
|
|
@ -2944,8 +3028,13 @@ set_stream_elt_type(cl_object stream, cl_fixnum byte_size, int flags,
|
|||
IO_STREAM_ELT_TYPE(stream) = @'character';
|
||||
byte_size = 8;
|
||||
stream->stream.format = stream->stream.format_table;
|
||||
stream->stream.encoder = user_encoder;
|
||||
stream->stream.decoder = user_decoder;
|
||||
if (CONSP(stream->stream.format)) {
|
||||
stream->stream.encoder = user_multistate_encoder;
|
||||
stream->stream.decoder = user_multistate_decoder;
|
||||
} else {
|
||||
stream->stream.encoder = user_encoder;
|
||||
stream->stream.decoder = user_decoder;
|
||||
}
|
||||
break;
|
||||
case ECL_STREAM_US_ASCII:
|
||||
IO_STREAM_ELT_TYPE(stream) = @'base-char';
|
||||
|
|
@ -2967,20 +3056,21 @@ set_stream_elt_type(cl_object stream, cl_fixnum byte_size, int flags,
|
|||
FEerror("Invalid or unsupported external format ~A with code ~D",
|
||||
2, external_format, MAKE_FIXNUM(flags));
|
||||
}
|
||||
t = @':LF';
|
||||
if (stream->stream.ops->write_char == eformat_write_char &&
|
||||
(flags & ECL_STREAM_CR)) {
|
||||
cl_object key;
|
||||
if (flags & ECL_STREAM_LF) {
|
||||
stream->stream.ops->read_char = eformat_read_char_crlf;
|
||||
stream->stream.ops->write_char = eformat_write_char_crlf;
|
||||
key = @':CRLF';
|
||||
t = @':CRLF';
|
||||
} else {
|
||||
stream->stream.ops->read_char = eformat_read_char_cr;
|
||||
stream->stream.ops->write_char = eformat_write_char_cr;
|
||||
key = @':CR';
|
||||
t = @':CR';
|
||||
}
|
||||
stream->stream.format = cl_list(2, key, stream->stream.format);
|
||||
}
|
||||
stream->stream.format = cl_list(2, stream->stream.format, t);
|
||||
{
|
||||
cl_object (*read_byte)(cl_object);
|
||||
void (*write_byte)(cl_object,cl_object);
|
||||
|
|
|
|||
|
|
@ -291,8 +291,8 @@ the one defined in the ANSI standard. *print-base* is 10, *print-array* is t,
|
|||
(defun ext:load-encoding (name)
|
||||
(let ((filename (make-pathname :name (symbol-name name) :defaults "sys:encodings;")))
|
||||
(cond ((probe-file filename)
|
||||
(with-open-file (s filename :direction :input)
|
||||
(read s)))
|
||||
(load filename :verbose nil)
|
||||
name)
|
||||
((probe-file (setf filename (make-pathname :type "BIN" :defaults filename)))
|
||||
(with-open-file (in filename :element-type '(unsigned-byte 16)
|
||||
:external-format :big-endian)
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue