[Top][All Lists]
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[Emacs-diffs] Changes to emacs/lisp/international/utf-8.el
From: |
Kenichi Handa |
Subject: |
[Emacs-diffs] Changes to emacs/lisp/international/utf-8.el |
Date: |
Mon, 30 Sep 2002 02:35:14 -0400 |
Index: emacs/lisp/international/utf-8.el
diff -c emacs/lisp/international/utf-8.el:1.18
emacs/lisp/international/utf-8.el:1.19
*** emacs/lisp/international/utf-8.el:1.18 Wed Sep 11 16:57:50 2002
--- emacs/lisp/international/utf-8.el Mon Sep 30 02:35:13 2002
***************
*** 46,57 ****
;; Fixme: note that reading and writing invalid utf-8 may not be
;; idempotent -- to represent the bytes to fix that needs a new charset.
;;
! ;; Characters from other character sets can be encoded with
! ;; mule-utf-8 by populating the table `ucs-mule-to-mule-unicode' and
! ;; registering the translation with `register-char-codings'. Hash
! ;; tables `utf-8-subst-table' and `utf-8-subst-rev-table' are used to
! ;; support encoding and decoding of about a quarter of the CJK space
! ;; between U+3400 and U+DFFF.
;; UTF-8 is defined in RFC 2279. A sketch of the encoding is:
--- 46,58 ----
;; Fixme: note that reading and writing invalid utf-8 may not be
;; idempotent -- to represent the bytes to fix that needs a new charset.
;;
! ;; Characters from other character sets can be encoded with mule-utf-8
! ;; by populating the translation-table
! ;; `utf-translation-table-for-encode' and registering the translation
! ;; with `register-char-codings'. Hash tables
! ;; `utf-subst-table-for-decode' and `utf-subst-table-for-encode' are
! ;; used to support encoding and decoding of about a quarter of the CJK
! ;; space between U+3400 and U+DFFF.
;; UTF-8 is defined in RFC 2279. A sketch of the encoding is:
***************
*** 64,97 ****
;;; Code:
! (defvar ucs-mule-to-mule-unicode (make-translation-table)
! "Translation table for encoding to `mule-utf-8'.")
! (define-translation-table 'ucs-mule-to-mule-unicode
! ucs-mule-to-mule-unicode)
!
! (defvar utf-8-subst-table (make-hash-table :test 'eq))
! (defvar utf-8-subst-rev-table (make-hash-table :test 'eq))
! (define-translation-hash-table 'utf-8-subst-table utf-8-subst-table)
! (define-translation-hash-table 'utf-8-subst-rev-table utf-8-subst-rev-table)
!
! (defvar utf-8-translation-table-for-decode (make-translation-table)
! "Translation table applied after decoding utf-8 to mule-unicode.
! This is only actually applied to characters which would normally be
! decoded into mule-unicode-0100-24ff.")
! (define-translation-table 'utf-8-translation-table-for-decode
! utf-8-translation-table-for-decode)
;; Map Cyrillic and Greek to iso-8859 charsets, which take half the
;; space of mule-unicode. For Latin scripts this isn't very
;; important. Hebrew and Arabic might go here too when there's proper
;; support for them.
! (defvar utf-8-fragmentation-table (make-translation-table)
! "Char table normally mapping non-Latin mule-unicode-... characters to
iso8859.
! Used as the value of `utf-8-translation-table-for-decode' in
! `utf-8-fragment-on-decoding' mode.")
(mapc
(lambda (pair)
! (aset utf-8-fragmentation-table (car pair) (cdr pair)))
'((?$,1&d(B . ?,F4(B) (?$,1&e(B . ?,F5(B) (?$,1&f(B . ?,F6(B)
(?$,1&h(B . ?,F8(B) (?$,1&i(B . ?,F9(B)
(?$,1&j(B . ?,F:(B) (?$,1&l(B . ?,F<(B) (?$,1&n(B . ?,F>(B)
(?$,1&o(B . ?,F?(B) (?$,1&p(B . ?,address@hidden(B)
(?$,1&q(B . ?,FA(B) (?$,1&r(B . ?,FB(B) (?$,1&s(B . ?,FC(B)
(?$,1&t(B . ?,FD(B) (?$,1&u(B . ?,FE(B)
--- 65,122 ----
;;; Code:
! (defvar ucs-mule-to-mule-unicode (make-char-table 'translation-table nil)
! "Char table mapping characters to latin-iso8859-1 or mule-unicode-*.
!
! If `unify-8859-on-encoding-mode' is non-nil, this table populates the
! translation-table named `utf-translation-table-for-encode'.")
!
! (define-translation-table 'utf-translation-table-for-encode)
!
;; Map Cyrillic and Greek to iso-8859 charsets, which take half the
;; space of mule-unicode. For Latin scripts this isn't very
;; important. Hebrew and Arabic might go here too when there's proper
;; support for them.
!
! (defvar utf-fragmentation-table (make-char-table 'translation-table nil)
! "Char-table normally mapping non-Latin mule-unicode-* chars to iso-8859-*.
!
! If `utf-fragment-on-decoding' is non-nil, this table populates the
! translation-table named `utf-translation-table-for-decode'")
!
! (defvar utf-defragmentation-table (make-char-table 'translation-table nil)
! "Char-table for reverse mapping of `utf-fragmentation-table'.
!
! If `utf-fragment-on-decoding' is non-nil and
! `unify-8859-on-encoding-mode' is nil, this table populates the
! translation-table named `utf-translation-table-for-encode'")
!
! (define-translation-table 'utf-translation-table-for-decode)
!
!
! (defvar ucs-mule-cjk-to-unicode (make-hash-table :test 'eq)
! "Hash table mapping Emacs CJK character sets to Unicode code points.
!
! If `utf-translate-cjk' is non-nil, this table populates the
! translation-hash-table named `utf-subst-table-for-encode'.")
!
! (define-translation-hash-table 'utf-subst-table-for-encode
! (make-hash-table :test 'eq))
!
! (defvar ucs-unicode-to-mule-cjk (make-hash-table :test 'eq)
! "Hash table mapping Unicode code points to Emacs CJK character sets.
!
! If `utf-translate-cjk' is non-nil, this table populates the
! translation-hash-table named `utf-subst-table-for-decode'.")
!
! (define-translation-hash-table 'utf-subst-table-for-decode
! (make-hash-table :test 'eq))
!
(mapc
(lambda (pair)
! (aset utf-fragmentation-table (car pair) (cdr pair))
! (aset utf-defragmentation-table (cdr pair) (car pair)))
'((?$,1&d(B . ?,F4(B) (?$,1&e(B . ?,F5(B) (?$,1&f(B . ?,F6(B)
(?$,1&h(B . ?,F8(B) (?$,1&i(B . ?,F9(B)
(?$,1&j(B . ?,F:(B) (?$,1&l(B . ?,F<(B) (?$,1&n(B . ?,F>(B)
(?$,1&o(B . ?,F?(B) (?$,1&p(B . ?,address@hidden(B)
(?$,1&q(B . ?,FA(B) (?$,1&r(B . ?,FB(B) (?$,1&s(B . ?,FC(B)
(?$,1&t(B . ?,FD(B) (?$,1&u(B . ?,FE(B)
***************
*** 128,135 ****
(?$,1(w(B . ?,Lw(B) (?$,1(x(B . ?,Lx(B) (?$,1(y(B . ?,Ly(B)
(?$,1(z(B . ?,Lz(B) (?$,1({(B . ?,L{(B)
(?$,1(|(B . ?,L|(B) (?$,1(~(B . ?,L~(B) (?$,1((B . ?,L(B)))
! (defcustom utf-8-fragment-on-decoding nil
! "Whether or not to decode some scripts in UTF-8 text into iso8859 charsets.
Setting this means that the relevant Cyrillic and Greek characters are
decoded into the iso8859 charsets rather than into
mule-unicode-0100-24ff. The iso8859 charsets take half as much space
--- 153,161 ----
(?$,1(w(B . ?,Lw(B) (?$,1(x(B . ?,Lx(B) (?$,1(y(B . ?,Ly(B)
(?$,1(z(B . ?,Lz(B) (?$,1({(B . ?,L{(B)
(?$,1(|(B . ?,L|(B) (?$,1(~(B . ?,L~(B) (?$,1((B . ?,L(B)))
!
! (defcustom utf-fragment-on-decoding nil
! "Whether or not to decode some chars in UTF-8/16 text into iso8859 charsets.
Setting this means that the relevant Cyrillic and Greek characters are
decoded into the iso8859 charsets rather than into
mule-unicode-0100-24ff. The iso8859 charsets take half as much space
***************
*** 140,179 ****
Setting this variable outside customize has no effect."
:set (lambda (s v)
! (setq utf-8-translation-table-for-decode
! (if v
! utf-8-fragmentation-table
! (make-translation-table)))
! (define-translation-table 'utf-8-translation-table-for-decode
! utf-8-translation-table-for-decode)
(set-default s v))
:version "21.4"
:type 'boolean
:group 'mule)
! (defcustom utf-8-translate-cjk nil
! "Whether the `mule-utf-8' coding system should encode many CJK characters.
! Enabling this loads tables which enable the coding system to encode
! characters in the charsets `korean-ksc5601', `chinese-gb2312' and
`japanese-jisx0208', and to decode the corresponding unicodes into
such characters. This works by loading the library `utf-8-subst'; see
its commentary. The tables are fairly large (about 33000 entries), so this
option is not the default."
:link '(emacs-commentary-link "utf-8-subst")
:set (lambda (s v)
! (when v
! (require 'utf-8-subst)
! (let ((table (make-char-table 'translation-table)))
! (coding-system-put 'mule-utf-8 'safe-charsets
! (append (coding-system-get 'mule-utf-8
! 'safe-charsets)
! '(korean-ksc5601 chinese-gb2312
! japanese-jisx0208)))
! (maphash (lambda (k v)
! (aset table k v))
! utf-8-subst-rev-table)
! (register-char-codings 'mule-utf-8 table)))
(set-default s v))
:version "21.4"
:type 'boolean
--- 166,246 ----
Setting this variable outside customize has no effect."
:set (lambda (s v)
! (if v
! (progn
! (define-translation-table 'utf-translation-table-for-decode
! utf-fragmentation-table)
! ;; Even if unify-8859-on-encoding-mode is off, make
! ;; mule-utf-* encode characters in
! ;; utf-fragmentation-table.
! (unless (eq (get 'utf-translation-table-for-encode
! 'translation-table)
! ucs-mule-to-mule-unicode)
! (define-translation-table 'utf-translation-table-for-encode
! utf-defragmentation-table)
! (dolist (coding '(mule-utf-8 mule-utf-16-be mule-utf-16-le))
! (register-char-codings coding utf-defragmentation-table))))
! (define-translation-table 'utf-translation-table-for-decode)
! ;; When unify-8859-on-encoding-mode is off, be sure to make
! ;; mule-utf-* disabled for characters in
! ;; utf-fragmentation-table.
! (unless (eq (get 'utf-translation-table-for-encode
! 'translation-table)
! ucs-mule-to-mule-unicode)
! (define-translation-table 'utf-translation-table-for-encode)
! (map-char-table
! (lambda (key val)
! (if (and (>= key 128) val)
! (aset char-coding-system-table key
! (delq 'mule-utf-8
! (delq 'mule-utf-16-le
! (delq 'mule-utf-16-be
! (aref char-coding-system-table
! key)))))))
! utf-defragmentation-table)))
(set-default s v))
:version "21.4"
:type 'boolean
:group 'mule)
! (defcustom utf-translate-cjk nil
! "Whether the UTF based coding systems should decode/encode CJK characters.
! Enabling this loads tables which enable the coding systems:
! mule-utf-8, mule-utf-16-le, mule-utf-16-be
! to encode characters in the charsets `korean-ksc5601', `chinese-gb2312' and
`japanese-jisx0208', and to decode the corresponding unicodes into
such characters. This works by loading the library `utf-8-subst'; see
its commentary. The tables are fairly large (about 33000 entries), so this
option is not the default."
:link '(emacs-commentary-link "utf-8-subst")
:set (lambda (s v)
! (if v
! (progn
! (require 'utf-8-subst)
! (let ((table (make-char-table 'translation-table)))
! (maphash (lambda (k v)
! (aset table k t))
! ucs-mule-cjk-to-unicode)
! (register-char-codings 'mule-utf-8 table)
! (register-char-codings 'mule-utf-16-le table)
! (register-char-codings 'mule-utf-16-be table))
! (define-translation-hash-table 'utf-subst-table-for-decode
! ucs-unicode-to-mule-cjk)
! (define-translation-hash-table 'utf-subst-table-for-encode
! ucs-mule-cjk-to-unicode))
! (map-char-table
! (lambda (k v)
! (if (gethash k ucs-mule-cjk-to-unicode)
! (aset char-coding-system-table k
! (delq 'mule-utf-8
! (delq 'mule-utf-16-le
! (delq 'mule-utf-16-be v))))))
! char-coding-system-table)
! (define-translation-hash-table 'utf-subst-table-for-decode
! (make-hash-table :test 'eq))
! (define-translation-hash-table 'utf-subst-table-for-encode
! (make-hash-table :test 'eq)))
(set-default s v))
:version "21.4"
:type 'boolean
***************
*** 263,269 ****
(r1 %= 96)
(r1 += (r2 + 32))
(translate-character
! utf-8-translation-table-for-decode r0 r1)
(write-multibyte-character r0 r1))))))))
;; 3byte encoding
--- 330,336 ----
(r1 %= 96)
(r1 += (r2 + 32))
(translate-character
! utf-translation-table-for-decode r0 r1)
(write-multibyte-character r0 r1))))))))
;; 3byte encoding
***************
*** 308,321 ****
(r1 = (r7 + 32))
(r1 += ((r3 + 32) << 7))
(translate-character
! utf-8-translation-table-for-decode r0 r1)
(write-multibyte-character r0 r1))
;; mule-unicode-2500-33ff
;; Fixme: Perhaps allow translation via
! ;; utf-8-subst-table for #x2e80 up, so that we use
! ;; consistent charsets for all of CJK. Would need
! ;; corresponding change to encoding tables.
(if (r3 < #x3400)
((r0 = ,(charset-id 'mule-unicode-2500-33ff))
(r3 -= #x2500)
--- 375,389 ----
(r1 = (r7 + 32))
(r1 += ((r3 + 32) << 7))
(translate-character
! utf-translation-table-for-decode r0 r1)
(write-multibyte-character r0 r1))
;; mule-unicode-2500-33ff
;; Fixme: Perhaps allow translation via
! ;; utf-subst-table-for-decode for #x2e80 up, so
! ;; that we use consistent charsets for all of
! ;; CJK. Would need corresponding change to
! ;; encoding tables.
(if (r3 < #x3400)
((r0 = ,(charset-id 'mule-unicode-2500-33ff))
(r3 -= #x2500)
***************
*** 329,335 ****
;; them as eight-bit-{control|graphic}.
(if (r3 < #xd800)
((r4 = r3) ; don't zap r3
! (lookup-integer utf-8-subst-table r4 r5)
(if r7
;; got a translation
((write-multibyte-character r4 r5)
--- 397,403 ----
;; them as eight-bit-{control|graphic}.
(if (r3 < #xd800)
((r4 = r3) ; don't zap r3
! (lookup-integer utf-subst-table-for-decode r4 r5)
(if r7
;; got a translation
((write-multibyte-character r4 r5)
***************
*** 370,376 ****
(if (r0 < #xfe)
;; 4byte encoding
;; keep those bytes as eight-bit-{control|graphic}
! ;; Fixme: allow lookup in utf-8-subst-table.
((read r1 r2 r3)
;; r0 > #xf0, thus eight-bit-graphic
(write-multibyte-character r6 r0)
--- 438,444 ----
(if (r0 < #xfe)
;; 4byte encoding
;; keep those bytes as eight-bit-{control|graphic}
! ;; Fixme: allow lookup in utf-subst-table-for-decode.
((read r1 r2 r3)
;; r0 > #xf0, thus eight-bit-graphic
(write-multibyte-character r6 r0)
***************
*** 409,416 ****
"CCL program to decode UTF-8.
Basic decoding is done into the charsets ascii, latin-iso8859-1 and
! mule-unicode-*, but see also `utf-8-translation-table-for-decode' and
! `utf-8-subst-table'.
Encodings of un-representable Unicode characters are decoded asis into
eight-bit-control and eight-bit-graphic characters.")
--- 477,484 ----
"CCL program to decode UTF-8.
Basic decoding is done into the charsets ascii, latin-iso8859-1 and
! mule-unicode-*, but see also `utf-fragmentation-table' and
! `ucs-mule-cjk-to-unicode'.
Encodings of un-representable Unicode characters are decoded asis into
eight-bit-control and eight-bit-graphic characters.")
***************
*** 421,427 ****
(if (r5 < 0)
((r1 = -1)
(read-multibyte-character r0 r1)
! (translate-character ucs-mule-to-mule-unicode r0 r1))
(;; We have already done read-multibyte-character.
(r0 = r5)
(r1 = r6)
--- 489,495 ----
(if (r5 < 0)
((r1 = -1)
(read-multibyte-character r0 r1)
! (translate-character utf-translation-table-for-encode r0 r1))
(;; We have already done read-multibyte-character.
(r0 = r5)
(r1 = r6)
***************
*** 516,522 ****
((write #xc2)
(write r1)))))))
! ((lookup-character utf-8-subst-rev-table r0 r1)
(if r7 ; lookup succeeded
((r1 = (((r0 & #xf000) >> 12) | #xe0))
(r2 = ((r0 & #x3f) | #x80))
--- 584,590 ----
((write #xc2)
(write r1)))))))
! ((lookup-character utf-subst-table-for-encode r0 r1)
(if r7 ; lookup succeeded
((r1 = (((r0 & #xf000) >> 12) | #xe0))
(r2 = ((r0 & #x3f) | #x80))
***************
*** 538,547 ****
"CCL program to encode into UTF-8.")
- ;; Dummy definition so that the CCL can be checked correctly; the
- ;; actual data are loaded on demand.
- (unless (boundp 'ucs-mule-8859-to-mule-unicode) ; don't zap it
- (define-translation-table 'ucs-mule-8859-to-mule-unicode))
(define-ccl-program ccl-untranslated-to-ucs
`(0
--- 606,611 ----
***************
*** 648,654 ****
;; ucs-tables is preloaded
;; (defun utf-8-pre-write-conversion (beg end)
;; "Semi-dummy pre-write function effectively to autoload ucs-tables."
! ;; ;; Ensure translation table is loaded.
;; (require 'ucs-tables)
;; ;; Don't do this again.
;; (coding-system-put 'mule-utf-8 'pre-write-conversion nil)
--- 712,718 ----
;; ucs-tables is preloaded
;; (defun utf-8-pre-write-conversion (beg end)
;; "Semi-dummy pre-write function effectively to autoload ucs-tables."
! ;; ;; Ensure translation-table is loaded.
;; (require 'ucs-tables)
;; ;; Don't do this again.
;; (coding-system-put 'mule-utf-8 'pre-write-conversion nil)
***************
*** 657,689 ****
(make-coding-system
'mule-utf-8 4 ?u
"UTF-8 encoding for Emacs-supported Unicode characters.
! The supported Emacs character sets are the following, plus any other
! characters included in the tables `ucs-mule-to-mule-unicode' and
! `utf-8-subst-rev-table':
! ascii
! eight-bit-control
! eight-bit-graphic
! latin-iso8859-1
! latin-iso8859-2
! latin-iso8859-3
! latin-iso8859-4
! cyrillic-iso8859-5
! greek-iso8859-7
! hebrew-iso8859-8
! latin-iso8859-9
! latin-iso8859-14
! latin-iso8859-15
! mule-unicode-0100-24ff
! mule-unicode-2500-33ff
! mule-unicode-e000-ffff
!
! Unicode characters out of the ranges U+0000-U+33FF and U+E200-U+FFFF
! may be decoded into korean-ksc5601, chinese-gb2312, japanese-jisx0208
! \(see user option `utf-8-translate-cjk'); otherwise, sequences of
! eight-bit-control and eight-bit-graphic characters are used to
! preserve their byte sequences, and these are composed to display as a
! single character. Emacs characters that otherwise can't be encoded
! are encoded as U+FFFD."
'(ccl-decode-mule-utf-8 . ccl-encode-mule-utf-8)
'((safe-charsets
--- 721,741 ----
(make-coding-system
'mule-utf-8 4 ?u
"UTF-8 encoding for Emacs-supported Unicode characters.
! It supports Unicode characters of these ranges:
! U+0000..U+33FF, U+E000..U+FFFF.
! They correspond to these Emacs character sets:
! ascii, latin-iso8859-1, mule-unicode-0100-24ff,
! mule-unicode-2500-33ff, mule-unicode-e000-ffff
!
! On decoding (e.g. reading a file), Unicode characters not in the above
! ranges are decoded into sequences of eight-bit-control and
! eight-bit-graphic characters to preserve their byte sequences. The
! byte sequence is preserved on i/o for valid utf-8, but not necessarily
! for invalid utf-8.
!
! On encoding (e.g. writing a file), Emacs characters not belonging to
! any of the character sets listed above are encoded into the UTF-8 byte
! sequence representing U+FFFD (REPLACEMENT CHARACTER)."
'(ccl-decode-mule-utf-8 . ccl-encode-mule-utf-8)
'((safe-charsets
***************
*** 691,714 ****
eight-bit-control
eight-bit-graphic
latin-iso8859-1
- latin-iso8859-15
- latin-iso8859-14
- latin-iso8859-9
- hebrew-iso8859-8
- greek-iso8859-7
- cyrillic-iso8859-5
- latin-iso8859-4
- latin-iso8859-3
- latin-iso8859-2
- vietnamese-viscii-lower
- vietnamese-viscii-upper
- thai-tis620
- ipa
- ethiopic
- indian-is13194
- katakana-jisx0201
- chinese-sisheng
- lao
mule-unicode-0100-24ff
mule-unicode-2500-33ff
mule-unicode-e000-ffff)
--- 743,748 ----
***************
*** 716,722 ****
(coding-category . coding-category-utf-8)
(valid-codes (0 . 255))
;; (pre-write-conversion . utf-8-pre-write-conversion)
! (post-read-conversion . utf-8-post-read-conversion)))
(define-coding-system-alias 'utf-8 'mule-utf-8)
--- 750,760 ----
(coding-category . coding-category-utf-8)
(valid-codes (0 . 255))
;; (pre-write-conversion . utf-8-pre-write-conversion)
! (post-read-conversion . utf-8-post-read-conversion)
! (dependency unify-8859-on-encoding-mode
! unify-8859-on-decoding-mode
! utf-fragment-on-decoding
! utf-translate-cjk)))
(define-coding-system-alias 'utf-8 'mule-utf-8)