(linenum→info "unix/slp.c:2238")

emacs/22.1/src/charset.h

    1: /* Header for multibyte character handler.
    2:    Copyright (C) 2001, 2002, 2003, 2004, 2005,
    3:                  2006, 2007 Free Software Foundation, Inc.
    4:    Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
    5:      2005, 2006, 2007
    6:      National Institute of Advanced Industrial Science and Technology (AIST)
    7:      Registration Number H14PRO021
    8: 
    9: This file is part of GNU Emacs.
   10: 
   11: GNU Emacs is free software; you can redistribute it and/or modify
   12: it under the terms of the GNU General Public License as published by
   13: the Free Software Foundation; either version 2, or (at your option)
   14: any later version.
   15: 
   16: GNU Emacs is distributed in the hope that it will be useful,
   17: but WITHOUT ANY WARRANTY; without even the implied warranty of
   18: MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   19: GNU General Public License for more details.
   20: 
   21: You should have received a copy of the GNU General Public License
   22: along with GNU Emacs; see the file COPYING.  If not, write to
   23: the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
   24: Boston, MA 02110-1301, USA.  */
   25: 
   26: #ifndef EMACS_CHARSET_H
   27: #define EMACS_CHARSET_H
   28: 
   29: /* #define BYTE_COMBINING_DEBUG */
   30: 
   31: /*** GENERAL NOTE on CHARACTER SET (CHARSET) ***
   32: 
   33:   A character set ("charset" hereafter) is a meaningful collection
   34:   (i.e. language, culture, functionality, etc) of characters.  Emacs
   35:   handles multiple charsets at once.  Each charset corresponds to one
   36:   of the ISO charsets.  Emacs identifies a charset by a unique
   37:   identification number, whereas ISO identifies a charset by a triplet
   38:   of DIMENSION, CHARS and FINAL-CHAR.  So, hereafter, just saying
   39:   "charset" means an identification number (integer value).
   40: 
   41:   The value range of charsets is 0x00, 0x81..0xFE.  There are four
   42:   kinds of charset depending on DIMENSION (1 or 2) and CHARS (94 or
   43:   96).  For instance, a charset of DIMENSION2_CHARS94 contains 94x94
   44:   characters.
   45: 
   46:   Within Emacs Lisp, a charset is treated as a symbol which has a
   47:   property `charset'.  The property value is a vector containing
   48:   various information about the charset.  For readability of C code,
   49:   we use the following convention for C variable names:
   50:         charset_symbol: Emacs Lisp symbol of a charset
   51:         charset_id: Emacs Lisp integer of an identification number of a charset
   52:         charset: C integer of an identification number of a charset
   53: 
   54:   Each charset (except for ascii) is assigned a base leading-code
   55:   (range 0x80..0x9E).  In addition, a charset of greater than 0xA0
   56:   (whose base leading-code is 0x9A..0x9D) is assigned an extended
   57:   leading-code (range 0xA0..0xFE).  In this case, each base
   58:   leading-code specifies the allowable range of extended leading-code
   59:   as shown in the table below.  A leading-code is used to represent a
   60:   character in Emacs' buffer and string.
   61: 
   62:   We call a charset which has extended leading-code a "private
   63:   charset" because those are mainly for a charset which is not yet
   64:   registered by ISO.  On the contrary, we call a charset which does
   65:   not have extended leading-code an "official charset".
   66: 
   67:   ---------------------------------------------------------------------------
   68:   charset       dimension      base leading-code  extended leading-code
   69:   ---------------------------------------------------------------------------
   70:   0x00          official dim1    -- none --             -- none --
   71:                 (ASCII)
   72:   0x01..0x7F    --never used--
   73:   0x80          official dim1    -- none --               -- none --
   74:                 (eight-bit-graphic)
   75:   0x81..0x8F    official dim1    same as charset   -- none --
   76:   0x90..0x99    official dim2       same as charset     -- none --
   77:   0x9A..0x9D    --never used--
   78:   0x9E          official dim1    same as charset  -- none --
   79:                 (eight-bit-control)
   80:   0x9F          --never used--
   81:   0xA0..0xDF    private dim1           0x9A              same as charset
   82:                 of 1-column width
   83:   0xE0..0xEF    private dim1           0x9B              same as charset
   84:                 of 2-column width
   85:   0xF0..0xF4    private dim2           0x9C              same as charset
   86:                 of 1-column width
   87:   0xF5..0xFE    private dim2           0x9D              same as charset
   88:                 of 2-column width
   89:   0xFF          --never used--
   90:   ---------------------------------------------------------------------------
   91: 
   92: */
   93: 
   94: /* Definition of special leading-codes.  */
   95: /* Leading-code followed by extended leading-code.  */
   96: #define LEADING_CODE_PRIVATE_11 0x9A /* for private DIMENSION1 of 1-column */
   97: #define LEADING_CODE_PRIVATE_12 0x9B /* for private DIMENSION1 of 2-column */
   98: #define LEADING_CODE_PRIVATE_21 0x9C /* for private DIMENSION2 of 1-column */
   99: #define LEADING_CODE_PRIVATE_22 0x9D /* for private DIMENSION2 of 2-column */
  100: 
  101: #define LEADING_CODE_8_BIT_CONTROL 0x9E /* for `eight-bit-control' */
  102: 
  103: /* Extended leading-code.  */
  104: /* Start of each extended leading-codes.  */
  105: #define LEADING_CODE_EXT_11 0xA0 /* follows LEADING_CODE_PRIVATE_11 */
  106: #define LEADING_CODE_EXT_12 0xE0 /* follows LEADING_CODE_PRIVATE_12 */
  107: #define LEADING_CODE_EXT_21 0xF0 /* follows LEADING_CODE_PRIVATE_21 */
  108: #define LEADING_CODE_EXT_22 0xF5 /* follows LEADING_CODE_PRIVATE_22 */
  109: /* Maximum value of extended leading-codes.  */
  110: #define LEADING_CODE_EXT_MAX 0xFE
  111: 
  112: /* Definition of minimum/maximum charset of each DIMENSION.  */
  113: #define MIN_CHARSET_OFFICIAL_DIMENSION1 0x80
  114: #define MAX_CHARSET_OFFICIAL_DIMENSION1 0x8F
  115: #define MIN_CHARSET_OFFICIAL_DIMENSION2 0x90
  116: #define MAX_CHARSET_OFFICIAL_DIMENSION2 0x99
  117: #define MIN_CHARSET_PRIVATE_DIMENSION1  LEADING_CODE_EXT_11
  118: #define MIN_CHARSET_PRIVATE_DIMENSION2  LEADING_CODE_EXT_21
  119: 
  120: /* Maximum value of overall charset identification number.  */
  121: #define MAX_CHARSET 0xFE
  122: 
  123: /* Definition of special charsets.  */
  124: #define CHARSET_ASCII           0        /* 0x00..0x7F */
  125: #define CHARSET_8_BIT_CONTROL   0x9E      /* 0x80..0x9F */
  126: #define CHARSET_8_BIT_GRAPHIC   0x80      /* 0xA0..0xFF */
  127: 
  128: extern int charset_latin_iso8859_1; /* ISO8859-1 (Latin-1) */
  129: extern int charset_jisx0208_1978; /* JISX0208.1978 (Japanese Kanji old set) */
  130: extern int charset_jisx0208;    /* JISX0208.1983 (Japanese Kanji) */
  131: extern int charset_katakana_jisx0201; /* JISX0201.Kana (Japanese Katakana) */
  132: extern int charset_latin_jisx0201; /* JISX0201.Roman (Japanese Roman) */
  133: extern int charset_big5_1;      /* Big5 Level 1 (Chinese Traditional) */
  134: extern int charset_big5_2;      /* Big5 Level 2 (Chinese Traditional) */
  135: extern int charset_mule_unicode_0100_24ff;
  136: extern int charset_mule_unicode_2500_33ff;
  137: extern int charset_mule_unicode_e000_ffff;
  138: 
  139: /* Check if CH is an ASCII character or a base leading-code.
  140:    Nowadays, any byte can be the first byte of a character in a
  141:    multibyte buffer/string.  So this macro name is not appropriate.  */
  142: #define CHAR_HEAD_P(ch) ((unsigned char) (ch) < 0xA0)
  143: 
  144: /*** GENERAL NOTE on CHARACTER REPRESENTATION ***
  145: 
  146:   Firstly, the term "character" or "char" is used for a multilingual
  147:   character (of course, including ASCII characters), not for a byte in
  148:   computer memory.  We use the term "code" or "byte" for the latter
  149:   case.
  150: 
  151:   A character is identified by charset and one or two POSITION-CODEs.
  152:   POSITION-CODE is the position of the character in the charset.  A
  153:   character of DIMENSION1 charset has one POSITION-CODE: POSITION-CODE-1.
  154:   A character of DIMENSION2 charset has two POSITION-CODE:
  155:   POSITION-CODE-1 and POSITION-CODE-2.  The code range of
  156:   POSITION-CODE is 0x20..0x7F.
  157: 
  158:   Emacs has two kinds of representation of a character: multi-byte
  159:   form (for buffers and strings) and single-word form (for character
  160:   objects in Emacs Lisp).  The latter is called "character code"
  161:   hereafter.  Both representations encode the information of charset
  162:   and POSITION-CODE but in a different way (for instance, the MSB of
  163:   POSITION-CODE is set in multi-byte form).
  164: 
  165:   For details of the multi-byte form, see the section "2. Emacs
  166:   internal format handlers" of `coding.c'.
  167: 
  168:   Emacs uses 19 bits for a character code.  The bits are divided into
  169:   3 fields: FIELD1(5bits):FIELD2(7bits):FIELD3(7bits).
  170: 
  171:   A character code of DIMENSION1 character uses FIELD2 to hold charset
  172:   and FIELD3 to hold POSITION-CODE-1.  A character code of DIMENSION2
  173:   character uses FIELD1 to hold charset, FIELD2 and FIELD3 to hold
  174:   POSITION-CODE-1 and POSITION-CODE-2 respectively.
  175: 
  176:   More precisely...
  177: 
  178:   FIELD2 of DIMENSION1 character (except for ascii, eight-bit-control,
  179:   and eight-bit-graphic) is "charset - 0x70".  This is to make all
  180:   character codes except for ASCII and 8-bit codes greater than 256.
  181:   So, the range of FIELD2 of DIMENSION1 character is 0, 1, or
  182:   0x11..0x7F.
  183: 
  184:   FIELD1 of DIMENSION2 character is "charset - 0x8F" for official
  185:   charset and "charset - 0xE0" for private charset.  So, the range of
  186:   FIELD1 of DIMENSION2 character is 0x01..0x1E.
  187: 
  188:   -----------------------------------------------------------------------------
  189:   charset               FIELD1 (5-bit)           FIELD2 (7-bit)    FIELD3 (7-bit)
  190:   -----------------------------------------------------------------------------
  191:   ascii                 0                 0                      0x00..0x7F
  192:   eight-bit-control     0               1                    0x00..0x1F
  193:   eight-bit-graphic     0               1                    0x20..0x7F
  194:   DIMENSION1            0             charset - 0x70     POSITION-CODE-1
  195:   DIMENSION2(o)         charset - 0x8F     POSITION-CODE-1     POSITION-CODE-2
  196:   DIMENSION2(p)         charset - 0xE0     POSITION-CODE-1     POSITION-CODE-2
  197:   -----------------------------------------------------------------------------
  198:   "(o)": official, "(p)": private
  199:   -----------------------------------------------------------------------------
  200: */
  201: 
  202: /* Masks of each field of character code.  */
  203: #define CHAR_FIELD1_MASK (0x1F << 14)
  204: #define CHAR_FIELD2_MASK (0x7F << 7)
  205: #define CHAR_FIELD3_MASK 0x7F
  206: 
  207: /* Macros to access each field of character C.  */
  208: #define CHAR_FIELD1(c) (((c) & CHAR_FIELD1_MASK) >> 14)
  209: #define CHAR_FIELD2(c) (((c) & CHAR_FIELD2_MASK) >> 7)
  210: #define CHAR_FIELD3(c) ((c) & CHAR_FIELD3_MASK)
  211: 
  212: /* Minimum character code of character of each DIMENSION.  */
  213: #define MIN_CHAR_OFFICIAL_DIMENSION1 \
  214:   ((0x81 - 0x70) << 7)
  215: #define MIN_CHAR_PRIVATE_DIMENSION1 \
  216:   ((MIN_CHARSET_PRIVATE_DIMENSION1 - 0x70) << 7)
  217: #define MIN_CHAR_OFFICIAL_DIMENSION2 \
  218:   ((MIN_CHARSET_OFFICIAL_DIMENSION2 - 0x8F) << 14)
  219: #define MIN_CHAR_PRIVATE_DIMENSION2 \
  220:   ((MIN_CHARSET_PRIVATE_DIMENSION2 - 0xE0) << 14)
  221: /* Maximum character code currently used plus 1.  */
  222: #define MAX_CHAR (0x1F << 14)
  223: 
  224: /* 1 if C is a single byte character, else 0.  */
  225: #define SINGLE_BYTE_CHAR_P(c) (((unsigned)(c) & 0xFF) == (c))
  226: 
  227: /* 1 if BYTE is an ASCII character in itself, in multibyte mode.  */
  228: #define ASCII_BYTE_P(byte) ((byte) < 0x80)
  229: 
  230: /* A char-table containing information on each character set.
  231: 
  232:    Unlike ordinary char-tables, this doesn't contain any nested tables.
  233:    Only the top level elements are used.  Each element is a vector of
  234:    the following information:
  235:         CHARSET-ID, BYTES, DIMENSION, CHARS, WIDTH, DIRECTION,
  236:         LEADING-CODE-BASE, LEADING-CODE-EXT,
  237:         ISO-FINAL-CHAR, ISO-GRAPHIC-PLANE,
  238:         REVERSE-CHARSET, SHORT-NAME, LONG-NAME,        DESCRIPTION,
  239:         PLIST.
  240: 
  241:    CHARSET-ID (integer) is the identification number of the charset.
  242: 
  243:    BYTES (integer) is the length of the multi-byte form of a character
  244:    in the charset: one of 1, 2, 3, and 4.
  245: 
  246:    DIMENSION (integer) is the number of bytes to represent a character: 1 or 2.
  247: 
  248:    CHARS (integer) is the number of characters in a dimension: 94 or 96.
  249: 
  250:    WIDTH (integer) is the number of columns a character in the charset
  251:    occupies on the screen: one of 0, 1, and 2..
  252: 
  253:    DIRECTION (integer) is the rendering direction of characters in the
  254:    charset when rendering.  If 0, render from left to right, else
  255:    render from right to left.
  256: 
  257:    LEADING-CODE-BASE (integer) is the base leading-code for the
  258:    charset.
  259: 
  260:    LEADING-CODE-EXT (integer) is the extended leading-code for the
  261:    charset.  All charsets of less than 0xA0 have the value 0.
  262: 
  263:    ISO-FINAL-CHAR (character) is the final character of the
  264:    corresponding ISO 2022 charset.  It is -1 for such a character
  265:    that is used only internally (e.g. `eight-bit-control').
  266: 
  267:    ISO-GRAPHIC-PLANE (integer) is the graphic plane to be invoked
  268:    while encoding to variants of ISO 2022 coding system, one of the
  269:    following: 0/graphic-plane-left(GL), 1/graphic-plane-right(GR).  It
  270:    is -1 for such a character that is used only internally
  271:    (e.g. `eight-bit-control').
  272: 
  273:    REVERSE-CHARSET (integer) is the charset which differs only in
  274:    LEFT-TO-RIGHT value from the charset.  If there's no such a
  275:    charset, the value is -1.
  276: 
  277:    SHORT-NAME (string) is the short name to refer to the charset.
  278: 
  279:    LONG-NAME (string) is the long name to refer to the charset.
  280: 
  281:    DESCRIPTION (string) is the description string of the charset.
  282: 
  283:    PLIST (property list) may contain any type of information a user
  284:    wants to put and get by functions `put-charset-property' and
  285:    `get-charset-property' respectively.  */
  286: extern Lisp_Object Vcharset_table;
  287: 
  288: /* Macros to access various information of CHARSET in Vcharset_table.
  289:    We provide these macros for efficiency.  No range check of CHARSET.  */
  290: 
  291: /* Return entry of CHARSET (C integer) in Vcharset_table.  */
  292: #define CHARSET_TABLE_ENTRY(charset)                                    \
  293:   XCHAR_TABLE (Vcharset_table)->contents[((charset) == CHARSET_ASCII    \
  294:                                           ? 0 : (charset) + 128)]
  295: 
  296: /* Return information INFO-IDX of CHARSET.  */
  297: #define CHARSET_TABLE_INFO(charset, info_idx) \
  298:   XVECTOR (CHARSET_TABLE_ENTRY (charset))->contents[info_idx]
  299: 
  300: #define CHARSET_ID_IDX (0)
  301: #define CHARSET_BYTES_IDX (1)
  302: #define CHARSET_DIMENSION_IDX (2)
  303: #define CHARSET_CHARS_IDX (3)
  304: #define CHARSET_WIDTH_IDX (4)
  305: #define CHARSET_DIRECTION_IDX (5)
  306: #define CHARSET_LEADING_CODE_BASE_IDX (6)
  307: #define CHARSET_LEADING_CODE_EXT_IDX (7)
  308: #define CHARSET_ISO_FINAL_CHAR_IDX (8)
  309: #define CHARSET_ISO_GRAPHIC_PLANE_IDX (9)
  310: #define CHARSET_REVERSE_CHARSET_IDX (10)
  311: #define CHARSET_SHORT_NAME_IDX (11)
  312: #define CHARSET_LONG_NAME_IDX (12)
  313: #define CHARSET_DESCRIPTION_IDX (13)
  314: #define CHARSET_PLIST_IDX (14)
  315: /* Size of a vector of each entry of Vcharset_table.  */
  316: #define CHARSET_MAX_IDX (15)
  317: 
  318: /* And several more macros to be used frequently.  */
  319: #define CHARSET_BYTES(charset) \
  320:   XFASTINT (CHARSET_TABLE_INFO (charset, CHARSET_BYTES_IDX))
  321: #define CHARSET_DIMENSION(charset) \
  322:   XFASTINT (CHARSET_TABLE_INFO (charset, CHARSET_DIMENSION_IDX))
  323: #define CHARSET_CHARS(charset) \
  324:   XFASTINT (CHARSET_TABLE_INFO (charset, CHARSET_CHARS_IDX))
  325: #define CHARSET_WIDTH(charset) \
  326:   XFASTINT (CHARSET_TABLE_INFO (charset, CHARSET_WIDTH_IDX))
  327: #define CHARSET_DIRECTION(charset) \
  328:   XFASTINT (CHARSET_TABLE_INFO (charset, CHARSET_DIRECTION_IDX))
  329: #define CHARSET_LEADING_CODE_BASE(charset) \
  330:   XFASTINT (CHARSET_TABLE_INFO (charset, CHARSET_LEADING_CODE_BASE_IDX))
  331: #define CHARSET_LEADING_CODE_EXT(charset) \
  332:   XFASTINT (CHARSET_TABLE_INFO (charset, CHARSET_LEADING_CODE_EXT_IDX))
  333: #define CHARSET_ISO_FINAL_CHAR(charset) \
  334:   XINT (CHARSET_TABLE_INFO (charset, CHARSET_ISO_FINAL_CHAR_IDX))
  335: #define CHARSET_ISO_GRAPHIC_PLANE(charset) \
  336:   XINT (CHARSET_TABLE_INFO (charset, CHARSET_ISO_GRAPHIC_PLANE_IDX))
  337: #define CHARSET_REVERSE_CHARSET(charset) \
  338:   XINT (CHARSET_TABLE_INFO (charset, CHARSET_REVERSE_CHARSET_IDX))
  339: 
  340: /* Macros to specify direction of a charset.  */
  341: #define CHARSET_DIRECTION_LEFT_TO_RIGHT 0
  342: #define CHARSET_DIRECTION_RIGHT_TO_LEFT 1
  343: 
  344: /* A vector of charset symbol indexed by charset-id.  This is used
  345:    only for returning charset symbol from C functions.  */
  346: extern Lisp_Object Vcharset_symbol_table;
  347: 
  348: /* Return symbol of CHARSET.  */
  349: #define CHARSET_SYMBOL(charset) \
  350:   XVECTOR (Vcharset_symbol_table)->contents[charset]
  351: 
  352: /* 1 if CHARSET is in valid value range, else 0.  */
  353: #define CHARSET_VALID_P(charset)                                         \
  354:   ((charset) == 0                                                        \
  355:    || ((charset) > 0x80 && (charset) <= MAX_CHARSET_OFFICIAL_DIMENSION2) \
  356:    || ((charset) >= MIN_CHARSET_PRIVATE_DIMENSION1                       \
  357:        && (charset) <= MAX_CHARSET)                                      \
  358:    || ((charset) == CHARSET_8_BIT_CONTROL)                               \
  359:    || ((charset) == CHARSET_8_BIT_GRAPHIC))
  360: 
  361: /* 1 if CHARSET is already defined, else 0.  */
  362: #define CHARSET_DEFINED_P(charset)                      \
  363:   (((charset) >= 0) && ((charset) <= MAX_CHARSET)       \
  364:    && !NILP (CHARSET_TABLE_ENTRY (charset)))
  365: 
  366: /* Since the information CHARSET-BYTES and CHARSET-WIDTH of
  367:    Vcharset_table can be retrieved only by the first byte of
  368:    multi-byte form (an ASCII code or a base leading-code), we provide
  369:    here tables to be used by macros BYTES_BY_CHAR_HEAD and
  370:    WIDTH_BY_CHAR_HEAD for faster information retrieval.  */
  371: extern int bytes_by_char_head[256];
  372: extern int width_by_char_head[256];
  373: 
  374: #define BYTES_BY_CHAR_HEAD(char_head)   \
  375:   (ASCII_BYTE_P (char_head) ? 1 : bytes_by_char_head[char_head])
  376: #define WIDTH_BY_CHAR_HEAD(char_head)   \
  377:   (ASCII_BYTE_P (char_head) ? 1 : width_by_char_head[char_head])
  378: 
  379: /* Charset of the character C.  */
  380: #define CHAR_CHARSET(c)                                                 \
  381:   (SINGLE_BYTE_CHAR_P (c)                                               \
  382:    ? (ASCII_BYTE_P (c)                                                  \
  383:       ? CHARSET_ASCII                                                   \
  384:       : (c) < 0xA0 ? CHARSET_8_BIT_CONTROL : CHARSET_8_BIT_GRAPHIC)     \
  385:    : ((c) < MIN_CHAR_OFFICIAL_DIMENSION2                                \
  386:       ? CHAR_FIELD2 (c) + 0x70                                          \
  387:       : ((c) < MIN_CHAR_PRIVATE_DIMENSION2                              \
  388:          ? CHAR_FIELD1 (c) + 0x8F                                      \
  389:          : CHAR_FIELD1 (c) + 0xE0)))
  390: 
  391: /* Check if two characters C1 and C2 belong to the same charset.  */
  392: #define SAME_CHARSET_P(c1, c2)                          \
  393:   (c1 < MIN_CHAR_OFFICIAL_DIMENSION2                    \
  394:    ? (c1 & CHAR_FIELD2_MASK) == (c2 & CHAR_FIELD2_MASK) \
  395:    : (c1 & CHAR_FIELD1_MASK) == (c2 & CHAR_FIELD1_MASK))
  396: 
  397: /* Return a character of which charset is CHARSET and position-codes
  398:    are C1 and C2.  DIMENSION1 character ignores C2.  */
  399: #define MAKE_CHAR(charset, c1, c2)                                          \
  400:   ((charset) == CHARSET_ASCII                                               \
  401:    ? (c1) & 0x7F                                                            \
  402:    : (((charset) == CHARSET_8_BIT_CONTROL                                   \
  403:        || (charset) == CHARSET_8_BIT_GRAPHIC)                               \
  404:       ? ((c1) & 0x7F) | 0x80                                                \
  405:       : ((CHARSET_DEFINED_P (charset)                                       \
  406:           ? CHARSET_DIMENSION (charset) == 1                               \
  407:           : (charset) < MIN_CHARSET_PRIVATE_DIMENSION2)                            \
  408:          ? (((charset) - 0x70) << 7) | ((c1) <= 0 ? 0 : ((c1) & 0x7F))     \
  409:          : ((((charset)                                                            \
  410:               - ((charset) < MIN_CHARSET_PRIVATE_DIMENSION2 ? 0x8F : 0xE0)) \
  411:              << 14)                                                        \
  412:             | ((c2) <= 0 ? 0 : ((c2) & 0x7F))                              \
  413:             | ((c1) <= 0 ? 0 : (((c1) & 0x7F) << 7))))))
  414: 
  415: 
  416: /* If GENERICP is nonzero, return nonzero iff C is a valid normal or
  417:    generic character.  If GENERICP is zero, return nonzero iff C is a
  418:    valid normal character.  */
  419: #define CHAR_VALID_P(c, genericp)       \
  420:   ((c) >= 0                             \
  421:    && (SINGLE_BYTE_CHAR_P (c) || char_valid_p (c, genericp)))
  422: 
  423: /* This default value is used when nonascii-translation-table or
  424:    nonascii-insert-offset fail to convert unibyte character to a valid
  425:    multibyte character.  This makes a Latin-1 character.  */
  426: 
  427: #define DEFAULT_NONASCII_INSERT_OFFSET 0x800
  428: 
  429: /* Parse multibyte string STR of length LENGTH and set BYTES to the
  430:    byte length of a character at STR.  */
  431: 
  432: #ifdef BYTE_COMBINING_DEBUG
  433: 
  434: #define PARSE_MULTIBYTE_SEQ(str, length, bytes)                 \
  435:   do {                                                          \
  436:     int i = 1;                                                  \
  437:     while (i < (length) && ! CHAR_HEAD_P ((str)[i])) i++;       \
  438:     (bytes) = BYTES_BY_CHAR_HEAD ((str)[0]);                    \
  439:     if ((bytes) > i)                                            \
  440:       abort ();                                                 \
  441:   } while (0)
  442: 
  443: #else  /* not BYTE_COMBINING_DEBUG */
  444: 
  445: #define PARSE_MULTIBYTE_SEQ(str, length, bytes) \
  446:   ((void)(length), (bytes) = BYTES_BY_CHAR_HEAD ((str)[0]))
  447: 
  448: #endif /* not BYTE_COMBINING_DEBUG */
  449: 
  450: #define VALID_LEADING_CODE_P(code)      \
  451:   (! NILP (CHARSET_TABLE_ENTRY (code)))
  452: 
  453: /* Return 1 iff the byte sequence at unibyte string STR (LENGTH bytes)
  454:    is valid as a multibyte form.  If valid, by a side effect, BYTES is
  455:    set to the byte length of the multibyte form.  */
  456: 
  457: #define UNIBYTE_STR_AS_MULTIBYTE_P(str, length, bytes)          \
  458:   (((str)[0] < 0x80 || (str)[0] >= 0xA0)                        \
  459:    ? ((bytes) = 1)                                              \
  460:    : (((bytes) = BYTES_BY_CHAR_HEAD ((str)[0])),                \
  461:       ((bytes) <= (length)                                      \
  462:        && !CHAR_HEAD_P ((str)[1])                               \
  463:        && ((bytes) == 2                                         \
  464:            ? (str)[0] != LEADING_CODE_8_BIT_CONTROL            \
  465:            : (!CHAR_HEAD_P ((str)[2])                          \
  466:               && ((bytes) == 3                                 \
  467:                   ? (((str)[0] != LEADING_CODE_PRIVATE_11     \
  468:                       && (str)[0] != LEADING_CODE_PRIVATE_12) \
  469:                      || VALID_LEADING_CODE_P (str[1]))                \
  470:                   : (!CHAR_HEAD_P ((str)[3])                  \
  471:                      && VALID_LEADING_CODE_P (str[1]))))))))
  472: 
  473: 
  474: /* Return 1 iff the byte sequence at multibyte string STR is valid as
  475:    a unibyte form.  By a side effect, BYTES is set to the byte length
  476:    of one character at STR.  */
  477: 
  478: #define MULTIBYTE_STR_AS_UNIBYTE_P(str, bytes)  \
  479:   ((bytes) = BYTES_BY_CHAR_HEAD ((str)[0]),     \
  480:    (str)[0] != LEADING_CODE_8_BIT_CONTROL)
  481: 
  482: /* The charset of character C is stored in CHARSET, and the
  483:    position-codes of C are stored in C1 and C2.
  484:    We store -1 in C2 if the dimension of the charset is 1.  */
  485: 
  486: #define SPLIT_CHAR(c, charset, c1, c2)                                      \
  487:   (SINGLE_BYTE_CHAR_P (c)                                                   \
  488:    ? ((charset                                                              \
  489:        = (ASCII_BYTE_P (c)                                                  \
  490:           ? CHARSET_ASCII                                                  \
  491:           : ((c) < 0xA0 ? CHARSET_8_BIT_CONTROL : CHARSET_8_BIT_GRAPHIC))), \
  492:       c1 = (c), c2 = -1)                                                    \
  493:    : ((c) & CHAR_FIELD1_MASK                                                \
  494:       ? (charset = (CHAR_FIELD1 (c)                                         \
  495:                     + ((c) < MIN_CHAR_PRIVATE_DIMENSION2 ? 0x8F : 0xE0)),   \
  496:          c1 = CHAR_FIELD2 (c),                                             \
  497:          c2 = CHAR_FIELD3 (c))                                             \
  498:       : (charset = CHAR_FIELD2 (c) + 0x70,                                  \
  499:          c1 = CHAR_FIELD3 (c),                                             \
  500:          c2 = -1)))
  501: 
  502: /* Return 1 iff character C has valid printable glyph.  */
  503: #define CHAR_PRINTABLE_P(c) (ASCII_BYTE_P (c) || char_printable_p (c))
  504: 
  505: /* The charset of the character at STR is stored in CHARSET, and the
  506:    position-codes are stored in C1 and C2.
  507:    We store -1 in C2 if the character is just 2 bytes.  */
  508: 
  509: #define SPLIT_STRING(str, len, charset, c1, c2)                 \
  510:   ((BYTES_BY_CHAR_HEAD ((unsigned char) *(str)) < 2             \
  511:     || BYTES_BY_CHAR_HEAD ((unsigned char) *(str)) > len        \
  512:     || split_string (str, len, &charset, &c1, &c2) < 0)         \
  513:    ? c1 = *(str), charset = CHARSET_ASCII                       \
  514:    : charset)
  515: 
  516: /* Mapping table from ISO2022's charset (specified by DIMENSION,
  517:    CHARS, and FINAL_CHAR) to Emacs' charset.  Should be accessed by
  518:    macro ISO_CHARSET_TABLE (DIMENSION, CHARS, FINAL_CHAR).  */
  519: extern int iso_charset_table[2][2][128];
  520: 
  521: #define ISO_CHARSET_TABLE(dimension, chars, final_char) \
  522:   iso_charset_table[XINT (dimension) - 1][XINT (chars) > 94][XINT (final_char)]
  523: 
  524: #define BASE_LEADING_CODE_P(c) (BYTES_BY_CHAR_HEAD ((unsigned char) (c)) > 1)
  525: 
  526: /* Return how many bytes C will occupy in a multibyte buffer.  */
  527: #define CHAR_BYTES(c)                                   \
  528:   (SINGLE_BYTE_CHAR_P (c)                               \
  529:    ? ((ASCII_BYTE_P (c) || (c) >= 0xA0) ? 1 : 2)        \
  530:    : char_bytes (c))
  531: 
  532: /* The following two macros CHAR_STRING and STRING_CHAR are the main
  533:    entry points to convert between Emacs's two types of character
  534:    representations: multi-byte form and single-word form (character
  535:    code).  */
  536: 
  537: /* Store multi-byte form of the character C in STR.  The caller should
  538:    allocate at least MAX_MULTIBYTE_LENGTH bytes area at STR in
  539:    advance.  Returns the length of the multi-byte form.  If C is an