(linenum→info "unix/slp.c:2238")

ruby/1.9.0/enc/iso_8859_2.c

    1: /**********************************************************************
    2:   iso8859_2.c -  Oniguruma (regular expression library)
    3: **********************************************************************/
    4: /*-
    5:  * Copyright (c) 2002-2007  K.Kosako  <sndgk393 AT ybb DOT ne DOT jp>
    6:  * All rights reserved.
    7:  *
    8:  * Redistribution and use in source and binary forms, with or without
    9:  * modification, are permitted provided that the following conditions
   10:  * are met:
   11:  * 1. Redistributions of source code must retain the above copyright
   12:  *    notice, this list of conditions and the following disclaimer.
   13:  * 2. Redistributions in binary form must reproduce the above copyright
   14:  *    notice, this list of conditions and the following disclaimer in the
   15:  *    documentation and/or other materials provided with the distribution.
   16:  *
   17:  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
   18:  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   19:  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   20:  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
   21:  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   22:  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   23:  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   24:  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   25:  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   26:  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   27:  * SUCH DAMAGE.
   28:  */
   29: 
   30: #include "regenc.h"
   31: 
   32: #define ENC_ISO_8859_2_TO_LOWER_CASE(c) EncISO_8859_2_ToLowerCaseTable[c]
   33: #define ENC_IS_ISO_8859_2_CTYPE(code,ctype) \
   34:   ((EncISO_8859_2_CtypeTable[code] & CTYPE_TO_BIT(ctype)) != 0)
   35: 
   36: static const UChar EncISO_8859_2_ToLowerCaseTable[256] = {
   37:   '\000', '\001', '\002', '\003', '\004', '\005', '\006', '\007',
   38:   '\010', '\011', '\012', '\013', '\014', '\015', '\016', '\017',
   39:   '\020', '\021', '\022', '\023', '\024', '\025', '\026', '\027',
   40:   '\030', '\031', '\032', '\033', '\034', '\035', '\036', '\037',
   41:   '\040', '\041', '\042', '\043', '\044', '\045', '\046', '\047',
   42:   '\050', '\051', '\052', '\053', '\054', '\055', '\056', '\057',
   43:   '\060', '\061', '\062', '\063', '\064', '\065', '\066', '\067',
   44:   '\070', '\071', '\072', '\073', '\074', '\075', '\076', '\077',
   45:   '\100', '\141', '\142', '\143', '\144', '\145', '\146', '\147',
   46:   '\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157',
   47:   '\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167',
   48:   '\170', '\171', '\172', '\133', '\134', '\135', '\136', '\137',
   49:   '\140', '\141', '\142', '\143', '\144', '\145', '\146', '\147',
   50:   '\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157',
   51:   '\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167',
   52:   '\170', '\171', '\172', '\173', '\174', '\175', '\176', '\177',
   53:   '\200', '\201', '\202', '\203', '\204', '\205', '\206', '\207',
   54:   '\210', '\211', '\212', '\213', '\214', '\215', '\216', '\217',
   55:   '\220', '\221', '\222', '\223', '\224', '\225', '\226', '\227',
   56:   '\230', '\231', '\232', '\233', '\234', '\235', '\236', '\237',
   57:   '\240', '\261', '\242', '\263', '\244', '\265', '\266', '\247',
   58:   '\250', '\271', '\272', '\273', '\274', '\255', '\276', '\277',
   59:   '\260', '\261', '\262', '\263', '\264', '\265', '\266', '\267',
   60:   '\270', '\271', '\272', '\273', '\274', '\275', '\276', '\277',
   61:   '\340', '\341', '\342', '\343', '\344', '\345', '\346', '\347',
   62:   '\350', '\351', '\352', '\353', '\354', '\355', '\356', '\357',
   63:   '\360', '\361', '\362', '\363', '\364', '\365', '\366', '\327',
   64:   '\370', '\371', '\372', '\373', '\374', '\375', '\376', '\337',
   65:   '\340', '\341', '\342', '\343', '\344', '\345', '\346', '\347',
   66:   '\350', '\351', '\352', '\353', '\354', '\355', '\356', '\357',
   67:   '\360', '\361', '\362', '\363', '\364', '\365', '\366', '\367',
   68:   '\370', '\371', '\372', '\373', '\374', '\375', '\376', '\377'
   69: };
   70: 
   71: static const unsigned short EncISO_8859_2_CtypeTable[256] = {
   72:   0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008,
   73:   0x4008, 0x420c, 0x4209, 0x4208, 0x4208, 0x4208, 0x4008, 0x4008,
   74:   0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008,
   75:   0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008, 0x4008,
   76:   0x4284, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0,
   77:   0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0,
   78:   0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0, 0x78b0,
   79:   0x78b0, 0x78b0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x41a0,
   80:   0x41a0, 0x7ca2, 0x7ca2, 0x7ca2, 0x7ca2, 0x7ca2, 0x7ca2, 0x74a2,
   81:   0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2,
   82:   0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2, 0x74a2,
   83:   0x74a2, 0x74a2, 0x74a2, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x51a0,
   84:   0x41a0, 0x78e2, 0x78e2, 0x78e2, 0x78e2, 0x78e2, 0x78e2, 0x70e2,
   85:   0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2,
   86:   0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2, 0x70e2,
   87:   0x70e2, 0x70e2, 0x70e2, 0x41a0, 0x41a0, 0x41a0, 0x41a0, 0x4008,
   88:   0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008,
   89:   0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008,
   90:   0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008,
   91:   0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008,
   92:   0x0284, 0x34a2, 0x00a0, 0x34a2, 0x00a0, 0x34a2, 0x34a2, 0x00a0,
   93:   0x00a0, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x01a0, 0x34a2, 0x34a2,
   94:   0x00a0, 0x30e2, 0x00a0, 0x30e2, 0x00a0, 0x30e2, 0x30e2, 0x00a0,
   95:   0x00a0, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x00a0, 0x30e2, 0x30e2,
   96:   0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2,
   97:   0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2,
   98:   0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x00a0,
   99:   0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x30e2,
  100:   0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2,
  101:   0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2,
  102:   0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x00a0,
  103:   0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x00a0
  104: };
  105: 
  106: static int
  107: mbc_case_fold(OnigCaseFoldType flag,
  108:               const UChar** pp, const UChar* end, UChar* lower,
  109:               OnigEncoding enc)
  110: {
  111:   const UChar* p = *pp;
  112: 
  113:   if (*p == 0xdf && (flag & INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR) != 0) {
  114:     *lower++ = 's';
  115:     *lower   = 's';
  116:     (*pp)++;
  117:     return 2;
  118:   }
  119: 
  120:   *lower = ENC_ISO_8859_2_TO_LOWER_CASE(*p);
  121:   (*pp)++;
  122:   return 1; /* return byte length of converted char to lower */
  123: }
  124: 
  125: #if 0
  126: static int
  127: is_mbc_ambiguous(OnigCaseFoldType flag, const UChar** pp, const UChar* end)
  128: {
  129:   int v;
  130:   const UChar* p = *pp;
  131: 
  132:   if (*p == 0xdf && (flag & INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR) != 0) {
  133:     (*pp)++;
  134:     return TRUE;
  135:   }
  136: 
  137:   (*pp)++;
  138:   v = (EncISO_8859_2_CtypeTable[*p] & (BIT_CTYPE_UPPER | BIT_CTYPE_LOWER));
  139:   if ((v | BIT_CTYPE_LOWER) != 0) {
  140:     return TRUE;
  141:   }
  142: 
  143:   return (v != 0 ? TRUE : FALSE);
  144: }
  145: #endif
  146: 
  147: static const OnigPairCaseFoldCodes CaseFoldMap[] = {
  148:  { 0xa1, 0xb1 },
  149:  { 0xa3, 0xb3 },
  150:  { 0xa5, 0xb5 },
  151:  { 0xa6, 0xb6 },
  152:  { 0xa9, 0xb9 },
  153:  { 0xaa, 0xba },
  154:  { 0xab, 0xbb },
  155:  { 0xac, 0xbc },
  156:  { 0xae, 0xbe },
  157:  { 0xaf, 0xbf },
  158: 
  159:  { 0xc0, 0xe0 },
  160:  { 0xc1, 0xe1 },
  161:  { 0xc2, 0xe2 },
  162:  { 0xc3, 0xe3 },
  163:  { 0xc4, 0xe4 },
  164:  { 0xc5, 0xe5 },
  165:  { 0xc6, 0xe6 },
  166:  { 0xc7, 0xe7 },
  167:  { 0xc8, 0xe8 },
  168:  { 0xc9, 0xe9 },
  169:  { 0xca, 0xea },
  170:  { 0xcb, 0xeb },
  171:  { 0xcc, 0xec },
  172:  { 0xcd, 0xed },
  173:  { 0xce, 0xee },
  174:  { 0xcf, 0xef },
  175: 
  176:  { 0xd0, 0xf0 },
  177:  { 0xd1, 0xf1 },
  178:  { 0xd2, 0xf2 },
  179:  { 0xd3, 0xf3 },
  180:  { 0xd4, 0xf4 },
  181:  { 0xd5, 0xf5 },
  182:  { 0xd6, 0xf6 },
  183:  { 0xd8, 0xf8 },
  184:  { 0xd9, 0xf9 },
  185:  { 0xda, 0xfa },
  186:  { 0xdb, 0xfb },
  187:  { 0xdc, 0xfc },
  188:  { 0xdd, 0xfd },
  189:  { 0xde, 0xfe }
  190: };
  191: 
  192: static int
  193: apply_all_case_fold(OnigCaseFoldType flag,
  194:                     OnigApplyAllCaseFoldFunc f, void* arg,
  195:                     OnigEncoding enc)
  196: {
  197:   return onigenc_apply_all_case_fold_with_map(
  198:             sizeof(CaseFoldMap)/sizeof(OnigPairCaseFoldCodes), CaseFoldMap, 1,
  199:             flag, f, arg);
  200: }
  201: 
  202: static int
  203: get_case_fold_codes_by_str(OnigCaseFoldType flag,
  204:                            const OnigUChar* p, const OnigUChar* end,
  205:                            OnigCaseFoldCodeItem items[],
  206:                            OnigEncoding enc)
  207: {
  208:   return onigenc_get_case_fold_codes_by_str_with_map(
  209:              sizeof(CaseFoldMap)/sizeof(OnigPairCaseFoldCodes), CaseFoldMap, 1,
  210:              flag, p, end, items);
  211: }
  212: 
  213: static int
  214: is_code_ctype(OnigCodePoint code, unsigned int ctype, OnigEncoding enc)
  215: {
  216:   if (code < 256)
  217:     return ENC_IS_ISO_8859_2_CTYPE(code, ctype);
  218:   else
  219:     return FALSE;
  220: }
  221: 
  222: OnigEncodingDefine(iso_8859_2, ISO_8859_2) = {
  223:   onigenc_single_byte_mbc_enc_len,
  224:   "ISO-8859-2",  /* name */
  225:   1,             /* max enc length */
  226:   1,             /* min enc length */
  227:   onigenc_is_mbc_newline_0x0a,
  228:   onigenc_single_byte_mbc_to_code,
  229:   onigenc_single_byte_code_to_mbclen,
  230:   onigenc_single_byte_code_to_mbc,
  231:   mbc_case_fold,
  232:   apply_all_case_fold,
  233:   get_case_fold_codes_by_str,
  234:   onigenc_minimum_property_name_to_ctype,
  235:   is_code_ctype,
  236:   onigenc_not_support_get_ctype_code_range,
  237:   onigenc_single_byte_left_adjust_char_head,
  238:   onigenc_always_true_is_allowed_reverse_match
  239: };
Syntax (Markdown)