(linenum→info "unix/slp.c:2238")

anthy/9100e/src-diclib/xchar.c

    1: /*
    2:  * 文字(xchar)のタイプなどを扱う
    3:  *
    4:  * Copyright (C) 2001-2006 TABATA Yusuke
    5:  */
    6: #include <string.h>
    7: #include "config.h"
    8: 
    9: #include <anthy/xstr.h>
   10: #include <anthy/xchar.h>
   11: 
   12: #include "diclib_inner.h"
   13: 
   14: #define PAGE_SIZE 128
   15: #define NR_PAGES 512
   16: #include "e2u.h"
   17: #include "u2e.h"
   18: 
   19: /* this use UCS4 */
   20: static struct xchar_ent {
   21:   const xchar xc;
   22:   const int type;
   23:   struct xchar_ent *next;/* hash chain */
   24: } xchar_tab[] =
   25: {
   26:   {0x309b, XCT_CLOSE, 0}, /* ” */
   27:   {0xff08, XCT_OPEN, 0}, /* ( */
   28:   {0xff09, XCT_CLOSE, 0}, /* ) */
   29:   {0x3014, XCT_OPEN, 0},  /* 〔 */
   30:   {0x3015, XCT_CLOSE, 0}, /* 〕 */
   31:   {0xff3b, XCT_OPEN, 0}, /* [ */
   32:   {0xff3d, XCT_CLOSE, 0}, /* ] */
   33:   {0xff5b, XCT_OPEN, 0},  /* { */
   34:   {0xff5d, XCT_CLOSE, 0},  /* } */
   35:   {0x3008, XCT_OPEN, 0},  /* < */
   36:   {0x3009, XCT_CLOSE, 0},  /* > */
   37:   {0x300a, XCT_OPEN, 0},  /* 《 */
   38:   {0x300b, XCT_CLOSE, 0},  /* 》 */
   39:   {0x300c, XCT_OPEN, 0},  /* 「 */
   40:   {0x300d, XCT_CLOSE, 0},  /* 」 */
   41:   {0x300e, XCT_OPEN, 0},  /* 『 */
   42:   {0x300f, XCT_CLOSE, 0},  /* 』 */
   43:   {0x3010, XCT_OPEN, 0},  /* 【 */
   44:   {0x3011, XCT_CLOSE, 0},  /* 】 */
   45:   {0x3001, XCT_PUNCTUATION, 0},  /* 、 */
   46:   {0x3002, XCT_PUNCTUATION, 0},  /* 。 */
   47:   {0xff0c, XCT_PUNCTUATION, 0},  /* , */
   48:   {0xff0e, XCT_PUNCTUATION, 0},  /* . */
   49:   {0xff1f, XCT_PUNCTUATION, 0},  /* ? */
   50:   {0xff01, XCT_PUNCTUATION, 0},  /* ! */
   51: 
   52:   {28, XCT_OPEN, 0}, /* ( */
   53:   {133, XCT_OPEN, 0}, /* [ */
   54:   {29, XCT_CLOSE, 0}, /* ) */
   55:   {135, XCT_CLOSE, 0}, /* ] */
   56:   {HK_TO, XCT_DEP, 0},/* と */
   57:   {HK_HA, XCT_DEP, 0},/* は */
   58:   {HK_NO, XCT_DEP, 0},/* の */
   59:   {HK_NI, XCT_DEP, 0},/* に */
   60:   {HK_GA, XCT_DEP, 0},/* が */
   61:   {HK_WO, XCT_DEP, 0},/* を */
   62:   {WIDE_0, XCT_WIDENUM, 0},
   63:   {WIDE_1, XCT_WIDENUM, 0},
   64:   {WIDE_2, XCT_WIDENUM, 0},
   65:   {WIDE_3, XCT_WIDENUM, 0},
   66:   {WIDE_4, XCT_WIDENUM, 0},
   67:   {WIDE_5, XCT_WIDENUM, 0},
   68:   {WIDE_6, XCT_WIDENUM, 0},
   69:   {WIDE_7, XCT_WIDENUM, 0},
   70:   {WIDE_8, XCT_WIDENUM, 0},
   71:   {WIDE_9, XCT_WIDENUM, 0},
   72:   {HK_DDOT, XCT_PART, 0},
   73:   {HK_XA, XCT_PART, 0},
   74:   {HK_XI, XCT_PART, 0},
   75:   {HK_XU, XCT_PART, 0},
   76:   {HK_XE, XCT_PART, 0},
   77:   {HK_XO, XCT_PART, 0},
   78:   {HK_XYA, XCT_PART, 0},
   79:   {HK_XYU, XCT_PART, 0},
   80:   {HK_XYO, XCT_PART, 0},
   81:   {HK_TT, XCT_PART, 0},
   82:   {0, 0, 0},
   83: };
   84: 
   85: #define DDOT 0x8ede
   86: #define CIRCLE 0x8edf
   87: 
   88: static const struct half_kana_table half_kana_tab[] = {
   89:   {HK_A,0x8eb1,0},
   90:   {HK_I,0x8eb2,0},
   91:   {HK_U,0x8eb3,0},
   92:   {HK_E,0x8eb4,0},
   93:   {HK_O,0x8eb5,0},
   94:   {HK_KA,0x8eb6,0},
   95:   {HK_KI,0x8eb7,0},
   96:   {HK_KU,0x8eb8,0},
   97:   {HK_KE,0x8eb9,0},
   98:   {HK_KO,0x8eba,0},
   99:   {HK_SA,0x8ebb,0},
  100:   {HK_SI,0x8ebc,0},
  101:   {HK_SU,0x8ebd,0},
  102:   {HK_SE,0x8ebe,0},
  103:   {HK_SO,0x8ebf,0},
  104:   {HK_TA,0x8ec0,0},
  105:   {HK_TI,0x8ec1,0},
  106:   {HK_TU,0x8ec2,0},
  107:   {HK_TE,0x8ec3,0},
  108:   {HK_TO,0x8ec4,0},
  109:   {HK_NA,0x8ec5,0},
  110:   {HK_NI,0x8ec6,0},
  111:   {HK_NU,0x8ec7,0},
  112:   {HK_NE,0x8ec8,0},
  113:   {HK_NO,0x8ec9,0},
  114:   {HK_HA,0x8eca,0},
  115:   {HK_HI,0x8ecb,0},
  116:   {HK_HU,0x8ecc,0},
  117:   {HK_HE,0x8ecd,0},
  118:   {HK_HO,0x8ece,0},
  119:   {HK_MA,0x8ecf,0},
  120:   {HK_MI,0x8ed0,0},
  121:   {HK_MU,0x8ed1,0},
  122:   {HK_ME,0x8ed2,0},
  123:   {HK_MO,0x8ed3,0},
  124:   {HK_YA,0x8ed4,0},
  125:   {HK_YU,0x8ed5,0},
  126:   {HK_YO,0x8ed6,0},
  127:   {HK_RA,0x8ed7,0},
  128:   {HK_RI,0x8ed8,0},
  129:   {HK_RU,0x8ed9,0},
  130:   {HK_RE,0x8eda,0},
  131:   {HK_RO,0x8edb,0},
  132:   {HK_WA,0x8edc,0},
  133:   {HK_WI,0,0},
  134:   {HK_WE,0,0},
  135:   {HK_WO,0x8ea6,0},
  136:   {HK_N,0x8edd,0},
  137:   {HK_TT,0x8eaf,0},
  138:   {HK_XA,0x8ea7,0},
  139:   {HK_XI,0x8ea8,0},
  140:   {HK_XU,0x8ea9,0},
  141:   {HK_XE,0x8eaa,0},
  142:   {HK_XO,0x8eab,0},
  143:   {HK_GA,0x8eb6,DDOT},
  144:   {HK_GI,0x8eb7,DDOT},
  145:   {HK_GU,0x8eb8,DDOT},
  146:   {HK_GE,0x8eb9,DDOT},
  147:   {HK_GO,0x8eba,DDOT},
  148:   {HK_ZA,0x8ebb,DDOT},
  149:   {HK_ZI,0x8ebc,DDOT},
  150:   {HK_ZU,0x8ebd,DDOT},
  151:   {HK_ZE,0x8ebe,DDOT},
  152:   {HK_ZO,0x8ebf,DDOT},
  153:   {HK_DA,0x8ec0,DDOT},
  154:   {HK_DI,0x8ec1,DDOT},
  155:   {HK_DU,0x8ec2,DDOT},
  156:   {HK_DE,0x8ec3,DDOT},
  157:   {HK_DO,0x8ec4,DDOT},
  158:   {HK_BA,0x8eca,DDOT},
  159:   {HK_BI,0x8ecb,DDOT},
  160:   {HK_BU,0x8ecc,DDOT},
  161:   {HK_BE,0x8ecd,DDOT},
  162:   {HK_BO,0x8ece,DDOT},
  163:   {HK_PA,0x8eca,CIRCLE},
  164:   {HK_PI,0x8ecb,CIRCLE},
  165:   {HK_PU,0x8ecc,CIRCLE},
  166:   {HK_PE,0x8ecd,CIRCLE},
  167:   {HK_PO,0x8ece,CIRCLE},
  168:   {HK_XYA,0x8eac,0},
  169:   {HK_XYU,0x8ead,0},
  170:   {HK_XYO,0x8eae,0},
  171:   {HK_XWA,0,0},
  172:   {HK_DDOT,DDOT,0},
  173:   {HK_BAR,0x8eb0,0},
  174:   {0,0,0}
  175: };
  176: 
  177: static const struct half_wide_ent {
  178:   const xchar half;
  179:   const xchar wide;
  180: } half_wide_tab[] = {
  181:   {'!', 0xff01},
  182:   {'\"', 0x201d},
  183:   {'#', 0xff03},
  184:   {'$', 0xff04},
  185:   {'%', 0xff05},
  186:   {'&', 0xff06},
  187:   {'\'', 0x2019},
  188:   {'(', 0xff08},
  189:   {')', 0xff09},
  190:   {'*', 0xff0a},
  191:   {'+', 0xff0b},
  192:   {',', 0xff0c},
  193:   {'-', 0xff0d},
  194:   {'.', 0xff0e},
  195:   {'/', 0xff0f},
  196:   {':', 0xff1a},
  197:   {';', 0xff1b},
  198:   {'<', 0xff1c},
  199:   {'=', 0xff1d},
  200:   {'>', 0xff1e},
  201:   {'?', 0xff1f},
  202:   {'@', 0xff20},
  203:   {'[', 0xff3b},
  204:   {'\\', 0xff3c},
  205:   {']', 0xff3d},
  206:   {'^', 0xff3e},
  207:   {'_', 0xff3f},
  208:   {'`', 0xff40},
  209:   {'{', 0xff5b},
  210:   {'|', 0xff5c},
  211:   {'}', 0xff5d},
  212:   {'~', 0xff5e},
  213:   {0, 0}
  214: };
  215: 
  216: xchar
  217: anthy_lookup_half_wide(xchar xc)
  218: {
  219:   const struct half_wide_ent *hw;
  220:   for (hw = half_wide_tab; hw->half; hw ++) {
  221:     if (hw->half == xc) {
  222:       return hw->wide;
  223:     }
  224:     if (hw->wide == xc) {
  225:       return hw->half;
  226:     }
  227:   }
  228:   return 0;
  229: }
  230: 
  231: const struct half_kana_table *
  232: anthy_find_half_kana(xchar xc)
  233: {
  234:   const struct half_kana_table *tab;
  235:   for (tab = half_kana_tab; tab->src; tab ++) {
  236:     if (tab->src == xc && tab->dst) {
  237:       return tab;
  238:     }
  239:   }
  240:   return NULL;
  241: }
  242: 
  243: static int
  244: find_xchar_type(xchar xc)
  245: {
  246:   struct xchar_ent *xe = xchar_tab;
  247: 
  248:   for (; xe->xc; xe++) {
  249:     if (xe->xc == xc) {
  250:       return xe->type;
  251:     }
  252:   }
  253: 
  254:   return XCT_NONE;
  255: }
  256: 
  257: static int
  258: is_hira(xchar xc)
  259: {
  260:   if (xc == HK_DDOT) {
  261:     return 1;
  262:   }
  263:   if (xc == HK_BAR) {
  264:     return 1;
  265:   }
  266:   xc = anthy_ucs_to_euc(xc);
  267:   if ((xc & 0xff00) == 0xa400) {
  268:     return 1;
  269:   }
  270:   return 0;
  271: }
  272: 
  273: static int
  274: is_kata(xchar xc)
  275: {
  276:   if (xc == HK_BAR) {
  277:     return 1;
  278:   }
  279:   xc = anthy_ucs_to_euc(xc);
  280:   if ((xc & 0xff00) == 0xa500) {
  281:     return 1;
  282:   }
  283:   return 0;
  284: }
  285: 
  286: static int
  287: is_symbol(xchar xc)
  288: {
  289:   if (xc == UCS_GETA) {
  290:     return 1;
  291:   }
  292:   xc = anthy_ucs_to_euc(xc);
  293:   if (xc == EUC_GETA) {
  294:     return 0;
  295:   }
  296:   if ((xc & 0xff00) == 0xa100) {
  297:     return 1;
  298:   }
  299:   if ((xc & 0xff00) == 0xa200) {
  300:     return 1;
  301:   }
  302:   return 0;
  303: }
  304: 
  305: static int
  306: is_kanji(xchar xc)
  307: {
  308:   if (xc > 0x4e00 && xc < 0xa000) {
  309:     return 1;
  310:   }
  311:   return 0;
  312: }
  313: 
  314: static int
  315: search(const int *tab[], int v, int geta)
  316: {
  317:   int page = v / PAGE_SIZE;
  318:   int off = v % PAGE_SIZE;
  319:   const int *t;
  320:   if (page >= NR_PAGES) {
  321:     return geta;
  322:   }
  323:   t = tab[page];
  324:   if (!t) {
  325:     return geta;
  326:   }
  327:   if (!t[off] && v) {
  328:     return geta;
  329:   }
  330:   return t[off];
  331: }
  332: 
  333: int
  334: anthy_euc_to_ucs(int ec)
  335: {
  336:   return search(e2u_index, ec, UCS_GETA);
  337: }
  338: 
  339: int
  340: anthy_ucs_to_euc(int uc)
  341: {
  342:   int r = search(u2e_index, uc, EUC_GETA);
  343:   if (r > 65536) {
  344:     return EUC_GETA;
  345:   }
  346:   return r;
  347: }
  348: 
  349: int
  350: anthy_get_xchar_type(const xchar xc)
  351: {
  352:   int t = find_xchar_type(xc);
  353:   if (xc > 47 && xc < 58) {
  354:     t |= XCT_NUM;
  355:   }
  356:   if (xc < 128) {
  357:     t |= XCT_ASCII;
  358:   }
  359:   if (is_hira(xc)) {
  360:     t |= XCT_HIRA;
  361:   }
  362:   if (is_kata(xc)) {
  363:     t |= XCT_KATA;
  364:   }
  365:   if (is_symbol(xc)) {
  366:     if (!(t & XCT_OPEN) && !(t & XCT_CLOSE)) {
  367:       t |= XCT_SYMBOL;
  368:     }
  369:   }
  370:   if (is_kanji(xc)) {
  371:     t |= XCT_KANJI;
  372:   }
  373:   return t;
  374: }
  375: 
  376: int
  377: anthy_get_xstr_type(const xstr *xs)
  378: {
  379:   int i, t = XCT_ALL;
  380:   for (i = 0; i < xs->len; i++) {
  381:     t &= anthy_get_xchar_type(xs->str[i]);
  382:   }
  383:   return t;
  384: }
  385: 
  386: int
  387: anthy_xchar_to_num(xchar xc)
  388: {
  389:   switch (xc) {
  390:   case WIDE_0:return 0;
  391:   case WIDE_1:return 1;
  392:   case WIDE_2:return 2;
  393:   case WIDE_3:return 3;
  394:   case WIDE_4:return 4;
  395:   case WIDE_5:return 5;
  396:   case WIDE_6:return 6;
  397:   case WIDE_7:return 7;
  398:   case WIDE_8:return 8;
  399:   case WIDE_9:return 9;
  400:   }
  401:   if (xc >= '0' && xc <= '9') {
  402:     return xc - (int)'0';
  403:   }
  404:   return -1;
  405: }
  406: 
  407: xchar
  408: anthy_xchar_wide_num_to_num(xchar c)
  409: {
  410:   switch (c) {
  411:   case WIDE_0:return '0';
  412:   case WIDE_1:return '1';
  413:   case WIDE_2:return '2';
  414:   case WIDE_3:return '3';
  415:   case WIDE_4:return '4';
  416:   case WIDE_5:return '5';
  417:   case WIDE_6:return '6';
  418:   case WIDE_7:return '7';
  419:   case WIDE_8:return '8';
  420:   case WIDE_9:return '9';
  421:   default:return c;
  422:   }
  423: }
  424: 
  425: void
  426: anthy_init_xchar_tab(void)
  427: {
  428: }
Syntax (Markdown)