(linenum→info "unix/slp.c:2238")

anthy/9100e/src-diclib/xstr.c

    1: /*
    2:  * Anthy内部で使う文字列の処理
    3:  *  typedef struct xstr_ {
    4:  *    xstr *str; int len;
    5:  *  } xstr;
    6:  *
    7:  * malloc(0);の意味は考えないで0文字の文字列を扱えるような
    8:  * コーディングをする。free(0)は良い。
    9:  *
   10:  * デフォルトの設定では
   11:  *  cstrはCの普通のEUC文字列
   12:  *
   13:  * Copyright (C) 2000-2007 TABATA Yusuke
   14:  *
   15:  */
   16: /*
   17:   This library is free software; you can redistribute it and/or
   18:   modify it under the terms of the GNU Lesser General Public
   19:   License as published by the Free Software Foundation; either
   20:   version 2 of the License, or (at your option) any later version.
   21: 
   22:   This library is distributed in the hope that it will be useful,
   23:   but WITHOUT ANY WARRANTY; without even the implied warranty of
   24:   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   25:   Lesser General Public License for more details.
   26: 
   27:   You should have received a copy of the GNU Lesser General Public
   28:   License along with this library; if not, write to the Free Software
   29:   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307  USA
   30:  */
   31: #include <stdio.h>
   32: #include <stdlib.h>
   33: #include <string.h>
   34: 
   35: #include "config.h"
   36: /* for ANTHY_*_ENCODING */
   37: #include <anthy/anthy.h>
   38: 
   39: #include <anthy/xstr.h>
   40: #include <anthy/xchar.h>
   41: #include "diclib_inner.h"
   42: 
   43: /* 画面に出力するときのエンコーディング */
   44: static int print_encoding;
   45: 
   46: #define MAX_BYTES_PER_XCHAR 10
   47: 
   48: static int
   49: xc_isprint(xchar xc)
   50: {
   51:   return xc > 0;
   52: }
   53: 
   54: /** Cの文字列に対応するxstrの長さを計算する
   55:  */
   56: static int
   57: xlengthofcstr(const char *c)
   58: {
   59:   int ll = 0;
   60:   int len = strlen(c);
   61:   int i;
   62:   for (i = 0; i < len; i++) {
   63:     ll ++;
   64:     if ((c[i] & 0x80)) {
   65:       i++;
   66:     }
   67:   }
   68:   return ll;
   69: }
   70: 
   71: const char *
   72: anthy_utf8_to_ucs4_xchar(const char *s, xchar *res)
   73: {
   74:   const unsigned char *str = (const unsigned char *)s;
   75:   int i, len;
   76:   xchar cur;
   77:   cur = str[0];
   78:   if (str[0] < 0x80) {
   79:     len = 1;
   80:   } else if (str[0] < 0xe0) {
   81:     cur &= 0x1f;
   82:     len = 2;
   83:   } else if (str[0] < 0xf0) {
   84:     cur &= 0x0f;
   85:     len = 3;
   86:   } else if (str[0] < 0xf8) {
   87:     cur &= 0x07;
   88:     len = 4;
   89:   } else if (str[0] < 0xfc) {
   90:     cur &= 0x03;
   91:     len = 5;
   92:   } else {
   93:     cur &= 0x01;
   94:     len = 6;
   95:   }
   96:   str ++;
   97:   for (i = 1; i < len; i++) {
   98:     cur <<= 6;
   99:     cur |= (str[0] & 0x3f);
  100:     str++;
  101:   }
  102:   *res = cur;
  103:   return (const char *)str;
  104: }
  105: 
  106: static xstr *
  107: utf8_to_ucs4_xstr(const char *s)
  108: {
  109:   const unsigned char *str = (const unsigned char *)s;
  110:   xstr res;
  111:   res.str = (xchar *)alloca(sizeof(xchar) * strlen(s));
  112:   res.len = 0;
  113: 
  114:   while (*str) {
  115:     xchar cur;
  116:     str = (const unsigned char *)anthy_utf8_to_ucs4_xchar((const char *)str,
  117:                                                           &cur);
  118:     res.str[res.len] = cur;
  119:     res.len ++;
  120:   }
  121:   return anthy_xstr_dup(&res);
  122: }
  123: 
  124: static int
  125: put_xchar_to_utf8_str(xchar xc, char *buf_)
  126: {
  127:   int i, len;
  128:   unsigned char *buf = (unsigned char *)buf_;
  129:   if (xc < 0x80) {
  130:     buf[0] = 0;
  131:     len = 1;
  132:   } else if (xc < 0x800) {
  133:     buf[0] = 0xc0;
  134:     len = 2;
  135:   } else if (xc < 0x10000) {
  136:     buf[0] = 0xe0;
  137:     len = 3;
  138:   } else if (xc < 0x200000) {
  139:     buf[0] = 0xf0;
  140:     len = 4;
  141:   } else if (xc < 0x400000) {
  142:     buf[0] = 0xf8;
  143:     len = 5;
  144:   } else {
  145:     buf[0] = 0xfc;
  146:     len = 6;
  147:   }
  148:   for (i = len - 1; i > 0; i--) {
  149:     buf[i] = (xc & 0x3f) | 0x80;
  150:     xc >>= 6;
  151:   }
  152:   buf[0] += xc;
  153:   buf[len] = 0;
  154:   return len;
  155: }
  156: 
  157: static char *
  158: ucs4_xstr_to_utf8(xstr *xs)
  159: {
  160:   char *buf = alloca(xs->len * 6 + 1);
  161:   int i, t = 0;
  162:   buf[0] = 0;
  163:   for (i = 0; i < xs->len; i++) {
  164:     xchar xc = xs->str[i];
  165:     put_xchar_to_utf8_str(xc, &buf[t]);
  166:     t = strlen(buf);
  167:   }
  168:   return strdup(buf);
  169: }
  170: 
  171: /** Cの文字列をxstrに変更する
  172:  */
  173: xstr *
  174: anthy_cstr_to_xstr(const char *c, int encoding)
  175: {
  176:   xstr *x;
  177:   int i, j, l;
  178:   if (encoding == ANTHY_UTF8_ENCODING) {
  179:     return utf8_to_ucs4_xstr(c);
  180:   }
  181:   l = xlengthofcstr(c);
  182:   x = (xstr *)malloc(sizeof(struct xstr_));
  183:   if (!x) {
  184:     return NULL;
  185:   }
  186:   x->len = l;
  187:   x->str = malloc(sizeof(xchar)*l);
  188:   for (i = 0, j = 0; i < l; i++) {
  189:     if (!(c[j] & 0x80)){
  190:       x->str[i] = c[j];
  191:       j++;
  192:     } else {
  193:       unsigned char *p = (unsigned char *)&c[j];
  194:       x->str[i] = (p[1] | (p[0]<<8)) | 0x8080;
  195:       x->str[i] = anthy_euc_to_ucs(x->str[i]);
  196:       j++;
  197:       j++;
  198:     }
  199:   }
  200:   return x;
  201: }
  202: 
  203: char *
  204: anthy_xstr_to_cstr(xstr *s, int encoding)
  205: {
  206:   int i, j, l;
  207:   char *p;
  208: 
  209:   if (encoding == ANTHY_UTF8_ENCODING) {
  210:     return ucs4_xstr_to_utf8(s);
  211:   }
  212: 
  213:   l = s->len;
  214:   for (i = 0; i < s->len; i++) {
  215:     int ec = anthy_ucs_to_euc(s->str[i]);
  216:     if (ec > 255) {
  217:       l++;
  218:     }
  219:   }
  220:   p = (char *)malloc(l + 1);
  221:   p[l] = 0;
  222:   j = 0;
  223:   for (i =  0; i < s->len; i++) {
  224:     int ec = anthy_ucs_to_euc(s->str[i]);
  225:     if (ec < 256) {
  226:       p[j] = ec;
  227:       j++;
  228:     }else{
  229:       p[j] = ec >> 8;
  230:       j++;
  231:       p[j] = ec & 255;
  232:       j++;
  233:     }
  234:   }
  235:   return p;
  236: }
  237: 
  238: xstr *
  239: anthy_xstr_dup(xstr *s)
  240: {
  241:   int i;
  242:   xstr *x = (xstr *)malloc(sizeof(xstr));
  243:   x->len = s->len;
  244:   if (s->len) {
  245:     x->str = malloc(sizeof(xchar)*s->len);
  246:   }else{
  247:     x->str = NULL;
  248:   }
  249:   for (i = 0; i < x->len; i++) {
  250:     x->str[i] = s->str[i];
  251:   }
  252:   return x;
  253: }
  254: 
  255: xchar *
  256: anthy_xstr_dup_str(xstr *s)
  257: {
  258:   xchar *c;
  259:   int i;
  260:   if (s->len) {
  261:     c = malloc(sizeof(xchar)*s->len);
  262:   }else{
  263:     c = 0;
  264:   }
  265:   for (i = 0; i < s->len; i++) {
  266:     c[i] = s->str[i];
  267:   }
  268:   return c;
  269: }
  270: 
  271: void
  272: anthy_free_xstr(xstr *x)
  273: {
  274:   if (!x) {
  275:     return ;
  276:   }
  277:   /**/
  278:   free(x->str);
  279:   free(x);
  280: }
  281: 
  282: void
  283: anthy_free_xstr_str(xstr *x)
  284: {
  285:   if (!x) {
  286:     return ;
  287:   }
  288:   free(x->str);
  289: }
  290: 
  291: int
  292: anthy_sputxchar(char *buf, xchar x, int encoding)
  293: {
  294:   if (!xc_isprint(x)) {
  295:     sprintf(buf, "??");
  296:     return 2;
  297:   }
  298:   if (encoding == ANTHY_UTF8_ENCODING) {
  299:     return put_xchar_to_utf8_str(x, buf);
  300:   }
  301:   x = anthy_ucs_to_euc(x);
  302:   if (x < 256) {
  303:     buf[0] = x;
  304:     buf[1] = 0;
  305:     return 1;
  306:   }
  307:   buf[2] = 0;
  308:   buf[1] = 0x80 | (x & 255);
  309:   buf[0] = 0x80 | ((x>>8) & 255);
  310:   return 2;
  311: }
  312: 
  313: int
  314: anthy_sputxstr(char *buf, xstr *x, int encoding)
  315: {
  316:   char b[MAX_BYTES_PER_XCHAR];
  317:   int i, l = 0;
  318:   for (i = 0; i < x->len; i++) {
  319:     anthy_sputxchar(b, x->str[i], encoding);
  320:     sprintf(&buf[l], "%s", b);
  321:     l += strlen(b);
  322:   }
  323:   return l;
  324: }
  325: 
  326: int
  327: anthy_snputxstr(char *buf, int n, xstr *x, int encoding)
  328: {
  329:   char b[MAX_BYTES_PER_XCHAR];
  330:   int i, l=0;
  331:   for (i = 0; i < x->len; i++) {
  332:     anthy_sputxchar(b, x->str[i], encoding);
  333:     if ((int)strlen(b) + l >= n) {
  334:       return l;
  335:     }
  336:     n -= sprintf(&buf[l], "%s", b);
  337:     l += strlen(b);
  338:   }
  339:   return l;
  340: }
  341: 
  342: void
  343: anthy_putxchar(xchar x)
  344: {
  345:   char buf[MAX_BYTES_PER_XCHAR];
  346:   if (!xc_isprint(x)) {
  347:     printf("\\%x", x);
  348:     return ;
  349:   }
  350:   anthy_sputxchar(buf, x, print_encoding);
  351:   printf("%s", buf);
  352: }
  353: 
  354: void
  355: anthy_putxstr(xstr *x)
  356: {
  357:   int i;
  358:   for (i = 0; i < x->len; i++) {
  359:     anthy_putxchar(x->str[i]);
  360:   }
  361: }
  362: 
  363: void
  364: anthy_putxstrln(xstr *x)
  365: {
  366:   anthy_putxstr(x);
  367:   printf("\n");
  368: }
  369: 
  370: xstr*
  371: anthy_xstrcpy(xstr *dest, xstr *src)
  372: {
  373:   int i;
  374:   /* 文字列をコピー */
  375:   dest->len = src->len;
  376:   for (i = 0; i < src->len; i++) {
  377:     dest->str[i] = src->str[i];
  378:   }
  379:   
  380:   return dest;
  381: }
  382: /* 返り値の符号はstrcmpと同じ */
  383: int
  384: anthy_xstrcmp(xstr *x1, xstr *x2)
  385: {
  386:   int i, m;
  387:   if (x1->len < x2->len) {
  388:     m = x1->len;
  389:   }else{
  390:     m = x2->len;
  391:   }
  392:   for (i = 0 ; i < m ; i++) {
  393:     if (x1->str[i] < x2->str[i]) {
  394:       return -1;
  395:     }
  396:     if (x1->str[i] > x2->str[i]) {
  397:       return 1;
  398:     }
  399:   }
  400:   if (x1->len < x2->len) {
  401:     return -1;
  402:   }
  403:   if (x1->len > x2->len) {
  404:     return 1;
  405:   }
  406:   return 0;
  407: }
  408: 
  409: /* 返り値の符号はstrncmpと同じ */
  410: int
  411: anthy_xstrncmp(xstr *x1, xstr *x2, int n)
  412: {
  413:   int i, m;
  414:   if (x1->len < x2->len) {
  415:     m = x1->len;
  416:   }else{
  417:     m = x2->len;
  418:   }
  419:   if (m > n) m = n;
  420:   for (i = 0 ; i < m ; i++) {
  421:     if (x1->str[i] < x2->str[i]) {
  422:       return -1;
  423:     }
  424:     if (x1->str[i] > x2->str[i]) {
  425:       return 1;
  426:     }
  427:   }
  428:   if (x2->len <= n && x1->len < x2->len) {
  429:     return -1;
  430:   }
  431:   if (x1->len <= n && x1->len > x2->len) {
  432:     return 1;
  433:   }
  434:   return 0;
  435: }
  436: 
  437: 
  438: xstr *
  439: anthy_xstrcat(xstr *s, xstr *a)
  440: {
  441:   int i, l;
  442:   if (!s) {
  443:     s = malloc(sizeof(xstr));
  444:     s->str = NULL;
  445:     s->len = 0;
  446:   }
  447:   l = s->len + a->len;
  448:   s->str = realloc(s->str, sizeof(xchar)*l);
  449:   for (i = 0; i < a->len; i ++) {
  450:     s->str[s->len+i] = a->str[i];
  451:   }
  452:   s->len = l;
  453:   return s;
  454: }
  455: 
  456: xstr *
  457: anthy_xstrappend(xstr *xs, xchar xc)
  458: {
  459:   xstr p;
  460:   xchar q[1];
  461:   p.len = 1;
  462:   p.str = q;
  463:   q[0] = xc;
  464:   return anthy_xstrcat(xs, &p);
  465: }
  466: 
  467: long long
  468: anthy_xstrtoll(xstr *x)
  469: {
  470:   xchar c;
  471:   int i;
  472:   long long n = 0;/* 数 */
  473:   if (!x->len || x->len > 16) {
  474:     return -1;
  475:   }
  476:   if (!anthy_get_xstr_type(x) & (XCT_NUM | XCT_WIDENUM)) {
  477:     return -1;
  478:   }
  479:   for (i = 0; i < x->len; i++) {
  480:     c = x->str[i];
  481:     n *= 10;
  482:     n += anthy_xchar_to_num(c);
  483:   }
  484:   return n;
  485: }
  486: 
  487: /** 全角の数字を半角にする
  488:  */
  489: xstr *
  490: anthy_xstr_wide_num_to_num(xstr* src_xs)
  491: {
  492:   int i;
  493:   xstr *dst_xs;
  494:   dst_xs = anthy_xstr_dup(src_xs);
  495:   for (i = 0; i < src_xs->len; ++i) {
  496:     dst_xs->str[i] = anthy_xchar_wide_num_to_num(src_xs->str[i]);
  497:   }
  498:   return dst_xs;
  499: }
  500: 
  501: /** 平仮名をカタカナに変換する
  502:  */
  503: xstr *
  504: anthy_xstr_hira_to_kata(xstr *src_xs)
  505: {
  506:   xstr *dst_xs;
  507:   int i, j;
  508:   dst_xs = anthy_xstr_dup(src_xs);
  509: 
  510:   for (i = 0 ,j = 0; i < dst_xs->len; i++, j++) {
  511:     /* 「う゛」のチェック */
  512:     if (i < dst_xs->len - 1 && dst_xs->str[i] == HK_U
  513:         && dst_xs->str[i+1] == HK_DDOT) {
  514:       dst_xs->str[j] = KK_VU;/* ヴ */
  515:       i++;
  516:       continue ;
  517:     }
  518:     /**/
  519:     dst_xs->str[j] = dst_xs->str[i];
  520:     if ((anthy_ucs_to_euc(dst_xs->str[j]) & 0xff00) == 0xa400) {
  521:       /* ひらがなだったら256足す */
  522:       dst_xs->str[j] = anthy_ucs_to_euc(dst_xs->str[j]);
  523:       dst_xs->str[j] += 256;
  524:       dst_xs->str[j] = anthy_euc_to_ucs(dst_xs->str[j]);
  525:     }
  526:   }
  527:   dst_xs->len = j;
  528:   return dst_xs;
  529: }
  530: 
  531: xstr *
  532: anthy_xstr_hira_to_half_kata(xstr *src_xs)
  533: {
  534:   int len = src_xs->len;
  535:   int i, j;
  536:   xstr *xs;
  537:   for (i = 0; i < src_xs->len; i++) {
  538:     const struct half_kana_table *tab = anthy_find_half_kana(src_xs->str[i]);
  539:     if (tab && tab->mod) {
  540:       len ++;
  541:     }
  542:   }
  543:   xs = malloc(sizeof(xstr));
  544:   xs->len = len;
  545:   xs->str = malloc(sizeof(xchar) * len);
  546:   j = 0;
  547:   for (i = 0; i < src_xs->len; i++) {
  548:     const struct half_kana_table *tab = anthy_find_half_kana(src_xs->str[i]);
  549:     if (tab) {
  550:       xs->str[j] = anthy_euc_to_ucs(tab->dst);
  551:       if (tab->mod) {
  552:         j++;
  553:         xs->str[j] = anthy_euc_to_ucs(tab->mod);
  554:       }
  555:     } else {
  556:       xs->str[j] = src_xs->str[i];
  557:     }
  558:     j++;
  559:   }
  560:   return xs;
  561: }
  562: 
  563: xstr *
  564: anthy_conv_half_wide(xstr *xs)
  565: {
  566:   int i;
  567:   xstr *res;
  568:   for (i = 0; i < xs->len; i++) {
  569:     if (!anthy_lookup_half_wide(xs->str[i])) {
  570:       return NULL;
  571:     }
  572:   }
  573:   res = anthy_xstr_dup(xs);
  574:   for (i = 0; i < xs->len; i++) {
  575:     res->str[i] = anthy_lookup_half_wide(xs->str[i]);
  576:   }
  577:   return res;
  578: }
  579: 
  580: int
  581: anthy_xstr_hash(xstr *xs)
  582: {
  583:   int h,i;
  584:   h = 0;
  585:   for (i = 0 ;i < xs->len ;i++) {
  586:     h *= 97;
  587:     h += xs->str[i]<<4;