(linenum→info "unix/slp.c:2238")

anthy/9100e/src-worddic/ext_ent.c

    1: /*
    2:  * "123" "ABC" のような辞書にのってない
    3:  * 文字列に対する問合せの場合は全ての候補をここで生成する
    4:  * 上記の他に郵便番号へのアクセスも行なう
    5:  *
    6:  * Copyright (C) 2001-2005 TABATA Yusuke
    7:  * Copyright (C) 2004-2005 YOSHIDA Yuichi
    8:  *
    9:  */
   10: /*
   11:   This library is free software; you can redistribute it and/or
   12:   modify it under the terms of the GNU Lesser General Public
   13:   License as published by the Free Software Foundation; either
   14:   version 2 of the License, or (at your option) any later version.
   15: 
   16:   This library is distributed in the hope that it will be useful,
   17:   but WITHOUT ANY WARRANTY; without even the implied warranty of
   18:   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   19:   Lesser General Public License for more details.
   20: 
   21:   You should have received a copy of the GNU Lesser General Public
   22:   License along with this library; if not, write to the Free Software
   23:   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307  USA
   24:  */
   25: #include <stdlib.h>
   26: #include <string.h>
   27: #include <stdio.h>
   28: #include <anthy/anthy.h> /* for ANTHY_*_ENCODING */
   29: #include <anthy/conf.h>
   30: #include <anthy/xstr.h>
   31: #include <anthy/xchar.h>
   32: #include "dic_main.h"
   33: #include "dic_ent.h"
   34: 
   35: /* ext entry */
   36: static struct seq_ent unkseq_ent;/*未知文字列たとえば英文字列とか*/
   37: static struct seq_ent num_ent;/*数字など*/
   38: static struct seq_ent sep_ent;/*セパレータなど。*/
   39: /* ext entryのwtype*/
   40: static wtype_t wt_num;
   41: 
   42: static xchar narrow_wide_tab[]= {WIDE_0, WIDE_1, WIDE_2,
   43:                                  WIDE_3, WIDE_4, WIDE_5,
   44:                                  WIDE_6, WIDE_7, WIDE_8, WIDE_9};
   45: static int kj_num_tab[]={KJ_0, KJ_1, KJ_2, KJ_3, KJ_4,
   46:                          KJ_5, KJ_6, KJ_7, KJ_8, KJ_9};
   47: 
   48: struct zipcode_line {
   49:   int nr;
   50:   xstr **strs;
   51: };
   52: 
   53: /* 地名を追加する */
   54: static void
   55: pushback_place_name(struct zipcode_line *zl, char *pn)
   56: {
   57:   if (pn[0] == '#') {
   58:     return ;
   59:   }
   60:   zl->strs = realloc(zl->strs, sizeof(xstr *) * (zl->nr + 1));
   61:   zl->strs[zl->nr] = anthy_cstr_to_xstr(pn, ANTHY_EUC_JP_ENCODING);
   62:   zl->nr++;
   63: }
   64: 
   65: /* 郵便番号辞書をパースしてスペース区切りを検出する */
   66: static void
   67: parse_zipcode_line(struct zipcode_line *zl, char *ln)
   68: {
   69:   char buf[1000];
   70:   int i = 0;
   71:   while (*ln) {
   72:     buf[i] = *ln;
   73:     if (*ln == '\\') {
   74:       buf[i] = ln[1];
   75:       i ++;
   76:       if (ln[1]) {
   77:         ln ++;
   78:       }
   79:     } else if (*ln == ' ') {
   80:       buf[i] = 0;
   81:       i = 0;
   82:       pushback_place_name(zl, buf);
   83:     } else {
   84:       i ++;
   85:     }
   86:     /**/
   87:     ln ++;
   88:   }
   89:   buf[i] = 0;
   90:   pushback_place_name(zl, buf);
   91: }
   92: 
   93: /* 郵便番号辞書から探す */
   94: static void
   95: search_zipcode_dict(struct zipcode_line *zl, xstr* xs)
   96: {
   97:   FILE *fp;
   98:   char buf[1000];
   99:   int len;
  100:   xstr *temp;
  101:   char *index;
  102: 
  103:   zl->nr = 0;
  104:   zl->strs = NULL;
  105:   fp = fopen(anthy_conf_get_str("ZIPDICT_EUC"), "r");
  106:   if (!fp) {
  107:     return ;
  108:   }
  109:   
  110:   /* 半角、全角を吸収する */
  111:   temp = anthy_xstr_wide_num_to_num(xs);
  112:   index = anthy_xstr_to_cstr(temp, 0);
  113:   len = strlen(index);
  114: 
  115:   /* 全部grepする */
  116:   while (fgets(buf, 1000, fp)) {
  117:     /* 3文字の郵便番号が7文字の郵便番号の頭にマッチしないように */
  118:     if (!strncmp(buf, index, len) && buf[len] == ' ') {
  119:       /* 改行を消す */
  120:       buf[strlen(buf) - 1] = 0;
  121:       parse_zipcode_line(zl, &buf[len + 1]);
  122:     }
  123:   }
  124:   free(temp);
  125:   free(index);
  126:   fclose(fp);
  127: }
  128: 
  129: /* 郵便番号辞書の情報を解放する */
  130: static void
  131: free_zipcode_line(struct zipcode_line *zl)
  132: {
  133:   int i;
  134:   for (i = 0; i < zl->nr; i++) {
  135:     anthy_free_xstr(zl->strs[i]);
  136:   }
  137:   free(zl->strs);
  138: }
  139: 
  140: static int
  141: gen_zipcode(xstr* xs, xstr *dest, int nth)
  142: {
  143:   struct zipcode_line zl;
  144: 
  145:   /* 郵便番号辞書から地名を読み取る */
  146:   search_zipcode_dict(&zl, xs);
  147: 
  148:   /* 候補を取得する */
  149:   if (zl.nr > nth) {
  150:     dest->len = zl.strs[nth]->len;
  151:     dest->str = anthy_xstr_dup_str(zl.strs[nth]);
  152:     free_zipcode_line(&zl);
  153:     return 0;
  154:   } else {
  155:     free_zipcode_line(&zl);
  156:     return -1;
  157:   }
  158: }
  159: 
  160: 
  161: 
  162: /* 半角の数字から全角の数字を求める */
  163: static xchar
  164: narrow_num_to_wide_num(xchar xc)
  165: {
  166:   if (xc > '9' || xc < '0') {
  167:     return WIDE_0;
  168:   }
  169:   return narrow_wide_tab[(int)(xc - '0')];
  170: }
  171: 
  172: /* 全角の数字から半角の数字を求める */
  173: static xchar
  174: wide_num_to_narrow_num(xchar xc)
  175: {
  176:   int i;
  177:   for (i = 0; i < 10; i++) {
  178:     if (xc == narrow_wide_tab[i]) {
  179:       return i + '0';
  180:     }
  181:   }
  182:   return '0';
  183: }
  184: /*
  185:  * 一桁の整数を漢数字に変換する
  186:  */
  187: static xchar
  188: get_kj_num(int n)
  189: {
  190:   if (n > 9 || n < 1) {
  191:     return KJ_10;
  192:   }
  193:   return kj_num_tab[n];
  194: }
  195: 
  196: /*
  197:  * 4桁分の整数を漢字文字列としてを生成する
  198:  */
  199: static void
  200: compose_num_component(xstr *xs, long long num)
  201: {
  202:   int n[4],i;
  203:   int a[4] = { 0 , KJ_10, KJ_100, KJ_1000};
  204:   for (i = 0; i < 4; i++) {
  205:     n[i] = num-(num/10)*10;
  206:     num /= 10;
  207:   }
  208:   /* 10,100,1000の位 */
  209:   for (i = 3; i > 0; i--) {
  210:     if (n[i] > 0) {
  211:       if (n[i] > 1) {
  212:         anthy_xstrappend(xs, get_kj_num(n[i]));
  213:       }
  214:       anthy_xstrappend(xs, a[i]);
  215:     }
  216:   }
  217:   /* 1の位 */
  218:   if (n[0]) {
  219:     anthy_xstrappend(xs, get_kj_num(n[0]));
  220:   }
  221: }
  222: 
  223: /** 漢数字の文字列を作る */
  224: static int
  225: gen_kanji_num(long long num, xstr *dest)
  226: {
  227:   int i;
  228:   int n[10];
  229:   if (num < 1 || num >= 10000000000000000LL) {
  230:     return -1;
  231:   }
  232:   /* 4桁ずつ配列nにつめる */
  233:   for (i = 0; i < 10; i ++) {
  234:     n[i] = num-(num/10000)*10000;
  235:     num = num/10000;
  236:   }
  237:   /**/
  238:   dest->len = 0;
  239:   dest->str = 0;
  240:   /* 京の位をつくる */
  241:   if (n[3]) {
  242:     compose_num_component(dest, n[3]);
  243:     anthy_xstrappend(dest, KJ_1000000000000);
  244:   }
  245:   /* 億の位をつくる */
  246:   if (n[2]) {
  247:     compose_num_component(dest, n[2]);
  248:     anthy_xstrappend(dest, KJ_100000000);
  249:   }
  250:   /* 万の位をつくる */
  251:   if (n[1]) {
  252:     compose_num_component(dest, n[1]);
  253:     anthy_xstrappend(dest, KJ_10000);
  254:   }
  255:   /**/
  256:   compose_num_component(dest, n[0]);
  257:   return 0;
  258: }
  259: 
  260: static int
  261: get_nr_zipcode(xstr* xs)
  262: {
  263:   struct zipcode_line zl;
  264:   int nr = 0;
  265:   if (xs->len != 3 && xs->len != 7) {
  266:     return 0;
  267:   }
  268:   /* 郵便番号辞書から地名を読み取る */
  269:   search_zipcode_dict(&zl, xs);
  270: 
  271:   nr = zl.nr;
  272:   free_zipcode_line(&zl);
  273:   return nr;
  274: }
  275: 
  276: static int
  277: get_nr_num_ents(long long num)
  278: {
  279:   if (num > 0 && num < 10000000000000000LL) {
  280:     if (num > 999) {
  281:       /* アラビア数字(そのまま)、アラビア数字(全角半角切替え)、
  282:          漢数字、3桁区切り(全角、半角) */
  283:       return 5;
  284:     } else {
  285:       /* アラビア数字(そのまま)、全角半角切替え、漢数字 */
  286:       return 3;
  287:     }
  288:   } else {
  289:     /* アラビア数字(そのまま)、全角半角切替え */
  290:     return 2;
  291:   }
  292: }
  293: 
  294: 
  295: /*
  296:  * いくつの合成のエントリーがあるか
  297:  */
  298: int
  299: anthy_get_nr_dic_ents_of_ext_ent(seq_ent_t se, xstr *xs)
  300: {
  301:   if (se == &unkseq_ent) {
  302:     return 1;
  303:   }
  304:   if (anthy_get_xstr_type(xs) & (XCT_NUM|XCT_WIDENUM)) {
  305:     long long num = anthy_xstrtoll(xs);
  306:     return get_nr_num_ents(num) + get_nr_zipcode(xs);
  307:   }
  308:   return 0;
  309: }
  310: 
  311: /* 文字列の全角半角を交換する */
  312: static void
  313: toggle_wide_narrow(xstr *dest, xstr *src)
  314: {
  315:   int f, i;
  316:   dest->len = src->len;
  317:   dest->str = anthy_xstr_dup_str(src);
  318:   f = anthy_get_xstr_type(src) & XCT_WIDENUM;
  319:   for (i = 0; i < dest->len; i++) {
  320:     if (f) {
  321:       dest->str[i] = wide_num_to_narrow_num(src->str[i]);
  322:     } else {
  323:       dest->str[i] = narrow_num_to_wide_num(src->str[i]);
  324:     }
  325:   }
  326: }
  327: 
  328: /* 3桁に区切った数字を生成する */
  329: static int
  330: gen_separated_num(long long num, xstr *dest, int full)
  331: {
  332:   int width = 0, dot_count;
  333:   long long tmp;
  334:   int i, pos;
  335: 
  336:   if (num < 1000) {
  337:     return -1;
  338:   }
  339: 
  340:   /* 桁数を数える */
  341:   for (tmp = num; tmp != 0; tmp /= 10) {
  342:     width ++;
  343:   }
  344:   /* 点の数 */
  345:   dot_count = (width - 1) / 3;
  346:   /* 格納するのに必要な文字列を用意する */
  347:   dest->len = dot_count + width;
  348:   dest->str = malloc(sizeof(xchar)*dest->len);
  349: 
  350:   /* 右の桁から順に決めていく */
  351:   for (i = 0, pos = dest->len - 1; i < width; i++, pos --) {
  352:     int n = num % 10;
  353:     /* カンマを追加 */
  354:     if (i > 0 && (i % 3) == 0) {
  355:       if (full) {
  356:         dest->str[pos] = WIDE_COMMA;
  357:       } else {
  358:         dest->str[pos] = ',';
  359:       }
  360:       pos --;
  361:     }
  362:     if (full) {
  363:       /* 全角数字 */
  364:       dest->str[pos] = narrow_wide_tab[n];
  365:     } else {
  366:       /* ASCII数字 */
  367:       dest->str[pos] = 48 + n;
  368:     }
  369:     num /= 10;
  370:   }
  371:   return 0;
  372: }
  373: 
  374: /*
  375:  * nth個めの候補を取り出す
  376:  */
  377: int
  378: anthy_get_nth_dic_ent_str_of_ext_ent(seq_ent_t se, xstr *xs,
  379:                                      int nth, xstr *dest)
  380: {
  381:   if (nth == 0) {
  382:     /* 無変換文字列 */
  383:     dest->len = xs->len;
  384:     dest->str = anthy_xstr_dup_str(xs);
  385:     return 0;
  386:   }
  387:   if (se == &unkseq_ent) {
  388:     switch(nth) {
  389:     case 1:
  390:       /* 全角、半角のトグル */
  391:       return 0;
  392:     }
  393:   }
  394:   if (anthy_get_xstr_type(xs) & (XCT_NUM|XCT_WIDENUM)) {
  395:     long long num = anthy_xstrtoll(xs);
  396:     /* 漢数字、アラビア数字、全角半角切替え */
  397:     switch(nth) {
  398:     case 1:
  399:       /* 全角半角を入れ換えたもの */
  400:       toggle_wide_narrow(dest, xs);
  401:       return 0;
  402:     case 2:
  403:       /* 漢数字 */
  404:       if (!gen_kanji_num(num, dest)) {
  405:         return 0;
  406:       }
  407:       /* break無し */
  408:     case 3:
  409:       /* 3桁で区切った数字 */
  410:       if (!gen_separated_num(num, dest, 0)) {
  411:         return 0;
  412:       }
  413:       /* break無し */
  414:     case 4:
  415:       /* 3桁で区切った数字(全角) */
  416:       if (!gen_separated_num(num, dest, 1)) {
  417:         return 0;
  418:       }
  419:       /* break無し */
  420:     default:
  421:       /* 郵便番号 */
  422:       if (nth >= 5) {
  423:         if (xs->len == 3 || xs->len == 7) {
  424:           if (!gen_zipcode(xs, dest, nth-5)) {
  425:             return 0;
  426:           }
  427:         }
  428:       }
  429:       break;
  430:     }
  431:     return -1;
  432:   }
  433:   return 0;
  434: }
  435: 
  436: int
  437: anthy_get_ext_seq_ent_indep(struct seq_ent *se)
  438: {
  439:   if (se == &num_ent || se == &unkseq_ent) {
  440:     return 1;
  441:   }
  442:   return 0;
  443: }
  444: 
  445: /* 活用形を得る */
  446: int
  447: anthy_get_ext_seq_ent_ct(struct seq_ent *se, int pos, int ct)
  448: {
  449:   if (anthy_get_ext_seq_ent_pos(se, pos) && ct == CT_NONE) {
  450:     /* 品詞が合っていてかつ無活用の場合 
  451:        (ext_entは活用しない) */
  452:     return 10;
  453:   }
  454:   return 0;
  455: }
  456: 
  457: /* 品詞を取得する */
  458: int
  459: anthy_get_ext_seq_ent_pos(struct seq_ent *se, int pos)
  460: {
  461:   /* ext_entは名詞のみ */
  462:   if (se == &num_ent && pos == POS_NOUN) {
  463:     return 10;
  464:   }
  465:   if ((se == &unkseq_ent) && pos == POS_NOUN) {
  466:     return 10;
  467:   }
  468:   return 0;
  469: }
  470: 
  471: /*
  472:  * 辞書にのっていないシーケンスを解析
  473:  */
  474: seq_ent_t
  475: anthy_get_ext_seq_ent_from_xstr(xstr *x, int is_reverse)
  476: {
  477:   int t = anthy_get_xstr_type(x);
  478: 
  479:   /* 数字のみで構成されていれば num_ent */
  480:   if (t & (XCT_NUM | XCT_WIDENUM)) {
  481:     return &num_ent;
  482:   }
  483:   /* 英数ならunkseq */
  484:   if (t & XCT_ASCII) {
  485:     return &unkseq_ent;
  486:   }
  487:   if (t & XCT_KATA) {
  488:     return &unkseq_ent;
  489:   }
  490:   if (!is_reverse) {
  491:     /* 逆変換中は漢字候補は作らない */
  492:     if (t & XCT_KANJI) {
  493:       return &unkseq_ent;
  494:     }
  495:   }
  496:   if (x->len == 1) {
  497:     /* 辞書にのってなくて1文字ならセパレータ */
  498:     return &sep_ent;
  499:   }
  500:   return 0;
  501: }
  502: 
  503: int
  504: anthy_get_nth_dic_ent_wtype_of_ext_ent(xstr *xs, int nth,
  505:                                        wtype_t *wt)
  506: {
  507:   int type;
  508:   (void)nth;
  509:   type = anthy_get_xstr_type(xs);
  510:   if (type & (XCT_NUM | XCT_WIDENUM)) {
  511:     *wt = wt_num;
  512:     return 0;
  513:   }
  514:   if (type & XCT_KATA) {
  515:     *wt = anthy_get_wtype(POS_NOUN, COS_NONE, SCOS_NONE, CC_NONE,
  516:                           CT_NONE, WF_INDEP);
  517:     return 0;
  518:   }
  519:   return -1;
  520: }
  521: 
  522: int
  523: anthy_get_nth_dic_ent_freq_of_ext_ent(struct seq_ent *se, int nth)
  524: {
  525:   (void)se;
  526:   (void)nth;
  527:   return 100;
  528: }
  529: 
  530: int
  531: anthy_get_ext_seq_ent_wtype(struct seq_ent *se, wtype_t w)
  532: {
  533:   if (se == &num_ent) {
  534:     if (anthy_wtype_include(w, wt_num)) {
  535:       /* 数字の場合 */
  536:       return 10;
  537:     }
  538:     return 0;
  539:   }
  540:   if (anthy_wtype_get_pos(w) == POS_NOUN &&
  541:       anthy_wtype_get_cos(w) == COS_NONE &&
  542:       anthy_wtype_get_scos(w) == SCOS_NONE) {
  543:     /* 名詞、副品詞なし、副々品詞無しにマッチ */
  544:     return 10;
  545:   }
  546:   return 0;
  547: }
  548: 
  549: void
  550: anthy_init_ext_ent(void)
  551: {
  552:   /**/
  553:   unkseq_ent.seq_type = 0;
  554:   unkseq_ent.nr_dic_ents = 0;
  555:   num_ent.seq_type = 0;
  556:   num_ent.nr_dic_ents = 0;
  557:   sep_ent.seq_type = 0;
  558:   sep_ent.nr_dic_ents = 0;
  559:   /**/
  560:   wt_num = anthy_init_wtype_by_name("数詞");
  561: }
1
Syntax (Markdown)