(linenum→info "unix/slp.c:2238")

anthy/9100e/src-splitter/wordlist.c

    1: /*
    2:  * 文節の最小単位であるwordlistを構成する
    3:  *
    4:  * anthy_make_word_list_all() 
    5:  * 文節の形式を満たす部分文字列を列挙する
    6:  *  いくかの経路で列挙されたword_listは
    7:  *  anthy_commit_word_listでsplitter_contextに追加される
    8:  *
    9:  * Funded by IPA未踏ソフトウェア創造事業 2002 2/27
   10:  * Copyright (C) 2000-2006 TABATA Yusuke
   11:  * Copyright (C) 2004-2006 YOSHIDA Yuichi
   12:  * Copyright (C) 2000-2003 UGAWA Tomoharu
   13:  *
   14:  * $Id: wordlist.c,v 1.50 2002/11/17 14:45:47 yusuke Exp $
   15:  *
   16:  */
   17: 
   18: #include <stdlib.h>
   19: #include <stdio.h>
   20: #include <string.h>
   21: #include <arpa/inet.h>
   22: 
   23: #include <anthy/alloc.h>
   24: #include <anthy/record.h>
   25: #include <anthy/xstr.h>
   26: #include <anthy/diclib.h>
   27: #include <anthy/wtype.h>
   28: #include <anthy/ruleparser.h>
   29: #include <anthy/dic.h>
   30: #include <anthy/splitter.h>
   31: #include <anthy/feature_set.h>
   32: #include "wordborder.h"
   33: 
   34: #define HF_THRESH 784
   35: 
   36: static void *weak_word_array;
   37: 
   38: /* デバッグ用 */
   39: void
   40: anthy_print_word_list(struct splitter_context *sc,
   41:                       struct word_list *wl)
   42: {
   43:   xstr xs;
   44:   if (!wl) {
   45:     printf("--\n");
   46:     return ;
   47:   }
   48:   /* 接頭辞 */
   49:   xs.len = wl->part[PART_CORE].from - wl->from;
   50:   xs.str = sc->ce[wl->from].c;
   51:   anthy_putxstr(&xs);
   52:   printf(".");
   53:   /* 自立語 */
   54:   xs.len = wl->part[PART_CORE].len;
   55:   xs.str = sc->ce[wl->part[PART_CORE].from].c;
   56:   anthy_putxstr(&xs);
   57:   printf(".");
   58:   /* 接尾辞 */
   59:   xs.len = wl->part[PART_POSTFIX].len;
   60:   xs.str = sc->ce[wl->part[PART_CORE].from + wl->part[PART_CORE].len].c;
   61:   anthy_putxstr(&xs);
   62:   printf("-");
   63:   /* 付属語 */
   64:   xs.len = wl->part[PART_DEPWORD].len;
   65:   xs.str = sc->ce[wl->part[PART_CORE].from +
   66:                   wl->part[PART_CORE].len +
   67:                   wl->part[PART_POSTFIX].len].c;
   68:   anthy_putxstr(&xs);
   69:   anthy_print_wtype(wl->part[PART_CORE].wt);
   70:   printf(" %s%s\n", anthy_seg_class_name(wl->seg_class),
   71:          (wl->is_compound ? ",compound" : ""));
   72: }
   73: 
   74: int
   75: anthy_dep_word_hash(xstr *xs)
   76: {
   77:   return anthy_xstr_hash(xs) % WORD_HASH_MAX;
   78: }
   79: 
   80: /** word_listを比較する、枝刈りのためなので、
   81:     厳密な比較である必要は無い */
   82: static int
   83: word_list_same(struct word_list *wl1, struct word_list *wl2)
   84: {
   85:   if (wl1->node_id != wl2->node_id ||
   86:       wl1->from != wl2->from ||
   87:       wl1->len != wl2->len ||
   88:       wl1->mw_features != wl2->mw_features ||
   89:       wl1->tail_ct != wl2->tail_ct ||
   90:       wl1->part[PART_CORE].len != wl2->part[PART_CORE].len ||
   91:       wl1->is_compound != wl2->is_compound ||
   92:       !anthy_wtype_equal(wl1->part[PART_CORE].wt, wl2->part[PART_CORE].wt) ||
   93:       wl1->head_pos != wl2->head_pos) {
   94:     return 0;
   95:   }
   96:   if (wl1->part[PART_DEPWORD].dc != wl2->part[PART_DEPWORD].dc) {
   97:     return 0;
   98:   }
   99:   /* 同じと判断 */
  100:   return 1;
  101: }
  102: 
  103: static void
  104: set_features(struct word_list *wl)
  105: {
  106:   if (anthy_wtype_get_pos(wl->part[PART_CORE].wt) == POS_NOUN &&
  107:       anthy_wtype_get_sv(wl->part[PART_CORE].wt)) {
  108:     wl->mw_features |= MW_FEATURE_SV;
  109:   }
  110:   if (wl->part[PART_POSTFIX].len || wl->part[PART_PREFIX].len) {
  111:     wl->mw_features |= MW_FEATURE_SUFFIX;
  112:   }
  113:   if (anthy_wtype_get_pos(wl->part[PART_CORE].wt) == POS_NUMBER) {
  114:     wl->mw_features |= MW_FEATURE_NUM;
  115:   }
  116:   if (wl->part[PART_CORE].len == 1) {
  117:     wl->mw_features |= MW_FEATURE_CORE1;
  118:   }
  119:   if (wl->part[PART_CORE].len == 0) {
  120:     wl->mw_features |= MW_FEATURE_DEP_ONLY;
  121:   }
  122:   if (wl->part[PART_CORE].freq > HF_THRESH) {
  123:     wl->mw_features |= MW_FEATURE_HIGH_FREQ;
  124:   }
  125: }
  126: 
  127: /** 作ったword_listのスコアを計算してからコミットする */
  128: void 
  129: anthy_commit_word_list(struct splitter_context *sc,
  130:                        struct word_list *wl)
  131: {
  132:   struct word_list *tmp;
  133:   xstr xs;
  134: 
  135:   /* 付属語だけのword_listで、長さ0のもやってくるので */
  136:   if (wl->len == 0) return;
  137:   /**/
  138:   wl->last_part = PART_DEPWORD;
  139: 
  140:   /**/
  141:   set_features(wl);
  142:   /* 文節境界の検索で使用するクラスの設定 */
  143:   anthy_set_seg_class(wl);
  144:   /**/
  145:   xs.len = wl->part[PART_DEPWORD].len;
  146:   xs.str = sc->ce[wl->part[PART_POSTFIX].from + wl->part[PART_POSTFIX].len].c;
  147:   wl->dep_word_hash = anthy_dep_word_hash(&xs);
  148:   if (wl->part[PART_POSTFIX].len) {
  149:     xs.len = wl->part[PART_POSTFIX].len;
  150:     xs.str = sc->ce[wl->part[PART_POSTFIX].from].c;
  151:   }
  152: 
  153:   /* 同じ内容のword_listがないかを調べる */
  154:   for (tmp = sc->word_split_info->cnode[wl->from].wl; tmp; tmp = tmp->next) {
  155:     if (word_list_same(tmp, wl)) {
  156:       return ;
  157:     }
  158:   }
  159:   /* wordlistのリストに追加 */
  160:   wl->next = sc->word_split_info->cnode[wl->from].wl;
  161:   sc->word_split_info->cnode[wl->from].wl = wl;
  162: 
  163:   /* デバッグプリント */
  164:   if (anthy_splitter_debug_flags() & SPLITTER_DEBUG_WL) {
  165:     anthy_print_word_list(sc, wl);
  166:   }
  167: }
  168: 
  169: struct word_list *
  170: anthy_alloc_word_list(struct splitter_context *sc)
  171: {
  172:   return anthy_smalloc(sc->word_split_info->WlAllocator);
  173: }
  174: 
  175: /* 後続の活用語尾、助詞、助動詞を付ける */
  176: static void
  177: make_following_word_list(struct splitter_context *sc,
  178:                          struct word_list *tmpl)
  179: {
  180:   /* このxsは自立語部の後続の文字列 */
  181:   xstr xs;
  182:   xs.str = sc->ce[tmpl->from+tmpl->len].c;
  183:   xs.len = sc->char_count - tmpl->from - tmpl->len;
  184:   tmpl->part[PART_DEPWORD].from =
  185:     tmpl->part[PART_POSTFIX].from + tmpl->part[PART_POSTFIX].len;
  186: 
  187:   if (tmpl->node_id >= 0) {
  188:     /* 普通のword_list */
  189:     anthy_scan_node(sc, tmpl, &xs, tmpl->node_id);
  190:   } else {
  191:     /* 自立語がないword_list */
  192:     struct wordseq_rule rule;
  193:     struct word_list new_tmpl;
  194:     int i;
  195:     int nr_rule = anthy_get_nr_dep_rule();
  196:     new_tmpl = *tmpl;
  197:     /* 名詞35の後に続くルールに対して */
  198:     for (i = 0; i < nr_rule; ++i) {
  199:       anthy_get_nth_dep_rule(i, &rule);
  200:       if (anthy_wtype_get_pos(rule.wt) == POS_NOUN
  201:           && anthy_wtype_get_scos(rule.wt) == SCOS_T35) {
  202:         new_tmpl.part[PART_CORE].wt = rule.wt;
  203:         new_tmpl.node_id = rule.node_id;
  204:         new_tmpl.head_pos = anthy_wtype_get_pos(new_tmpl.part[PART_CORE].wt);
  205:         anthy_scan_node(sc, &new_tmpl, &xs, new_tmpl.node_id);
  206:       }
  207:     }
  208:   }
  209: }
  210: 
  211: static void
  212: push_part_back(struct word_list *tmpl, int len,
  213:                seq_ent_t se, wtype_t wt)
  214: {
  215:   tmpl->len += len;
  216:   tmpl->part[PART_POSTFIX].len += len;
  217:   tmpl->part[PART_POSTFIX].wt = wt;
  218:   tmpl->part[PART_POSTFIX].seq = se;
  219:   tmpl->last_part = PART_POSTFIX;
  220: }
  221: 
  222: /* 接尾辞をくっつける */
  223: static void 
  224: make_suc_words(struct splitter_context *sc,
  225:                struct word_list *tmpl)
  226: {
  227:   int i, right;
  228: 
  229:   wtype_t core_wt = tmpl->part[PART_CORE].wt;
  230:   /* 数詞、名前、サ変名詞のいずれかに付属語は付く */
  231:   int core_is_num = 0;
  232:   int core_is_name = 0;
  233:   int core_is_sv_noun = 0;
  234: 
  235:   /* まず、接尾辞が付く自立語かチェックする */
  236:   if (anthy_wtype_include(anthy_wtype_num_noun, core_wt)) {
  237:     core_is_num = 1;
  238:   }
  239:   if (anthy_wtype_include(anthy_wtype_name_noun, core_wt)) {
  240:     core_is_name = 1;
  241:   }
  242:   if (anthy_wtype_get_sv(core_wt)) {
  243:     core_is_sv_noun = 1;
  244:   }
  245:   if (!core_is_num && !core_is_name && !core_is_sv_noun) {
  246:     return ;
  247:   }
  248: 
  249:   right = tmpl->part[PART_CORE].from + tmpl->part[PART_CORE].len;
  250:   /* 自立語の右側の文字列に対して */
  251:   for (i = 1;
  252:        i <= sc->word_split_info->seq_len[right];
  253:        i++){
  254:     xstr xs;
  255:     seq_ent_t suc;
  256:     xs.str = sc->ce[right].c;
  257:     xs.len = i;
  258:     suc = anthy_get_seq_ent_from_xstr(&xs, sc->is_reverse);
  259:     if (anthy_get_seq_ent_pos(suc, POS_SUC)) {
  260:       /* 右側の文字列は付属語なので、自立語の品詞にあわせてチェック */
  261:       struct word_list new_tmpl;
  262:       if (core_is_num &&
  263:           anthy_get_seq_ent_wtype_freq(suc, anthy_wtype_num_postfix)) {
  264:         new_tmpl = *tmpl;
  265:         push_part_back(&new_tmpl, i, suc, anthy_wtype_num_postfix);
  266:         make_following_word_list(sc, &new_tmpl);
  267:       }
  268:       if (core_is_name &&
  269:           anthy_get_seq_ent_wtype_freq(suc, anthy_wtype_name_postfix)) {
  270:         new_tmpl = *tmpl;
  271:         push_part_back(&new_tmpl, i, suc, anthy_wtype_name_postfix);
  272:         make_following_word_list(sc, &new_tmpl);
  273:       }
  274:       if (core_is_sv_noun &&
  275:           anthy_get_seq_ent_wtype_freq(suc, anthy_wtype_sv_postfix)) {
  276:         new_tmpl = *tmpl;
  277:         push_part_back(&new_tmpl, i, suc, anthy_wtype_sv_postfix);
  278:         make_following_word_list(sc, &new_tmpl);
  279:       }
  280:     }
  281:   }
  282: }
  283: 
  284: static void
  285: push_part_front(struct word_list *tmpl, int len,
  286:                 seq_ent_t se, wtype_t wt)
  287: {
  288:   tmpl->from = tmpl->from - len;
  289:   tmpl->len = tmpl->len + len;
  290:   tmpl->part[PART_PREFIX].from = tmpl->from;
  291:   tmpl->part[PART_PREFIX].len += len;
  292:   tmpl->part[PART_PREFIX].wt = wt;
  293:   tmpl->part[PART_PREFIX].seq = se;
  294: }
  295: 
  296: /* 接頭辞をくっつけてから接尾辞をくっつける */
  297: static void
  298: make_pre_words(struct splitter_context *sc,
  299:                struct word_list *tmpl)
  300: {
  301:   int i;
  302:   wtype_t core_wt = tmpl->part[PART_CORE].wt;
  303:   int core_is_num = 0;
  304:   /* 自立語は数詞か? */
  305:   if (anthy_wtype_include(anthy_wtype_num_noun, core_wt)) {
  306:     core_is_num = 1;
  307:   }
  308:   /* 接頭辞を列挙する */
  309:   for (i = 1; 
  310:        i <= sc->word_split_info->rev_seq_len[tmpl->part[PART_CORE].from];
  311:        i++) {
  312:     seq_ent_t pre;
  313:     /* このxsは自立語部の前の文字列 */
  314:     xstr xs;
  315:     xs.str = sc->ce[tmpl->part[PART_CORE].from - i].c;
  316:     xs.len = i;
  317:     pre = anthy_get_seq_ent_from_xstr(&xs, sc->is_reverse);
  318:     if (anthy_get_seq_ent_pos(pre, POS_PRE)) {
  319:       struct word_list new_tmpl;
  320:       if (core_is_num &&
  321:           anthy_get_seq_ent_wtype_freq(pre, anthy_wtype_num_prefix)) {
  322:         new_tmpl = *tmpl;
  323:         push_part_front(&new_tmpl, i, pre, anthy_wtype_num_prefix);
  324:         make_following_word_list(sc, &new_tmpl);
  325:         /* 数の場合は接尾辞もくっつける */
  326:         make_suc_words(sc, &new_tmpl);
  327:       }/* else if (anthy_get_seq_ent_wtype_freq(pre, anthy_wtype_prefix)) {
  328:         new_tmpl = *tmpl;
  329:         push_part_front(&new_tmpl, i, pre, anthy_wtype_prefix);
  330:         make_following_word_list(sc, &new_tmpl);
  331:         }*/
  332:     }
  333:   }
  334: }
  335: 
  336: /* wordlistを初期化する */
  337: static void
  338: setup_word_list(struct word_list *wl, int from, int len,
  339:                 int is_compound, int is_weak)
  340: {
  341:   int i;
  342:   wl->from = from;
  343:   wl->len = len;
  344:   wl->is_compound = is_compound;
  345:   /* partの配列を初期化する */
  346:   for (i = 0; i < NR_PARTS; i++) {
  347:     wl->part[i].from = 0;
  348:     wl->part[i].len = 0;
  349:     wl->part[i].wt = anthy_wt_none;
  350:     wl->part[i].seq = 0;
  351:     wl->part[i].freq = 1;/* 頻度の低い単語としておく */
  352:     wl->part[i].dc = DEP_NONE;
  353:   }
  354:   /* 自立語のパートを設定 */
  355:   wl->part[PART_CORE].from = from;
  356:   wl->part[PART_CORE].len = len;
  357:   /**/
  358:   wl->mw_features = MW_FEATURE_NONE;
  359:   wl->node_id = -1;
  360:   wl->last_part = PART_CORE;
  361:   wl->head_pos = POS_NONE;
  362:   wl->tail_ct = CT_NONE;
  363:   if (is_weak) {
  364:     wl->mw_features |= MW_FEATURE_WEAK_SEQ;
  365:   }
  366: }
  367: 
  368: /*
  369:  * ある独立語に対して、接頭辞、接尾辞、付属語を付けたものを
  370:  * 文節の候補(=word_list)としてcacheに追加する
  371:  */
  372: static void
  373: make_word_list(struct splitter_context *sc,
  374:                seq_ent_t se,
  375:                int from, int len,
  376:                int is_compound,
  377:                int is_weak)
  378: {
  379:   struct word_list tmpl;
  380:   struct wordseq_rule rule;
  381:   int nr_rule = anthy_get_nr_dep_rule();
  382:   int i;
  383: 
  384:   /* テンプレートの初期化 */
  385:   setup_word_list(&tmpl, from, len, is_compound, is_weak);
  386:   tmpl.part[PART_CORE].seq = se;
  387: 
  388:   /* 各ルールにマッチするか比較 */
  389:   for (i = 0; i < nr_rule; ++i) {
  390:     int freq;
  391:     anthy_get_nth_dep_rule(i, &rule);
  392:     if (!is_compound) {
  393:       freq = anthy_get_seq_ent_wtype_freq(se, rule.wt);
  394:     } else {
  395:       freq = anthy_get_seq_ent_wtype_compound_freq(se, rule.wt);
  396:     }
  397: 
  398:     if (freq) {
  399:       /* 自立語の品詞はそのルールにあっている */
  400:       if (anthy_splitter_debug_flags() & SPLITTER_DEBUG_ID) {
  401:         /* 品詞表のデバッグ用*/
  402:         xstr xs;
  403:         xs.str = sc->ce[tmpl.part[PART_CORE].from].c;
  404:         xs.len = tmpl.part[PART_CORE].len;
  405:         anthy_putxstr(&xs);
  406:         printf(" freq=%d rule_id=%d node_id=%d\n",
  407:                freq, i, rule.node_id);
  408:       }
  409:       /* 遷移したルールの情報を転記する */
  410:       tmpl.part[PART_CORE].wt = rule.wt;
  411:       tmpl.part[PART_CORE].freq = freq;
  412:       tmpl.node_id = rule.node_id;
  413:       tmpl.head_pos = anthy_wtype_get_pos(tmpl.part[PART_CORE].wt);
  414: 
  415:       /**/
  416:       tmpl.part[PART_POSTFIX].from =
  417:         tmpl.part[PART_CORE].from +
  418:         tmpl.part[PART_CORE].len;
  419:       /**/
  420:       if (anthy_wtype_get_pos(rule.wt) == POS_NOUN ||
  421:           anthy_wtype_get_pos(rule.wt) == POS_NUMBER) {
  422:         /* 接頭辞、接尾辞は名詞、数詞にしか付かないことにしている */
  423:         make_pre_words(sc, &tmpl);
  424:         make_suc_words(sc, &tmpl);
  425:       }
  426:       /* 接頭辞、接尾辞無しで助詞助動詞をつける */
  427:       make_following_word_list(sc, &tmpl);
  428:     }
  429:   }
  430: }
  431: 
  432: static void
  433: make_dummy_head(struct splitter_context *sc)
  434: {
  435:   struct word_list tmpl;
  436:   setup_word_list(&tmpl, 0, 0, 0, 0);
  437:   tmpl.part[PART_CORE].seq = 0;
  438:   tmpl.part[PART_CORE].wt = anthy_wtype_noun;
  439: 
  440:   tmpl.head_pos = anthy_wtype_get_pos(tmpl.part[PART_CORE].wt);
  441:   make_suc_words(sc, &tmpl);
  442: }
  443: 
  444: static int
  445: compare_hash(const void *kp, const void *cp)
  446: {
  447:   const int *h = kp;
  448:   const int *c = cp;
  449:   return (*h) - ntohl(*c);
  450: }
  451: 
  452: static int
  453: check_weak(xstr *xs)
  454: {
  455:   const int *array = (int *)weak_word_array;
  456:   int nr;
  457:   int h;
  458:   if (!array) {
  459:     return 0;
  460:   }
  461:   nr = ntohl(array[1]);
  462:   h = anthy_xstr_hash(xs);
  463:   if (bsearch(&h, &array[16], nr,
  464:               sizeof(int), compare_hash)) {
  465:     return 1;
  466:   }
  467:   return 0;
  468: }
  469: 
  470: /* コンテキストに設定された文字列の部分文字列から全てのword_listを列挙する */
  471: void 
  472: anthy_make_word_list_all(struct splitter_context *sc)
  473: {
  474:   int i, j;
  475:   xstr xs;
  476:   seq_ent_t se;
  477:   struct depword_ent {
  478:     struct depword_ent *next;
  479:     int from, len;
  480:     int is_compound;
  481:     int is_weak;
  482:     seq_ent_t se;
  483:   } *head, *de;
  484:   struct word_split_info_cache *info;
  485:   allocator de_ator;
  486: 
  487:   weak_word_array = anthy_file_dic_get_section("weak_words");
  488: 
  489:   info = sc->word_split_info;
  490:   head = NULL;
  491:   de_ator = anthy_create_allocator(sizeof(struct depword_ent), 0);
  492: 
  493:   xs.str = sc->ce[0].c;
  494:   xs.len = sc->char_count;
  495:   anthy_gang_load_dic(&xs, sc->is_reverse);
  496: 
  497:   /* 全ての自立語を列挙 */
  498:   /* 開始地点のループ */
  499:   for (i = 0; i < sc->char_count ; i++) {
  500:     int search_len = sc->char_count - i;
  501:     int search_from = 0;
  502:     if (search_len > 30) {
  503:       search_len = 30;
  504:     }
  505: 
  506:     /* 文字列長のループ(長い方から) */
  507:     for (j = search_len; j > search_from; j--) {
  508:       /* seq_entを取得する */
  509:       xs.len = j;
  510:       xs.str = sc->ce[i].c;
  511:       se = anthy_get_seq_ent_from_xstr(&xs, sc->is_reverse);
  512: 
  513:       /* 単語として認識できない */
  514:       if (!se) {
  515:         continue;
  516:       }
  517: 
  518:       /* 各、部分文字列が単語ならば接頭辞、接尾辞の
  519:          最大長を調べてマークする */
  520:       if (j > info->seq_len[i] &&
  521:           anthy_get_seq_ent_pos(se, POS_SUC)) {
  522:         info->seq_len[i] = j;
  523:       }
  524:       if (j > info->rev_seq_len[i + j] &&
  525:           anthy_get_seq_ent_pos(se, POS_PRE)) {
  526:         info->rev_seq_len[i + j] = j;
  527:       }
  528: