(linenum→info "unix/slp.c:2238")

anthy/9100e/src-splitter/metaword.c

    1: /*
    2:  * 文節もしくは単語を一つ以上セットにしてmetawordとして扱う。
    3:  * ここでは各種のmetawordを生成する
    4:  *
    5:  * init_metaword_tab() metaword処理のための情報を構成する
    6:  * anthy_make_metaword_all() context中のmetawordを構成する
    7:  * anthy_print_metaword() 指定されたmetawordを表示する
    8:  *
    9:  * Funded by IPA未踏ソフトウェア創造事業 2001 10/29
   10:  * Copyright (C) 2000-2006 TABATA Yusuke
   11:  * Copyright (C) 2004-2006 YOSHIDA Yuichi
   12:  * Copyright (C) 2000-2003 UGAWA Tomoharu
   13:  */
   14: #include <stdlib.h>
   15: #include <stdio.h>
   16: #include <math.h>
   17: 
   18: #include <anthy/record.h>
   19: #include <anthy/splitter.h>
   20: #include <anthy/xstr.h>
   21: #include <anthy/segment.h>
   22: #include <anthy/segclass.h>
   23: #include "wordborder.h"
   24: 
   25: /* 各種meta_wordをどのように処理するか */
   26: struct metaword_type_tab_ anthy_metaword_type_tab[] = {
   27:   {MW_DUMMY,"dummy",MW_STATUS_NONE,MW_CHECK_SINGLE},
   28:   {MW_SINGLE,"single",MW_STATUS_NONE,MW_CHECK_SINGLE},
   29:   {MW_WRAP,"wrap",MW_STATUS_WRAPPED,MW_CHECK_WRAP},
   30:   {MW_COMPOUND_HEAD,"compound_head",MW_STATUS_NONE,MW_CHECK_COMPOUND},
   31:   {MW_COMPOUND,"compound",MW_STATUS_NONE,MW_CHECK_NONE},
   32:   {MW_COMPOUND_LEAF,"compound_leaf",MW_STATUS_COMPOUND,MW_CHECK_NONE},
   33:   {MW_COMPOUND_PART,"compound_part",MW_STATUS_COMPOUND_PART,MW_CHECK_SINGLE},
   34:   {MW_V_RENYOU_A,"v_renyou_a",MW_STATUS_COMBINED,MW_CHECK_BORDER},
   35:   {MW_V_RENYOU_NOUN,"v_renyou_noun",MW_STATUS_COMBINED,MW_CHECK_BORDER},
   36:   {MW_NUMBER,"number",MW_STATUS_COMBINED,MW_CHECK_NUMBER},
   37:   {MW_OCHAIRE,"ochaire",MW_STATUS_OCHAIRE,MW_CHECK_OCHAIRE},
   38:   /**/
   39:   {MW_END,"end",MW_STATUS_NONE,MW_CHECK_NONE}
   40: };
   41: 
   42: static void
   43: combine_metaword(struct splitter_context *sc, struct meta_word *mw);
   44: 
   45: /* コンテキスト中にmetawordを追加する */
   46: void
   47: anthy_commit_meta_word(struct splitter_context *sc,
   48:                        struct meta_word *mw)
   49: {
   50:   struct word_split_info_cache *info = sc->word_split_info;
   51:   /* 同じ開始点を持つノードのリスト */
   52:   mw->next = info->cnode[mw->from].mw;
   53:   info->cnode[mw->from].mw = mw;
   54:   /**/
   55:   if (anthy_splitter_debug_flags() & SPLITTER_DEBUG_MW) {
   56:     anthy_print_metaword(sc, mw);
   57:   }
   58: }
   59: 
   60: static void
   61: print_metaword_features(int features)
   62: {
   63:   if (features & MW_FEATURE_SV) {
   64:     printf(":sv");
   65:   }
   66:   if (features & MW_FEATURE_WEAK_CONN) {
   67:     printf(":weak");
   68:   }
   69:   if (features & MW_FEATURE_SUFFIX) {
   70:     printf(":suffix");
   71:   }
   72:   if (features & MW_FEATURE_NUM) {
   73:     printf(":num");
   74:   }
   75:   if (features & MW_FEATURE_CORE1) {
   76:     printf(":c1");
   77:   }
   78:   if (features & MW_FEATURE_HIGH_FREQ) {
   79:     printf(":hf");
   80:   }
   81: }
   82: 
   83: static void
   84: anthy_do_print_metaword(struct splitter_context *sc,
   85:                         struct meta_word *mw,
   86:                         int indent)
   87: {
   88:   int i;
   89:   for (i = 0; i < indent; i++) {
   90:     printf(" ");
   91:   }
   92:   printf("*meta word type=%s(%d-%d):score=%d:seg_class=%s",
   93:          anthy_metaword_type_tab[mw->type].name,
   94:          mw->from, mw->len, mw->score,
   95:          anthy_seg_class_name(mw->seg_class));
   96:   print_metaword_features(mw->mw_features);
   97:   printf(":can_use=%d*\n", mw->can_use);
   98:   if (mw->wl) {
   99:     anthy_print_word_list(sc, mw->wl);
  100:   }
  101:   if (mw->cand_hint.str) {
  102:     printf("(");
  103:     anthy_putxstr(&mw->cand_hint);
  104:     printf(")\n");
  105:   }
  106:   if (mw->mw1) {
  107:     anthy_do_print_metaword(sc, mw->mw1, indent + 1);
  108:   }    
  109:   if (mw->mw2) {
  110:     anthy_do_print_metaword(sc, mw->mw2, indent + 1);
  111:   }
  112: }
  113: 
  114: void
  115: anthy_print_metaword(struct splitter_context *sc,
  116:                      struct meta_word *mw)
  117: {
  118:   anthy_do_print_metaword(sc, mw, 0);
  119: }
  120: 
  121: static struct meta_word *
  122: alloc_metaword(struct splitter_context *sc)
  123: {
  124:   struct meta_word *mw;
  125:   mw = anthy_smalloc(sc->word_split_info->MwAllocator);
  126:   mw->type = MW_SINGLE;
  127:   mw->score = 0;
  128:   mw->struct_score = 0;
  129:   mw->dep_word_hash = 0;
  130:   mw->core_wt = anthy_wt_none;
  131:   mw->mw_features = 0;
  132:   mw->dep_class = DEP_NONE;
  133:   mw->wl = NULL;
  134:   mw->mw1 = NULL;
  135:   mw->mw2 = NULL;
  136:   mw->cand_hint.str = NULL;
  137:   mw->cand_hint.len = 0;
  138:   mw->seg_class = SEG_HEAD;
  139:   mw->can_use = ok;
  140:   return mw;
  141: }
  142: 
  143: 
  144: /*
  145:  * wlの接頭辞部分と接尾辞部分を文字列として取り出す
  146:  */
  147: static void
  148: get_surrounding_text(struct splitter_context* sc,
  149:                      struct word_list* wl,
  150:                      xstr* xs_pre, xstr* xs_post)
  151: {
  152:     int post_len = wl->part[PART_DEPWORD].len + wl->part[PART_POSTFIX].len;
  153:     int pre_len = wl->part[PART_PREFIX].len;
  154: 
  155:     xs_pre->str = sc->ce[wl->from].c;
  156:     xs_pre->len = pre_len;
  157:     xs_post->str = sc->ce[wl->from + wl->len - post_len].c;
  158:     xs_post->len = post_len;
  159: }
  160: 
  161: /*
  162:  * 複合語であるwlからn番めの部分を取り出してmwにする
  163:  */
  164: static struct meta_word*
  165: make_compound_nth_metaword(struct splitter_context* sc, 
  166:                            compound_ent_t ce, int nth,
  167:                            struct word_list* wl,
  168:                            enum metaword_type type)
  169: {
  170:   int i;
  171:   int len = 0;
  172:   int from = wl->from;
  173:   int seg_num = anthy_compound_get_nr_segments(ce);
  174:   struct meta_word* mw;
  175:   xstr xs_pre, xs_core, xs_post;
  176: 
  177:   get_surrounding_text(sc, wl, &xs_pre, &xs_post);
  178: 
  179:   for (i = 0; i <= nth; ++i) {
  180:     from += len;
  181:     len = anthy_compound_get_nth_segment_len(ce, i);
  182:     if (i == 0) {
  183:       len += xs_pre.len;
  184:     }
  185:     if (i == seg_num - 1) {
  186:       len += xs_post.len;
  187:     }
  188:   }
  189:   
  190:   mw = alloc_metaword(sc);
  191:   mw->from = from;
  192:   mw->len = len;
  193:   mw->type = type;
  194:   mw->score = 1000;
  195:   mw->seg_class = wl->seg_class;
  196: 
  197:   anthy_compound_get_nth_segment_xstr(ce, nth, &xs_core);
  198:   if (nth == 0) {
  199:     anthy_xstrcat(&mw->cand_hint, &xs_pre);
  200:   }
  201:   anthy_xstrcat(&mw->cand_hint, &xs_core);
  202:   if (nth == seg_num - 1) {
  203:     anthy_xstrcat(&mw->cand_hint, &xs_post);
  204:   }
  205:   return mw;
  206: }
  207: 
  208: 
  209: /*
  210:  * metawordを実際に結合する
  211:  */
  212: static struct meta_word *
  213: anthy_do_cons_metaword(struct splitter_context *sc,
  214:                        enum metaword_type type,
  215:                        struct meta_word *mw, struct meta_word *mw2)
  216: {
  217:   struct meta_word *n;
  218:  
  219:   n = alloc_metaword(sc);
  220:   n->from = mw->from;
  221:   n->len = mw->len + (mw2 ? mw2->len : 0);
  222: 
  223:   if (mw2) {
  224:     n->score = sqrt(mw->score) * sqrt(mw2->score);
  225:   } else {
  226:     n->score = mw->score;
  227:   }
  228:   n->type = type;
  229:   n->mw1 = mw;
  230:   n->mw2 = mw2;
  231:   if (mw2) {
  232:     n->seg_class = mw2->seg_class;
  233:     n->nr_parts = mw->nr_parts + mw2->nr_parts;
  234:     n->dep_word_hash = mw2->dep_word_hash;
  235:   } else {
  236:     n->seg_class = mw->seg_class;
  237:     n->nr_parts = mw->nr_parts;
  238:     n->dep_word_hash = mw->dep_word_hash;
  239:   }
  240:   anthy_commit_meta_word(sc, n);
  241:   return n;
  242: }
  243: 
  244: /*
  245:  * 複合語用のmeta_wordを作成する。
  246:  */
  247: static void
  248: make_compound_metaword(struct splitter_context* sc, struct word_list* wl)
  249: {
  250:   int i, j;
  251:   seq_ent_t se = wl->part[PART_CORE].seq;
  252:   int ent_num = anthy_get_nr_dic_ents(se, NULL);
  253: 
  254:   for (i = 0; i < ent_num; ++i) {
  255:     compound_ent_t ce;
  256:     int seg_num;
  257:     struct meta_word *mw = NULL;
  258:     struct meta_word *mw2 = NULL;
  259:     if (!anthy_get_nth_dic_ent_is_compound(se, i)) {
  260:       continue;
  261:     }
  262:     ce = anthy_get_nth_compound_ent(se, i);
  263:     seg_num = anthy_compound_get_nr_segments(ce);
  264: 
  265:     for (j = seg_num - 1; j >= 0; --j) {
  266:       enum metaword_type type;
  267:       mw = make_compound_nth_metaword(sc, ce, j, wl, MW_COMPOUND_LEAF);
  268:       anthy_commit_meta_word(sc, mw);
  269: 
  270:       type = j == 0 ? MW_COMPOUND_HEAD : MW_COMPOUND;
  271:       mw2 = anthy_do_cons_metaword(sc, type, mw, mw2);
  272:     }
  273:   }
  274: }
  275: 
  276: /*
  277:  * 複合語の中の個々の文節を結合したmeta_wordを作成する。
  278:  */
  279: static void
  280: make_compound_part_metaword(struct splitter_context* sc, struct word_list* wl)
  281: {
  282:   int i, j, k;
  283:   seq_ent_t se = wl->part[PART_CORE].seq;
  284:   int ent_num = anthy_get_nr_dic_ents(se, NULL);
  285: 
  286:   for (i = 0; i < ent_num; ++i) {
  287:     compound_ent_t ce;
  288:     int seg_num;
  289:     struct meta_word *mw = NULL;
  290:     struct meta_word *mw2 = NULL;
  291: 
  292:     if (!anthy_get_nth_dic_ent_is_compound(se, i)) {
  293:       continue;
  294:     }
  295: 
  296:     ce = anthy_get_nth_compound_ent(se, i);
  297:     seg_num = anthy_compound_get_nr_segments(ce);
  298: 
  299:     /* 後ろから */
  300:     for (j = seg_num - 1; j >= 0; --j) {
  301:       mw = make_compound_nth_metaword(sc, ce, j, wl, MW_COMPOUND_PART);
  302:       for (k = j - 1; k >= 0; --k) {
  303:         mw2 = make_compound_nth_metaword(sc, ce, k, wl, MW_COMPOUND_PART);
  304:         mw2->len += mw->len;
  305:         mw2->score += mw->score;
  306:         anthy_xstrcat(&mw2->cand_hint, &mw->cand_hint);
  307: 
  308:         anthy_commit_meta_word(sc, mw2);       
  309:         mw = mw2;
  310:       }
  311:     } 
  312:   }
  313: }
  314: 
  315: /*
  316:  * 単文節単語
  317:  */
  318: static void
  319: make_simple_metaword(struct splitter_context *sc, struct word_list* wl)
  320: {
  321:   struct meta_word *mw = alloc_metaword(sc);
  322:   mw->wl = wl;
  323:   mw->from = wl->from;
  324:   mw->len = wl->len;
  325:   mw->score = 1000;
  326:   mw->type = MW_SINGLE;
  327:   mw->dep_class = wl->part[PART_DEPWORD].dc;
  328:   mw->seg_class = wl->seg_class;
  329:   if (wl->part[PART_CORE].len) {
  330:     mw->core_wt = wl->part[PART_CORE].wt;
  331:   }
  332:   mw->nr_parts = NR_PARTS;
  333:   mw->dep_word_hash = wl->dep_word_hash;
  334:   mw->mw_features = wl->mw_features;
  335:   anthy_commit_meta_word(sc, mw);
  336: }
  337: 
  338: /*
  339:  * wordlist一個からなる、metawordを作成
  340:  */
  341: static void
  342: make_metaword_from_word_list(struct splitter_context *sc)
  343: {
  344:   int i;
  345:   for (i = 0; i < sc->char_count; i++) {
  346:     struct word_list *wl;
  347:     for (wl = sc->word_split_info->cnode[i].wl;
  348:          wl; wl = wl->next) {
  349:       if (wl->is_compound) {
  350:         make_compound_part_metaword(sc, wl);
  351:         make_compound_metaword(sc, wl);
  352:       } else {
  353:         make_simple_metaword(sc, wl);
  354:       }
  355:     }
  356:   }
  357: }
  358: 
  359: /*
  360:  * metawordをリスト風に結合する
  361:  */
  362: static struct meta_word *
  363: list_metaword(struct splitter_context *sc,
  364:               enum metaword_type type,
  365:               struct meta_word *mw, struct meta_word *mw2)
  366: {
  367:   struct meta_word *wrapped_mw = anthy_do_cons_metaword(sc, type, mw2, NULL);
  368:   struct meta_word *n = anthy_do_cons_metaword(sc, type, mw, wrapped_mw);
  369: 
  370:   n->mw_features = mw->mw_features | mw2->mw_features;
  371: 
  372:   return n;
  373: }
  374: 
  375: /*
  376:  * 動詞連用形 + 形容詞化接尾語 「〜しやすい」など
  377:  */
  378: static void
  379: try_combine_v_renyou_a(struct splitter_context *sc,
  380:                        struct meta_word *mw, struct meta_word *mw2)
  381: {
  382:   wtype_t w2;
  383:   if (!mw->wl || !mw2->wl) return;
  384: 
  385:   w2 = mw2->wl->part[PART_CORE].wt;
  386: 
  387:   if (mw->wl->head_pos == POS_V &&
  388:       mw->wl->tail_ct == CT_RENYOU &&
  389:       anthy_wtype_get_pos(w2) == POS_D2KY) {
  390:     /* 形容詞ではあるので次のチェック */
  391:     if (anthy_get_seq_ent_wtype_freq(mw2->wl->part[PART_CORE].seq, 
  392:                                      anthy_wtype_a_tail_of_v_renyou)) {
  393:       list_metaword(sc, MW_V_RENYOU_A, mw, mw2);
  394:     }
  395:   }
  396: }
  397: 
  398: /*
  399:  * 動詞連用形 + 名詞化接尾語(#D2T35) 「入れ たて(のお茶)」など
  400:  */
  401: static void
  402: try_combine_v_renyou_noun(struct splitter_context *sc,
  403:                           struct meta_word *mw, struct meta_word *mw2)
  404: {
  405:   wtype_t w2;
  406:   if (!mw->wl || !mw2->wl) return;
  407: 
  408:   w2 = mw2->wl->part[PART_CORE].wt;
  409:   if (mw->wl->head_pos == POS_V &&
  410:       mw->wl->tail_ct == CT_RENYOU &&
  411:       anthy_wtype_get_pos(w2) == POS_NOUN &&
  412:       anthy_wtype_get_scos(w2) == SCOS_T40) {
  413:     list_metaword(sc, MW_V_RENYOU_NOUN, mw, mw2);
  414:   }
  415: }
  416: 
  417: /*
  418:  * 数字を結合する
  419:  */
  420: static void
  421: try_combine_number(struct splitter_context *sc,
  422:                  struct meta_word *mw1, struct meta_word *mw2)
  423: {
  424:   struct word_list *wl1 = mw1->wl;
  425:   struct word_list *wl2 = mw2->wl;
  426:   struct meta_word *combined_mw;
  427:   int recursive = wl2 ? 0 : 1; /* combinedなmwを結合する場合1 */
  428: 
  429:   /* 左mwは数詞 */
  430: 
  431:   if (anthy_wtype_get_pos(wl1->part[PART_CORE].wt) != POS_NUMBER) return;  
  432:   if (recursive) {
  433:     /* 右mwは数字を結合したmw */
  434:     if (mw2->type != MW_NUMBER) return;
  435:     wl2 = mw2->mw1->wl;
  436:   } else {
  437:     /* 右mwは数詞 */
  438:     if (anthy_wtype_get_pos(wl2->part[PART_CORE].wt) != POS_NUMBER) return;    
  439:   }
  440:   /* 左mwの後ろに文字が付いていなければ */
  441:   if (wl1->part[PART_POSTFIX].len == 0 &&
  442:       wl1->part[PART_DEPWORD].len == 0) {
  443:     int scos1 = anthy_wtype_get_scos(wl1->part[PART_CORE].wt);
  444:     int scos2 = anthy_wtype_get_scos(wl2->part[PART_CORE].wt);
  445: 
  446:     /* #NNは対象外 */
  447:     if (scos2 == SCOS_NONE) return;
  448:     /* 
  449:        左mwの種類によって、後ろにつくことができる右mwの種類が変わる
  450:        例えば一〜九の後ろには万〜九万、億〜九億しかつくことができないが、
  451:        十〜九十の後ろには、あわせて一〜九などもつくことができる
  452:      */
  453:     switch (scos1) {
  454:     case SCOS_N1: 
  455:       if (scos2 == SCOS_N1) return; /* 後ろに一〜九がついてはいけない */
  456:     case SCOS_N10:
  457:       if (scos2 == SCOS_N10) return; /* 後ろに十〜九十がついてはいけない */
  458:     case SCOS_N100:
  459:       if (scos2 == SCOS_N100) return; /* 後ろに百〜九百がついてはいけない */
  460:     case SCOS_N1000:
  461:       if (scos2 == SCOS_N1000) return; /* 後ろに千〜九千がついてはいけない */
  462:     case SCOS_N10000:
  463:       /* 万〜九万、億〜九億…などは、
  464:          いつでも後ろにつくことができる */
  465:       break;
  466:     default:
  467:       return;
  468:     }
  469: 
  470:     if (recursive) {
  471:       combined_mw = anthy_do_cons_metaword(sc, MW_NUMBER, mw1, mw2);
  472:     } else {
  473:       /* 初めて結合する場合は後ろにnullをつけてlistにする */
  474:       combined_mw = list_metaword(sc, MW_NUMBER, mw1, mw2);
  475:     }
  476:     combine_metaword(sc, combined_mw);
  477:   }
  478: }
  479: 
  480: /* 右隣のmetawordと結合できるかチェック */
  481: static void
  482: try_combine_metaword(struct splitter_context *sc,
  483:                      struct meta_word *mw1, struct meta_word *mw2)
  484: {
  485:   if (!mw1->wl) return;
  486: 
  487:   /* metawordの結合を行うためには、後続の
  488:      metawordに接頭辞がないことが必要 */
  489:   if (mw2->wl && mw2->wl->part[PART_PREFIX].len > 0) {
  490:     return;
  491:   }
  492:   
  493:   try_combine_v_renyou_a(sc, mw1, mw2);
  494:   try_combine_v_renyou_noun(sc, mw1, mw2);
  495:   try_combine_number(sc, mw1, mw2);
  496: }
  497: 
  498: static void
  499: combine_metaword(struct splitter_context *sc, struct meta_word *mw)
  500: {
  501:   struct word_split_info_cache *info = sc->word_split_info;
  502:   int i;
  503: 
  504:   if (mw->mw_features & MW_FEATURE_DEP_ONLY) {
  505:     /* 付属語だけの文節とは結合しない */  
  506:     return;
  507:   }
  508: 
  509:   for (i = mw->from - 1; i >= 0; i--) {
  510:     struct meta_word *mw_left;
  511:     for (mw_left = info->cnode[i].mw; mw_left; mw_left = mw_left->next) {
  512:       if (mw_left->from + mw_left->len == mw->from) {
  513:         /* 結合できるかチェック */
  514:         try_combine_metaword(sc, mw_left, mw);
  515:       }
  516:     }
  517:   }
  518: }
  519: 
  520: static void
  521: combine_metaword_all(struct splitter_context *sc)
  522: {
  523:   int i;
  524: 
  525:   struct word_split_info_cache *info = sc->word_split_info;
  526:   /* metawordの左端によるループ */
  527:   for (i = sc->char_count - 1; i >= 0; i--){
  528:     struct meta_word *mw;
  529:     /* 各metawordのループ */
  530:     for (mw = info->cnode[i].mw;
  531:          mw; mw = mw->next) {
  532:       combine_metaword(sc, mw);
  533:     }
  534:   }
  535: }
  536: 
  537: static void
  538: make_dummy_metaword(struct splitter_context *sc, int from,