(linenum→info "unix/slp.c:2238")

anthy/9100e/src-ordering/relation.c

    1: /*
    2:  * 文節の関係を処理する
    3:  * Copyright (C) 2006 Higashiyama Masahiko (thanks google summer of code program)
    4:  * Copyright (C) 2002-2007 TABATA Yusuke
    5:  *
    6:  * anthy_reorder_candidates_by_relation()
    7:  *
    8:  */
    9: /*
   10:   This library is free software; you can redistribute it and/or
   11:   modify it under the terms of the GNU Lesser General Public
   12:   License as published by the Free Software Foundation; either
   13:   version 2 of the License, or (at your option) any later version.
   14: 
   15:   This library is distributed in the hope that it will be useful,
   16:   but WITHOUT ANY WARRANTY; without even the implied warranty of
   17:   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   18:   Lesser General Public License for more details.
   19: 
   20:   You should have received a copy of the GNU Lesser General Public
   21:   License along with this library; if not, write to the Free Software
   22:   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307  USA
   23:  */
   24: 
   25: #include <arpa/inet.h>
   26: #include <stdlib.h>
   27: 
   28: #include <anthy/segclass.h>
   29: #include <anthy/segment.h>
   30: #include <anthy/ordering.h>
   31: #include <anthy/dic.h>
   32: #include <anthy/diclib.h>
   33: #include <anthy/feature_set.h>
   34: #include <anthy/corpus.h>
   35: #include "sorter.h"
   36: 
   37: #define MAX_COLLISION 4
   38: #define SEARCH_LIMIT 100
   39: #define MAX_NEIGHBOR 10
   40: 
   41: 
   42: /* 全文検索用のコーパス */
   43: static struct corpus_ {
   44:   /* header */
   45:   void *corpus_bucket;
   46:   void *corpus_array;
   47:   /**/
   48:   int *bucket;
   49:   int *array;
   50:   /**/
   51:   int bucket_size;
   52:   int array_size;
   53: } corpus_info;
   54: 
   55: /* 検索用のiterator */
   56: struct iterator {
   57:   /* 検索のキーと現在の場所 */
   58:   int key;
   59:   int idx;
   60:   /* 検索回数の上限 */
   61:   int limit;
   62: };
   63: 
   64: struct neighbor {
   65:   int nr;
   66:   int id[MAX_NEIGHBOR];
   67: };
   68: 
   69: /** 文節@segの中に@from_word_idの単語と共起関係にある
   70:  *  候補があるかどうかを探し、あればスコアを上げる。
   71:  */
   72: static void
   73: reorder_candidate(int from_word_id, struct seg_ent *seg)
   74: {
   75:   int i, pos;
   76:   struct cand_ent *ce = seg->cands[0];
   77:   if (ce->core_elm_index == -1) {
   78:     return ;
   79:   }
   80:   /* 0番目の候補の品詞 */
   81:   pos = anthy_wtype_get_pos(ce->elm[ce->core_elm_index].wt);
   82: 
   83:   for (i = 0; i < seg->nr_cands; i++) {
   84:     int word_id;
   85:     ce = seg->cands[i];
   86:     if (ce->core_elm_index == -1) {
   87:       continue;
   88:     }
   89:     word_id = ce->elm[ce->core_elm_index].id;
   90:     if (anthy_dic_check_word_relation(from_word_id, word_id) &&
   91:         anthy_wtype_get_pos(ce->elm[ce->core_elm_index].wt) == pos) {
   92:       /* 用例にマッチしたので、候補のスコアを更新 */
   93:       ce->flag |= CEF_USEDICT;
   94:       ce->score *= 10;
   95:     }
   96:   }
   97: }
   98: 
   99: static int
  100: get_indep_word_id(struct seg_ent *seg, int nth)
  101: {
  102:   struct cand_ent *ce;
  103:   if (seg->cands[nth]->core_elm_index == -1) {
  104:     /* 一番目の候補がseq_entから作られた候補ではない */
  105:     return -1;
  106:   }
  107:   ce = seg->cands[nth];
  108:   /* 自立語のidを取り出す */
  109:   return ce->elm[ce->core_elm_index].id;
  110: }
  111: 
  112: /* 用例辞書を使って並び替えをする */
  113: static void
  114: reorder_by_use_dict(struct segment_list *sl, int nth)
  115: {
  116:   int i;
  117:   struct seg_ent *cur_seg;
  118:   int word_id;
  119: 
  120:   cur_seg = anthy_get_nth_segment(sl, nth);
  121:   word_id = get_indep_word_id(cur_seg, 0);
  122:   if (word_id == -1) {
  123:     /**/
  124:     return ;
  125:   }
  126:   /* 近所の文節を順に見ていく */
  127:   for (i = nth - 2; i < nth + 2 && i < sl->nr_segments; i++) {
  128:     struct seg_ent *target_seg;
  129:     if (i < 0 || i == nth) {
  130:       continue ;
  131:     }
  132:     /* i番目の文節と前後のj番目の文節に対して */
  133:     target_seg = anthy_get_nth_segment(sl, i);
  134:     reorder_candidate(word_id, target_seg);
  135:   }
  136: }
  137: 
  138: static int
  139: find_border_of_this_word(int idx)
  140: {
  141:   int val;
  142:   if (idx < 0) {
  143:     return 0;
  144:   }
  145:   val = ntohl(corpus_info.array[idx * 2]);
  146:   while (!(val & ELM_WORD_BORDER) &&
  147:          idx > -1) {
  148:     idx --;
  149:   }
  150:   return idx;
  151: }
  152: 
  153: static int
  154: find_left_word_border(int idx)
  155: {
  156:   int val;
  157:   if (idx == -1) {
  158:     return -1;
  159:   }
  160:   val = ntohl(corpus_info.array[idx * 2]);
  161:   if (val & ELM_BOS) {
  162:     return -1;
  163:   }
  164:   idx --;
  165:   return find_border_of_this_word(idx);
  166: }
  167: 
  168: static int
  169: find_right_word_border(int idx)
  170: {
  171:   if (idx == -1) {
  172:     return -1;
  173:   }
  174:   while (idx < corpus_info.array_size - 2) {
  175:     int val;
  176:     idx ++;
  177:     val = ntohl(corpus_info.array[idx * 2]);
  178:     if (val & ELM_BOS) {
  179:       return -1;
  180:     }
  181:     if (val & ELM_WORD_BORDER) {
  182:       return idx;
  183:     }
  184:   }
  185:   return -1;
  186: }
  187: 
  188: static void
  189: push_id(struct neighbor *ctx,
  190:         int id)
  191: {
  192:   if (ctx->nr < MAX_NEIGHBOR - 1) {
  193:     ctx->id[ctx->nr] = id;
  194:     ctx->nr++;
  195:   }
  196: }
  197: 
  198: static void
  199: collect_word_context(struct neighbor *ctx, int idx)
  200: {
  201:   int id = ntohl(corpus_info.array[idx * 2]) & CORPUS_KEY_MASK;
  202:   /*printf("  id=%d\n", id);*/
  203:   push_id(ctx, id);
  204: }
  205: 
  206: /* 例文中で周辺の情報を取得する */
  207: static void
  208: collect_corpus_context(struct neighbor *ctx,
  209:                        struct iterator *it)
  210: {
  211:   int i;
  212:   int this_idx, idx;
  213: 
  214:   this_idx = find_border_of_this_word(it->idx);
  215: 
  216:   /*printf(" key=%d\n", it->key);*/
  217:   /* 左へスキャン */
  218:   idx = this_idx;
  219:   for (i = 0; i < 2; i++) {
  220:     idx = find_left_word_border(idx);
  221:     if (idx == -1) {
  222:       break;
  223:     }
  224:     collect_word_context(ctx, idx);
  225:   }
  226:   /* 右へスキャン */
  227:   idx = this_idx;
  228:   for (i = 0; i < 2; i++) {
  229:     idx = find_right_word_border(idx);
  230:     if (idx == -1) {
  231:       break;
  232:     }
  233:     collect_word_context(ctx, idx);
  234:   }
  235: }
  236: 
  237: /* 変換対象の文字列の周辺の情報を取得する */
  238: static void
  239: collect_user_context(struct neighbor *ctx,
  240:                      struct segment_list *sl, int nth)
  241: {
  242:   int i;
  243:   ctx->nr = 0;
  244:   for (i = nth - 2; i <= nth + 2 && i < sl->nr_segments; i++) {
  245:     int id;
  246:     if ((i < 0) || (i == nth)) {
  247:       continue;
  248:     }
  249:     id = get_indep_word_id(anthy_get_nth_segment(sl, i), 0);
  250:     if (id > -1) {
  251:       id &= CORPUS_KEY_MASK;
  252:       /*printf("user_ctx=%d\n", id);*/
  253:       push_id(ctx, id);
  254:     }
  255:   }
  256: }
  257: 
  258: /* 隣接文節の情報を比較する */
  259: static int 
  260: do_compare_context(struct neighbor *n1,
  261:                    struct neighbor *n2)
  262: {
  263:   int i, j;
  264:   int m = 0;
  265:   for (i = 0; i < n1->nr; i++) {
  266:     for (j = 0; j < n2->nr; j++) {
  267:       if (n1->id[i] == n2->id[j]) {
  268:         m++;
  269:       }
  270:     }
  271:   }
  272:   return m;
  273: }
  274: 
  275: /* 隣接文節の情報を取得して比較する */
  276: static int
  277: compare_context(struct neighbor *user,
  278:                 struct iterator *it)
  279: {
  280:   struct neighbor sample;
  281:   int nr;
  282:   /**/
  283:   sample.nr = 0;
  284:   /* 例文中の周辺情報を集める */
  285:   collect_corpus_context(&sample, it);
  286:   if (sample.nr == 0) {
  287:     return 0;
  288:   }
  289:   /* 比較する */
  290:   nr = do_compare_context(user, &sample);
  291:   if (nr >= sample.nr / 2) {
  292:     return nr;
  293:   }
  294:   return 0;
  295: }
  296: 
  297: /* keyの最初の出現場所を見つける
  298:  * 見つからなかったら-1を返す
  299:  */
  300: static int
  301: find_first_pos(int key)
  302: {
  303:   int i;
  304:   for (i = 0; i < MAX_COLLISION; i++) {
  305:     int bkt = (key + i) % corpus_info.bucket_size;
  306:     if ((int)ntohl(corpus_info.bucket[bkt * 2]) == key) {
  307:       return ntohl(corpus_info.bucket[bkt * 2 + 1]);
  308:     }
  309:   }
  310:   return -1;
  311: }
  312: 
  313: /* keyの最初の出現場所でiteratorを初期化する
  314:  * 見つからなかったら-1を返す
  315:  */
  316: static int
  317: find_first_from_corpus(int key, struct iterator *it, int limit)
  318: {
  319:   key &= CORPUS_KEY_MASK;
  320:   it->idx = find_first_pos(key);
  321:   it->key = key;
  322:   it->limit = limit;
  323:   return it->idx;
  324: }
  325: 
  326: /* keyの次の出現場所のiteratorを設定する
  327:  */
  328: static int
  329: find_next_from_corpus(struct iterator *it)
  330: {
  331:   int idx = it->idx;
  332:   it->limit--;
  333:   if (it->limit < 1) {
  334:     it->idx = -1;
  335:     return -1;
  336:   }
  337:   it->idx = ntohl(corpus_info.array[it->idx * 2 + 1]);
  338:   if (it->idx < 0 || it->idx >= corpus_info.array_size ||
  339:       it->idx < idx) {
  340:     it->idx = -1;
  341:   }
  342:   return it->idx;
  343: }
  344: 
  345: static void
  346: check_candidate_context(struct seg_ent *cur_seg,
  347:                         int i,
  348:                         struct neighbor *user)
  349: {
  350:   struct iterator it;
  351:   int nr = 0;
  352:   int word_id;
  353:   word_id = get_indep_word_id(cur_seg, i);
  354:   if (word_id == -1) {
  355:     return ;
  356:   }
  357:   /* 各出現場所をスキャンする */
  358:   find_first_from_corpus(word_id, &it, SEARCH_LIMIT);
  359:   /*printf("word_id=%d %d\n", word_id, it.idx);*/
  360:   while (it.idx > -1) {
  361:     nr += compare_context(user, &it);
  362:     /**/
  363:     find_next_from_corpus(&it);
  364:   }
  365:   /**/
  366:   if (nr > 0) {
  367:     cur_seg->cands[i]->flag |= CEF_CONTEXT;
  368:   }
  369: }
  370: 
  371: /* 全文検索で候補を並び替える */
  372: static void
  373: reorder_by_corpus(struct segment_list *sl, int nth)
  374: {
  375:   struct seg_ent *cur_seg;
  376:   struct neighbor user;
  377:   int i;
  378:   /* 文節の周辺情報を集める */
  379:   collect_user_context(&user, sl, nth);
  380:   if (user.nr == 0) {
  381:     return ;
  382:   }
  383:   cur_seg = anthy_get_nth_segment(sl, nth);
  384:   /* 各候補について */
  385:   for (i = 0; i < cur_seg->nr_cands; i++) {
  386:     check_candidate_context(cur_seg, i, &user);
  387:   }
  388:   /* トップの候補に用例があれば、他の候補は見ない */
  389:   if (cur_seg->cands[0]->flag & CEF_CONTEXT) {
  390:     cur_seg->cands[0]->flag &= ~CEF_CONTEXT;
  391:     return ;
  392:   }
  393:   /* 用例によるスコア加算 */
  394:   for (i = 1; i < cur_seg->nr_cands; i++) {
  395:     if (cur_seg->cands[i]->flag & CEF_CONTEXT) {
  396:       cur_seg->cands[i]->score *= 2;
  397:     }
  398:   }
  399: }
  400: 
  401: /*
  402:  * 用例を用いて候補を並び替える
  403:  *  @nth番目以降の文節を対象とする
  404:  */
  405: void
  406: anthy_reorder_candidates_by_relation(struct segment_list *sl, int nth)
  407: {
  408:   int i;
  409:   for (i = nth; i < sl->nr_segments; i++) {
  410:     reorder_by_use_dict(sl, i);
  411:     reorder_by_corpus(sl, i);
  412:   }
  413: }
  414: 
  415: void
  416: anthy_relation_init(void)
  417: {
  418:   corpus_info.corpus_array = anthy_file_dic_get_section("corpus_array");
  419:   corpus_info.corpus_bucket = anthy_file_dic_get_section("corpus_bucket");
  420:   if (!corpus_info.corpus_array ||
  421:       !corpus_info.corpus_array) {
  422:     return ;
  423:   }
  424:   corpus_info.array_size = ntohl(((int *)corpus_info.corpus_array)[1]);
  425:   corpus_info.bucket_size = ntohl(((int *)corpus_info.corpus_bucket)[1]);
  426:   corpus_info.array = &(((int *)corpus_info.corpus_array)[16]);
  427:   corpus_info.bucket = &(((int *)corpus_info.corpus_bucket)[16]);
  428:   /*
  429:   {
  430:     int i;
  431:     for (i = 0; i < corpus_info.array_size; i++) {
  432:       int v = ntohl(corpus_info.array[i * 2]);
  433:       printf("%d: %d %d\n", i, v, v & CORPUS_KEY_MASK);
  434:     }
  435:   }
  436:   */
  437: }
Permalink to this note knok: anthy/9100e/src-ordering/relation.c:415-437 on Thu Feb 28 18:01:25 +0900 2008

コーパスの関連性情報を取り扱うらしい?

Syntax (Markdown)