(linenum→info "unix/slp.c:2238")

anthy/9100e/src-ordering/commit.c

    1: /*
    2:  * 確定(コミット)後の処理をする。
    3:  * 各種の学習処理を呼び出す
    4:  *
    5:  * anthy_proc_commit() が外部から呼ばれる
    6:  */
    7: #include <stdlib.h>
    8: #include <time.h>
    9: 
   10: #include <anthy/ordering.h>
   11: #include <anthy/record.h>
   12: #include <anthy/splitter.h>
   13: #include <anthy/segment.h>
   14: #include "sorter.h"
   15: 
   16: #define MAX_OCHAIRE_ENTRY_COUNT 100
   17: #define MAX_OCHAIRE_LEN 32
   18: #define MAX_PREDICTION_ENTRY 100
   19: 
   20: #define MAX_UNKNOWN_WORD 100
   21: 
   22: /* 交換された候補を探す */
   23: static void
   24: learn_swapped_candidates(struct segment_list *sl)
   25: {
   26:   int i;
   27:   struct seg_ent *seg;
   28:   for (i = 0; i < sl->nr_segments; i++) {
   29:     seg = anthy_get_nth_segment(sl, i);
   30:     if (seg->committed != 0) {
   31:       /* 最初の候補(0番目)でない候補(seg->committed番目)がコミットされた */
   32:       anthy_swap_cand_ent(seg->cands[0],
   33:                           seg->cands[seg->committed]);
   34:     }
   35:   }
   36:   anthy_cand_swap_ageup();
   37: }
   38: 
   39: /* 長さが変わった文節の変更後に対して */
   40: static void
   41: learn_resized_segment(struct splitter_context *sc,
   42:                       struct segment_list *sl)
   43:                       
   44: {
   45:   int i;
   46:   struct meta_word **mw
   47:     = alloca(sizeof(struct meta_word*) * sl->nr_segments);
   48:   int *len_array
   49:     = alloca(sizeof(int) * sl->nr_segments);
   50: 
   51:   /* 各文節の長さの配列とmeta_wordの配列を用意する */
   52:   for (i = 0; i < sl->nr_segments; i++) {
   53:     struct seg_ent *se = anthy_get_nth_segment(sl, i);
   54:     mw[i] = se->cands[se->committed]->mw;
   55:     len_array[i] = se->str.len;
   56:   }
   57: 
   58:   anthy_commit_border(sc, sl->nr_segments, mw, len_array);
   59: }
   60: 
   61: /* 長さが変わった文節の変更前に対して */
   62: static void
   63: clear_resized_segment(struct splitter_context *sc,
   64:                       struct segment_list *sl)
   65: {
   66:   int *mark, i, from;
   67:   struct seg_ent *seg;
   68:   mark = alloca(sizeof(int)*sc->char_count);
   69:   for (i = 0; i < sc->char_count; i++) {
   70:     mark[i] = 0;
   71:   }
   72:   /* 実際に確定された文節の長さをマークする */
   73:   from = 0;
   74:   for (i = 0; i < sl->nr_segments; i++) {
   75:     seg = anthy_get_nth_segment(sl, i);
   76:     mark[from] = seg->len;
   77:     from = from + seg->len;
   78:   }
   79:   for (i = 0; i < sc->char_count; i++) {
   80:     int len = sc->ce[i].initial_seg_len;
   81:     /* 最初の長さと確定された長さが異なれば、
   82:        使われなかった未知語の可能性がある */
   83:     if (len && len != mark[i]) {
   84:       xstr xs;
   85:       xs.str = sc->ce[i].c;
   86:       xs.len = len;
   87:       anthy_forget_unused_unknown_word(&xs);
   88:     }
   89:   }
   90:   if (!anthy_select_section("UNKNOWN_WORD", 0)) {
   91:     anthy_truncate_section(MAX_UNKNOWN_WORD);
   92:   }
   93: }
   94: 
   95: /* recordにお茶入れ学習の結果を書き込む */
   96: static void
   97: commit_ochaire(struct seg_ent *seg, int count, xstr* xs)
   98: {
   99:   int i;
  100:   if (xs->len >= MAX_OCHAIRE_LEN) {
  101:     return ;
  102:   }
  103:   if (anthy_select_row(xs, 1)) {
  104:     return ;
  105:   }
  106:   anthy_set_nth_value(0, count);
  107:   for (i = 0; i < count; i++, seg = seg->next) {
  108:     anthy_set_nth_value(i * 2 + 1, seg->len);
  109:     anthy_set_nth_xstr(i * 2 + 2, &seg->cands[seg->committed]->str);
  110:   }
  111: }
  112: 
  113: /* recordの領域を節約するために、お茶入れ学習のネガティブな
  114:    エントリを消す */
  115: static void
  116: release_negative_ochaire(struct splitter_context *sc,
  117:                          struct segment_list *sl)
  118: {
  119:   int start, len;
  120:   xstr xs;
  121:   (void)sl;
  122:   /* 変換前のひらがな文字列 */
  123:   xs.len = sc->char_count;
  124:   xs.str = sc->ce[0].c;
  125: 
  126:   /* xsの部分文字列に対して */
  127:   for (start = 0; start < xs.len; start ++) {
  128:     for (len = 1; len <= xs.len - start && len < MAX_OCHAIRE_LEN; len ++) {
  129:       xstr part;
  130:       part.str = &xs.str[start];
  131:       part.len = len;
  132:       if (anthy_select_row(&part, 0) == 0) {
  133:         anthy_release_row();
  134:       }
  135:     }
  136:   }
  137: }
  138: 
  139: /* お茶入れ学習を行う */
  140: static void
  141: learn_ochaire(struct splitter_context *sc,
  142:               struct segment_list *sl)
  143: {
  144:   int i;
  145:   int count;
  146: 
  147:   if (anthy_select_section("OCHAIRE", 1)) {
  148:     return ;
  149:   }
  150: 
  151:   /* お茶入れ学習のネガティブなエントリを消す */
  152:   release_negative_ochaire(sc, sl);
  153: 
  154:   /* お茶入れ学習をする */
  155:   for (count = 2; count <= sl->nr_segments && count < 5; count++) {
  156:     /* 2文節以上の長さの文節列に対して */
  157: 
  158:     for (i = 0; i <= sl->nr_segments - count; i++) {
  159:       struct seg_ent *head = anthy_get_nth_segment(sl, i);
  160:       struct seg_ent *s;
  161:       xstr xs;
  162:       int j;
  163:       xs = head->str;
  164:       if (xs.len < 2 && count < 3) {
  165:         /* 細切れの文節を学習することを避ける、
  166:          * いい加減なheuristics */
  167:         continue;
  168:       }
  169:       /* 文節列を構成する文字列を作る */
  170:       for (j = 1, s = head->next; j < count; j++, s = s->next) {
  171:         xs.len += s->str.len;
  172:       }
  173:       /**/
  174:       commit_ochaire(head, count, &xs);
  175:     }
  176:   }
  177:   if (anthy_select_section("OCHAIRE", 1)) {
  178:     return ;
  179:   }
  180:   anthy_truncate_section(MAX_OCHAIRE_ENTRY_COUNT);
  181: }
  182: 
  183: static int
  184: learn_prediction_str(xstr *idx, xstr *xs)
  185: {
  186:   int nr_predictions;
  187:   int i;
  188:   time_t t = time(NULL);
  189:   if (anthy_select_row(idx, 1)) {
  190:     return 0;
  191:   }
  192:   nr_predictions = anthy_get_nr_values();
  193: 
  194:   /* 既に履歴にある場合はタイムスタンプだけ更新 */
  195:   for (i = 0; i < nr_predictions; i += 2) {
  196:     xstr *log = anthy_get_nth_xstr(i + 1);
  197:     if (!log) {
  198:       continue;
  199:     }
  200:     if (anthy_xstrcmp(log, xs) == 0) {
  201:       anthy_set_nth_value(i, t);
  202:       break;
  203:     }
  204:   }
  205: 
  206:   /* ない場合は末尾に追加 */
  207:   if (i == nr_predictions) {
  208:     anthy_set_nth_value(nr_predictions, t);
  209:     anthy_set_nth_xstr(nr_predictions + 1, xs);      
  210:     anthy_mark_row_used();
  211:     return 1;
  212:   }
  213:   anthy_mark_row_used();
  214:   return 0;
  215: }
  216: 
  217: static void
  218: learn_prediction(struct segment_list *sl)
  219: {
  220:   int i;
  221:   int added = 0;
  222:   if (anthy_select_section("PREDICTION", 1)) {
  223:     return ;
  224:   }
  225:   for (i = 0; i < sl->nr_segments; i++) {
  226:     struct seg_ent *seg = anthy_get_nth_segment(sl, i);
  227:     xstr *xs = &seg->cands[seg->committed]->str;
  228: 
  229:     if (seg->committed < 0) {
  230:       continue;
  231:     }
  232:     if (learn_prediction_str(&seg->str, xs)) {
  233:       added = 1;
  234:     }
  235:   }
  236:   if (added) {
  237:     anthy_truncate_section(MAX_PREDICTION_ENTRY);
  238:   }
  239: }
  240: 
  241: static void
  242: learn_unknown(struct segment_list *sl)
  243: {
  244:   int i;
  245:   for (i = 0; i < sl->nr_segments; i++) {
  246:     struct seg_ent *seg = anthy_get_nth_segment(sl, i);
  247:     struct cand_ent *ce = seg->cands[seg->committed];
  248:     if (ce->nr_words == 0) {
  249:       anthy_add_unknown_word(&seg->str, &ce->str);
  250:     }
  251:   }
  252: }
  253: 
  254: void
  255: anthy_do_commit_prediction(xstr *src, xstr *xs)
  256: {
  257:   if (anthy_select_section("PREDICTION", 1)) {
  258:     return ;
  259:   }
  260:   learn_prediction_str(src, xs);
  261: }
  262: 
  263: void
  264: anthy_proc_commit(struct segment_list *sl,
  265:                   struct splitter_context *sc)
  266: {
  267:   /* 各種の学習を行う */
  268:   learn_swapped_candidates(sl);
  269:   learn_resized_segment(sc, sl);
  270:   clear_resized_segment(sc, sl);
  271:   learn_ochaire(sc, sl);
  272:   learn_prediction(sl);
  273:   learn_unknown(sl);
  274:   anthy_learn_cand_history(sl);
  275: }
Syntax (Markdown)