(linenum→info "unix/slp.c:2238")

anthy/9100e/calctrans/proccorpus.c

    1: /*
    2:  * コーパスとなる文章を読んで、文節の長さを調整して
    3:  * 形態素解析の結果を出力する
    4:  *
    5:  * 出力形式について
    6:  *  まず伸縮を行った文節が最初の長さで出力される
    7:  *  次に各文節毎に(あれば)誤った候補、正しい候補の順で情報を出力する
    8:  *
    9:  *
   10:  * Copyright (C) 2006-2007 TABATA Yusuke
   11:  *
   12:  */
   13: #include <stdio.h>
   14: #include <string.h>
   15: #include <stdlib.h>
   16: #include <anthy/convdb.h>
   17: 
   18: static int verbose;
   19: 
   20: /* 文節の長さを例文にあわせる */
   21: static int
   22: trim_segment(anthy_context_t ac, struct conv_res *cr,
   23:              int nth, char *seg)
   24: {
   25:   int len = strlen(seg);
   26:   int resized = 0;
   27:   (void)cr;
   28: 
   29:   while (1) {
   30:     char seg_buf[1024];
   31:     int cur_len;
   32: 
   33:     anthy_get_segment(ac, nth, NTH_UNCONVERTED_CANDIDATE, seg_buf, 1024);
   34:     cur_len = strlen(seg_buf);
   35:     if (len == cur_len) {
   36:       return 1;
   37:     }
   38:     if (!resized) {
   39:       resized = 1;
   40:       /* 伸縮前の文節の情報を表示する */
   41:       print_size_miss_segment_info(ac, nth);
   42:     }
   43:     if (len > cur_len) {
   44:       anthy_resize_segment(ac, nth, 1);
   45:     } else {
   46:       anthy_resize_segment(ac, nth, -1);
   47:     }
   48:   }
   49:   return 0;
   50: }
   51: 
   52: /*
   53:  * nth番目の文節で候補segを探して確定する
   54:  */
   55: static int
   56: find_candidate(anthy_context_t ac, struct conv_res *cr,
   57:                int nth, char *seg)
   58: {
   59:   char seg_buf[1024];
   60:   int i;
   61:   struct anthy_segment_stat ass;
   62: 
   63:   if (seg[0] == '~') {
   64:     /* 候補ミスのマーク「~」をスキップする */
   65:     seg++;
   66:     cr->cand_check[nth] = 1;
   67:   }
   68: 
   69:   anthy_get_segment_stat(ac, nth, &ass);
   70:   for (i = 0; i < ass.nr_candidate; i++) {
   71:     anthy_get_segment(ac, nth, i, seg_buf, 1024);
   72:     if (!strcmp(seg_buf, seg)) {
   73:       /* 一致する候補を見つけたので確定する */
   74:       anthy_commit_segment(ac, nth, i);
   75:       return 0;
   76:     }
   77:   }
   78:   return 0;
   79: }
   80: 
   81: /* '|' で文節に区切られた文字列の各文節を引数にfnを呼ぶ */
   82: static int
   83: for_each_segment(anthy_context_t ac, struct conv_res *cr,
   84:                  const char *res_str,
   85:                  int (*fn)(anthy_context_t ac, struct conv_res *cr,
   86:                            int nth, char *seg))
   87: {
   88:   char *str, *cur, *cur_seg;
   89:   int nth;
   90:   if (!res_str) {
   91:     return 0;
   92:   }
   93: 
   94:   str = strdup(res_str);
   95:   cur = str;
   96:   cur ++;
   97:   cur_seg = cur;
   98:   nth = 0;
   99:   while ((cur = strchr(cur, '|'))) {
  100:     *cur = 0;
  101:     /**/
  102:     if (fn) {
  103:       fn(ac, cr, nth, cur_seg);
  104:     }
  105:     /**/
  106:     nth ++;
  107:     cur ++;
  108:     cur_seg = cur;
  109:   }
  110: 
  111:   free(str);
  112:   
  113:   return 1;
  114: }
  115: 
  116: static void
  117: proc_sentence(anthy_context_t ac, struct conv_res *cr)
  118: {
  119:   int i;
  120:   struct anthy_conv_stat acs;
  121:   /*printf("(%s)\n", cr->src_str);*/
  122:   anthy_set_string(ac, cr->src_str);
  123:   /* 文節の長さを調節する */
  124:   if (!for_each_segment(ac, cr, cr->res_str, trim_segment)) {
  125:     return ;
  126:   }
  127:   /**/
  128:   if (anthy_get_stat(ac, &acs)) {
  129:     return ;
  130:   }
  131:   cr->cand_check = malloc(sizeof(int) * acs.nr_segment);
  132:   for (i = 0; i < acs.nr_segment; i++) {
  133:     cr->cand_check[i] = 0;
  134:   }
  135: 
  136:   /* 候補を選択する */
  137:   if (cr->cand_str) {
  138:     for_each_segment(ac, cr, cr->cand_str, find_candidate);
  139:   }
  140: 
  141:   if (verbose) {
  142:     anthy_print_context(ac);
  143:   }
  144:   /* 出力する */
  145:   print_context_info(ac, cr);
  146: }
  147: 
  148: int
  149: main(int argc, char **argv)
  150: {
  151:   struct res_db *db;
  152:   struct conv_res *cr;
  153:   anthy_context_t ac;
  154:   int i;
  155: 
  156:   db = create_db();
  157:   for (i = 1; i < argc; i++) {
  158:     if (!strcmp("-v", argv[i])) {
  159:       verbose = 1;
  160:     } else {
  161:       read_db(db, argv[i]);
  162:     }
  163:   }
  164: 
  165:   anthy_conf_override("CONFFILE", "../anthy-conf");
  166:   anthy_conf_override("DIC_FILE", "../mkanthydic/anthy.dic");
  167:   anthy_init();
  168:   anthy_set_personality("");
  169:   ac = anthy_create_context();
  170: 
  171:   /**/
  172:   for (cr = db->res_list.next; cr; cr = cr->next) {
  173:     proc_sentence(ac, cr);
  174:   }
  175:   return 0;
  176: }
Syntax (Markdown)