(linenum→info "unix/slp.c:2238")

anthy/9100e/src-util/convdb.c

    1: /*
    2:  * 変換エンジンの内部情報を使うため、意図的に
    3:  * layer violationを放置している。
    4:  *
    5:  */
    6: #include <stdio.h>
    7: #include <string.h>
    8: #include <stdlib.h>
    9: 
   10: #include <anthy/anthy.h>
   11: #include <anthy/convdb.h>
   12: #include <anthy/segment.h>
   13: #include <anthy/feature_set.h>
   14: /**/
   15: #include "../src-main/main.h"
   16: #include "../src-splitter/wordborder.h"
   17: #include "../src-worddic/dic_ent.h"
   18: 
   19: 
   20: /* 自立語部か付属語部か */
   21: #define WORD_INDEP 0
   22: #define WORD_DEP 1
   23: 
   24: /* 単語(自立語or付属語) */
   25: struct word {
   26:   /* WORD_* */
   27:   int type;
   28:   /* 付属語のhash(WORD_INDEP)もしくは変換後の文字列のhash(WORD_DEP) */
   29:   int hash;
   30:   /* 読みの文字列のhash */
   31:   int yomi_hash;
   32:   /* 変換前の文字列 */
   33:   xstr *raw_xs;
   34:   /* 変換後の文字列 */
   35:   xstr *conv_xs;
   36:   /* 変換後の品詞 */
   37:   const char *wt;
   38: };
   39: 
   40: static struct cand_ent *
   41: selected_candidate(struct seg_ent *seg)
   42: {
   43:   if (seg->committed > -1) {
   44:     return seg->cands[seg->committed];
   45:   }
   46:   return seg->cands[0];
   47: }
   48: 
   49: static void
   50: get_res(anthy_context_t ac, char *res_buf, int conv)
   51: {
   52:   struct anthy_conv_stat acs;
   53:   int i;
   54: 
   55:   anthy_get_stat(ac, &acs);
   56:   res_buf[0] = 0;
   57:   if (!conv) {
   58:     strcat(res_buf, "|");
   59:   }
   60:   for (i = 0; i < acs.nr_segment; i++) {
   61:     char buf[1024];
   62:     if (conv) {
   63:       anthy_get_segment(ac, i, 0, buf, 1024);
   64:       strcat(res_buf, buf);
   65:     } else {
   66:       anthy_get_segment(ac, i, NTH_UNCONVERTED_CANDIDATE, buf, 1024);
   67:       strcat(res_buf, buf);
   68:       strcat(res_buf, "|");
   69:     }
   70:   }
   71: }
   72: 
   73: static struct conv_res *
   74: do_find_conv_res(struct res_db *db, const char *src, const char *res)
   75: {
   76:   struct conv_res *cr;
   77: 
   78:   for (cr = db->res_list.next; cr; cr = cr->next) {
   79:     if (((!cr->res_str && !res) ||
   80:          !strcmp(cr->res_str, res)) &&
   81:         !strcmp(cr->src_str, src)) {
   82:       return cr;
   83:     }
   84:   }
   85:   cr = (struct conv_res *)malloc(sizeof(struct conv_res));
   86:   cr->src_str = strdup(src);
   87:   if (res) {
   88:     cr->res_str = strdup(res);
   89:   } else {
   90:     cr->res_str = NULL;
   91:   }
   92:   cr->cand_str = NULL;
   93:   cr->check = CHK_UNKNOWN;
   94:   cr->used = 0;
   95:   cr->cand_check = NULL;
   96:   /**/
   97:   db->tail->next = cr;
   98:   cr->next = NULL;
   99:   db->tail = cr;
  100:   return cr;
  101: }
  102: 
  103: struct conv_res *
  104: find_conv_res(struct res_db *db, anthy_context_t ac,
  105:               const char *src, int conv)
  106: {
  107:   char res_buf[1024];
  108:   get_res(ac, res_buf, conv);
  109: 
  110:   return do_find_conv_res(db, src, res_buf);
  111: }
  112: 
  113: static void
  114: chomp_line(char *buf)
  115: {
  116:   int len = strlen(buf);
  117:   if (buf[len-1] == '\n') {
  118:     buf[len-1] = 0;
  119:   }
  120: }
  121: 
  122: struct res_db *
  123: create_db(void)
  124: {
  125:   struct res_db *db;
  126: 
  127:   db = malloc(sizeof(struct res_db));
  128:   db->res_list.next = NULL;
  129:   db->tail = &db->res_list;
  130:   db->total = 0;
  131:   db->res.unknown = 0;
  132:   db->res.ok = 0;
  133:   db->res.miss = 0;
  134:   db->res.dontcare = 0;
  135:   db->split.unknown = 0;
  136:   db->split.ok = 0;
  137:   db->split.miss = 0;
  138:   db->split.dontcare = 0;
  139: 
  140:   return db;
  141: }
  142: 
  143: static void
  144: strip_separator_vbar(char *buf, const char *str)
  145: {
  146:   const char *src = str;
  147:   char *dst = buf;
  148:   while (*src) {
  149:     if (*src != '|' && *src != '~') {
  150:       *dst = *src;
  151:       dst ++;
  152:     }
  153:     src ++;
  154:   }
  155:   *dst = 0;
  156: }
  157: 
  158: static void
  159: parse_line(struct res_db *db, char *line)
  160: {
  161:   char buf1[1024], buf2[1024], buf3[1024], buf4[1024];
  162:   char *src, *res;
  163:   const char *check;
  164:   struct conv_res *cr;
  165:   int nr;
  166:   chomp_line(line);
  167:   if (line[0] == '#' || line[0] == 0) {
  168:     return ;
  169:   }
  170:   nr = sscanf(line, "%s %s %s", buf1, buf2, buf3);
  171:   if (nr == 1) {
  172:     cr = do_find_conv_res(db, buf1, NULL);
  173:     cr->check = CHK_UNKNOWN;
  174:     return ;
  175:   }
  176:   if (nr < 2) {
  177:     return ;
  178:   }
  179:   if (buf1[0] != '|') {
  180:     /* buf1 buf2    buf3
  181:      * 平文 区切り文
  182:      * 平文 区切り文 変換後
  183:      * 平文 区切り文 check
  184:      */
  185:     src = buf1;
  186:     res = buf2;
  187:     if (nr == 3) {
  188:       check = buf3;
  189:     } else {
  190:       check = "?";
  191:     }
  192:   } else {
  193:     /* buf1    buf2  (buf3)
  194:      * 区切り文
  195:      * 区切り文 変換後
  196:      * 区切り文 check
  197:      */
  198:     strip_separator_vbar(buf4, buf1);
  199:     src = buf4;
  200:     res = buf1;
  201:     check = buf2;
  202:   }
  203:   cr = do_find_conv_res(db, src, res);
  204:   if (nr == 2 && check[0] != '|') {
  205:     cr->check = CHK_OK;
  206:     return ;
  207:   }
  208:   if (check[0] == 'O') {
  209:     cr->check = CHK_OK;
  210:   } else if (check[0] == 'X') {
  211:     cr->check = CHK_MISS;
  212:   } else if (check[0] == '*') {
  213:     cr->check = CHK_DONTCARE;
  214:   } else if (check[0] == '|') {
  215:     cr->check = CHK_UNKNOWN;
  216:     cr->cand_str = strdup(check);
  217:   } else {
  218:     cr->check = CHK_UNKNOWN;
  219:   }
  220: }
  221: 
  222: void
  223: read_db(struct res_db *db, const char *fn)
  224: {
  225:   FILE *fp;
  226:   char line[1024];
  227: 
  228:   if (!fn) {
  229:     return ;
  230:   }
  231:   fp = fopen(fn, "r");
  232:   if (!fp) {
  233:     return ;
  234:   }
  235:   while (fgets(line, 1024, fp)) {
  236:     parse_line(db, line);
  237:   }
  238: }
  239: 
  240: static void
  241: fill_conv_info(struct word *w, struct cand_elm *elm)
  242: {
  243:   /*w->conv_xs, w->wt*/
  244:   struct dic_ent *de;
  245:   if (elm->nth == -1 ||
  246:       elm->nth >= elm->se->nr_dic_ents) {
  247:     w->conv_xs = NULL;
  248:     w->wt = NULL;
  249:     return ;
  250:   }
  251:   if (!elm->se->dic_ents) {
  252:     w->conv_xs = NULL;
  253:     w->wt = NULL;
  254:     return ;
  255:   }
  256:   /**/
  257:   de = elm->se->dic_ents[elm->nth];
  258:   w->conv_xs = anthy_xstr_dup(&de->str);
  259:   w->wt = de->wt_name;
  260:   w->hash = anthy_xstr_hash(w->conv_xs);
  261: }
  262: 
  263: static void
  264: init_word(struct word *w, int type)
  265: {
  266:   w->type = type;
  267:   w->raw_xs = NULL;
  268:   w->conv_xs = NULL;
  269:   w->wt = NULL;
  270: }
  271: 
  272: static void
  273: free_word(struct word *w)
  274: {
  275:   anthy_free_xstr(w->raw_xs);
  276:   anthy_free_xstr(w->conv_xs);
  277: }
  278: 
  279: /* 自立語を作る */
  280: static void
  281: fill_indep_word(struct word *w, struct cand_elm *elm)
  282: {
  283:   init_word(w, WORD_INDEP);
  284:   /* 変換前の読みを取得する */
  285:   w->raw_xs = anthy_xstr_dup(&elm->str);
  286:   w->yomi_hash = anthy_xstr_hash(w->raw_xs);
  287:   w->hash = 0;
  288:   /**/
  289:   fill_conv_info(w, elm);
  290: }
  291: 
  292: /* 付属語を作る */
  293: static void
  294: fill_dep_word(struct word *w, struct cand_elm *elm)
  295: {
  296:   init_word(w, WORD_DEP);
  297:   /**/
  298:   w->hash = anthy_xstr_hash(&elm->str);
  299:   w->yomi_hash = w->hash;
  300:   w->raw_xs = anthy_xstr_dup(&elm->str);
  301: }
  302: 
  303: static void
  304: print_features(struct feature_list *fl)
  305: {
  306:   int i, nr;
  307:   if (!fl) {
  308:     return ;
  309:   }
  310:   nr = anthy_feature_list_nr(fl);
  311:   if (nr == 0) {
  312:     return ;
  313:   }
  314:   printf(" features=");
  315:   for (i = 0; i < nr; i++) {
  316:     if (i > 0) {
  317:       printf(",");
  318:     }
  319:     printf("%d", anthy_feature_list_nth(fl, i));
  320:   }
  321: }
  322: 
  323: static void
  324: print_word(const char *prefix, struct word *w, struct feature_list *fl)
  325: {
  326:   printf("%s", prefix);
  327:   if (w->type == WORD_DEP) {
  328:     /* 付属語 */
  329:     printf("dep_word hash=%d ", w->hash);
  330:     anthy_putxstrln(w->raw_xs);
  331:     return ;
  332:   }
  333:   /* 自立語 */
  334:   printf("indep_word hash=%d", w->hash);
  335:   /**/
  336:   if (fl) {
  337:     print_features(fl);
  338:   }
  339:   /* 品詞 */
  340:   if (w->wt) {
  341:     printf(" %s", w->wt);
  342:   } else {
  343:     printf(" null");
  344:   }
  345:   /* 文字列 */
  346:   if (w->conv_xs) {
  347:     printf(" ");
  348:     anthy_putxstr(w->conv_xs);
  349:   } else {
  350:     printf(" null");
  351:   }
  352:   printf(" ");
  353:   anthy_putxstrln(w->raw_xs);
  354: }
  355: 
  356: /** segの文節クラスを返す
  357:  * segがnullであれば、clをクラスとする
  358:  */
  359: static int
  360: get_seg_class(struct seg_ent *seg, int cl)
  361: {
  362:   struct cand_ent *ce;
  363:   if (!seg) {
  364:     return cl;
  365:   }
  366:   ce = selected_candidate(seg);
  367:   if (ce->mw) {
  368:     return ce->mw->seg_class;
  369:   }
  370:   return SEG_BUNSETSU;
  371: }
  372: 
  373: static void
  374: set_features(struct feature_list *fl,
  375:              struct seg_ent *prev_seg,
  376:              struct seg_ent *cur_seg)
  377: {
  378:   int cl, pc;
  379:   cl = get_seg_class(cur_seg, SEG_TAIL);
  380:   pc = get_seg_class(prev_seg, SEG_HEAD);
  381: 
  382:   anthy_feature_list_set_cur_class(fl, cl);
  383:   if (cur_seg) {
  384:     struct cand_ent *ce =  selected_candidate(cur_seg);
  385:     anthy_feature_list_set_dep_word(fl, ce->dep_word_hash);
  386:     if (ce->mw) {
  387:       anthy_feature_list_set_dep_class(fl, ce->mw->dep_class);
  388:       anthy_feature_list_set_mw_features(fl, ce->mw->mw_features);
  389:       anthy_feature_list_set_noun_cos(fl, ce->mw->core_wt);
  390:     }
  391:   }
  392:   anthy_feature_list_set_class_trans(fl, pc, cl);
  393:   /**/
  394:   anthy_feature_list_sort(fl);
  395: }
  396: 
  397: static void
  398: print_element(const char *prefix,
  399:               struct cand_elm *elm, struct feature_list *fl)
  400: {
  401:   struct word w;
  402: 
  403:   if (elm->str.len == 0) {
  404:     return ;
  405:   }
  406:   if (elm->id != -1) {
  407:     /* 自立語 */
  408:     fill_indep_word(&w, elm);
  409:     print_word(prefix, &w, fl);
  410:   } else {
  411:     /* 付属語 */
  412:     fill_dep_word(&w, elm);
  413:     print_word(prefix, &w, NULL);
  414:   }
  415:   free_word(&w);
  416: }
  417: 
  418: static void
  419: print_unconverted(struct cand_ent *ce)
  420: {
  421:   printf("unknown ");
  422:   anthy_putxstrln(&ce->str);
  423: }
  424: 
  425: static void
  426: print_eos(struct seg_ent *prev_seg)
  427: {
  428:   struct feature_list fl;
  429:   anthy_feature_list_init(&fl);
  430:   set_features(&fl, prev_seg, NULL);
  431:   printf("eos ");
  432:   print_features(&fl);
  433:   printf("\n");
  434:   anthy_feature_list_free(&fl);
  435: }
  436: 
  437: /* 候補のミスには '~'、文節長のミスには '!'を付ける
  438:  * 同じ文節内の二つめ以降の自立語には '^'を付ける
  439:  */
  440: static const char *
  441: get_prefix(int flag)
  442: {
  443:   if (flag & CONV_INVALID) {
  444:     return "^";
  445:   }
  446:   if (flag & CONV_SIZE_MISS) {
  447:     return "!";
  448:   }
  449:   if (flag & CONV_CAND_MISS) {
  450:     return "~";
  451:   }
  452:   return "";
  453: }
  454: 
  455: static void
  456: print_segment_info(int is_negative,
  457:                    struct seg_ent *prev_seg,
  458:                    struct seg_ent *seg)
  459: {
  460:   int i;
  461:   struct feature_list fl;
  462:   struct cand_ent *ce =  selected_candidate(seg);
  463:   int nr_indep = 0;
  464:   const char *prefix = get_prefix(is_negative);
  465: 
  466:   anthy_feature_list_init(&fl);
  467:   set_features(&fl, prev_seg, seg);
  468:   for (i = 0; i < ce->nr_words; i++) {
  469:     struct cand_elm *elm = &ce->elm[i];
  470:     prefix = get_prefix(is_negative);
  471:     if (nr_indep > 0 && elm->id != -1) {
  472:       prefix = get_prefix(is_negative | CONV_INVALID);
  473:     }
  474:     /* 出力する */
  475:     print_element(prefix, elm, &fl);
  476:     /* 自立語を数える */
  477:     if (elm->id != -1) {
  478:       nr_indep ++;
  479:     }
  480:   }
  481:   anthy_feature_list_free(&fl);
  482: }
  483: 
  484: void
  485: print_size_miss_segment_info(anthy_context_t ac, int nth)
  486: {
  487:   struct seg_ent *prev_seg = NULL;
  488:   struct seg_ent *seg = anthy_get_nth_segment(&ac->seg_list, nth);
  489:   if (nth > 0) {
  490:     prev_seg = anthy_get_nth_segment(&ac->seg_list, nth - 1);
  491:   }
  492:   print_segment_info(CONV_SIZE_MISS, prev_seg, seg);
  493: }
  494: 
  495: void
  496: print_cand_miss_segment_info(anthy_context_t ac, int nth)
  497: {
  498:   struct seg_ent *prev_seg = NULL;
  499:   struct seg_ent *seg = anthy_get_nth_segment(&ac->seg_list, nth);
  500:   if (nth > 0) {
  501:     prev_seg = anthy_get_nth_segment(&ac->seg_list, nth - 1);
  502:   }
  503:   print_segment_info(CONV_CAND_MISS, prev_seg, seg);
  504: }
  505: 
  506: void
  507: print_context_info(anthy_context_t ac, struct conv_res *cr)
  508: {
  509:   int i;
  510:   struct seg_ent *prev_seg = NULL;
  511: 
  512:   printf("segments: %d\n", ac->seg_list.nr_segments);
  513:   /* 各文節に対して */
  514:   for (i = 0; i < ac->seg_list.nr_segments; i++) {
  515:     struct seg_ent *seg = anthy_get_nth_segment(&ac->seg_list, i);
  516:     struct cand_ent *ce = selected_candidate(seg);
  517:     int is_negative = 0;
  518:     if (cr && cr->cand_check && cr->cand_check[i]) {
  519:       is_negative = CONV_CAND_MISS;
  520:     }
  521: 
  522:     /* 各要素に対して */
  523:     if (!ce->nr_words) {
  524:       /* 要素が無いものはそのまま表示 */
  525:       print_unconverted(ce);
  526:     } else {
  527:       /* 候補の変更があった場合はそれを表示 */
  528:       if (seg->committed > 0) {
  529:         int tmp = seg->committed;
  530:         seg->committed = 0;
  531:         print_cand_miss_segment_info(ac, i);
  532:         seg->committed = tmp;
  533:       }
  534:       /* 文節の構成を表示 */
  535:       print_segment_info(is_negative, prev_seg, seg);
  536:     }
  537:     /**/
  538:     prev_seg = seg;
  539:   }
  540:   print_eos(prev_seg);
  541:   printf("\n");
  542: }
1
Syntax (Markdown)