(linenum→info "unix/slp.c:2238")

anthy/9100e/calctrans/calctrans.c

    1: /*
    2:  * 文節の遷移行列を作成する
    3:  *
    4:  * このコマンドは二つの機能を持っている。(-cオプションで制御)
    5:  * (1) proccorpusの結果からテキスト形式で経験的格率の表を作る
    6:  * (2) テキスト形式の表からバイナリ形式に変換する
    7:  *
    8:  * morphological-analyzerの出力には下記のマークが付けてある
    9:  * ~ 候補の誤り
   10:  * ! 文節長の誤り
   11:  * ^ 複合文節の2つめ以降の要素
   12:  *
   13:  * generate transition matrix
   14:  *
   15:  * Copyright (C) 2006 HANAOKA Toshiyuki
   16:  * Copyright (C) 2006-2007 TABATA Yusuke
   17:  *
   18:  */
   19: /*
   20:   This library is free software; you can redistribute it and/or
   21:   modify it under the terms of the GNU Lesser General Public
   22:   License as published by the Free Software Foundation; either
   23:   version 2 of the License, or (at your option) any later version.
   24: 
   25:   This library is distributed in the hope that it will be useful,
   26:   but WITHOUT ANY WARRANTY; without even the implied warranty of
   27:   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   28:   Lesser General Public License for more details.
   29: 
   30:   You should have received a copy of the GNU Lesser General Public
   31:   License along with this library; if not, write to the Free Software
   32:   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307  USA
   33:  */
   34: #include <stdio.h>
   35: #include <string.h>
   36: #include <stdlib.h>
   37: #include <math.h>
   38: 
   39: #include <anthy/anthy.h>
   40: #include <anthy/xstr.h>
   41: #include <anthy/feature_set.h>
   42: #include <anthy/diclib.h>
   43: #include "input_set.h"
   44: #include <anthy/corpus.h>
   45: 
   46: #define FEATURE_SET_SIZE NR_EM_FEATURES
   47: 
   48: #define ARRAY_SIZE 16
   49: 
   50: struct array {
   51:   int len;
   52:   int f[ARRAY_SIZE];
   53: };
   54: 
   55: #define MAX_SEGMENT 64
   56: 
   57: struct segment_info {
   58:   int orig_hash;
   59:   int hash;
   60: };
   61: 
   62: struct sentence_info {
   63:   int nr_segments;
   64:   struct segment_info segs[MAX_SEGMENT];
   65: };
   66: 
   67: /* 確率のテーブル */
   68: struct input_info {
   69:   /* 候補全体の素性 */
   70:   struct input_set *cand_is;
   71:   /* 文節の素性 */
   72:   struct input_set *seg_is;
   73:   /* 自立語の全文検索用情報 */
   74:   struct corpus *indep_corpus;
   75: 
   76:   /**/
   77:   struct array missed_cand_features;
   78: 
   79:   /**/
   80:   int nth_input_file;
   81: 
   82:   /* 入力された例文の量に関する情報 */
   83:   int nr_sentences;
   84:   int nr_connections;
   85: };
   86: 
   87: static struct input_info *
   88: init_input_info(void)
   89: {
   90:   struct input_info *m;
   91:   m = malloc(sizeof(struct input_info));
   92:   m->seg_is = input_set_create();
   93:   m->cand_is = input_set_create();
   94:   m->indep_corpus = corpus_new();
   95:   m->missed_cand_features.len = 0;
   96:   m->nth_input_file = 0;
   97:   m->nr_sentences = 0;
   98:   m->nr_connections = 0;
   99:   return m;
  100: }
  101: 
  102: /* features=1,2,3,,の形式をparseする */
  103: static void
  104: parse_features(struct array *features, char *s)
  105: {
  106:   char *tok, *str = s;
  107:   tok = strtok(str, ",");
  108:   features->len = 0;
  109:   do {
  110:     features->f[features->len] = atoi(tok);
  111:     features->len++;
  112:     tok = strtok(NULL, ",");
  113:   } while(tok);
  114: }
  115: 
  116: static void
  117: add_seg_struct_info(struct input_info *m,
  118:                     struct array *features,
  119:                     int weight)
  120: {
  121:   input_set_set_features(m->cand_is, features->f, features->len, weight);
  122: }
  123: 
  124: static void
  125: set_hash(struct sentence_info *sinfo, int error_class,
  126:          char tag, int hash)
  127: {
  128:   if (tag == '~') {
  129:     sinfo->segs[sinfo->nr_segments].orig_hash = hash;
  130:   } else {
  131:     sinfo->segs[sinfo->nr_segments].hash = hash;
  132:   }
  133:   if (!error_class) {
  134:     sinfo->nr_segments++;
  135:   }
  136: }
  137: 
  138: static int
  139: compare_array(struct array *a1, struct array *a2)
  140: {
  141:   int i;
  142:   if (a1->len != a2->len) {
  143:     return 1;
  144:   }
  145:   for (i = 0; i < a1->len; i++) {
  146:     if (a1->f[i] != a2->f[i]) {
  147:       return 1;
  148:     }
  149:   }
  150:   return 0;
  151: }
  152: 
  153: /* 自立語の行をparseする */
  154: static void
  155: parse_indep(struct input_info *m, struct sentence_info *sinfo,
  156:             char *line, char *buf, int error_class)
  157: {
  158:   struct array features;
  159:   char *s;
  160:   int weight = 1;
  161:   /**/
  162:   s = strstr(buf, "features=");
  163:   if (s) {
  164:     s += 9;
  165:     parse_features(&features, s);
  166:     m->nr_connections ++;
  167:   }
  168:   s = strstr(buf, "hash=");
  169:   if (s) {
  170:     s += 5;
  171:     set_hash(sinfo, error_class, line[0], atoi(s));
  172:   }
  173: 
  174:   /* 加算する */
  175:   if (error_class) {
  176:     if (line[0] == '~') {
  177:       /* 誤った候補の構造を保存 */
  178:       m->missed_cand_features = features;
  179:     }
  180:     if (line[0] == '!') {
  181:       /* 文節長の誤り */
  182:       input_set_set_features(m->seg_is, features.f, features.len, -weight);
  183:     }
  184:   } else {
  185:     /* 接続行列 */
  186:     input_set_set_features(m->seg_is, features.f, features.len, weight);
  187:     /* 候補の構造 */
  188:     if (m->missed_cand_features.len != 0 &&
  189:         compare_array(&features, &m->missed_cand_features)) {
  190:       /* 正解と異なる構造なら分母に加算 */
  191:       add_seg_struct_info(m, &m->missed_cand_features, -weight);
  192:     }
  193:     m->missed_cand_features.len = 0;
  194:     add_seg_struct_info(m, &features, weight);
  195:   }
  196: }
  197: 
  198: static void
  199: init_sentence_info(struct sentence_info *sinfo)
  200: {
  201:   int i;
  202:   sinfo->nr_segments = 0;
  203:   for (i = 0; i < MAX_SEGMENT; i++) {
  204:     sinfo->segs[i].orig_hash = 0;
  205:     sinfo->segs[i].hash = 0;
  206:   }
  207: }
  208: 
  209: /* 一つの文を読んだときに全文検索用のデータを作る
  210:  */
  211: static void
  212: complete_sentence_info(struct input_info *m, struct sentence_info *sinfo)
  213: {
  214:   int i;
  215:   if (m->nth_input_file > 0) {
  216:     /* 二つめ以降の入力ファイルは使わない */
  217:     return ;
  218:   }
  219:   for (i = 0; i < sinfo->nr_segments; i++) {
  220:     int flags = ELM_NONE;
  221:     int nr = 1;
  222:     int buf[2];
  223:     if (i == 0) {
  224:       flags |= ELM_BOS;
  225:     }
  226:     /**/
  227:     buf[0] = sinfo->segs[i].hash;
  228:     if (sinfo->segs[i].orig_hash) {
  229:       /*
  230:       buf[1] = sinfo->segs[i].orig_hash;
  231:       nr ++;
  232:       */
  233:     }
  234:     corpus_push_back(m->indep_corpus, buf, nr, flags);
  235:   }
  236: }
  237: 
  238: static void
  239: do_read_file(struct input_info *m, FILE *fp)
  240: {
  241:   char line[1024];
  242:   struct sentence_info sinfo;
  243: 
  244:   init_sentence_info(&sinfo);
  245: 
  246:   while (fgets(line, 1024, fp)) {
  247:     char *buf = line;
  248:     int error_class = 0;
  249:     if (!strncmp(buf, "eos", 3)) {
  250:       m->nr_sentences ++;
  251:       complete_sentence_info(m, &sinfo);
  252:       init_sentence_info(&sinfo);
  253:     }
  254:     if (line[0] == '~' || line[0] == '!' ||
  255:         line[0] == '^') {
  256:       buf ++;
  257:       error_class = 1;
  258:     }
  259:     if (!strncmp(buf, "indep_word", 10) ||
  260:         !strncmp(buf, "eos", 3)) {
  261:       parse_indep(m, &sinfo, line, buf, error_class);
  262:     }
  263:   }
  264: }
  265: 
  266: static void
  267: read_file(struct input_info *m, char *fn)
  268: {
  269:   FILE *ifp;
  270:   ifp = fopen(fn, "r");
  271:   if (!ifp) {
  272:     return ;
  273:   }
  274:   do_read_file(m, ifp);
  275:   fclose(ifp);
  276: }
  277: 
  278: static void
  279: write_nl(FILE *fp, int i)
  280: {
  281:   i = anthy_dic_htonl(i);
  282:   fwrite(&i, sizeof(int), 1, fp);
  283: }
  284: 
  285: static void
  286: dump_line(FILE *ofp, struct input_line *il)
  287: {
  288:   int i;
  289:   for (i = 0; i < FEATURE_SET_SIZE || i < il->nr_features; i++) {
  290:     if (i) {
  291:       fprintf(ofp, ", ");
  292:     }
  293:     if (i < il->nr_features) {
  294:       fprintf(ofp, "%d", il->features[i]);
  295:     } else {
  296:       fprintf(ofp, "0");
  297:     }
  298:   }
  299:   fprintf(ofp,",%d,%d\n", (int)il->negative_weight, (int)il->weight);
  300: }
  301: 
  302: static int
  303: compare_line(const void *p1, const void *p2)
  304: {
  305:   const struct input_line *const *il1 = p1;
  306:   const struct input_line *const *il2 = p2;
  307:   int i;
  308:   for (i = 0; i < (*il1)->nr_features &&
  309:          i < (*il2)->nr_features; i++) {
  310:     if ((*il1)->features[i] !=
  311:         (*il2)->features[i]) {
  312:       return (*il1)->features[i] - (*il2)->features[i];
  313:     }
  314:   }
  315:   return (*il1)->nr_features - (*il2)->nr_features;
  316: }
  317: 
  318: static void
  319: dump_features(FILE *ofp, struct input_set *is)
  320: {
  321:   struct input_line *il, **lines;
  322:   int i, nr = 0;
  323:   int weight = 0;
  324: 
  325:   /* count lines */
  326:   for (il = input_set_get_input_line(is); il; il = il->next_line) {
  327:     nr ++;
  328:     weight += (int)il->weight;
  329:   }
  330:   /* copy lines */
  331:   lines = malloc(sizeof(struct input_line *) * nr);
  332:   for (il = input_set_get_input_line(is), i = 0; i < nr;
  333:        i++, il = il->next_line) {
  334:     lines[i] = il;
  335:   }
  336:   /* sort */
  337:   qsort(lines, nr, sizeof(struct input_line *), compare_line);
  338:   /* output */
  339:   fprintf(ofp, "%d %d total_line_weight,count\n", weight, nr);
  340:   /**/
  341:   for (i = 0; i < nr; i++) {
  342:     dump_line(ofp, lines[i]);
  343:   }
  344: }
  345: 
  346: static void
  347: dump_input_info(FILE *ofp, struct input_info *m)
  348: {
  349:   fprintf(ofp, "section anthy.trans_info ");
  350:   dump_features(ofp, m->seg_is);
  351:   fprintf(ofp, "section anthy.cand_info ");
  352:   dump_features(ofp, m->cand_is);
  353:   fprintf(ofp, "section anthy.corpus_bucket ");
  354:   corpus_write_bucket(ofp, m->indep_corpus);
  355:   fprintf(ofp, "section anthy.corpus_array ");
  356:   corpus_write_array(ofp, m->indep_corpus);
  357:   /**/
  358:   fprintf(ofp, "section anthy.feature_info ");
  359:   input_set_output_feature_freq(ofp, m->seg_is);
  360: }
  361: 
  362: static void
  363: convert_line(FILE *ofp, char *buf)
  364: {
  365:   char *tok;
  366:   tok = strtok(buf, ",");
  367:   do {
  368:     int n = atoi(tok);
  369:     write_nl(ofp, n);
  370:     tok = strtok(NULL, ",");
  371:   } while (tok);
  372: }
  373: 
  374: static void
  375: convert_file(FILE *ifp)
  376: {
  377:   char buf[1024];
  378:   FILE *ofp = NULL;
  379:   while (fgets(buf, 1024, ifp)) {
  380:     /**/
  381:     if (buf[0] == '#') {
  382:       continue;
  383:     }
  384:     if (!strncmp("section", buf, 7)) {
  385:       int w, n, i;
  386:       char fn[1024];
  387:       if (ofp) {
  388:         fclose(ofp);
  389:         ofp = NULL;
  390:       }
  391:       sscanf(buf, "section %s %d %d", fn, &w, &n);
  392:       ofp = fopen(fn, "w");
  393:       if (!ofp) {
  394:         fprintf(stderr, "failed to open (%s)\n", fn);
  395:         abort();
  396:       }
  397:       write_nl(ofp, w);
  398:       write_nl(ofp, n);
  399:       for (i = 0; i < NR_EM_FEATURES; i++) {
  400:         write_nl(ofp, 0);
  401:       }
  402:     } else {
  403:       convert_line(ofp, buf);
  404:     }
  405:   }
  406:   if (ofp) {
  407:     fclose(ofp);
  408:   }
  409: }
  410: 
  411: static void
  412: convert_data(int nr_fn, char **fns)
  413: {
  414:   FILE *ifp;
  415:   int i;
  416:   /**/
  417:   for (i = 0; i < nr_fn; i++) {
  418:     ifp = fopen(fns[i], "r");
  419:     if (!ifp) {
  420:       fprintf(stderr, "failed to open (%s)\n", fns[i]);
  421:       continue;
  422:     }
  423:     convert_file(ifp);
  424:     fclose(ifp);
  425:   }
  426: }
  427: 
  428: /**/
  429: #define STRING_HASH_SIZE 256
  430: struct string_node {
  431:   int key;
  432:   char *str;
  433:   struct string_node *next_hash;
  434: };
  435: struct string_pool {
  436:   int nr;
  437:   struct string_node hash[STRING_HASH_SIZE];
  438:   struct string_node **array;
  439: };
  440: struct resize_info {
  441:   char *indep;
  442:   int valid;
  443: };
  444: struct extract_stat {
  445:   int nr;
  446:   struct resize_info info[MAX_SEGMENT];
  447: };
  448: 
  449: static void
  450: string_pool_init(struct string_pool *sp)
  451: {
  452:   int i;
  453:   for (i = 0; i < STRING_HASH_SIZE; i++) {
  454:     sp->hash[i].next_hash = NULL;
  455:   }
  456:   sp->nr = 0;
  457: }
  458: 
  459: static int
  460: compare_string_node(const void *p1, const void *p2)
  461: {
  462:   const struct string_node *const *n1 = p1;
  463:   const struct string_node *const *n2 = p2;
  464:   return (*n1)->key -(*n2)->key;
  465: }
  466: 
  467: static void
  468: string_pool_sort(struct string_pool *sp)
  469: {
  470:   int idx, h;
  471:   sp->array = malloc(sizeof(struct string_node *) * sp->nr);
  472:   for (idx = 0, h = 0; h < STRING_HASH_SIZE; h++) {
  473:     struct string_node *node;
  474:     for (node = sp->hash[h].next_hash; node; node = node->next_hash) {
  475:       sp->array[idx] = node;
  476:       idx ++;
  477:     }
  478:   }
  479:   /**/
  480:   qsort(sp->array, sp->nr, sizeof(struct string_node *), compare_string_node);
  481: }
  482: 
  483: static void
  484: string_pool_dump(FILE *ofp, struct string_pool *sp)
  485: {
  486:   int i;
  487:   fprintf(ofp, "section anthy.weak_words 0 %d\n", sp->nr);
  488:   for (i = 0; i < sp->nr; i++) {
  489:     fprintf(ofp, "%d\n", sp->array[i]->key);
  490:   }
  491: }
  492: 
  493: static unsigned int
  494: string_hash(const unsigned char *str)
  495: {
  496:   unsigned int h = 0;
  497:   while (*str) {
  498:     h += *str;
  499:     h *= 13;
  500:     str ++;
  501:   }
  502:   return h % STRING_HASH_SIZE;
  503: }
  504: 
  505: static struct string_node *
  506: find_string_node(struct string_pool *sp, const char *str)
  507: {
  508:   int h = (int)string_hash((const unsigned char *)str);
  509:   struct string_node *node;
  510:   for (node = sp->hash[h].next_hash; node; node = node->next_hash) {
  511:     if (!strcmp(str, node->str)) {
  512:       return node;
  513:     }
  514:   }
  515:   /* allocate new */
  516:   node = malloc(sizeof(*node));
  517:   node->str = strdup(str);
  518:   node->key = 0;
  519:   node->next_hash = sp->hash[h].next_hash;
  520:   sp->hash[h].next_hash = node;
  521:   sp->nr ++;
  522:   return node;
  523: }
  524: 
  525: static void
  526: flush_extract_stat(struct extract_stat *es, struct string_pool *sp)
  527: {
  528:   int i;
  529:   for (i = 0; i < es->nr; i++) {
  530:     if (es->info[i].valid) {
  531:       struct string_node *node;
  532:       node = find_string_node(sp, es->info[i].indep);
  533:       if (node->key == 0) {
  534:         xstr *xs = anthy_cstr_to_xstr(node->str, ANTHY_EUC_JP_ENCODING);
  535:         node->key = anthy_xstr_hash(xs);
  536:         anthy_free_xstr(xs);
  537:       }
  538:       /* printf("(%s)%d\n", es->info[i].indep, node->key); */
  539:     }
  540:     free(es->info[i].indep);
  541:     es->info[i].indep = NULL;
  542:   }
  543:   es->nr = 0;
  544: }
  545: 
  546: static char *
  547: get_indep_part(char *buf)
  548: {
  549:   int len;
  550:   char *c = strchr(buf, '#');
  551:   if (!c) {
  552:     return NULL;
  553:   }
  554:   c = strchr(c, ' ');
  555:   if (!c) {
  556:     return NULL;
  557:   }
  558:   c++;
  559:   c = strchr(c, ' ');
  560:   if (!c) {
  561:     return NULL;
  562:   }
  563:   c++;
  564:   len = strlen(c);
  565:   c[len-1] = 0;
  566:   return c;
  567: }
  568: 
  569: static