(linenum→info "unix/slp.c:2238")

anthy/9100e/mkworddic/mkdic.c

    1: /*
    2:  * cannadic形式のファイルから辞書ファイルを作る
    3:  *
    4:  * Funded by IPA未踏ソフトウェア創造事業 2002 1/1
    5:  *
    6:  * Copyright (C) 2000-2007 TABATA Yusuke
    7:  * Copyright (C) 2005 YOSHIDA Yuichi
    8:  * Copyright (C) 2001-2002 TAKAI Kousuke
    9:  */
   10: /*
   11:  * 辞書は読みをindexとし、品詞や変換後の単語(=entry)を検索
   12:  * する構造になっている。
   13:  *
   14:  * 読み -> 単語、単語、、
   15:  *
   16:  * 辞書ファイルはネットワークバイトオーダーを用いる。
   17:  *
   18:  * 辞書ファイルは複数のセクションから構成されている
   19:  *  0 ヘッダ 16*4 bytes
   20:  *  2 読みのインデックス (読み512個ごと)
   21:  *  3 読み
   22:  *  4 ページ
   23:  *  5 ページのインデックス
   24:  *  6 用例辞書(?)
   25:  *  7 読み hash
   26:  *
   27:  * source 元の辞書ファイル
   28:  * file_dic 生成するファイル
   29:  *
   30:  * yomi_hash 辞書ファイルに出力されるhashのbitmap
   31:  * index_hash このソース中でstruct yomi_entryを検索するためのhash
   32:  *
   33:  */
   34: 
   35: #include <sys/types.h>
   36: #include <unistd.h>
   37: #include <stdio.h>
   38: #include <stdlib.h>
   39: #include <string.h>
   40: #include <errno.h>
   41: #include <ctype.h>
   42: 
   43: #include <config.h>
   44: 
   45: #include <anthy/anthy.h>
   46: #include <anthy/xstr.h>
   47: #include <anthy/wtype.h>
   48: #include <anthy/ruleparser.h>
   49: #include <anthy/word_dic.h>
   50: #include <anthy/diclib.h>
   51: #include "mkdic.h"
   52: 
   53: #define MAX_LINE_LEN 10240
   54: #define NR_HEADER_SECTIONS 16
   55: #define SECTION_ALIGNMENT 8
   56: #define MAX_WTYPE_LEN 20
   57: 
   58: #define DEFAULT_FN "anthy.wdic"
   59: 
   60: static const char *progname;
   61: 
   62: /* writewords.cからアクセスするために、global変数 */
   63: FILE *yomi_entry_index_out, *yomi_entry_out;
   64: FILE *page_out, *page_index_out;
   65: /**/
   66: static FILE *uc_out;
   67: static FILE *yomi_hash_out;
   68: /* ハッシュの衝突の数、統計情報 */
   69: static int yomi_hash_collision;
   70: 
   71: /* ファイル中の順序に従って並べる */
   72: struct file_section {
   73:   FILE **fpp;
   74:   char *fn;
   75: } file_array[] = {
   76:   {&yomi_entry_index_out, NULL},
   77:   {&yomi_entry_out, NULL},
   78:   {&page_out, NULL},
   79:   {&page_index_out, NULL},
   80:   {&uc_out, NULL},
   81:   {&yomi_hash_out, NULL},
   82:   {NULL, NULL},
   83: };
   84: 
   85: /* 辞書生成の状態 */
   86: struct mkdic_stat {
   87:   /* 単語のリスト */
   88:   struct yomi_entry_list yl;
   89:   /**/
   90:   struct adjust_command ac_list;
   91:   /* 用例辞書 */
   92:   struct uc_dict *ud;
   93:   /**/
   94:   const char *output_fn;
   95:   /**/
   96:   int input_encoding;
   97:   /**/
   98:   int nr_excluded;
   99:   char **excluded_wtypes;
  100: };
  101: 
  102: /* 辞書の出力先のファイルをオープンする */
  103: static void
  104: open_output_files(void)
  105: {
  106:   struct file_section *fs;
  107:   for (fs = file_array; fs->fpp; fs ++) {
  108:     char *tmpdir = getenv("TMPDIR");
  109:     fs->fn = NULL;
  110:     if (tmpdir) {
  111:       /* tmpfile()がTMPDIRを見ないため、TMPDIRを指定された場合mkstempを使う。*/
  112:       char buf[256];
  113:       int fd = -1;
  114:       snprintf(buf, sizeof(buf), "%s/mkanthydic.XXXXXX", tmpdir);
  115:       fd = mkstemp(buf);
  116:       if (fd == -1) {
  117:         *(fs->fpp) = NULL;
  118:       } else {
  119:         *(fs->fpp) = fdopen(fd, "w+");
  120:         fs->fn = strdup(buf);
  121:       }
  122:     } else {
  123:       *(fs->fpp) = tmpfile();
  124:     }
  125:     /**/
  126:     if (!(*(fs->fpp))) {
  127:       fprintf (stderr, "%s: cannot open temporary file: %s\n",
  128:                progname, strerror (errno));
  129:       exit (2);
  130:     }
  131:   }
  132: }
  133: 
  134: /* fflushする */
  135: static void
  136: flush_output_files (void)
  137: {
  138:   struct file_section *fs;
  139:   for (fs = file_array; fs->fpp; fs ++) {
  140:     if (ferror(*(fs->fpp))) {
  141:       fprintf (stderr, "%s: write error\n", progname);
  142:       exit (1);
  143:     }
  144:   }
  145:   for (fs = file_array; fs->fpp; fs ++) {
  146:     if (fflush(*(fs->fpp))) {
  147:       fprintf (stderr, "%s: write error: %s\n", progname, strerror (errno));
  148:       exit (1);
  149:     }
  150:   }
  151: }
  152: 
  153: /* ネットワークbyteorderで4bytes書き出す */
  154: void
  155: write_nl(FILE *fp, int i)
  156: {
  157:   i = anthy_dic_htonl(i);
  158:   fwrite(&i, sizeof(int), 1, fp);
  159: }
  160: 
  161: static void
  162: print_usage(void)
  163: {
  164:   printf("please do not use mkanthydic command directly.\n");
  165:   exit(0);
  166: }
  167: 
  168: static char *
  169: read_line(FILE *fp, char *buf)
  170: {
  171:   /* 長すぎる行を無視する */
  172:   int toolong = 0;
  173: 
  174:   while (fgets(buf, MAX_LINE_LEN, fp)) {
  175:     int len = strlen(buf);
  176:     if (buf[0] == '#') {
  177:       continue ;
  178:     }
  179:     if (buf[len - 1] != '\n') {
  180:       toolong = 1;
  181:       continue ;
  182:     }
  183: 
  184:     buf[len - 1] = 0;
  185:     if (toolong) {
  186:       toolong = 0;
  187:     } else {
  188:       return buf;
  189:     }
  190:   }
  191:   return NULL;
  192: }
  193: 
  194: /** cannadic形式の辞書の行からindexとなる部分を取り出す */
  195: static xstr *
  196: get_index_from_line(struct mkdic_stat *mds, char *buf)
  197: {
  198:   char *sp;
  199:   xstr *xs;
  200:   sp = strchr(buf, ' ');
  201:   if (!sp) {
  202:     /* 辞書のフォーマットがおかしい */
  203:     return NULL;
  204:   }
  205:   *sp = 0;
  206:   xs = anthy_cstr_to_xstr(buf, mds->input_encoding);
  207:   *sp = ' ';
  208:   return xs;
  209: }
  210: 
  211: /** cannadic形式の辞書の行からindex以外の部分を取り出す */
  212: static char *
  213: get_entry_from_line(char *buf)
  214: {
  215:   char *sp;
  216:   sp = strchr(buf, ' ');
  217:   while(*sp == ' ') {
  218:     sp ++;
  219:   }
  220:   return strdup(sp);
  221: }
  222: 
  223: static int
  224: index_hash(xstr *xs)
  225: {
  226:   int i;
  227:   unsigned int h = 0;
  228:   for (i = 0; i < xs->len; i++) {
  229:     h += xs->str[i] * 11;
  230:   }
  231:   return (int)(h % YOMI_HASH);
  232: }
  233: 
  234: const char *
  235: get_wt_name(const char *name)
  236: {
  237:   wtype_t dummy;
  238:   const char *res;
  239:   if (!strcmp(name, "#T35")) {
  240:     return "#T";
  241:   }
  242:   res = anthy_type_to_wtype(name, &dummy);
  243:   if (!res) {
  244:     return "unknown";
  245:   }
  246:   return res;
  247: }
  248: 
  249: /** 読みに対して、単語を一つを追加する */
  250: static void
  251: push_back_word_entry(struct mkdic_stat *mds,
  252:                      struct yomi_entry *ye, const char *wt_name,
  253:                      int freq, const char *word, int order)
  254: {
  255:   wtype_t wt;
  256:   char *s;
  257:   if (freq == 0) {
  258:     return ;
  259:   }
  260:   if (!anthy_type_to_wtype(wt_name, &wt)) {
  261:     /* anthyの知らない品詞 */
  262:     return ;
  263:   }
  264:   ye->entries = realloc(ye->entries,
  265:                         sizeof(struct word_entry) *
  266:                         (ye->nr_entries + 1));
  267:   ye->entries[ye->nr_entries].ye = ye;
  268:   ye->entries[ye->nr_entries].wt_name = get_wt_name(wt_name);
  269:   ye->entries[ye->nr_entries].raw_freq = freq;
  270:   ye->entries[ye->nr_entries].feature = 0;
  271:   ye->entries[ye->nr_entries].source_order = order;
  272:   if (mds->input_encoding == ANTHY_EUC_JP_ENCODING) {
  273:     s = anthy_conv_euc_to_utf8(word);
  274:   } else {
  275:     s = strdup(word);
  276:   }
  277:   ye->entries[ye->nr_entries].word_utf8 = s;
  278:   ye->nr_entries ++;
  279: }
  280: 
  281: static int
  282: parse_wtype(char *wtbuf, char *cur)
  283: {
  284:   /* 品詞 */
  285:   char *t;
  286:   int freq;
  287:   if (strlen(cur) >= MAX_WTYPE_LEN) {
  288:     return 0;
  289:   }
  290:   strcpy(wtbuf, cur);
  291:   /* 頻度 */
  292:   t = strchr(wtbuf, '*');
  293:   freq = 1;
  294:   if (t) {
  295:     int tmp_freq;
  296:     *t = 0;
  297:     t++;
  298:     tmp_freq = atoi(t);
  299:     if (tmp_freq) {
  300:       freq = tmp_freq;
  301:     }
  302:   }
  303:   return freq;
  304: }
  305: 
  306: /* 複合語の要素の長さは 1,2,3, ... 9,a,b,c */
  307: static int
  308: get_element_len(xchar xc)
  309: {
  310:   if (xc > '0' && xc <= '9') {
  311:     return xc - '0';
  312:   }
  313:   if (xc >= 'a' && xc <= 'z') {
  314:     return xc - 'a' + 10;
  315:   }
  316:   return 0;
  317: }
  318: 
  319: /** 複合候補の形式チェック */
  320: static int
  321: check_compound_candidate(struct mkdic_stat *mds, xstr *index, const char *cur)
  322: {
  323:   /* 読みの文字数の合計を数える */
  324:   xstr *xs = anthy_cstr_to_xstr(cur, mds->input_encoding);
  325:   int i, total = 0;
  326:   for (i = 0; i < xs->len - 1; i++) {
  327:     if (xs->str[i] == '_') {
  328:       total += get_element_len(xs->str[i+1]);
  329:     }
  330:   }
  331:   anthy_free_xstr(xs);
  332:   /* 比較する */
  333:   if (total != index->len) {
  334:     fprintf(stderr, "Invalid compound candidate (%s, length = %d).\n",
  335:             cur, total);
  336:     return 0;
  337:   }
  338:   return 1;
  339: }
  340: 
  341: static int
  342: is_excluded_wtype(struct mkdic_stat *mds, char *wt)
  343: {
  344:   int i;
  345:   for (i = 0; i < mds->nr_excluded; i++) {
  346:     if (!strcmp(mds->excluded_wtypes[i], wt)) {
  347:       return 1;
  348:     }
  349:   }
  350:   return 0;
  351: }
  352: 
  353: static char *
  354: find_token_end(char *cur)
  355: {
  356:   char *n;
  357:   for (n = cur; *n != ' ' && *n != '\0'; n++) {
  358:     if (*n == '\\') {
  359:       if (!n[1]) {
  360:         return NULL;
  361:       }
  362:       n++;
  363:     }
  364:   }
  365:   return n;
  366: }
  367: 
  368: /** 読みに対応する行を分割して、配列を構成する */
  369: static void
  370: push_back_word_entry_line(struct mkdic_stat *mds, struct yomi_entry *ye,
  371:                           const char *ent)
  372: {
  373:   char *buf = alloca(strlen(ent) + 1);
  374:   char *cur = buf;
  375:   char *n;
  376:   char wtbuf[MAX_WTYPE_LEN];
  377:   int freq = 0;
  378:   int order = 0;
  379: 
  380:   strcpy(buf, ent);
  381:   wtbuf[0] = 0;
  382: 
  383:   while (1) {
  384:     /* トークンを\0で切る。curの後の空白か\0を探す */
  385:     n = find_token_end(cur);
  386:     if (!n) {
  387:       fprintf(stderr, "invalid \\ at the end of line (%s).\n",
  388:               ent);
  389:       return ;
  390:     }
  391:     if (*n) {
  392:       *n = 0;
  393:     } else {
  394:       n = NULL;
  395:     }
  396:     /**/
  397:     if (cur[0] == '#') {
  398:       if (isalpha((unsigned char)cur[1])) {
  399:         /* #XX*?? をパース */
  400:         freq = parse_wtype(wtbuf, cur);
  401:       } else {
  402:         if (cur[1] == '_' &&
  403:             check_compound_candidate(mds, ye->index_xstr, &cur[1])) {
  404:           /* #_ 複合候補 */
  405:           push_back_word_entry(mds, ye, wtbuf, freq, cur, order);
  406:           order ++;
  407:         }
  408:       }
  409:     } else {
  410:       /* 品詞が除去リストに入っているかをチェック */
  411:       if (!is_excluded_wtype(mds, wtbuf)) {
  412:         /* 単語を追加 */
  413:         push_back_word_entry(mds, ye, wtbuf, freq, cur, order);
  414:         order ++;
  415:       }/* :to extract excluded words
  416:           else {
  417:           anthy_putxstr(ye->index_xstr);
  418:           printf(" %s*%d %s\n", wtbuf, freq, cur);
  419:           }*/
  420:     }
  421:     if (!n) {
  422:       /* 行末 */
  423:       return ;
  424:     }
  425:     cur = n;
  426:     cur ++;
  427:   }
  428: }
  429: 
  430: /** 同じ単語が無いかチェック */
  431: static int
  432: check_same_word(struct yomi_entry *ye, int idx)
  433: {
  434:   struct word_entry *base = &ye->entries[idx];
  435:   int i;
  436:   for (i = idx -1; i >= 0; i--) {
  437:     struct word_entry *cur = &ye->entries[i];
  438:     if (base->raw_freq != cur->raw_freq) {
  439:       return 0;
  440:     }
  441:     if (strcmp(base->wt_name, cur->wt_name)) {
  442:       return 0;
  443:     }
  444:     if (strcmp(base->word_utf8, cur->word_utf8)) {
  445:       return 0;
  446:     }
  447:     /* 同じだった */
  448:     return 1;
  449:   }
  450:   return 0;
  451: }
  452: 
  453: /** qsort用の比較関数 */
  454: static int
  455: compare_word_entry_by_freq(const void *p1, const void *p2)
  456: {
  457:   const struct word_entry *e1 = p1;
  458:   const struct word_entry *e2 = p2;
  459:   return e2->raw_freq - e1->raw_freq;
  460: }
  461: 
  462: /** qsort用の比較関数 */
  463: static int
  464: compare_word_entry_by_wtype(const void *p1, const void *p2)
  465: {
  466:   const struct word_entry *e1 = p1;
  467:   const struct word_entry *e2 = p2;
  468:   int ret = strcmp(e1->wt_name, e2->wt_name);
  469:   if (ret != 0) {
  470:     return ret;
  471:   } else {
  472:     return compare_word_entry_by_freq(p1, p2);
  473:   }
  474: }
  475: 
  476: /** 読みに対する単語を頻度順に並べ、いらない単語を消す */
  477: static int
  478: normalize_word_entry(struct yomi_entry *ye)
  479: {
  480:   int i, nr_dup = 0;
  481:   if (!ye) {
  482:     return 0;
  483:   }
  484:   /* 単語を並べる */
  485:   qsort(ye->entries, ye->nr_entries,
  486:         sizeof(struct word_entry),
  487:         compare_word_entry_by_freq);
  488:   /* ダブったら、0点 */
  489:   for (i = 0; i < ye->nr_entries; i++) {
  490:     if (check_same_word(ye, i)) {
  491:       ye->entries[i].raw_freq = 0;
  492:       nr_dup ++;
  493:     }
  494:   }
  495:   /* 再びソート */
  496:   qsort(ye->entries, ye->nr_entries,
  497:         sizeof(struct word_entry),
  498:         compare_word_entry_by_wtype);
  499:   return ye->nr_entries - nr_dup;
  500: }
  501: 
  502: /*その読みに対応するyomi_entryを返す
  503: **/
  504: struct yomi_entry *
  505: find_yomi_entry(struct yomi_entry_list *yl, xstr *index, int create)
  506: {
  507:   struct yomi_entry *ye;
  508:   int hash = index_hash(index);
  509:   int search = 0;
  510:   /* hash chainから探す */
  511:   for (ye = yl->hash[hash];ye ; ye = ye->hash_next) {
  512:     search ++;
  513:     if (!anthy_xstrcmp(ye->index_xstr, index)) {
  514:       return ye;
  515:     }
  516:   }
  517:   if (!create) {
  518:     return NULL;
  519:   }
  520: 
  521:   /* 無いので確保 */
  522:   ye = malloc(sizeof(struct yomi_entry));
  523:   ye->nr_entries = 0;
  524:   ye->entries = 0;
  525:   ye->next = NULL;
  526:   ye->index_xstr = anthy_xstr_dup(index);
  527:   ye->index_str = NULL;
  528: 
  529:   /* hash chainにつなぐ */
  530:   ye->hash_next = yl->hash[hash];
  531:   yl->hash[hash] = ye;
  532: 
  533:   /* リストにつなぐ */
  534: 
  535:   ye->next = yl->head;
  536:   yl->head = ye;
  537: 
  538:   yl->nr_entries ++;
  539: 
  540:   return ye;
  541: }
  542: 
  543: /* 辞書ファイル中のhash bitmapにマークを付ける */
  544: static void
  545: mark_hash_array(unsigned char *hash_array, xstr *xs)
  546: {
  547:   int val, idx, bit, mask;
  548:   val = anthy_xstr_hash(xs);
  549:   val &= (YOMI_HASH_ARRAY_SIZE*YOMI_HASH_ARRAY_BITS-1);
  550:   idx=(val>>YOMI_HASH_ARRAY_SHIFT)&(YOMI_HASH_ARRAY_SIZE-1);
  551:   bit= val & ((1<<YOMI_HASH_ARRAY_SHIFT)-1);
  552:   mask = (1<<bit);
  553:   if (hash_array[idx] & mask) {
  554:     yomi_hash_collision ++;
  555:   }
  556:   hash_array[idx] |= mask;
  557: }
  558: 
  559: /* 読みhashのビットマップを作る */
  560: static void
  561: mk_yomi_hash(FILE *yomi_hash_out, struct yomi_entry_list *yl)
  562: {
  563:   unsigned char *hash_array;
  564:   int i;
  565:   struct yomi_entry *ye;
  566:   hash_array = (unsigned char *)malloc(YOMI_HASH_ARRAY_SIZE);
  567:   for (i = 0; i < YOMI_HASH_ARRAY_SIZE; i++) {
  568:     hash_array[i] = 0;
  569:   }
  570:   for (i = 0; i < yl->nr_valid_entries; i++) {
  571:     ye = yl->ye_array[i];
  572:     mark_hash_array(hash_array, ye->index_xstr);
  573:   }
  574:   fwrite(hash_array, YOMI_HASH_ARRAY_SIZE, 1, yomi_hash_out);
  575:   printf("generated yomi hash bitmap (%d collisions/%d entries)\n",
  576:          yomi_hash_collision, yl->nr_valid_entries);
  577:          
  578: }
  579: 
  580: static struct adjust_command *
  581: parse_modify_freq_command(const char *buf)
  582: {
  583:   char *line = alloca(strlen(buf) + 1);
  584:   char *yomi, *wt, *word, *type_str;
  585:   struct adjust_command *cmd;
  586:   int type = 0;
  587:   strcpy(line, buf);
  588:   yomi = strtok(line, " ");
  589:   wt = strtok(NULL, " ");
  590:   word = strtok(NULL<