(linenum→info "unix/slp.c:2238")

anthy/9100e/depgraph/mkdepgraph.c

    1: /*
    2:  * Copyright (C) 2000-2007 TABATA Yusuke
    3:  * Copyright (C) 2004-2006 YOSHIDA Yuichi
    4:  */
    5: /*
    6:  * 付属語グラフをバイナリ化する
    7:  * init_word_seq_tab()
    8:  *   付属語テーブル中のノードへのポインタの初期化
    9:  */
   10: /*
   11:   This library is free software; you can redistribute it and/or
   12:   modify it under the terms of the GNU Lesser General Public
   13:   License as published by the Free Software Foundation; either
   14:   version 2 of the License, or (at your option) any later version.
   15: 
   16:   This library is distributed in the hope that it will be useful,
   17:   but WITHOUT ANY WARRANTY; without even the implied warranty of
   18:   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   19:   Lesser General Public License for more details.
   20: 
   21:   You should have received a copy of the GNU Lesser General Public
   22:   License along with this library; if not, write to the Free Software
   23:   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307  USA
   24:  */
   25: 
   26: #include <stdio.h>
   27: #include <string.h>
   28: #include <stdlib.h>
   29: 
   30: #include <anthy/alloc.h>
   31: #include <anthy/conf.h>
   32: #include <anthy/ruleparser.h>
   33: #include <anthy/xstr.h>
   34: #include <anthy/logger.h>
   35: #include <anthy/splitter.h>
   36: #include <anthy/anthy.h>
   37: #include <anthy/depgraph.h>
   38: #include <anthy/diclib.h>
   39: 
   40: #ifndef SRCDIR
   41: #define SRCDIR "."
   42: #endif
   43: 
   44: static int verbose;
   45: 
   46: static struct dep_node* gNodes;
   47: static char** gNodeNames;
   48: static int nrNodes;
   49: 
   50: /* 単語接続ルール */
   51: static struct wordseq_rule *gRules;
   52: static int nrRules;
   53: 
   54: static int 
   55: get_node_id_by_name(const char *name)
   56: {
   57:   int i;
   58:   /* 登録済みのものから探す */
   59:   for (i = 0; i < nrNodes; i++) {
   60:     if (!strcmp(name,gNodeNames[i])) {
   61:       return i;
   62:     }
   63:   }
   64:   /* なかったので作る */
   65:   gNodes = realloc(gNodes, sizeof(struct dep_node)*(nrNodes+1));
   66:   gNodeNames = realloc(gNodeNames, sizeof(char*)*(nrNodes+1));
   67:   gNodes[nrNodes].nr_branch = 0;
   68:   gNodes[nrNodes].branch = 0;
   69:   gNodeNames[nrNodes] = strdup(name);
   70:   nrNodes++;
   71:   return nrNodes-1;
   72: }
   73: 
   74: 
   75: /* 遷移条件からbranchを捜し出す */
   76: static struct dep_branch *
   77: find_branch(struct dep_node *node, xstr **strs, int nr_strs)
   78: {
   79:   struct dep_branch *db;
   80:   int i, j;
   81:   /* 同じ遷移条件のブランチを探す */
   82:   for (i = 0; i < node->nr_branch; i++) {
   83:     db = &node->branch[i];
   84:     if (nr_strs != db->nr_strs) {
   85:       continue ;
   86:     }
   87:     for (j = 0; j < nr_strs; j++) {
   88:       if (anthy_xstrcmp(db->str[j], strs[j])) {
   89:         goto fail;
   90:       }
   91:     }
   92:     /**/
   93:     return db;
   94:   fail:;
   95:   }
   96:   /* 新しいブランチを確保する */
   97:   node->branch = realloc(node->branch,
   98:                          sizeof(struct dep_branch)*(node->nr_branch+1));
   99:   db = &node->branch[node->nr_branch];
  100:   node->nr_branch++;
  101:   db->str = malloc(sizeof(xstr*)*nr_strs);
  102:   for (i = 0; i < nr_strs; i++) {
  103:     db->str[i] = strs[i];
  104:   }
  105:   db->nr_strs = nr_strs;
  106:   db->nr_transitions = 0;
  107:   db->transition = 0;
  108:   return db;
  109: }
  110: 
  111: /*
  112:  * 遷移をparseする
  113:  *  doc/SPLITTER参照
  114:  */
  115: static void
  116: parse_transition(char *token, struct dep_transition *tr)
  117: {
  118:   int ct = CT_NONE;
  119:   int pos = POS_NONE;
  120:   enum dep_class dc = DEP_NONE;
  121:   char *str = token;
  122:   tr->head_pos = POS_NONE;
  123:   tr->weak = 0;
  124:   /* 遷移の属性を解析*/
  125:   while (*token != '@') {
  126:     switch(*token){
  127:     case ':':
  128:     case '.':
  129:       tr->weak = 1;
  130:       break;
  131:     case 'C':
  132:       /* 活用形 */
  133:       switch (token[1]) {
  134:       case 'z': ct = CT_MIZEN; break;
  135:       case 'y': ct = CT_RENYOU; break;
  136:       case 's': ct = CT_SYUSI; break;
  137:       case 't': ct = CT_RENTAI; break;
  138:       case 'k': ct = CT_KATEI; break;
  139:       case 'm': ct = CT_MEIREI; break;
  140:       case 'g': ct = CT_HEAD; break;
  141:       }
  142:       token ++;
  143:       break;
  144:     case 'H':
  145:       /* 自立語部の品詞 */
  146:       switch (token[1]) {
  147:       case 'n': tr->head_pos = POS_NOUN; break;
  148:       case 'v': tr->head_pos = POS_V; break;
  149:       case 'j': tr->head_pos = POS_AJV; break;
  150:       }
  151:       token ++;
  152:       break;
  153:     case 'S':
  154:       /* 文節の属性 */
  155:       switch (token[1]) {
  156:         /*      case 'n': sc = DEP_NO; break;*/
  157:       case 'f': dc = DEP_FUZOKUGO; break;
  158:       case 'k': dc = DEP_KAKUJOSHI; break;
  159:       case 'y': dc = DEP_RENYOU; break;
  160:       case 't': dc = DEP_RENTAI; break;
  161:       case 'e': dc = DEP_END; break;
  162:       case 'r': dc = DEP_RAW; break;
  163:       default: printf("unknown (S%c)\n", token[1]);
  164:       }
  165:       token ++;
  166:       break;
  167:     default:
  168:       printf("Unknown (%c) %s\n", *token, str);
  169:       break;
  170:     }
  171:     token ++;
  172:   }
  173:   /* @から後はノードの名前 */
  174:   tr->next_node = get_node_id_by_name(token);
  175:   /**/
  176:   tr->pos = pos;
  177:   tr->ct = ct;
  178:   tr->dc = dc;
  179: }
  180: 
  181: /*
  182:  * ノード名 遷移条件+ 遷移先+
  183:  */
  184: static void
  185: parse_dep(char **tokens, int nr)
  186: {
  187:   int id, row = 0;
  188:   struct dep_branch *db;
  189:   struct dep_node *dn;
  190:   int nr_strs;
  191:   xstr **strs = alloca(sizeof(xstr*) * nr);
  192: 
  193:   /* ノードとそのidを確保 */
  194:   id = get_node_id_by_name(tokens[row]);
  195:   dn = &gNodes[id];
  196:   row ++;
  197: 
  198:   nr_strs = 0;
  199: 
  200:   /* 遷移条件の付属語の配列を作る */
  201:   for (; row < nr && tokens[row][0] == '\"'; row++) {
  202:     char *s;
  203:     s = strdup(&tokens[row][1]);
  204:     s[strlen(s)-1] =0;
  205:     strs[nr_strs] = anthy_cstr_to_xstr(s, ANTHY_EUC_JP_ENCODING);
  206:     nr_strs ++;
  207:     free(s);
  208:   }
  209: 
  210:   /* 遷移条件がない時は警告を出して、空の遷移条件を追加する */
  211:   if (nr_strs == 0) {
  212:     char *s;
  213:     anthy_log(0, "node %s has a branch without any transition condition.\n",
  214:               tokens[0]);
  215:     s = strdup("");
  216:     strs[0] = anthy_cstr_to_xstr(s, ANTHY_EUC_JP_ENCODING);
  217:     nr_strs = 1;
  218:     free(s);
  219:   }
  220: 
  221:   /* ブランチに遷移先のノードを追加する */
  222:   db = find_branch(dn, strs, nr_strs);
  223:   for ( ; row < nr; row++){
  224:     struct dep_transition *tr;
  225:     db->transition = realloc(db->transition,
  226:                              sizeof(struct dep_transition)*
  227:                              (db->nr_transitions+1));
  228:     tr = &db->transition[db->nr_transitions];
  229:     parse_transition(tokens[row], tr);
  230:     db->nr_transitions ++;
  231:   }
  232: }
  233: 
  234: /* 文法定義ファイル中に空のノードがあるかチェックする */
  235: static void
  236: check_nodes(void)
  237: {
  238:   int i;
  239:   for (i = 1; i < nrNodes; i++) {
  240:     if (gNodes[i].nr_branch == 0) {
  241:       anthy_log(0, "node %s has no branch.\n", gNodeNames);
  242:     }
  243:   }
  244: }
  245: 
  246: 
  247: static int
  248: init_depword_tab(void)
  249: {
  250:   const char *fn;
  251:   char **tokens;
  252:   int nr;
  253: 
  254:   /* id 0 を空ノードに割当てる */
  255:   get_node_id_by_name("@");
  256: 
  257:   /**/
  258:   fn = anthy_conf_get_str("DEPWORD");
  259:   if (!fn) {
  260:     anthy_log(0, "Dependent word dictionary is unspecified.\n");
  261:     return -1;
  262:   }
  263:   if (anthy_open_file(fn) == -1) {
  264:     anthy_log(0, "Failed to open dep word dict (%s).\n", fn);
  265:     return -1;
  266:   }
  267:   /* 一行ずつ付属語グラフを読む */
  268:   while (!anthy_read_line(&tokens, &nr)) {
  269:     parse_dep(tokens, nr);
  270:     anthy_free_line();
  271:   }
  272:   anthy_close_file();
  273:   check_nodes();
  274:   return 0;
  275: }
  276: 
  277: 
  278: static void
  279: parse_indep(char **tokens, int nr)
  280: {
  281:   if (nr < 2) {
  282:     printf("Syntex error in indepword defs"
  283:            " :%d.\n", anthy_get_line_number());
  284:     return ;
  285:   }
  286:   gRules = realloc(gRules, sizeof(struct wordseq_rule)*(nrRules+1));
  287: 
  288:   /* 行の先頭には品詞の名前が入っている */
  289:   gRules[nrRules].wt = anthy_init_wtype_by_name(tokens[0]);
  290: 
  291:   /* その次にはノード名が入っている */
  292:   gRules[nrRules].node_id = get_node_id_by_name(tokens[1]);
  293: 
  294:   if (verbose) {
  295:     printf("%d (%s)\n", nrRules, tokens[0]);
  296:   }
  297: 
  298:   nrRules ++;
  299: }
  300: 
  301: /** 自立語からの遷移表 */
  302: static int 
  303: init_indep_word_seq_tab(void)
  304: {
  305:   const char *fn;
  306:   char **tokens;
  307:   int nr;
  308: 
  309:   fn = anthy_conf_get_str("INDEPWORD");
  310:   if (!fn){
  311:     printf("independent word dict unspecified.\n");
  312:     return -1;
  313:   }
  314:   if (anthy_open_file(fn) == -1) {
  315:     printf("Failed to open indep word dict (%s).\n", fn);
  316:     return -1;
  317:   }
  318:   /* ファイルを一行ずつ読む */
  319:   while (!anthy_read_line(&tokens, &nr)) {
  320:     parse_indep(tokens, nr);
  321:     anthy_free_line();
  322:   }
  323:   anthy_close_file();
  324: 
  325:   return 0;
  326: }
  327: 
  328: /*  
  329:     ネットワークバイトオーダーで4byte書き出す
  330: */
  331: static void
  332: write_nl(FILE* fp, int i)
  333: {
  334:   i = anthy_dic_htonl(i);
  335:   fwrite(&i, sizeof(int), 1, fp);
  336: }
  337: 
  338: static void
  339: write_transition(FILE* fp, struct dep_transition* transition)
  340: {
  341:   write_nl(fp, transition->next_node); 
  342:   write_nl(fp, transition->pos); 
  343:   write_nl(fp, transition->ct); 
  344:   write_nl(fp, transition->dc); 
  345:   write_nl(fp, transition->head_pos); 
  346:   write_nl(fp, transition->weak); 
  347: }
  348: 
  349: static void
  350: write_xstr(FILE* fp, xstr* str)
  351: {
  352:   int i;
  353:   xchar c;
  354:   write_nl(fp, str->len);
  355: 
  356:   for (i = 0; i < str->len; i++) {
  357:     c = anthy_dic_htonl(str->str[i]);
  358:     fwrite(&c, sizeof(xchar), 1, fp);
  359:   }
  360: }
  361: 
  362: static void
  363: write_branch(FILE* fp, struct dep_branch* branch)
  364: {
  365:   int i;
  366: 
  367:   write_nl(fp, branch->nr_strs);
  368:   for (i = 0; i < branch->nr_strs; ++i) {
  369:     write_xstr(fp, branch->str[i]);
  370:   }
  371: 
  372:   write_nl(fp, branch->nr_transitions);
  373:   for (i = 0; i < branch->nr_transitions; ++i) {
  374:     write_transition(fp, &branch->transition[i]);
  375:   }
  376: }
  377: 
  378: static void
  379: write_node(FILE* fp, struct dep_node* node)
  380: {
  381:   int i;
  382:   write_nl(fp, node->nr_branch);
  383:   for (i = 0; i < node->nr_branch; ++i) {
  384:     write_branch(fp, &node->branch[i]);
  385:   }
  386: }
  387: 
  388: static void
  389: write_wtype(FILE *fp, wtype_t wt)
  390: {
  391:   fputc(anthy_wtype_get_pos(wt), fp);
  392:   fputc(anthy_wtype_get_cos(wt), fp);
  393:   fputc(anthy_wtype_get_scos(wt), fp);
  394:   fputc(anthy_wtype_get_cc(wt), fp);
  395:   fputc(anthy_wtype_get_ct(wt), fp);
  396:   fputc(anthy_wtype_get_wf(wt), fp);
  397:   fputc(0, fp);
  398:   fputc(0, fp);
  399: }
  400: 
  401: static void
  402: write_file(const char* file_name)
  403: {
  404:   int i;
  405:   FILE* fp = fopen(file_name, "w");
  406:   int* node_offset = malloc(sizeof(int) * nrNodes); /* gNodesのファイル上の位置 */
  407: 
  408:   /* 各ルール */
  409:   write_nl(fp, nrRules);
  410:   for (i = 0; i < nrRules; ++i) {
  411:     write_wtype(fp, gRules[i].wt);
  412:     write_nl(fp, gRules[i].node_id);
  413:   }
  414: 
  415:   write_nl(fp, nrNodes);
  416: 
  417:   for (i = 0; i < nrNodes; ++i) {
  418:     write_node(fp, &gNodes[i]);
  419:   }
  420: 
  421:   free(node_offset);
  422:   fclose(fp);
  423: }
  424: 
  425: int
  426: main(int argc, char* argv[])
  427: {
  428:   /* 付属語辞書を読み込んでファイルに書き出す */
  429:   anthy_conf_override("CONFFILE", "../anthy-conf");
  430:   anthy_conf_override("ANTHYDIR", SRCDIR "/../depgraph/");
  431: 
  432:   anthy_init_wtypes();
  433:   anthy_do_conf_init();
  434:   /* 付属語グラフ */
  435:   init_depword_tab();
  436:   /* 自立語からの遷移表 */
  437:   init_indep_word_seq_tab();
  438: 
  439:   write_file("anthy.dep");
  440: 
  441:   return 0;
  442: }
Syntax (Markdown)