(linenum→info "unix/slp.c:2238")

anthy/9100e/src-util/dic-tool.c

    1: /*
    2:  * 辞書操作用のユーティリティコマンド
    3:  *
    4:  * 辞書のライブラリ内部の形式と外部の形式の相互変換を行う
    5:  * 外部形式は
    6:  * *読み 頻度 単語
    7:  * *品詞の変数1 = 値1
    8:  * *品詞の変数2 = 値2
    9:  * *...
   10:  * *<空行>
   11:  * になる
   12:  */
   13: /*
   14:  * Funded by IPA未踏ソフトウェア創造事業 2001 9/22
   15:  *
   16:  * Copyright (C) 2000-2007 TABATA Yusuke
   17:  */
   18: /*
   19:   This library is free software; you can redistribute it and/or
   20:   modify it under the terms of the GNU Lesser General Public
   21:   License as published by the Free Software Foundation; either
   22:   version 2 of the License, or (at your option) any later version.
   23: 
   24:   This library is distributed in the hope that it will be useful,
   25:   but WITHOUT ANY WARRANTY; without even the implied warranty of
   26:   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   27:   Lesser General Public License for more details.
   28: 
   29:   You should have received a copy of the GNU Lesser General Public
   30:   License along with this library; if not, write to the Free Software
   31:   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307  USA
   32:  */
   33: #include <stdio.h>
   34: #include <stdlib.h>
   35: #include <string.h>
   36: 
   37: #include <anthy/anthy.h>
   38: #include <anthy/dicutil.h>
   39: /**/
   40: #include <anthy/xstr.h>
   41: #include "config.h"
   42: 
   43: #define UNSPEC 0
   44: #define DUMP_DIC 1
   45: #define LOAD_DIC 2
   46: #define APPEND_DIC 3
   47: 
   48: #define TYPETAB "typetab"
   49: #define USAGE_TEXT "dic-tool-usage.txt"
   50: 
   51: #define USAGE \
   52:  "Anthy-dic-util [options]\n"\
   53:  " --help: Show this usage text\n"\
   54:  " --version: Show version\n"\
   55:  " --dump: Dump dictionary\n"\
   56:  " --load: Load dictionary\n"\
   57:  " --append: Append dictionary\n"\
   58:  " --utf8: Use utf8 encoding\n"\
   59:  " --personality=NAME: use NAME as a name of personality\n"
   60: 
   61: 
   62: static int command = UNSPEC;
   63: static int encoding = ANTHY_EUC_JP_ENCODING;
   64: static FILE *fp_in;
   65: static char *fn;
   66: static const char *personality = "";
   67: 
   68: /* 変数名と値のペア */
   69: struct var{
   70:   struct var *next;
   71:   char *var_name;
   72:   char *val;
   73: };
   74: 
   75: /* 品詞のパラメータから品詞名を得るためのテーブル */
   76: struct trans_tab {
   77:   struct trans_tab *next;
   78:   char *type_name; /* 内部での型の名前 T35とか */
   79:   struct var var_list; /* 型を決定するためのパラメータ */
   80: }trans_tab_list;
   81: 
   82: static void
   83: print_usage(void)
   84: {
   85:   printf(USAGE);
   86:   exit(0);
   87: }
   88: 
   89: static FILE *
   90: open_typetab(void)
   91: {
   92:   FILE *fp;
   93:   char *fn;
   94:   fp = fopen(TYPETAB, "r");
   95:   if (fp) {
   96:     return fp;
   97:   }
   98:   fn = strdup(anthy_dic_util_get_anthydir());
   99:   fn = realloc(fn, strlen(fn) + strlen(TYPETAB) + 4);
  100:   strcat(fn, "/");
  101:   strcat(fn, TYPETAB);
  102:   fp = fopen(fn, "r");
  103:   return fp;
  104: }
  105: 
  106: static FILE *
  107: open_usage_file(void)
  108: {
  109:   FILE *fp;
  110:   /* カレントディレクトリにある場合は、それを使用する */
  111:   fp = fopen(USAGE_TEXT, "r");
  112:   if (!fp) {
  113:     /* インストールされたものを使用 */
  114:     char *fn;
  115:     fn = strdup(anthy_dic_util_get_anthydir());
  116:     fn = realloc(fn, strlen(fn) + strlen(USAGE_TEXT) + 10);
  117:     strcat(fn, "/" USAGE_TEXT);
  118:     fp = fopen(fn, "r");
  119:   }
  120:   return fp;
  121: }
  122: 
  123: static void
  124: print_usage_text(void)
  125: {
  126:   char buf[256];
  127:   FILE *fp = open_usage_file();
  128:   if (!fp) {
  129:     printf("# Anthy-dic-tool\n#\n");
  130:     return ;
  131:   }
  132:   fprintf(stdout, "#" PACKAGE " " VERSION "\n");
  133:   if (encoding == ANTHY_UTF8_ENCODING) {
  134:   } else {
  135:   }
  136:   /* そのままファイルの内容を出力 */
  137:   while (fgets(buf, 256, fp)) {
  138:     if (encoding == ANTHY_UTF8_ENCODING) {
  139:       char *s;
  140:       s = anthy_conv_euc_to_utf8(buf);
  141:       printf("%s", s);
  142:       free(s);
  143:     } else {
  144:       printf("%s", buf);
  145:     }
  146:   }
  147:   fclose(fp);
  148: }
  149: 
  150: static char *
  151: read_line(char *buf, int len, FILE *fp)
  152: {
  153:   while (fgets(buf, len, fp)) {
  154:     if (buf[0] != '#') {
  155:       /* 改行を削除する */
  156:       int l = strlen(buf);
  157:       if (l > 0 && buf[l-1] == '\n') {
  158:         buf[l-1] = 0;
  159:       }
  160:       if (l > 1 && buf[l-2] == '\r') {
  161:         buf[l-1] = 0;
  162:       }
  163:       /**/
  164:       return buf;
  165:     }
  166:   }
  167:   return NULL;
  168: }
  169: 
  170: static int
  171: read_typetab_var(struct var *head, FILE *fp, int table)
  172: {
  173:   char buf[256];
  174:   char var[256], eq[256], val[256];
  175:   struct var *v;
  176:   if (!read_line(buf, 256, fp)) {
  177:     return -1;
  178:   }
  179:   if (sscanf(buf, "%s %s %s", var, eq, val) != 3) {
  180:     return -1;
  181:   }
  182: 
  183:   v = malloc(sizeof(struct var));
  184:   if (encoding == ANTHY_UTF8_ENCODING && table) {
  185:     /* UTF-8 */
  186:     v->var_name = anthy_conv_euc_to_utf8(var);
  187:     v->val = anthy_conv_euc_to_utf8(val);
  188:   } else {
  189:     /* do not change */
  190:     v->var_name = strdup(var);
  191:     v->val = strdup(val);
  192:   }
  193: 
  194:   /* リストにつなぐ */
  195:   v->next = head->next;
  196:   head->next = v;
  197: 
  198:   return 0;
  199: }
  200: 
  201: static int
  202: read_typetab_entry(FILE *fp)
  203: {
  204:   char buf[256], type_name[257];
  205:   char *res;
  206:   struct trans_tab *t;
  207:   /* 一行目の品詞名を読む */
  208:   do {
  209:     res = read_line(buf, 256, fp);
  210:     if (!res) {
  211:       return -1;
  212:     }
  213:   } while (res[0] == '#' || res[0] == 0);
  214:   t = malloc(sizeof(struct trans_tab));
  215:   sprintf(type_name, "#%s", buf);
  216:   t->type_name = strdup(type_name);
  217:   t->var_list.next = 0;
  218:   /* パラメータを読む */
  219:   while(!read_typetab_var(&t->var_list, fp, 1));
  220:   /* リストにつなぐ */
  221:   t->next = trans_tab_list.next;
  222:   trans_tab_list.next = t;
  223:   return 0;
  224: }
  225: 
  226: static void
  227: read_typetab(void)
  228: {
  229:   FILE *fp = open_typetab();
  230:   if (!fp) {
  231:     printf("Failed to open type table.\n");
  232:     exit(1);
  233:   }
  234:   while (!read_typetab_entry(fp));
  235: }
  236: 
  237: static struct trans_tab *
  238: find_trans_tab_by_name(char *name)
  239: {
  240:   struct trans_tab *t;
  241:   for (t = trans_tab_list.next; t; t = t->next) {
  242:     if (!strcmp(t->type_name, name)) {
  243:       return t;
  244:     }
  245:   }
  246:   return NULL;
  247: }
  248: 
  249: static void
  250: print_word_type(struct trans_tab *t)
  251: {
  252:   struct var *v;
  253:   for (v = t->var_list.next; v; v = v->next) {
  254:     printf("%s\t=\t%s\n", v->var_name, v->val);
  255:   }
  256: }
  257: 
  258: static void
  259: dump_dic(void)
  260: {
  261:   print_usage_text();
  262:   if (anthy_priv_dic_select_first_entry() == -1) {
  263:     printf("# Failed to read private dictionary\n"
  264:            "# There are no words or error occured?\n"
  265:            "#\n");
  266:     return ;
  267:   }
  268:   do {
  269:     char idx[100], wt[100], w[100];
  270:     int freq;
  271:     if (anthy_priv_dic_get_index(idx, 100) &&
  272:         anthy_priv_dic_get_wtype(wt, 100) &&
  273:         anthy_priv_dic_get_word(w, 100)) {
  274:       struct trans_tab *t;
  275:       freq = anthy_priv_dic_get_freq();
  276:       t = find_trans_tab_by_name(wt);
  277:       if (t) {
  278:         printf("%s %d %s\n", idx, freq, w);
  279:         print_word_type(t);
  280:         printf("\n");
  281:       } else {
  282:         printf("# Failed to determine word type of %s(%s).\n", w, wt);
  283:       }
  284:     }
  285:   } while (anthy_priv_dic_select_next_entry() == 0);
  286: }
  287: 
  288: static void
  289: open_input_file(void)
  290: {
  291:   if (!fn) {
  292:     fp_in = stdin;
  293:   } else {
  294:     fp_in = fopen(fn, "r");
  295:     if (!fp_in) {
  296:       exit(1);
  297:     }
  298:   }
  299: }
  300: 
  301: /* vが sの中にあるか */
  302: static int
  303: match_var(struct var *v, struct var *s)
  304: {
  305:   struct var *i;
  306:   for (i = s->next; i; i = i->next) {
  307:     if (!strcmp(v->var_name, i->var_name) &&
  308:         !strcmp(v->val, i->val)) {
  309:       return 1;
  310:     }
  311:   }
  312:   return 0;
  313: }
  314: 
  315: /* v1がv2の部分集合かどうか */
  316: static int
  317: var_list_subset_p(struct var *v1, struct var *v2)
  318: {
  319:   struct var *v;
  320:   for (v = v1->next; v; v = v->next) {
  321:     if (!match_var(v, v2)) {
  322:       return 0;
  323:     }
  324:   }
  325:   return 1;
  326: }
  327: 
  328: static char *
  329: find_wt(void)
  330: {
  331:   struct var v;
  332:   struct trans_tab *t;
  333:   v.next = 0;
  334:   while(!read_typetab_var(&v, fp_in, 0));
  335:   for (t = trans_tab_list.next; t; t = t->next) {
  336:     if (var_list_subset_p(&t->var_list, &v) &&
  337:         var_list_subset_p(&v, &t->var_list)) {
  338:       return t->type_name;
  339:     }
  340:   }
  341:   return NULL;
  342: }
  343: 
  344: static int
  345: find_head(char *yomi, char *freq, char *w)
  346: {
  347:   char buf[256];
  348:   do {
  349:     if (!read_line(buf, 256, fp_in)) {
  350:       return -1;
  351:     }
  352:   } while (sscanf(buf, "%s %s %[^\n]",yomi, freq, w) != 3);
  353:   return 0;
  354: }
  355: 
  356: static void
  357: load_dic(void)
  358: {
  359:   char yomi[256], freq[256], w[256];
  360:   while (!find_head(yomi, freq, w)) {
  361:     char *wt = find_wt();
  362:     if (wt) {
  363:       int ret;
  364:       ret = anthy_priv_dic_add_entry(yomi, w, wt, atoi(freq));
  365:       if (ret == -1) {
  366:         printf("Failed to register %s\n", yomi);
  367:       }else {
  368:         printf("Word %s is registered as %s\n", yomi, wt);
  369:       }
  370:     } else {
  371:       printf("Failed to find the type of %s.\n", yomi);
  372:     }
  373:   }
  374: }
  375: 
  376: static void
  377: print_version(void)
  378: {
  379:   printf("Anthy-dic-util "VERSION".\n");
  380:   exit(0);
  381: }
  382: 
  383: static void
  384: parse_args(int argc, char **argv)
  385: {
  386:   int i;
  387:   for (i = 1 ; i < argc ; i++) {
  388:     if (!strncmp(argv[i], "--", 2)) {
  389:       char *opt = &argv[i][2];
  390:       if (!strcmp(opt, "help")) {
  391:         print_usage();
  392:       } else if (!strcmp(opt, "version")){
  393:         print_version();
  394:       } else if (!strcmp(opt, "dump")) {
  395:         command = DUMP_DIC;
  396:       } else if (!strcmp(opt,"append") ){
  397:         command = APPEND_DIC;
  398:       } else if (!strncmp(opt, "personality=", 12)) {
  399:         personality = &opt[12];
  400:       } else if (!strcmp(opt, "utf8")) {
  401:         encoding = ANTHY_UTF8_ENCODING;
  402:       } else if (!strcmp(opt, "eucjp")) {
  403:         encoding = ANTHY_EUC_JP_ENCODING;
  404:       } else if (!strcmp(opt, "load")) {
  405:         command = LOAD_DIC;
  406:       }
  407:     }else{
  408:       fn = argv[i];
  409:     }
  410:   }
  411: }
  412: 
  413: static void
  414: init_lib(void)
  415: {
  416:   anthy_dic_util_init();
  417:   anthy_dic_util_set_encoding(encoding);
  418:   read_typetab();
  419: }
  420: 
  421: int
  422: main(int argc,char **argv)
  423: {
  424:   fp_in = stdin;
  425:   parse_args(argc, argv);
  426: 
  427:   switch (command) {
  428:   case DUMP_DIC:
  429:     init_lib();
  430:     dump_dic();
  431:     break;
  432:   case LOAD_DIC:
  433:     init_lib();
  434:     anthy_priv_dic_delete();
  435:     open_input_file();
  436:     load_dic();
  437:     break;
  438:   case APPEND_DIC:
  439:     init_lib();
  440:     open_input_file();
  441:     load_dic();
  442:     break;
  443:   case UNSPEC:
  444:   default:
  445:     print_usage();
  446:   }
  447:   return 0;
  448: }
Syntax (Markdown)