(linenum→info "unix/slp.c:2238")

anthy/9100e/src-worddic/dic_util.c

    1: /*
    2:  * 個人辞書管理用の関数群
    3:  *
    4:  * 互換性の都合で
    5:  *  utf8の辞書はtextdict
    6:  *  eucjpの辞書はtexttrie
    7:  *  およびrecordを使ってて混乱しまくり
    8:  * textdictへ移行する
    9:  *
   10:  * 開発予定
   11:  *
   12:  *  新規登録はtextdictに対して行うようにする <- todo
   13:  *  texttrieの単語は移行するようにする
   14:  *  record関係は消す
   15:  *
   16:  *
   17:  * Funded by IPA未踏ソフトウェア創造事業 2001 10/24
   18:  *
   19:  * Copyright (C) 2001-2007 TABATA Yusuke
   20:  *
   21:  */
   22: /*
   23:   This library is free software; you can redistribute it and/or
   24:   modify it under the terms of the GNU Lesser General Public
   25:   License as published by the Free Software Foundation; either
   26:   version 2 of the License, or (at your option) any later version.
   27: 
   28:   This library is distributed in the hope that it will be useful,
   29:   but WITHOUT ANY WARRANTY; without even the implied warranty of
   30:   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   31:   Lesser General Public License for more details.
   32: 
   33:   You should have received a copy of the GNU Lesser General Public
   34:   License along with this library; if not, write to the Free Software
   35:   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307  USA
   36:  */
   37: #include <stdlib.h>
   38: #include <stdio.h>
   39: #include <string.h>
   40: 
   41: #include <anthy/anthy.h>
   42: #include <anthy/conf.h>
   43: #include <anthy/dic.h>
   44: #include <anthy/texttrie.h>
   45: #include <anthy/textdict.h>
   46: #include <anthy/dicutil.h>
   47: 
   48: #include "dic_main.h"
   49: #include "dic_personality.h"
   50: 
   51: /*
   52:  * 個人辞書はtexttrie中に格納されるとき
   53:  * 「  見出し 数字」 -> 「#品詞*頻度 単語」という形式をとる
   54:  * (UTF8の場合は「 p見出し 数字」 -> 「#品詞*頻度 単語」)
   55:  * 最初の2文字の空白は単語情報のセクションであることを意味し、
   56:  * 数字の部分は同音語を区別するために用いられる。
   57:  *
   58:  */
   59: 
   60: /* UTF8で32文字 x 3bytes */
   61: #define MAX_KEY_LEN 96
   62: 
   63: static int gIsInit;
   64: static int dic_util_encoding;
   65: 
   66: extern struct text_trie *anthy_private_tt_dic;
   67: extern struct textdict *anthy_private_text_dic;
   68: /* 現在選択されている読み */
   69: static struct iterate_contex {
   70:   /**/
   71:   int in_tt;
   72:   /* texttrie */
   73:   char key_buf[MAX_KEY_LEN+32];
   74:   /* textdictの検索用 */
   75:   int dicfile_offset;
   76:   char *current_index;
   77:   char *current_line;
   78: } word_iterator;
   79: /**/
   80: struct scan_context {
   81:   const char *yomi;
   82:   const char *word;
   83:   const char *wt_name;
   84:   int offset;
   85:   int found_word;
   86: };
   87: 
   88: static void
   89: set_current_line(const char *index, const char *line)
   90: {
   91:   if (word_iterator.current_line) {
   92:     free(word_iterator.current_line);
   93:     word_iterator.current_line = NULL;
   94:   }
   95:   if (line) {
   96:     word_iterator.current_line = strdup(line);
   97:   }
   98:   if (word_iterator.current_index) {
   99:     free(word_iterator.current_index);
  100:     word_iterator.current_index = NULL;
  101:   }
  102:   if (index) {
  103:     word_iterator.current_index = strdup(index);
  104:   }
  105: }
  106: 
  107: /** 個人辞書ライブラリを初期化する */
  108: void
  109: anthy_dic_util_init(void)
  110: {
  111:   if (gIsInit) {
  112:     return ;
  113:   }
  114:   if (anthy_init_dic() == -1) {
  115:     return ;
  116:   }
  117:   anthy_dic_set_personality("default");
  118:   gIsInit = 1;
  119:   dic_util_encoding = ANTHY_EUC_JP_ENCODING;
  120:   /**/
  121:   word_iterator.key_buf[0] = 0;
  122:   word_iterator.in_tt = 1;
  123: }
  124: 
  125: /** 辞書ライブラリを解放する */
  126: void
  127: anthy_dic_util_quit(void)
  128: {
  129:   if (gIsInit) {
  130:     anthy_quit_dic();
  131:   }
  132:   set_current_line(NULL, NULL);
  133:   gIsInit = 0;
  134: }
  135: 
  136: /** 辞書ユーティリティAPIのエンコーディングを設定する */
  137: int
  138: anthy_dic_util_set_encoding(int enc)
  139: {
  140:   if (enc == ANTHY_UTF8_ENCODING ||
  141:       enc == ANTHY_EUC_JP_ENCODING) {
  142:     dic_util_encoding = enc;
  143:   }
  144:   return dic_util_encoding;
  145: }
  146: 
  147: void
  148: anthy_dic_util_set_personality(const char *id)
  149: {
  150:   anthy_dic_set_personality(id);
  151: }
  152: 
  153: static char *
  154: find_next_key(const char *prefix)
  155: {
  156:   char *v;
  157:   v = anthy_trie_find_next_key(anthy_private_tt_dic,
  158:                                word_iterator.key_buf, MAX_KEY_LEN+32);
  159: 
  160:   if (v && v[0] == prefix[0] && v[1] == prefix[1]) {
  161:     /* 次のkeyも指定されたprefixを持っている */
  162:     return v;
  163:   }
  164:   /**/
  165:   sprintf(word_iterator.key_buf, prefix);
  166:   return NULL;
  167: }
  168: 
  169: static void
  170: delete_prefix(const char *prefix)
  171: {
  172:   sprintf(word_iterator.key_buf, prefix);
  173:   anthy_priv_dic_lock();
  174:   /* word_iterator.key_bufがprefixの文字列であれば、find_next_key()は
  175:      最初の単語を返す */
  176:   while (find_next_key(prefix)) {
  177:     anthy_trie_delete(anthy_private_tt_dic, word_iterator.key_buf);
  178:     sprintf(word_iterator.key_buf, prefix);
  179:   }
  180:   anthy_priv_dic_unlock();
  181: }
  182: 
  183: static const char *
  184: encoding_prefix(int encoding)
  185: {
  186:   if (encoding == ANTHY_UTF8_ENCODING) {
  187:     return " p";
  188:   }
  189:   /* EUC-JP */
  190:   return "  ";
  191: }
  192: 
  193: /** (API) 個人辞書を全部消す */
  194: void
  195: anthy_priv_dic_delete(void)
  196: {
  197:   delete_prefix(encoding_prefix(ANTHY_EUC_JP_ENCODING));
  198:   /**/
  199:   while (!anthy_textdict_delete_line(anthy_private_text_dic, 0)) {
  200:     /**/
  201:   }
  202: }
  203: 
  204: static int
  205: scan_one_word_cb(void *p, int next_offset, const char *key, const char *n)
  206: {
  207:   (void)p;
  208:   set_current_line(key, n);
  209:   word_iterator.dicfile_offset = next_offset;
  210:   return -1;
  211: }
  212: 
  213: static int
  214: select_first_entry_in_textdict(void)
  215: {
  216:   word_iterator.dicfile_offset = 0;
  217:   set_current_line(NULL, NULL);
  218:   anthy_textdict_scan(anthy_private_text_dic,
  219:                       word_iterator.dicfile_offset, NULL,
  220:                       scan_one_word_cb);
  221:   if (word_iterator.current_line) {
  222:     word_iterator.in_tt = 0;
  223:     return 0;
  224:   }
  225:   /* 単語が無い */
  226:   return ANTHY_DIC_UTIL_ERROR;
  227: }
  228: 
  229: /** (API) 最初の単語を選択する */
  230: int
  231: anthy_priv_dic_select_first_entry(void)
  232: {
  233:   if (dic_util_encoding == ANTHY_UTF8_ENCODING) {
  234:     return select_first_entry_in_textdict();
  235:   }
  236:   if (anthy_private_tt_dic) {
  237:     sprintf(word_iterator.key_buf, encoding_prefix(dic_util_encoding));
  238:     /* prefixの次のエントリが最初のエントリ */
  239:     if (find_next_key(encoding_prefix(dic_util_encoding))) {
  240:       word_iterator.in_tt = 1;
  241:       return 0;
  242:     }
  243:   }
  244:   /* 単語が無いのでtextdictに移動を試みる */
  245:   return select_first_entry_in_textdict();
  246: }
  247: 
  248: /** (API) 現在選択されている単語の次の単語を選択する */
  249: int
  250: anthy_priv_dic_select_next_entry(void)
  251: {
  252:   if (!word_iterator.in_tt) {
  253:     set_current_line(NULL, NULL);
  254:     anthy_textdict_scan(anthy_private_text_dic, word_iterator.dicfile_offset,
  255:                         NULL,
  256:                         scan_one_word_cb);
  257:     if (word_iterator.current_line) {
  258:       return 0;
  259:     }
  260:     return ANTHY_DIC_UTIL_ERROR;
  261:   }
  262:   if (find_next_key(encoding_prefix(dic_util_encoding))) {
  263:     return 0;
  264:   }
  265:   /* 単語が無いのでtextdictに移動を試みる */
  266:   return select_first_entry_in_textdict();
  267: }
  268: 
  269: /** 未実装 */
  270: int
  271: anthy_priv_dic_select_entry(const char *index)
  272: {
  273:   (void)index;
  274:   return 0;
  275: }
  276: 
  277: /** 現在選択されている単語の読みをを取得する */
  278: char *
  279: anthy_priv_dic_get_index(char *buf, int len)
  280: {
  281:   int i;
  282:   char *src_buf;
  283:   if (word_iterator.in_tt) {
  284:     src_buf = &word_iterator.key_buf[2];
  285:   } else {
  286:     src_buf = word_iterator.current_index;
  287:   }
  288:   if (!word_iterator.in_tt && dic_util_encoding == ANTHY_EUC_JP_ENCODING) {
  289:     /**/
  290:     src_buf = anthy_conv_utf8_to_euc(src_buf);
  291:   } else {
  292:     src_buf = strdup(src_buf);
  293:   }
  294:   /* 最初の空白か\0までをコピーする */
  295:   for (i = 0; src_buf[i] && src_buf[i] != ' '; i++) {
  296:     if (i >= len - 1) {
  297:       free(src_buf);
  298:       return NULL;
  299:     }
  300:     buf[i] = src_buf[i];
  301:   }
  302:   buf[i] = 0;
  303:   free(src_buf);
  304:   return buf;
  305: }
  306: 
  307: /** 現在選択されている単語の頻度を取得する */
  308: int
  309: anthy_priv_dic_get_freq(void)
  310: {
  311:   struct word_line res;
  312:   char *v;
  313:   if (word_iterator.in_tt) {
  314:     v = anthy_trie_find(anthy_private_tt_dic, word_iterator.key_buf);
  315:     anthy_parse_word_line(v, &res);
  316:     free(v);
  317:   } else {
  318:     anthy_parse_word_line(word_iterator.current_line, &res);
  319:   }
  320:   return res.freq;
  321: }
  322: 
  323: /** 現在選択されている単語の品詞を取得する */
  324: char *
  325: anthy_priv_dic_get_wtype(char *buf, int len)
  326: {
  327:   struct word_line res;
  328:   char *v;
  329:   if (word_iterator.in_tt) {
  330:     v = anthy_trie_find(anthy_private_tt_dic, word_iterator.key_buf);
  331:     anthy_parse_word_line(v, &res);
  332:     free(v);
  333:   } else {
  334:     anthy_parse_word_line(word_iterator.current_line, &res);
  335:   }
  336:   if (len - 1 < (int)strlen(res.wt)) {
  337:     return NULL;
  338:   }
  339:   sprintf(buf, "%s", res.wt);
  340:   return buf;
  341: }
  342: 
  343: /** 現在選択されている単語を取得する */
  344: char *
  345: anthy_priv_dic_get_word(char *buf, int len)
  346: {
  347:   char *v;
  348:   char *s;
  349:   if (word_iterator.in_tt) {
  350:     v = anthy_trie_find(anthy_private_tt_dic, word_iterator.key_buf);
  351:   } else {
  352:     v = word_iterator.current_line;
  353:   }
  354:   if (!v) {
  355:     return NULL;
  356:   }
  357:   /* 品詞の後ろにある単語を取り出す */
  358:   s = strchr(v, ' ');
  359:   s++;
  360:   if (!word_iterator.in_tt && dic_util_encoding == ANTHY_EUC_JP_ENCODING) {
  361:     s = anthy_conv_utf8_to_euc(s);
  362:     snprintf(buf, len, "%s", s);
  363:     free(s);
  364:   } else {
  365:     snprintf(buf, len, "%s", s);
  366:   }
  367:   if (word_iterator.in_tt) {
  368:     free(v);
  369:   }
  370:   return buf;
  371: }
  372: 
  373: static int
  374: find_cb(void *p, int next_offset, const char *key, const char *n)
  375: {
  376:   struct scan_context *sc = p;
  377:   struct word_line res;
  378:   if (strcmp(key, sc->yomi)) {
  379:     sc->offset = next_offset;
  380:     return 0;
  381:   }
  382:   anthy_parse_word_line(n, &res);
  383:   if (!strcmp(res.wt, sc->wt_name) &&
  384:       !strcmp(res.word, sc->word)) {
  385:     sc->found_word = 1;
  386:     return -1;
  387:   }
  388:   sc->offset = next_offset;
  389:   return 0;
  390: }
  391: 
  392: static int
  393: order_cb(void *p, int next_offset, const char *key, const char *n)
  394: {
  395:   struct scan_context *sc = p;
  396:   (void)n;
  397:   if (strcmp(key, sc->yomi) >= 0) {
  398:     sc->found_word = 1;
  399:     return -1;
  400:   }
  401:   sc->offset = next_offset;
  402:   return 0;
  403: }
  404: 
  405: /* 引数はutf8 */
  406: static int
  407: do_add_word_to_textdict(struct textdict *td, int offset,
  408:                         const char *yomi, const char *word,
  409:                         const char *wt_name, int freq)
  410: {
  411:   char *buf = malloc(strlen(yomi) + strlen(word) + strlen(wt_name) + 20);
  412:   int rv;
  413:   if (!buf) {
  414:     return -1;
  415:   }
  416:   sprintf(buf, "%s %s*%d %s\n", yomi, wt_name, freq, word);
  417:   rv = anthy_textdict_insert_line(td, offset, buf);
  418:   free(buf);
  419:   return rv;
  420: }
  421: 
  422: static int
  423: dup_word_check(const char *v, const char *word, const char *wt)
  424: {
  425:   struct word_line res;
  426: 
  427:   if (anthy_parse_word_line(v, &res)) {
  428:     return 0;
  429:   }
  430: 
  431:   /* 読みと単語を比較する */
  432:   if (!strcmp(res.wt, wt) &&
  433:       !strcmp(res.word, word)) {
  434:     return 1;
  435:   }
  436:   return 0;
  437: }
  438: 
  439: static int
  440: find_same_word(char *idx_buf, const char *yomi,
  441:                const char *word, const char *wt_name, int yomi_len)
  442: {
  443:   int found = 0;
  444:   sprintf(idx_buf, "%s%s ",
  445:           encoding_prefix(dic_util_encoding),
  446:           yomi);
  447:   anthy_trie_find_next_key(anthy_private_tt_dic,
  448:                            idx_buf, yomi_len + 12);
  449: 
  450:   /* trieのインデックスを探す */
  451:   do {
  452:     char *v;
  453:     if (strncmp(&idx_buf[2], yomi, yomi_len) ||
  454:         idx_buf[yomi_len+2] != ' ') {
  455:       /* 見出語が異なるのでループ終了 */
  456:       break;
  457:     }
  458:     /* texttrieにアクセスして、見出語以外も一致しているかをチェック */
  459:     v = anthy_trie_find(anthy_private_tt_dic, idx_buf);
  460:     if (v) {
  461:       found = dup_word_check(v, word, wt_name);
  462:       free(v);
  463:       if (found) {
  464:         break;
  465:       }
  466:     }
  467:   } while (anthy_trie_find_next_key(anthy_private_tt_dic,
  468:                                     idx_buf, yomi_len + 12));
  469: 
  470:   return found;
  471: }
  472: 
  473: static int
  474: add_word_to_textdict(const char *yomi, const char *word,
  475:                      const char *wt_name, int freq)
  476: {
  477:   struct scan_context sc;
  478:   int rv;
  479:   int yomi_len = strlen(yomi);
  480: 
  481:   if (yomi_len > MAX_KEY_LEN || yomi_len == 0) {
  482:     return ANTHY_DIC_UTIL_ERROR;
  483:   }
  484: 
  485:   if (wt_name[0] != '#') {
  486:     return ANTHY_DIC_UTIL_ERROR;
  487:   }
  488: 
  489:   /* texttrieにあれば消す */
  490:   if (anthy_private_tt_dic) {
  491:     char *idx_buf = malloc(yomi_len + 12);
  492:     if (find_same_word(idx_buf, yomi, word, wt_name, yomi_len)) {
  493:       anthy_trie_delete(anthy_private_tt_dic, idx_buf);
  494:     }
  495:     free(idx_buf);
  496:   }
  497: 
  498:   /* 同じ物があったら消す */
  499:   sc.yomi = yomi;
  500:   sc.word = word;
  501:   sc.wt_name = wt_name;
  502:   /**/
  503:   sc.offset = 0;
  504:   sc.found_word = 0;
  505:   anthy_textdict_scan(anthy_private_text_dic, 0, &sc,
  506:                       find_cb);
  507:   if (sc.found_word == 1) {
  508:     anthy_textdict_delete_line(anthy_private_text_dic, sc.offset);
  509:   }
  510:   if (freq == 0) {
  511:     return ANTHY_DIC_UTIL_OK;
  512:   }
  513:   /* 追加する場所を探す */
  514:   sc.offset = 0;
  515:   sc.found_word = 0;
  516:   anthy_textdict_scan(anthy_private_text_dic, 0, &sc,
  517:                       order_cb);
  518:   /* 追加する */
  519:   rv = do_add_word_to_textdict(anthy_private_text_dic, sc.offset,
  520:                                yomi, word, wt_name, freq);
  521:   if (!rv) {
  522:     return ANTHY_DIC_UTIL_OK;
  523:   }
  524:   return ANTHY_DIC_UTIL_ERROR;
  525: }
  526: 
  527: /** 単語を登録する
  528:  * 頻度が0の場合は削除
  529:  */
  530: int
  531: anthy_priv_dic_add_entry(const char *yomi, const char *word,
  532:                          const char *wt_name, int freq)
  533: {
  534:   if (dic_util_encoding == ANTHY_UTF8_ENCODING) {
  535:     return add_word_to_textdict(yomi, word, wt_name, freq);
  536:   } else {
  537:     int rv;
  538:     char *yomi_utf8 = anthy_conv_euc_to_utf8(yomi);
  539:     char *word_utf8 = anthy_conv_euc_to_utf8(word);
  540:     rv = add_word_to_textdict(yomi_utf8, word_utf8, wt_name, freq);
  541:     free(yomi_utf8);
  542:     free(word_utf8);
  543:     return rv;
  544:   }
  545: }
  546: 
  547: const char *
  548: anthy_dic_util_get_anthydir(void)
  549: {
  550:   return anthy_conf_get_str("ANTHYDIR");
  551: }
  552: 
  553: /* lookコマンドの辞書を検索するための関数 */
  554: static char *
  555: do_search(FILE *fp, const char *word)
  556: {
  557:   char buf[32];
  558:   char *res = NULL;
  559:   int word_len = strlen(word);
  560:   while (fgets(buf, 32, fp)) {
  561:     int len = strlen(buf);
  562:     buf[len - 1] = 0;
  563:     len --;
  564:     if (len > word_len) {
  565:       continue;
  566:     }
  567:     if (!strncasecmp(buf, word, len)) {
  568:       if (res) {
  569:         free(res);
  570:       }
  571:       res = strdup(buf);
  572:     }
  573:   }
  574:   return res;
  575: }
  576: 
  577: /* lookコマンドの辞書を検索するAPI */
  578: char *
  579: anthy_dic_search_words_file(const char *word)
  580: {
  581:   FILE *fp;
  582:   char *res;
  583:   const char *words_dict_fn = anthy_conf_get_str("WORDS_FILE");
  584:   if (!words_dict_fn) {
  585:     return NULL;
  586:   }
  587:   fp = fopen(words_dict_fn, "r");
  588:   if (!fp) {
  589:     return NULL;
  590:   }
  591:   res = do_search(fp, word);
  592:   fcl