(linenum→info "unix/slp.c:2238")

anthy/9100e/src-worddic/word_dic.c

    1: /*
    2:  * Anthyの辞書ライブラリの中心
    3:  *
    4:  * anthy_get_seq_ent_from_xstr()で辞書をひく
    5:  *
    6:  * Copyright (C) 2000-2007 TABATA Yusuke
    7:  * Copyright (C) 2005-2006 YOSHIDA Yuichi
    8:  *
    9:  */
   10: /*
   11:   This library is free software; you can redistribute it and/or
   12:   modify it under the terms of the GNU Lesser General Public
   13:   License as published by the Free Software Foundation; either
   14:   version 2 of the License, or (at your option) any later version.
   15: 
   16:   This library is distributed in the hope that it will be useful,
   17:   but WITHOUT ANY WARRANTY; without even the implied warranty of
   18:   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   19:   Lesser General Public License for more details.
   20: 
   21:   You should have received a copy of the GNU Lesser General Public
   22:   License along with this library; if not, write to the Free Software
   23:   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307  USA
   24:  */
   25: #include <stdlib.h>
   26: #include <string.h>
   27: 
   28: #include <anthy/anthy.h>
   29: #include <anthy/dic.h>
   30: #include <anthy/conf.h>
   31: #include <anthy/record.h>
   32: #include <anthy/alloc.h>
   33: #include <anthy/logger.h>
   34: #include <anthy/xchar.h>
   35: #include <anthy/feature_set.h>
   36: #include <anthy/textdict.h>
   37: 
   38: #include <anthy/diclib.h>
   39: 
   40: #include "dic_ent.h"
   41: #include "dic_personality.h"
   42: #include "dic_main.h"
   43: 
   44: /**/
   45: static int dic_init_count;
   46: 
   47: /* 辞書 */
   48: /* 全personalityで共有されるファイル辞書 */
   49: static struct word_dic *master_dic_file;
   50: 
   51: /* 各パーソナリティごとの辞書 */
   52: struct mem_dic *anthy_current_personal_dic_cache;/* キャッシュ */
   53: /**/
   54: struct record_stat *anthy_current_record;
   55: 
   56: struct seq_ent *
   57: anthy_validate_seq_ent(struct seq_ent *seq, xstr *xs, int is_reverse)
   58: {
   59:   if (!seq) {
   60:     return NULL;
   61:   }
   62:   if (seq->nr_dic_ents == 0 && seq->nr_compound_ents == 0) {
   63:     /* 無効なエントリを作成したのでcacheから削除 */
   64:     anthy_mem_dic_release_seq_ent(anthy_current_personal_dic_cache,
   65:                                   xs, is_reverse);
   66:     return NULL;
   67:   }
   68: 
   69:   return seq;
   70: }
   71: 
   72: struct seq_ent *
   73: anthy_cache_get_seq_ent(xstr *xs, int is_reverse)
   74: {
   75:   struct seq_ent *seq;
   76: 
   77:   /* キャッシュ中に既にあればそれを返す */
   78:   seq = anthy_mem_dic_find_seq_ent_by_xstr(anthy_current_personal_dic_cache,
   79:                                            xs, is_reverse);
   80:   if (seq) {
   81:     return seq;
   82:   }
   83: 
   84:   /* キャッシュ中に無いので確保 */
   85:   return anthy_mem_dic_alloc_seq_ent_by_xstr(anthy_current_personal_dic_cache,
   86:                                              xs, is_reverse);
   87: }
   88: 
   89: int
   90: anthy_dic_check_word_relation(int from, int to)
   91: {
   92:   return anthy_word_dic_check_word_relation(master_dic_file, from, to);
   93: }
   94: 
   95: static seq_ent_t
   96: do_get_seq_ent_from_xstr(xstr *xs, int is_reverse)
   97: {
   98:   struct seq_ent *seq;
   99:   /* キャッシュから取り出す */
  100:   seq = anthy_cache_get_seq_ent(xs, is_reverse);
  101:   seq = anthy_validate_seq_ent(seq, xs, is_reverse);
  102:   if (!seq) {
  103:     /* 数字などの辞書に無い文字列を検索する */
  104:     return anthy_get_ext_seq_ent_from_xstr(xs, is_reverse);
  105:   }
  106:   return seq;
  107: }
  108: 
  109: static xstr *
  110: convert_vu(xstr *xs)
  111: {
  112:   int i, v = 0;
  113:   int j;
  114: 
  115:     /* 「ヴ」の出現を数える */
  116:   for (i = 0; i < xs->len; i++) {
  117:     if (xs->str[i] == KK_VU) {
  118:       v++;
  119:     }
  120:   }
  121:   if (v > 0) {
  122:     xstr *nx = malloc(sizeof(xstr));
  123:     nx->len = xs->len + v;
  124:     nx->str = malloc(sizeof(xchar)*nx->len);
  125:     j = 0;
  126:     /* 「ヴ」を「う゛」に変換しつつコピーする */
  127:     for (i = 0; i < xs->len; i++) {
  128:       if (xs->str[i] == KK_VU) {
  129:         nx->str[j] = HK_U;
  130:         j++;
  131:         nx->str[j] = HK_DDOT;
  132:         j++;
  133:       } else {
  134:         nx->str[j] = xs->str[i];
  135:         j++;
  136:       }
  137:     }
  138:     return nx;
  139:   }
  140:   return NULL;
  141: }
  142: 
  143: seq_ent_t
  144: anthy_get_seq_ent_from_xstr(xstr *xs, int is_reverse)
  145: {
  146:   struct seq_ent *se;
  147: 
  148:   if (!xs) {
  149:     return NULL;
  150:   }
  151:   if (!is_reverse) {
  152:     xstr *nx = convert_vu(xs);
  153:     /* 「ヴ」の混ざった順変換の場合、「う゛」に直して検索する
  154:      *   上位のレイヤーではユーザの与えた文字列をそのまま保持することが
  155:      *   期待されるので、変換はここで行なう。
  156:      */
  157:     if (nx) {
  158:       se = do_get_seq_ent_from_xstr(nx, 0);
  159:       anthy_free_xstr(nx);
  160:       return se;
  161:     }
  162:   }
  163:   /* 「ヴ」が出現しない、もしくは逆変換の場合 */
  164:   return do_get_seq_ent_from_xstr(xs, is_reverse);
  165: }
  166: 
  167: static void
  168: gang_elm_dtor(void *p)
  169: {
  170:   struct gang_elm *ge = p;
  171:   free(ge->key);
  172: }
  173: 
  174: static int
  175: find_gang_elm(allocator ator, struct gang_elm *head, xstr *xs)
  176: {
  177:   char *str = anthy_xstr_to_cstr(xs, ANTHY_UTF8_ENCODING);
  178:   struct gang_elm *ge;
  179:   for (ge = head->tmp.next; ge; ge = ge->tmp.next) {
  180:     if (!strcmp(ge->key, str)) {
  181:       free(str);
  182:       return 0;
  183:     }
  184:   }
  185:   ge = anthy_smalloc(ator);
  186:   ge->xs = *xs;
  187:   ge->key = str;
  188:   ge->tmp.next = head->tmp.next;
  189:   head->tmp.next = ge;
  190:   return 1;
  191: }
  192: 
  193: static int
  194: gang_elm_compare_func(const void *p1, const void *p2)
  195: {
  196:   const struct gang_elm * const *s1 = p1;
  197:   const struct gang_elm * const *s2 = p2;
  198:   return strcmp((*s1)->key, (*s2)->key);
  199: }
  200: 
  201: struct gang_scan_context {
  202:   /**/
  203:   int nr;
  204:   struct gang_elm **array;
  205:   /**/
  206:   int nth;
  207: };
  208: 
  209: static int
  210: is_ext_ent(struct seq_ent *seq)
  211: {
  212:   if (!seq->md) {
  213:     return 1;
  214:   }
  215:   return 0;
  216: }
  217: 
  218: static void
  219: scan_misc_dic(struct gang_elm **array, int nr, int is_reverse)
  220: {
  221:   int i;
  222:   for (i = 0; i < nr; i++) {
  223:     xstr *xs = &array[i]->xs;
  224:     struct seq_ent *seq;
  225:     seq = anthy_cache_get_seq_ent(xs, is_reverse);
  226:     /* 個人辞書からの取得(texttrie(旧形式)と未知語辞書) */
  227:     if (seq) {
  228:       anthy_copy_words_from_private_dic(seq, xs, is_reverse);
  229:       anthy_validate_seq_ent(seq, xs, is_reverse);
  230:     }
  231:   }
  232: }
  233: 
  234: static void
  235: load_word(xstr *xs, const char *n, int is_reverse)
  236: {
  237:   struct seq_ent *seq = anthy_get_seq_ent_from_xstr(xs, 0);
  238:   xstr *word_xs;
  239:   wtype_t wt;  
  240:   struct word_line wl;
  241:   if (!seq || is_ext_ent(seq)) {
  242:     seq = anthy_mem_dic_alloc_seq_ent_by_xstr(anthy_current_personal_dic_cache,
  243:                                               xs, is_reverse);
  244:   }
  245:   if (anthy_parse_word_line(n, &wl)) {
  246:     return ;
  247:   }
  248:   word_xs = anthy_cstr_to_xstr(wl.word, ANTHY_UTF8_ENCODING);
  249:   if (anthy_type_to_wtype(wl.wt, &wt)) {
  250:     anthy_mem_dic_push_back_dic_ent(seq, 0, word_xs, wt,
  251:                                     NULL, wl.freq, 0);
  252:   }
  253: 
  254:   anthy_free_xstr(word_xs);
  255: }
  256: 
  257: static int
  258: gang_scan(void *p, int offset, const char *key, const char *n)
  259: {
  260:   struct gang_scan_context *gsc = p;
  261:   struct gang_elm *elm;
  262:   int r;
  263:   (void)offset;
  264:   while (1) {
  265:     if (gsc->nth >= gsc->nr) {
  266:       return 0;
  267:     }
  268:     elm = gsc->array[gsc->nth];
  269:     r = strcmp(elm->key, key);
  270:     if (r == 0) {
  271:       /* find it */
  272:       load_word(&elm->xs, n, 0);
  273:       /* go next in dictionary */
  274:       return 0;
  275:     } else if (r > 0) {
  276:       /* go next in dictionary */
  277:       return 0;
  278:     } else {
  279:       /* go next in lookup */
  280:       gsc->nth ++;
  281:     }
  282:   }
  283:   return 0;
  284: }
  285: 
  286: static void
  287: scan_dict(struct textdict *td, int nr, struct gang_elm **array)
  288: {
  289:   struct gang_scan_context gsc;
  290:   gsc.nr = nr;
  291:   gsc.array = array;
  292:   gsc.nth = 0;
  293:   anthy_textdict_scan(td, 0, &gsc, gang_scan);
  294: }
  295: 
  296: struct scan_arg {
  297:   struct gang_elm **array;
  298:   int nr;
  299: };
  300: 
  301: static void
  302: request_scan(struct textdict *td, void *arg)
  303: {
  304:   struct scan_arg *sarg = (struct scan_arg *)arg;
  305:   scan_dict(td, sarg->nr, sarg->array);
  306: }
  307: 
  308: static void
  309: do_gang_load_dic(xstr *sentence, int is_reverse)
  310: {
  311:   allocator ator = anthy_create_allocator(sizeof(struct gang_elm),
  312:                                           gang_elm_dtor);
  313:   int from, len;
  314:   xstr xs;
  315:   int i, nr;
  316:   struct gang_elm head;
  317:   struct gang_elm **array, *cur;
  318:   struct scan_arg sarg;
  319:   head.tmp.next = NULL;
  320:   nr = 0;
  321:   for (from = 0; from < sentence->len ; from ++) {
  322:     for (len = 1; len < 32 && from + len <= sentence->len; len ++) {
  323:       xs.str = &sentence->str[from];
  324:       xs.len = len;
  325:       nr += find_gang_elm(ator, &head, &xs);
  326:     }
  327:   }
  328:   array = malloc(sizeof(struct gang_elm *) * nr);
  329:   cur = head.tmp.next;
  330:   for (i = 0; i < nr; i++) {
  331:     array[i] = cur;
  332:     cur = cur->tmp.next;
  333:   }
  334:   qsort(array, nr, sizeof(struct gang_elm *), gang_elm_compare_func);
  335:   /**/
  336:   anthy_gang_fill_seq_ent(master_dic_file, array, nr, is_reverse);
  337:   /**/
  338:   scan_misc_dic(array, nr, is_reverse);
  339:   /* 個人辞書から読む */
  340:   sarg.nr = nr;
  341:   sarg.array = array;
  342:   anthy_ask_scan(request_scan, (void *)&sarg);
  343:   /**/
  344:   free(array);
  345:   anthy_free_allocator(ator);
  346: }
  347: 
  348: void
  349: anthy_gang_load_dic(xstr *sentence, int is_reverse)
  350: {
  351:   xstr *nx;
  352:   if (!is_reverse && (nx = convert_vu(sentence))) {
  353:     do_gang_load_dic(nx, is_reverse);
  354:     anthy_free_xstr(nx);
  355:   } else {
  356:     do_gang_load_dic(sentence, is_reverse);
  357:   }
  358: }
  359: 
  360: /*
  361:  * seq_entの取得
  362:  ************************
  363:  * seq_entの各種情報の取得
  364:  */
  365: int
  366: anthy_get_nr_dic_ents(seq_ent_t se, xstr *xs)
  367: {
  368:   struct seq_ent *s = se;
  369:   if (!s) {
  370:     return 0;
  371:   }
  372:   if (!xs) {
  373:     return s->nr_dic_ents;
  374:   }
  375:   return s->nr_dic_ents + anthy_get_nr_dic_ents_of_ext_ent(se, xs);
  376: }
  377: 
  378: int
  379: anthy_get_nth_dic_ent_str(seq_ent_t se, xstr *orig,
  380:                           int n, xstr *x)
  381: {
  382:   if (!se) {
  383:     return -1;
  384:   }
  385:   if (n >= se->nr_dic_ents) {
  386:     return anthy_get_nth_dic_ent_str_of_ext_ent(se, orig,
  387:                                                 n - se->nr_dic_ents, x);
  388:   }
  389:   x->len = se->dic_ents[n]->str.len;
  390:   x->str = anthy_xstr_dup_str(&se->dic_ents[n]->str);
  391:   return 0;
  392: }
  393: 
  394: int
  395: anthy_get_nth_dic_ent_is_compound(seq_ent_t se, int nth)
  396: {
  397:   if (!se) {
  398:     return 0;
  399:   }
  400:   if (nth >= se->nr_dic_ents) {
  401:     return 0;
  402:   }
  403:   return se->dic_ents[nth]->is_compound;
  404: }
  405: 
  406: int
  407: anthy_get_nth_dic_ent_freq(seq_ent_t se, int nth)
  408: {
  409:   struct seq_ent *s = se;
  410:   if (!s) {
  411:     return 0;
  412:   }
  413:   if (!s->dic_ents) {
  414:     return anthy_get_nth_dic_ent_freq_of_ext_ent(se, nth);
  415:   }
  416:   if (s->nr_dic_ents <= nth) {
  417:     return anthy_get_nth_dic_ent_freq_of_ext_ent(se, nth - se->nr_dic_ents);
  418:   }
  419:   return s->dic_ents[nth]->freq;
  420: }
  421: 
  422: int
  423: anthy_get_nth_dic_ent_wtype(seq_ent_t se, xstr *xs,
  424:                             int n, wtype_t *w)
  425: {
  426:   struct seq_ent *s = se;
  427:   if (!s) {
  428:     *w = anthy_wt_none;
  429:     return -1;
  430:   }
  431:   if (s->nr_dic_ents <= n) {
  432:     int r;
  433:     r = anthy_get_nth_dic_ent_wtype_of_ext_ent(xs, n - s->nr_dic_ents, w);
  434:     if (r == -1) {
  435:       *w = anthy_wt_none;
  436:     }
  437:     return r;
  438:   }
  439:   *w =  s->dic_ents[n]->type;
  440:   return 0;
  441: }
  442: 
  443: int
  444: anthy_get_seq_ent_pos(seq_ent_t se, int pos)
  445: {
  446:   int i, v=0;
  447:   struct seq_ent *s = se;
  448:   if (!s) {
  449:     return 0;
  450:   }
  451:   if (s->nr_dic_ents == 0) {
  452:     return anthy_get_ext_seq_ent_pos(se, pos);
  453:   }
  454:   for (i = 0; i < s->nr_dic_ents; i++) {
  455:     if (anthy_wtype_get_pos(s->dic_ents[i]->type) == pos) {
  456:       v += s->dic_ents[i]->freq;
  457:       if (v == 0) {
  458:         v = 1;
  459:       }
  460:     }
  461:   }
  462:   return v;
  463: }
  464: 
  465: int
  466: anthy_get_seq_ent_ct(seq_ent_t se, int pos, int ct)
  467: {
  468:   int i, v=0;
  469:   struct seq_ent *s = se;
  470:   if (!s) {
  471:     return 0;
  472:   }
  473:   if (s->nr_dic_ents == 0) {
  474:     return anthy_get_ext_seq_ent_ct(s, pos, ct);
  475:   }
  476:   for (i = 0; i < s->nr_dic_ents; i++) {
  477:     if (anthy_wtype_get_pos(s->dic_ents[i]->type)== pos &&
  478:         anthy_wtype_get_ct(s->dic_ents[i]->type)==ct) {
  479:       v += s->dic_ents[i]->freq;
  480:       if (v == 0) {
  481:         v = 1;
  482:       }
  483:     }
  484:   }
  485:   return v;
  486: }
  487: 
  488: /*
  489:  * wtの品詞を持つ単語の中で最大の頻度を持つものを返す
  490:  */
  491: int
  492: anthy_get_seq_ent_wtype_freq(seq_ent_t seq, wtype_t wt)
  493: {
  494:   int i, f;
  495: 
  496:   if (!seq) {
  497:     return 0;
  498:   }
  499:   /**/
  500:   if (seq->nr_dic_ents == 0) {
  501:     return anthy_get_ext_seq_ent_wtype(seq, wt);
  502:   }
  503: 
  504:   f = 0;
  505:   /* 単語 */
  506:   for (i = 0; i < seq->nr_dic_ents; i++) {
  507:     if (seq->dic_ents[i]->order == 0 &&
  508:         anthy_wtype_include(wt, seq->dic_ents[i]->type)) {
  509:       if (f < seq->dic_ents[i]->freq) {
  510:         f = seq->dic_ents[i]->freq;
  511:       }
  512:     }
  513:   }
  514:   return f;
  515: }
  516: 
  517: /*
  518:  * wtの品詞を持つ複合語の中で最大の頻度を持つものを返す
  519:  */
  520: int
  521: anthy_get_seq_ent_wtype_compound_freq(seq_ent_t se, wtype_t wt)
  522: {
  523:   int i,f;
  524:   struct seq_ent *s = se;
  525:   if (!s) {
  526:     return 0;
  527:   }
  528:   /**/
  529:   f = 0;
  530:   for (i = 0; i < s->nr_dic_ents; i++) {
  531:     if (!anthy_get_nth_dic_ent_is_compound(se, i)) {
  532:       continue;
  533:     }
  534:     if (anthy_wtype_include(wt, s->dic_ents[i]->type)) {
  535:       if (f < s->dic_ents[i]->freq) {
  536:         f = s->dic_ents[i]->freq;
  537:       }
  538:     }
  539:   }
  540:   return f;
  541: }
  542: 
  543: int
  544: anthy_get_seq_ent_indep(seq_ent_t se)
  545: {
  546:   int i;
  547:   struct seq_ent *s = se;
  548:   if (!s) {
  549:     return 0;
  550:   }
  551:   if (s->nr_dic_ents == 0) {
  552:     return anthy_get_ext_seq_ent_indep(s);
  553:   }
  554:   for (i = 0; i < s->nr_dic_ents; i++) {
  555:     if (anthy_wtype_get_indep(s->dic_ents[i]->type)) {
  556:       return 1;
  557:     }
  558:   }
  559:   return 0;
  560: }
  561: 
  562: int
  563: anthy_has_compound_ents(seq_ent_t se)
  564: {
  565:   if (!se) {
  566:     return 0;
  567:   }
  568:   return se->nr_compound_ents;
  569: }
  570: 
  571: /* compundでない候補を持っているか */
  572: int
  573: anthy_has_non_compound_ents(seq_ent_t se)
  574: {