(linenum→info "unix/slp.c:2238")

anthy/9100e/src-splitter/splitter.c

    1: /*
    2:  * 文を文節にsplitするsplitter
    3:  *
    4:  * 文節の境界を検出する
    5:  *  anthy_init_split_context() 分割用のコンテキストを作って
    6:  *  anthy_mark_border() 分割をして
    7:  *  anthy_release_split_context() コンテキストを解放する
    8:  *
    9:  *  anthy_commit_border() コミットされた内容に対して学習をする
   10:  *
   11:  * Funded by IPA未踏ソフトウェア創造事業 2001 9/22
   12:  *
   13:  * Copyright (C) 2004 YOSHIDA Yuichi
   14:  * Copyright (C) 2000-2004 TABATA Yusuke
   15:  * Copyright (C) 2000-2001 UGAWA Tomoharu
   16:  *
   17:  * $Id: splitter.c,v 1.48 2002/11/18 11:39:18 yusuke Exp $
   18:  */
   19: /*
   20:   This library is free software; you can redistribute it and/or
   21:   modify it under the terms of the GNU Lesser General Public
   22:   License as published by the Free Software Foundation; either
   23:   version 2 of the License, or (at your option) any later version.
   24: 
   25:   This library is distributed in the hope that it will be useful,
   26:   but WITHOUT ANY WARRANTY; without even the implied warranty of
   27:   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   28:   Lesser General Public License for more details.
   29: 
   30:   You should have received a copy of the GNU Lesser General Public
   31:   License along with this library; if not, write to the Free Software
   32:   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307  USA
   33:  */
   34: #include <stdlib.h>
   35: #include <string.h>
   36: 
   37: #include <anthy/alloc.h>
   38: #include <anthy/record.h>
   39: #include <anthy/splitter.h>
   40: #include <anthy/logger.h>
   41: #include "wordborder.h"
   42: 
   43: #define MAX_EXPAND_PAIR_ENTRY_COUNT 1000
   44: 
   45: static int splitter_debug_flags;
   46: 
   47: /**/
   48: wtype_t anthy_wtype_noun;
   49: wtype_t anthy_wtype_name_noun;
   50: wtype_t anthy_wtype_num_noun;
   51: wtype_t anthy_wtype_prefix;
   52: wtype_t anthy_wtype_num_prefix;
   53: wtype_t anthy_wtype_num_postfix;
   54: wtype_t anthy_wtype_name_postfix;
   55: wtype_t anthy_wtype_sv_postfix;
   56: wtype_t anthy_wtype_a_tail_of_v_renyou;
   57: wtype_t anthy_wtype_v_renyou;
   58: wtype_t anthy_wtype_noun_tail;/* いれ「たて」とか */
   59: wtype_t anthy_wtype_n1;
   60: wtype_t anthy_wtype_n10;
   61: 
   62: 
   63: /** make_word_cacheで作成した文節情報を解放する
   64:  */
   65: static void
   66: release_info_cache(struct splitter_context *sc)
   67: {
   68:   struct word_split_info_cache *info = sc->word_split_info;
   69: 
   70:   anthy_free_allocator(info->MwAllocator);
   71:   anthy_free_allocator(info->WlAllocator);
   72:   free(info->cnode);
   73:   free(info->seq_len);
   74:   free(info->rev_seq_len);
   75:   free(info);
   76: }
   77: 
   78: static void
   79: metaword_dtor(void *p)
   80: {
   81:   struct meta_word *mw = (struct meta_word*)p;
   82:   if (mw->cand_hint.str) {
   83:     free(mw->cand_hint.str);
   84:   }
   85: }
   86: 
   87: 
   88: static void
   89: alloc_char_ent(xstr *xs, struct splitter_context *sc)
   90: {
   91:   int i;
   92:  
   93:   sc->char_count = xs->len;
   94:   sc->ce = (struct char_ent*)
   95:     malloc(sizeof(struct char_ent)*(xs->len + 1));
   96:   for (i = 0; i <= xs->len; i++) {
   97:     sc->ce[i].c = &xs->str[i];
   98:     sc->ce[i].seg_border = 0;
   99:     sc->ce[i].initial_seg_len = 0;
  100:     sc->ce[i].best_seg_class = SEG_HEAD;
  101:     sc->ce[i].best_mw = NULL;
  102:   }
  103:  
  104:   /* 左右両端は文節の境界である */
  105:   sc->ce[0].seg_border = 1;
  106:   sc->ce[xs->len].seg_border = 1;
  107: }
  108: 
  109: /*  ここで確保した内容はrelease_info_cacheで解放される 
  110:  */
  111: static void
  112: alloc_info_cache(struct splitter_context *sc)
  113: {
  114:   int i;
  115:   struct word_split_info_cache *info;
  116: 
  117:   /* キャッシュのデータを確保 */
  118:   sc->word_split_info = malloc(sizeof(struct word_split_info_cache));
  119:   info = sc->word_split_info;
  120:   info->MwAllocator = anthy_create_allocator(sizeof(struct meta_word), metaword_dtor);
  121:   info->WlAllocator = anthy_create_allocator(sizeof(struct word_list), 0);
  122:   info->cnode =
  123:     malloc(sizeof(struct char_node) * (sc->char_count + 1));
  124: 
  125:   info->seq_len = malloc(sizeof(int) * (sc->char_count + 1));
  126:   info->rev_seq_len = malloc(sizeof(int) * (sc->char_count + 1));
  127: 
  128:   /* 各文字インデックスに対して初期化を行う */
  129:   for (i = 0; i <= sc->char_count; i++) {
  130:     info->seq_len[i] = 0;
  131:     info->rev_seq_len[i] = 0;
  132:     info->cnode[i].wl = NULL;
  133:     info->cnode[i].mw = NULL;
  134:     info->cnode[i].max_len = 0;
  135:   }
  136: }
  137: 
  138: /** 外から呼び出されるwordsplitterのトップレベルの関数 */
  139: void
  140: anthy_mark_border(struct splitter_context *sc,
  141:                   int from, int from2, int to)
  142: {
  143:   int i;
  144:   struct word_split_info_cache *info;
  145: 
  146:   /* sanity check */
  147:   if ((to - from) <= 0) {
  148:     return ;
  149:   }
  150: 
  151:   /* 境界マーク用とlatticeの検索で用いられるクラス用の領域を確保 */
  152:   info = sc->word_split_info;
  153:   info->seg_border = alloca(sizeof(int)*(sc->char_count + 1));
  154:   info->best_seg_class = alloca(sizeof(enum seg_class)*(sc->char_count + 1));
  155:   info->best_mw = alloca(sizeof(struct meta_word*)*(sc->char_count + 1));
  156:   for (i = 0; i < sc->char_count + 1; ++i) {
  157:     info->seg_border[i] = sc->ce[i].seg_border;
  158:     info->best_seg_class[i] = sc->ce[i].best_seg_class;
  159:     info->best_mw[i] = sc->ce[i].best_mw;
  160:   }
  161: 
  162:   /* 境界を決定する */
  163:   anthy_eval_border(sc, from, from2, to);
  164: 
  165:   for (i = from; i < to; ++i) {
  166:     sc->ce[i].seg_border = info->seg_border[i];
  167:     sc->ce[i].best_seg_class = info->best_seg_class[i];
  168:     sc->ce[i].best_mw = info->best_mw[i];
  169:   }
  170: }
  171: 
  172: /* 文節が拡大されたので,それを学習する */
  173: static void
  174: proc_expanded_segment(struct splitter_context *sc,
  175:                       int from, int len)
  176: {
  177:   int initial_len = sc->ce[from].initial_seg_len;
  178:   int i, nr;
  179:   xstr from_xs, to_xs, *xs;
  180: 
  181:   from_xs.str = sc->ce[from].c;
  182:   from_xs.len = initial_len;
  183:   to_xs.str = sc->ce[from].c;
  184:   to_xs.len = len;
  185:   if (anthy_select_section("EXPANDPAIR", 1) == -1) {
  186:     return ;
  187:   }
  188:   if (anthy_select_row(&from_xs, 1) == -1) {
  189:     return ;
  190:   }
  191:   nr = anthy_get_nr_values();
  192:   for (i = 0; i < nr; i ++) {
  193:     xs = anthy_get_nth_xstr(i);
  194:     if (!xs || !anthy_xstrcmp(xs, &to_xs)) {
  195:       /* 既にある */
  196:       return ;
  197:     }
  198:   }
  199:   anthy_set_nth_xstr(nr, &to_xs);
  200:   anthy_truncate_section(MAX_EXPAND_PAIR_ENTRY_COUNT);
  201: }
  202: 
  203: /* 文節のマージと語尾を学習する */
  204: void
  205: anthy_commit_border(struct splitter_context *sc, int nr_segments,
  206:                     struct meta_word **mw, int *seg_len)
  207: {
  208:   int i, from = 0;
  209: 
  210:   /* 伸ばした文節 */
  211:   for (i = 0; i < nr_segments; i++) {
  212:     /* それぞれの文節に対して */
  213: 
  214:     int len = seg_len[i];
  215:     int initial_len = sc->ce[from].initial_seg_len;
  216:     int real_len = 0;
  217:     int l2;
  218: 
  219:     if (!initial_len || from + initial_len == sc->char_count) {
  220:       /* そこは境界ではない */
  221:       goto tail;
  222:     }
  223:     l2 = sc->ce[from + initial_len].initial_seg_len;
  224:     if (initial_len + l2 > len) {
  225:       /* 隣の文節を含むほど拡大されたわけではない */
  226:       goto tail;
  227:     }
  228:     if (mw[i]) {
  229:       real_len = mw[i]->len;
  230:     }
  231:     if (real_len <= initial_len) {
  232:       goto tail;
  233:     }
  234:     /* 右の文節を含む長さに拡張された文節がコミットされた */
  235:     proc_expanded_segment(sc, from, real_len);
  236:   tail:
  237:     from += len;
  238:   }
  239: }
  240: 
  241: int
  242: anthy_splitter_debug_flags(void)
  243: {
  244:   return splitter_debug_flags;
  245: }
  246: 
  247: void
  248: anthy_init_split_context(xstr *xs, struct splitter_context *sc, int is_reverse)
  249: {
  250:   alloc_char_ent(xs, sc);
  251:   alloc_info_cache(sc);
  252:   sc->is_reverse = is_reverse;
  253:   /* 全ての部分文字列をチェックして、文節の候補を列挙する
  254:      word_listを構成してからmetawordを構成する */
  255:   anthy_lock_dic();
  256:   anthy_make_word_list_all(sc);
  257:   anthy_unlock_dic();
  258:   anthy_make_metaword_all(sc);
  259: 
  260: }
  261: 
  262: void
  263: anthy_release_split_context(struct splitter_context *sc)
  264: {
  265:   if (sc->word_split_info) {
  266:     release_info_cache(sc);
  267:     sc->word_split_info = 0;
  268:   }
  269:   if (sc->ce) {
  270:     free(sc->ce);
  271:     sc->ce = 0;
  272:   }
  273: }
  274: 
  275: /** splitter全体の初期化を行う */
  276: int
  277: anthy_init_splitter(void)
  278: {
  279:   /* デバッグプリントの設定 */
  280:   char *en = getenv("ANTHY_ENABLE_DEBUG_PRINT");
  281:   char *dis = getenv("ANTHY_DISABLE_DEBUG_PRINT");
  282:   splitter_debug_flags = SPLITTER_DEBUG_NONE;
  283:   if (!dis && en && strlen(en)) {
  284:     char *fs = getenv("ANTHY_SPLITTER_PRINT");
  285:     if (fs) {
  286:       if (strchr(fs, 'w')) {
  287:         splitter_debug_flags |= SPLITTER_DEBUG_WL;
  288:       }
  289:       if (strchr(fs, 'm')) {
  290:         splitter_debug_flags |= SPLITTER_DEBUG_MW;
  291:       }
  292:       if (strchr(fs, 'l')) {
  293:         splitter_debug_flags |= SPLITTER_DEBUG_LN;
  294:       }
  295:       if (strchr(fs, 'i')) {
  296:         splitter_debug_flags |= SPLITTER_DEBUG_ID;
  297:       }
  298:       if (strchr(fs, 'c')) {
  299:         splitter_debug_flags |= SPLITTER_DEBUG_CAND;
  300:       }
  301:     }
  302:   }
  303:   /* 付属語グラフの初期化 */
  304:   if (anthy_init_depword_tab()) {
  305:     anthy_log(0, "Failed to init dependent word table.\n");
  306:     return -1;
  307:   }
  308:   /**/
  309:   anthy_wtype_noun = anthy_init_wtype_by_name("名詞35");
  310:   anthy_wtype_name_noun = anthy_init_wtype_by_name("人名");
  311:   anthy_wtype_num_noun = anthy_init_wtype_by_name("数詞");
  312:   anthy_wtype_a_tail_of_v_renyou = anthy_init_wtype_by_name("形容詞化接尾語");
  313:   anthy_wtype_v_renyou = anthy_init_wtype_by_name("動詞連用形");
  314:   anthy_wtype_noun_tail = anthy_init_wtype_by_name("名詞化接尾語");
  315:   anthy_wtype_prefix = anthy_init_wtype_by_name("名詞接頭辞");
  316:   anthy_wtype_num_prefix = anthy_init_wtype_by_name("数接頭辞");
  317:   anthy_wtype_num_postfix = anthy_init_wtype_by_name("数接尾辞");
  318:   anthy_wtype_name_postfix = anthy_init_wtype_by_name("人名接尾辞");
  319:   anthy_wtype_sv_postfix = anthy_init_wtype_by_name("サ変接尾辞");
  320:   anthy_wtype_n1 = anthy_init_wtype_by_name("数詞1");
  321:   anthy_wtype_n10 = anthy_init_wtype_by_name("数詞10");
  322:   return 0;
  323: }
Permalink to this note knok: anthy/9100e/src-splitter/splitter.c:276-323 on Thu Feb 28 18:01:59 +0900 2008

splitterサブシステムの初期化

324: 325: void 326: anthy_quit_splitter(void) 327: { 328: anthy_quit_depword_tab(); 329: }
Syntax (Markdown)