
1: /* 2: * 文を文節にsplitするsplitter 3: * 4: * 文節の境界を検出する 5: * anthy_init_split_context() 分割用のコンテキストを作って 6: * anthy_mark_border() 分割をして 7: * anthy_release_split_context() コンテキストを解放する 8: * 9: * anthy_commit_border() コミットされた内容に対して学習をする 10: * 11: * Funded by IPA未踏ソフトウェア創造事業 2001 9/22 12: * 13: * Copyright (C) 2004 YOSHIDA Yuichi 14: * Copyright (C) 2000-2004 TABATA Yusuke 15: * Copyright (C) 2000-2001 UGAWA Tomoharu 16: * 17: * $Id: splitter.c,v 1.48 2002/11/18 11:39:18 yusuke Exp $ 18: */ 19: /* 20: This library is free software; you can redistribute it and/or 21: modify it under the terms of the GNU Lesser General Public 22: License as published by the Free Software Foundation; either 23: version 2 of the License, or (at your option) any later version. 24: 25: This library is distributed in the hope that it will be useful, 26: but WITHOUT ANY WARRANTY; without even the implied warranty of 27: MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 28: Lesser General Public License for more details. 29: 30: You should have received a copy of the GNU Lesser General Public 31: License along with this library; if not, write to the Free Software 32: Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 33: */ 34: #include <stdlib.h> 35: #include <string.h> 36: 37: #include <anthy/alloc.h> 38: #include <anthy/record.h> 39: #include <anthy/splitter.h> 40: #include <anthy/logger.h> 41: #include "wordborder.h" 42: 43: #define MAX_EXPAND_PAIR_ENTRY_COUNT 1000 44: 45: static int splitter_debug_flags; 46: 47: /**/ 48: wtype_t anthy_wtype_noun; 49: wtype_t anthy_wtype_name_noun; 50: wtype_t anthy_wtype_num_noun; 51: wtype_t anthy_wtype_prefix; 52: wtype_t anthy_wtype_num_prefix; 53: wtype_t anthy_wtype_num_postfix; 54: wtype_t anthy_wtype_name_postfix; 55: wtype_t anthy_wtype_sv_postfix; 56: wtype_t anthy_wtype_a_tail_of_v_renyou; 57: wtype_t anthy_wtype_v_renyou; 58: wtype_t anthy_wtype_noun_tail;/* いれ「たて」とか */ 59: wtype_t anthy_wtype_n1; 60: wtype_t anthy_wtype_n10; 61: 62: 63: /** make_word_cacheで作成した文節情報を解放する 64: */ 65: static void 66: release_info_cache(struct splitter_context *sc) 67: { 68: struct word_split_info_cache *info = sc->word_split_info; 69: 70: anthy_free_allocator(info->MwAllocator); 71: anthy_free_allocator(info->WlAllocator); 72: free(info->cnode); 73: free(info->seq_len); 74: free(info->rev_seq_len); 75: free(info); 76: } 77: 78: static void 79: metaword_dtor(void *p) 80: { 81: struct meta_word *mw = (struct meta_word*)p; 82: if (mw->cand_hint.str) { 83: free(mw->cand_hint.str); 84: } 85: } 86: 87: 88: static void 89: alloc_char_ent(xstr *xs, struct splitter_context *sc) 90: { 91: int i; 92: 93: sc->char_count = xs->len; 94: sc->ce = (struct char_ent*) 95: malloc(sizeof(struct char_ent)*(xs->len + 1)); 96: for (i = 0; i <= xs->len; i++) { 97: sc->ce[i].c = &xs->str[i]; 98: sc->ce[i].seg_border = 0; 99: sc->ce[i].initial_seg_len = 0; 100: sc->ce[i].best_seg_class = SEG_HEAD; 101: sc->ce[i].best_mw = NULL; 102: } 103: 104: /* 左右両端は文節の境界である */ 105: sc->ce[0].seg_border = 1; 106: sc->ce[xs->len].seg_border = 1; 107: } 108: 109: /* ここで確保した内容はrelease_info_cacheで解放される 110: */ 111: static void 112: alloc_info_cache(struct splitter_context *sc) 113: { 114: int i; 115: struct word_split_info_cache *info; 116: 117: /* キャッシュのデータを確保 */ 118: sc->word_split_info = malloc(sizeof(struct word_split_info_cache)); 119: info = sc->word_split_info; 120: info->MwAllocator = anthy_create_allocator(sizeof(struct meta_word), metaword_dtor); 121: info->WlAllocator = anthy_create_allocator(sizeof(struct word_list), 0); 122: info->cnode = 123: malloc(sizeof(struct char_node) * (sc->char_count + 1)); 124: 125: info->seq_len = malloc(sizeof(int) * (sc->char_count + 1)); 126: info->rev_seq_len = malloc(sizeof(int) * (sc->char_count + 1)); 127: 128: /* 各文字インデックスに対して初期化を行う */ 129: for (i = 0; i <= sc->char_count; i++) { 130: info->seq_len[i] = 0; 131: info->rev_seq_len[i] = 0; 132: info->cnode[i].wl = NULL; 133: info->cnode[i].mw = NULL; 134: info->cnode[i].max_len = 0; 135: } 136: } 137: 138: /** 外から呼び出されるwordsplitterのトップレベルの関数 */ 139: void 140: anthy_mark_border(struct splitter_context *sc, 141: int from, int from2, int to) 142: { 143: int i; 144: struct word_split_info_cache *info; 145: 146: /* sanity check */ 147: if ((to - from) <= 0) { 148: return ; 149: } 150: 151: /* 境界マーク用とlatticeの検索で用いられるクラス用の領域を確保 */ 152: info = sc->word_split_info; 153: info->seg_border = alloca(sizeof(int)*(sc->char_count + 1)); 154: info->best_seg_class = alloca(sizeof(enum seg_class)*(sc->char_count + 1)); 155: info->best_mw = alloca(sizeof(struct meta_word*)*(sc->char_count + 1)); 156: for (i = 0; i < sc->char_count + 1; ++i) { 157: info->seg_border[i] = sc->ce[i].seg_border; 158: info->best_seg_class[i] = sc->ce[i].best_seg_class; 159: info->best_mw[i] = sc->ce[i].best_mw; 160: } 161: 162: /* 境界を決定する */ 163: anthy_eval_border(sc, from, from2, to); 164: 165: for (i = from; i < to; ++i) { 166: sc->ce[i].seg_border = info->seg_border[i]; 167: sc->ce[i].best_seg_class = info->best_seg_class[i]; 168: sc->ce[i].best_mw = info->best_mw[i]; 169: } 170: } 171: 172: /* 文節が拡大されたので,それを学習する */ 173: static void 174: proc_expanded_segment(struct splitter_context *sc, 175: int from, int len) 176: { 177: int initial_len = sc->ce[from].initial_seg_len; 178: int i, nr; 179: xstr from_xs, to_xs, *xs; 180: 181: from_xs.str = sc->ce[from].c; 182: from_xs.len = initial_len; 183: to_xs.str = sc->ce[from].c; 184: to_xs.len = len; 185: if (anthy_select_section("EXPANDPAIR", 1) == -1) { 186: return ; 187: } 188: if (anthy_select_row(&from_xs, 1) == -1) { 189: return ; 190: } 191: nr = anthy_get_nr_values(); 192: for (i = 0; i < nr; i ++) { 193: xs = anthy_get_nth_xstr(i); 194: if (!xs || !anthy_xstrcmp(xs, &to_xs)) { 195: /* 既にある */ 196: return ; 197: } 198: } 199: anthy_set_nth_xstr(nr, &to_xs); 200: anthy_truncate_section(MAX_EXPAND_PAIR_ENTRY_COUNT); 201: } 202: 203: /* 文節のマージと語尾を学習する */ 204: void 205: anthy_commit_border(struct splitter_context *sc, int nr_segments, 206: struct meta_word **mw, int *seg_len) 207: { 208: int i, from = 0; 209: 210: /* 伸ばした文節 */ 211: for (i = 0; i < nr_segments; i++) { 212: /* それぞれの文節に対して */ 213: 214: int len = seg_len[i]; 215: int initial_len = sc->ce[from].initial_seg_len; 216: int real_len = 0; 217: int l2; 218: 219: if (!initial_len || from + initial_len == sc->char_count) { 220: /* そこは境界ではない */ 221: goto tail; 222: } 223: l2 = sc->ce[from + initial_len].initial_seg_len; 224: if (initial_len + l2 > len) { 225: /* 隣の文節を含むほど拡大されたわけではない */ 226: goto tail; 227: } 228: if (mw[i]) { 229: real_len = mw[i]->len; 230: } 231: if (real_len <= initial_len) { 232: goto tail; 233: } 234: /* 右の文節を含む長さに拡張された文節がコミットされた */ 235: proc_expanded_segment(sc, from, real_len); 236: tail: 237: from += len; 238: } 239: } 240: 241: int 242: anthy_splitter_debug_flags(void) 243: { 244: return splitter_debug_flags; 245: } 246: 247: void 248: anthy_init_split_context(xstr *xs, struct splitter_context *sc, int is_reverse) 249: { 250: alloc_char_ent(xs, sc); 251: alloc_info_cache(sc); 252: sc->is_reverse = is_reverse; 253: /* 全ての部分文字列をチェックして、文節の候補を列挙する 254: word_listを構成してからmetawordを構成する */ 255: anthy_lock_dic(); 256: anthy_make_word_list_all(sc); 257: anthy_unlock_dic(); 258: anthy_make_metaword_all(sc); 259: 260: } 261: 262: void 263: anthy_release_split_context(struct splitter_context *sc) 264: { 265: if (sc->word_split_info) { 266: release_info_cache(sc); 267: sc->word_split_info = 0; 268: } 269: if (sc->ce) { 270: free(sc->ce); 271: sc->ce = 0; 272: } 273: } 274: 275: /** splitter全体の初期化を行う */ 276: int 277: anthy_init_splitter(void) 278: { 279: /* デバッグプリントの設定 */ 280: char *en = getenv("ANTHY_ENABLE_DEBUG_PRINT"); 281: char *dis = getenv("ANTHY_DISABLE_DEBUG_PRINT"); 282: splitter_debug_flags = SPLITTER_DEBUG_NONE; 283: if (!dis && en && strlen(en)) { 284: char *fs = getenv("ANTHY_SPLITTER_PRINT"); 285: if (fs) { 286: if (strchr(fs, 'w')) { 287: splitter_debug_flags |= SPLITTER_DEBUG_WL; 288: } 289: if (strchr(fs, 'm')) { 290: splitter_debug_flags |= SPLITTER_DEBUG_MW; 291: } 292: if (strchr(fs, 'l')) { 293: splitter_debug_flags |= SPLITTER_DEBUG_LN; 294: } 295: if (strchr(fs, 'i')) { 296: splitter_debug_flags |= SPLITTER_DEBUG_ID; 297: } 298: if (strchr(fs, 'c')) { 299: splitter_debug_flags |= SPLITTER_DEBUG_CAND; 300: } 301: } 302: } 303: /* 付属語グラフの初期化 */ 304: if (anthy_init_depword_tab()) { 305: anthy_log(0, "Failed to init dependent word table.\n"); 306: return -1; 307: } 308: /**/ 309: anthy_wtype_noun = anthy_init_wtype_by_name("名詞35"); 310: anthy_wtype_name_noun = anthy_init_wtype_by_name("人名"); 311: anthy_wtype_num_noun = anthy_init_wtype_by_name("数詞"); 312: anthy_wtype_a_tail_of_v_renyou = anthy_init_wtype_by_name("形容詞化接尾語"); 313: anthy_wtype_v_renyou = anthy_init_wtype_by_name("動詞連用形"); 314: anthy_wtype_noun_tail = anthy_init_wtype_by_name("名詞化接尾語"); 315: anthy_wtype_prefix = anthy_init_wtype_by_name("名詞接頭辞"); 316: anthy_wtype_num_prefix = anthy_init_wtype_by_name("数接頭辞"); 317: anthy_wtype_num_postfix = anthy_init_wtype_by_name("数接尾辞"); 318: anthy_wtype_name_postfix = anthy_init_wtype_by_name("人名接尾辞"); 319: anthy_wtype_sv_postfix = anthy_init_wtype_by_name("サ変接尾辞"); 320: anthy_wtype_n1 = anthy_init_wtype_by_name("数詞1"); 321: anthy_wtype_n10 = anthy_init_wtype_by_name("数詞10"); 322: return 0; 323: }knok: anthy/9100e/src-splitter/splitter.c:276-323 on Thu Feb 28 18:01:59 +0900 2008324: 325: void 326: anthy_quit_splitter(void) 327: { 328: anthy_quit_depword_tab(); 329: }splitterサブシステムの初期化