
1: /* 2: * 文節の関係を処理する 3: * Copyright (C) 2006 Higashiyama Masahiko (thanks google summer of code program) 4: * Copyright (C) 2002-2007 TABATA Yusuke 5: * 6: * anthy_reorder_candidates_by_relation() 7: * 8: */ 9: /* 10: This library is free software; you can redistribute it and/or 11: modify it under the terms of the GNU Lesser General Public 12: License as published by the Free Software Foundation; either 13: version 2 of the License, or (at your option) any later version. 14: 15: This library is distributed in the hope that it will be useful, 16: but WITHOUT ANY WARRANTY; without even the implied warranty of 17: MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 18: Lesser General Public License for more details. 19: 20: You should have received a copy of the GNU Lesser General Public 21: License along with this library; if not, write to the Free Software 22: Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 23: */ 24: 25: #include <arpa/inet.h> 26: #include <stdlib.h> 27: 28: #include <anthy/segclass.h> 29: #include <anthy/segment.h> 30: #include <anthy/ordering.h> 31: #include <anthy/dic.h> 32: #include <anthy/diclib.h> 33: #include <anthy/feature_set.h> 34: #include <anthy/corpus.h> 35: #include "sorter.h" 36: 37: #define MAX_COLLISION 4 38: #define SEARCH_LIMIT 100 39: #define MAX_NEIGHBOR 10 40: 41: 42: /* 全文検索用のコーパス */ 43: static struct corpus_ { 44: /* header */ 45: void *corpus_bucket; 46: void *corpus_array; 47: /**/ 48: int *bucket; 49: int *array; 50: /**/ 51: int bucket_size; 52: int array_size; 53: } corpus_info; 54: 55: /* 検索用のiterator */ 56: struct iterator { 57: /* 検索のキーと現在の場所 */ 58: int key; 59: int idx; 60: /* 検索回数の上限 */ 61: int limit; 62: }; 63: 64: struct neighbor { 65: int nr; 66: int id[MAX_NEIGHBOR]; 67: }; 68: 69: /** 文節@segの中に@from_word_idの単語と共起関係にある 70: * 候補があるかどうかを探し、あればスコアを上げる。 71: */ 72: static void 73: reorder_candidate(int from_word_id, struct seg_ent *seg) 74: { 75: int i, pos; 76: struct cand_ent *ce = seg->cands[0]; 77: if (ce->core_elm_index == -1) { 78: return ; 79: } 80: /* 0番目の候補の品詞 */ 81: pos = anthy_wtype_get_pos(ce->elm[ce->core_elm_index].wt); 82: 83: for (i = 0; i < seg->nr_cands; i++) { 84: int word_id; 85: ce = seg->cands[i]; 86: if (ce->core_elm_index == -1) { 87: continue; 88: } 89: word_id = ce->elm[ce->core_elm_index].id; 90: if (anthy_dic_check_word_relation(from_word_id, word_id) && 91: anthy_wtype_get_pos(ce->elm[ce->core_elm_index].wt) == pos) { 92: /* 用例にマッチしたので、候補のスコアを更新 */ 93: ce->flag |= CEF_USEDICT; 94: ce->score *= 10; 95: } 96: } 97: } 98: 99: static int 100: get_indep_word_id(struct seg_ent *seg, int nth) 101: { 102: struct cand_ent *ce; 103: if (seg->cands[nth]->core_elm_index == -1) { 104: /* 一番目の候補がseq_entから作られた候補ではない */ 105: return -1; 106: } 107: ce = seg->cands[nth]; 108: /* 自立語のidを取り出す */ 109: return ce->elm[ce->core_elm_index].id; 110: } 111: 112: /* 用例辞書を使って並び替えをする */ 113: static void 114: reorder_by_use_dict(struct segment_list *sl, int nth) 115: { 116: int i; 117: struct seg_ent *cur_seg; 118: int word_id; 119: 120: cur_seg = anthy_get_nth_segment(sl, nth); 121: word_id = get_indep_word_id(cur_seg, 0); 122: if (word_id == -1) { 123: /**/ 124: return ; 125: } 126: /* 近所の文節を順に見ていく */ 127: for (i = nth - 2; i < nth + 2 && i < sl->nr_segments; i++) { 128: struct seg_ent *target_seg; 129: if (i < 0 || i == nth) { 130: continue ; 131: } 132: /* i番目の文節と前後のj番目の文節に対して */ 133: target_seg = anthy_get_nth_segment(sl, i); 134: reorder_candidate(word_id, target_seg); 135: } 136: } 137: 138: static int 139: find_border_of_this_word(int idx) 140: { 141: int val; 142: if (idx < 0) { 143: return 0; 144: } 145: val = ntohl(corpus_info.array[idx * 2]); 146: while (!(val & ELM_WORD_BORDER) && 147: idx > -1) { 148: idx --; 149: } 150: return idx; 151: } 152: 153: static int 154: find_left_word_border(int idx) 155: { 156: int val; 157: if (idx == -1) { 158: return -1; 159: } 160: val = ntohl(corpus_info.array[idx * 2]); 161: if (val & ELM_BOS) { 162: return -1; 163: } 164: idx --; 165: return find_border_of_this_word(idx); 166: } 167: 168: static int 169: find_right_word_border(int idx) 170: { 171: if (idx == -1) { 172: return -1; 173: } 174: while (idx < corpus_info.array_size - 2) { 175: int val; 176: idx ++; 177: val = ntohl(corpus_info.array[idx * 2]); 178: if (val & ELM_BOS) { 179: return -1; 180: } 181: if (val & ELM_WORD_BORDER) { 182: return idx; 183: } 184: } 185: return -1; 186: } 187: 188: static void 189: push_id(struct neighbor *ctx, 190: int id) 191: { 192: if (ctx->nr < MAX_NEIGHBOR - 1) { 193: ctx->id[ctx->nr] = id; 194: ctx->nr++; 195: } 196: } 197: 198: static void 199: collect_word_context(struct neighbor *ctx, int idx) 200: { 201: int id = ntohl(corpus_info.array[idx * 2]) & CORPUS_KEY_MASK; 202: /*printf(" id=%d\n", id);*/ 203: push_id(ctx, id); 204: } 205: 206: /* 例文中で周辺の情報を取得する */ 207: static void 208: collect_corpus_context(struct neighbor *ctx, 209: struct iterator *it) 210: { 211: int i; 212: int this_idx, idx; 213: 214: this_idx = find_border_of_this_word(it->idx); 215: 216: /*printf(" key=%d\n", it->key);*/ 217: /* 左へスキャン */ 218: idx = this_idx; 219: for (i = 0; i < 2; i++) { 220: idx = find_left_word_border(idx); 221: if (idx == -1) { 222: break; 223: } 224: collect_word_context(ctx, idx); 225: } 226: /* 右へスキャン */ 227: idx = this_idx; 228: for (i = 0; i < 2; i++) { 229: idx = find_right_word_border(idx); 230: if (idx == -1) { 231: break; 232: } 233: collect_word_context(ctx, idx); 234: } 235: } 236: 237: /* 変換対象の文字列の周辺の情報を取得する */ 238: static void 239: collect_user_context(struct neighbor *ctx, 240: struct segment_list *sl, int nth) 241: { 242: int i; 243: ctx->nr = 0; 244: for (i = nth - 2; i <= nth + 2 && i < sl->nr_segments; i++) { 245: int id; 246: if ((i < 0) || (i == nth)) { 247: continue; 248: } 249: id = get_indep_word_id(anthy_get_nth_segment(sl, i), 0); 250: if (id > -1) { 251: id &= CORPUS_KEY_MASK; 252: /*printf("user_ctx=%d\n", id);*/ 253: push_id(ctx, id); 254: } 255: } 256: } 257: 258: /* 隣接文節の情報を比較する */ 259: static int 260: do_compare_context(struct neighbor *n1, 261: struct neighbor *n2) 262: { 263: int i, j; 264: int m = 0; 265: for (i = 0; i < n1->nr; i++) { 266: for (j = 0; j < n2->nr; j++) { 267: if (n1->id[i] == n2->id[j]) { 268: m++; 269: } 270: } 271: } 272: return m; 273: } 274: 275: /* 隣接文節の情報を取得して比較する */ 276: static int 277: compare_context(struct neighbor *user, 278: struct iterator *it) 279: { 280: struct neighbor sample; 281: int nr; 282: /**/ 283: sample.nr = 0; 284: /* 例文中の周辺情報を集める */ 285: collect_corpus_context(&sample, it); 286: if (sample.nr == 0) { 287: return 0; 288: } 289: /* 比較する */ 290: nr = do_compare_context(user, &sample); 291: if (nr >= sample.nr / 2) { 292: return nr; 293: } 294: return 0; 295: } 296: 297: /* keyの最初の出現場所を見つける 298: * 見つからなかったら-1を返す 299: */ 300: static int 301: find_first_pos(int key) 302: { 303: int i; 304: for (i = 0; i < MAX_COLLISION; i++) { 305: int bkt = (key + i) % corpus_info.bucket_size; 306: if ((int)ntohl(corpus_info.bucket[bkt * 2]) == key) { 307: return ntohl(corpus_info.bucket[bkt * 2 + 1]); 308: } 309: } 310: return -1; 311: } 312: 313: /* keyの最初の出現場所でiteratorを初期化する 314: * 見つからなかったら-1を返す 315: */ 316: static int 317: find_first_from_corpus(int key, struct iterator *it, int limit) 318: { 319: key &= CORPUS_KEY_MASK; 320: it->idx = find_first_pos(key); 321: it->key = key; 322: it->limit = limit; 323: return it->idx; 324: } 325: 326: /* keyの次の出現場所のiteratorを設定する 327: */ 328: static int 329: find_next_from_corpus(struct iterator *it) 330: { 331: int idx = it->idx; 332: it->limit--; 333: if (it->limit < 1) { 334: it->idx = -1; 335: return -1; 336: } 337: it->idx = ntohl(corpus_info.array[it->idx * 2 + 1]); 338: if (it->idx < 0 || it->idx >= corpus_info.array_size || 339: it->idx < idx) { 340: it->idx = -1; 341: } 342: return it->idx; 343: } 344: 345: static void 346: check_candidate_context(struct seg_ent *cur_seg, 347: int i, 348: struct neighbor *user) 349: { 350: struct iterator it; 351: int nr = 0; 352: int word_id; 353: word_id = get_indep_word_id(cur_seg, i); 354: if (word_id == -1) { 355: return ; 356: } 357: /* 各出現場所をスキャンする */ 358: find_first_from_corpus(word_id, &it, SEARCH_LIMIT); 359: /*printf("word_id=%d %d\n", word_id, it.idx);*/ 360: while (it.idx > -1) { 361: nr += compare_context(user, &it); 362: /**/ 363: find_next_from_corpus(&it); 364: } 365: /**/ 366: if (nr > 0) { 367: cur_seg->cands[i]->flag |= CEF_CONTEXT; 368: } 369: } 370: 371: /* 全文検索で候補を並び替える */ 372: static void 373: reorder_by_corpus(struct segment_list *sl, int nth) 374: { 375: struct seg_ent *cur_seg; 376: struct neighbor user; 377: int i; 378: /* 文節の周辺情報を集める */ 379: collect_user_context(&user, sl, nth); 380: if (user.nr == 0) { 381: return ; 382: } 383: cur_seg = anthy_get_nth_segment(sl, nth); 384: /* 各候補について */ 385: for (i = 0; i < cur_seg->nr_cands; i++) { 386: check_candidate_context(cur_seg, i, &user); 387: } 388: /* トップの候補に用例があれば、他の候補は見ない */ 389: if (cur_seg->cands[0]->flag & CEF_CONTEXT) { 390: cur_seg->cands[0]->flag &= ~CEF_CONTEXT; 391: return ; 392: } 393: /* 用例によるスコア加算 */ 394: for (i = 1; i < cur_seg->nr_cands; i++) { 395: if (cur_seg->cands[i]->flag & CEF_CONTEXT) { 396: cur_seg->cands[i]->score *= 2; 397: } 398: } 399: } 400: 401: /* 402: * 用例を用いて候補を並び替える 403: * @nth番目以降の文節を対象とする 404: */ 405: void 406: anthy_reorder_candidates_by_relation(struct segment_list *sl, int nth) 407: { 408: int i; 409: for (i = nth; i < sl->nr_segments; i++) { 410: reorder_by_use_dict(sl, i); 411: reorder_by_corpus(sl, i); 412: } 413: } 414: 415: void 416: anthy_relation_init(void) 417: { 418: corpus_info.corpus_array = anthy_file_dic_get_section("corpus_array"); 419: corpus_info.corpus_bucket = anthy_file_dic_get_section("corpus_bucket"); 420: if (!corpus_info.corpus_array || 421: !corpus_info.corpus_array) { 422: return ; 423: } 424: corpus_info.array_size = ntohl(((int *)corpus_info.corpus_array)[1]); 425: corpus_info.bucket_size = ntohl(((int *)corpus_info.corpus_bucket)[1]); 426: corpus_info.array = &(((int *)corpus_info.corpus_array)[16]); 427: corpus_info.bucket = &(((int *)corpus_info.corpus_bucket)[16]); 428: /* 429: { 430: int i; 431: for (i = 0; i < corpus_info.array_size; i++) { 432: int v = ntohl(corpus_info.array[i * 2]); 433: printf("%d: %d %d\n", i, v, v & CORPUS_KEY_MASK); 434: } 435: } 436: */ 437: }knok: anthy/9100e/src-ordering/relation.c:415-437 on Thu Feb 28 18:01:25 +0900 2008コーパスの関連性情報を取り扱うらしい?