1:
2:
3:
4:
5:
6:
7:
8:
9:
10:
11:
12:
13:
14: #include <stdlib.h>
15: #include <stdio.h>
16: #include <math.h>
17:
18: #include <anthy/record.h>
19: #include <anthy/splitter.h>
20: #include <anthy/xstr.h>
21: #include <anthy/segment.h>
22: #include <anthy/segclass.h>
23: #include "wordborder.h"
24:
25:
26: struct metaword_type_tab_ anthy_metaword_type_tab[] = {
27: {MW_DUMMY,"dummy",MW_STATUS_NONE,MW_CHECK_SINGLE},
28: {MW_SINGLE,"single",MW_STATUS_NONE,MW_CHECK_SINGLE},
29: {MW_WRAP,"wrap",MW_STATUS_WRAPPED,MW_CHECK_WRAP},
30: {MW_COMPOUND_HEAD,"compound_head",MW_STATUS_NONE,MW_CHECK_COMPOUND},
31: {MW_COMPOUND,"compound",MW_STATUS_NONE,MW_CHECK_NONE},
32: {MW_COMPOUND_LEAF,"compound_leaf",MW_STATUS_COMPOUND,MW_CHECK_NONE},
33: {MW_COMPOUND_PART,"compound_part",MW_STATUS_COMPOUND_PART,MW_CHECK_SINGLE},
34: {MW_V_RENYOU_A,"v_renyou_a",MW_STATUS_COMBINED,MW_CHECK_BORDER},
35: {MW_V_RENYOU_NOUN,"v_renyou_noun",MW_STATUS_COMBINED,MW_CHECK_BORDER},
36: {MW_NUMBER,"number",MW_STATUS_COMBINED,MW_CHECK_NUMBER},
37: {MW_OCHAIRE,"ochaire",MW_STATUS_OCHAIRE,MW_CHECK_OCHAIRE},
38:
39: {MW_END,"end",MW_STATUS_NONE,MW_CHECK_NONE}
40: };
41:
42: static void
43: combine_metaword(struct splitter_context *sc, struct meta_word *mw);
44:
45:
46: void
47: anthy_commit_meta_word(struct splitter_context *sc,
48: struct meta_word *mw)
49: {
50: struct word_split_info_cache *info = sc->word_split_info;
51:
52: mw->next = info->cnode[mw->from].mw;
53: info->cnode[mw->from].mw = mw;
54:
55: if (anthy_splitter_debug_flags() & SPLITTER_DEBUG_MW) {
56: anthy_print_metaword(sc, mw);
57: }
58: }
59:
60: static void
61: print_metaword_features(int features)
62: {
63: if (features & MW_FEATURE_SV) {
64: printf(":sv");
65: }
66: if (features & MW_FEATURE_WEAK_CONN) {
67: printf(":weak");
68: }
69: if (features & MW_FEATURE_SUFFIX) {
70: printf(":suffix");
71: }
72: if (features & MW_FEATURE_NUM) {
73: printf(":num");
74: }
75: if (features & MW_FEATURE_CORE1) {
76: printf(":c1");
77: }
78: if (features & MW_FEATURE_HIGH_FREQ) {
79: printf(":hf");
80: }
81: }
82:
83: static void
84: anthy_do_print_metaword(struct splitter_context *sc,
85: struct meta_word *mw,
86: int indent)
87: {
88: int i;
89: for (i = 0; i < indent; i++) {
90: printf(" ");
91: }
92: printf("*meta word type=%s(%d-%d):score=%d:seg_class=%s",
93: anthy_metaword_type_tab[mw->type].name,
94: mw->from, mw->len, mw->score,
95: anthy_seg_class_name(mw->seg_class));
96: print_metaword_features(mw->mw_features);
97: printf(":can_use=%d*\n", mw->can_use);
98: if (mw->wl) {
99: anthy_print_word_list(sc, mw->wl);
100: }
101: if (mw->cand_hint.str) {
102: printf("(");
103: anthy_putxstr(&mw->cand_hint);
104: printf(")\n");
105: }
106: if (mw->mw1) {
107: anthy_do_print_metaword(sc, mw->mw1, indent + 1);
108: }
109: if (mw->mw2) {
110: anthy_do_print_metaword(sc, mw->mw2, indent + 1);
111: }
112: }
113:
114: void
115: anthy_print_metaword(struct splitter_context *sc,
116: struct meta_word *mw)
117: {
118: anthy_do_print_metaword(sc, mw, 0);
119: }
120:
121: static struct meta_word *
122: alloc_metaword(struct splitter_context *sc)
123: {
124: struct meta_word *mw;
125: mw = anthy_smalloc(sc->word_split_info->MwAllocator);
126: mw->type = MW_SINGLE;
127: mw->score = 0;
128: mw->struct_score = 0;
129: mw->dep_word_hash = 0;
130: mw->core_wt = anthy_wt_none;
131: mw->mw_features = 0;
132: mw->dep_class = DEP_NONE;
133: mw->wl = NULL;
134: mw->mw1 = NULL;
135: mw->mw2 = NULL;
136: mw->cand_hint.str = NULL;
137: mw->cand_hint.len = 0;
138: mw->seg_class = SEG_HEAD;
139: mw->can_use = ok;
140: return mw;
141: }
142:
143:
144:
145:
146:
147: static void
148: get_surrounding_text(struct splitter_context* sc,
149: struct word_list* wl,
150: xstr* xs_pre, xstr* xs_post)
151: {
152: int post_len = wl->part[PART_DEPWORD].len + wl->part[PART_POSTFIX].len;
153: int pre_len = wl->part[PART_PREFIX].len;
154:
155: xs_pre->str = sc->ce[wl->from].c;
156: xs_pre->len = pre_len;
157: xs_post->str = sc->ce[wl->from + wl->len - post_len].c;
158: xs_post->len = post_len;
159: }
160:
161:
162:
163:
164: static struct meta_word*
165: make_compound_nth_metaword(struct splitter_context* sc,
166: compound_ent_t ce, int nth,
167: struct word_list* wl,
168: enum metaword_type type)
169: {
170: int i;
171: int len = 0;
172: int from = wl->from;
173: int seg_num = anthy_compound_get_nr_segments(ce);
174: struct meta_word* mw;
175: xstr xs_pre, xs_core, xs_post;
176:
177: get_surrounding_text(sc, wl, &xs_pre, &xs_post);
178:
179: for (i = 0; i <= nth; ++i) {
180: from += len;
181: len = anthy_compound_get_nth_segment_len(ce, i);
182: if (i == 0) {
183: len += xs_pre.len;
184: }
185: if (i == seg_num - 1) {
186: len += xs_post.len;
187: }
188: }
189:
190: mw = alloc_metaword(sc);
191: mw->from = from;
192: mw->len = len;
193: mw->type = type;
194: mw->score = 1000;
195: mw->seg_class = wl->seg_class;
196:
197: anthy_compound_get_nth_segment_xstr(ce, nth, &xs_core);
198: if (nth == 0) {
199: anthy_xstrcat(&mw->cand_hint, &xs_pre);
200: }
201: anthy_xstrcat(&mw->cand_hint, &xs_core);
202: if (nth == seg_num - 1) {
203: anthy_xstrcat(&mw->cand_hint, &xs_post);
204: }
205: return mw;
206: }
207:
208:
209:
210:
211:
212: static struct meta_word *
213: anthy_do_cons_metaword(struct splitter_context *sc,
214: enum metaword_type type,
215: struct meta_word *mw, struct meta_word *mw2)
216: {
217: struct meta_word *n;
218:
219: n = alloc_metaword(sc);
220: n->from = mw->from;
221: n->len = mw->len + (mw2 ? mw2->len : 0);
222:
223: if (mw2) {
224: n->score = sqrt(mw->score) * sqrt(mw2->score);
225: } else {
226: n->score = mw->score;
227: }
228: n->type = type;
229: n->mw1 = mw;
230: n->mw2 = mw2;
231: if (mw2) {
232: n->seg_class = mw2->seg_class;
233: n->nr_parts = mw->nr_parts + mw2->nr_parts;
234: n->dep_word_hash = mw2->dep_word_hash;
235: } else {
236: n->seg_class = mw->seg_class;
237: n->nr_parts = mw->nr_parts;
238: n->dep_word_hash = mw->dep_word_hash;
239: }
240: anthy_commit_meta_word(sc, n);
241: return n;
242: }
243:
244:
245:
246:
247: static void
248: make_compound_metaword(struct splitter_context* sc, struct word_list* wl)
249: {
250: int i, j;
251: seq_ent_t se = wl->part[PART_CORE].seq;
252: int ent_num = anthy_get_nr_dic_ents(se, NULL);
253:
254: for (i = 0; i < ent_num; ++i) {
255: compound_ent_t ce;
256: int seg_num;
257: struct meta_word *mw = NULL;
258: struct meta_word *mw2 = NULL;
259: if (!anthy_get_nth_dic_ent_is_compound(se, i)) {
260: continue;
261: }
262: ce = anthy_get_nth_compound_ent(se, i);
263: seg_num = anthy_compound_get_nr_segments(ce);
264:
265: for (j = seg_num - 1; j >= 0; --j) {
266: enum metaword_type type;
267: mw = make_compound_nth_metaword(sc, ce, j, wl, MW_COMPOUND_LEAF);
268: anthy_commit_meta_word(sc, mw);
269:
270: type = j == 0 ? MW_COMPOUND_HEAD : MW_COMPOUND;
271: mw2 = anthy_do_cons_metaword(sc, type, mw, mw2);
272: }
273: }
274: }
275:
276:
277:
278:
279: static void
280: make_compound_part_metaword(struct splitter_context* sc, struct word_list* wl)
281: {
282: int i, j, k;
283: seq_ent_t se = wl->part[PART_CORE].seq;
284: int ent_num = anthy_get_nr_dic_ents(se, NULL);
285:
286: for (i = 0; i < ent_num; ++i) {
287: compound_ent_t ce;
288: int seg_num;
289: struct meta_word *mw = NULL;
290: struct meta_word *mw2 = NULL;
291:
292: if (!anthy_get_nth_dic_ent_is_compound(se, i)) {
293: continue;
294: }
295:
296: ce = anthy_get_nth_compound_ent(se, i);
297: seg_num = anthy_compound_get_nr_segments(ce);
298:
299:
300: for (j = seg_num - 1; j >= 0; --j) {
301: mw = make_compound_nth_metaword(sc, ce, j, wl, MW_COMPOUND_PART);
302: for (k = j - 1; k >= 0; --k) {
303: mw2 = make_compound_nth_metaword(sc, ce, k, wl, MW_COMPOUND_PART);
304: mw2->len += mw->len;
305: mw2->score += mw->score;
306: anthy_xstrcat(&mw2->cand_hint, &mw->cand_hint);
307:
308: anthy_commit_meta_word(sc, mw2);
309: mw = mw2;
310: }
311: }
312: }
313: }
314:
315:
316:
317:
318: static void
319: make_simple_metaword(struct splitter_context *sc, struct word_list* wl)
320: {
321: struct meta_word *mw = alloc_metaword(sc);
322: mw->wl = wl;
323: mw->from = wl->from;
324: mw->len = wl->len;
325: mw->score = 1000;
326: mw->type = MW_SINGLE;
327: mw->dep_class = wl->part[PART_DEPWORD].dc;
328: mw->seg_class = wl->seg_class;
329: if (wl->part[PART_CORE].len) {
330: mw->core_wt = wl->part[PART_CORE].wt;
331: }
332: mw->nr_parts = NR_PARTS;
333: mw->dep_word_hash = wl->dep_word_hash;
334: mw->mw_features = wl->mw_features;
335: anthy_commit_meta_word(sc, mw);
336: }
337:
338:
339:
340:
341: static void
342: make_metaword_from_word_list(struct splitter_context *sc)
343: {
344: int i;
345: for (i = 0; i < sc->char_count; i++) {
346: struct word_list *wl;
347: for (wl = sc->word_split_info->cnode[i].wl;
348: wl; wl = wl->next) {
349: if (wl->is_compound) {
350: make_compound_part_metaword(sc, wl);
351: make_compound_metaword(sc, wl);
352: } else {
353: make_simple_metaword(sc, wl);
354: }
355: }
356: }
357: }
358:
359:
360:
361:
362: static struct meta_word *
363: list_metaword(struct splitter_context *sc,
364: enum metaword_type type,
365: struct meta_word *mw, struct meta_word *mw2)
366: {
367: struct meta_word *wrapped_mw = anthy_do_cons_metaword(sc, type, mw2, NULL);
368: struct meta_word *n = anthy_do_cons_metaword(sc, type, mw, wrapped_mw);
369:
370: n->mw_features = mw->mw_features | mw2->mw_features;
371:
372: return n;
373: }
374:
375:
376:
377:
378: static void
379: try_combine_v_renyou_a(struct splitter_context *sc,
380: struct meta_word *mw, struct meta_word *mw2)
381: {
382: wtype_t w2;
383: if (!mw->wl || !mw2->wl) return;
384:
385: w2 = mw2->wl->part[PART_CORE].wt;
386:
387: if (mw->wl->head_pos == POS_V &&
388: mw->wl->tail_ct == CT_RENYOU &&
389: anthy_wtype_get_pos(w2) == POS_D2KY) {
390:
391: if (anthy_get_seq_ent_wtype_freq(mw2->wl->part[PART_CORE].seq,
392: anthy_wtype_a_tail_of_v_renyou)) {
393: list_metaword(sc, MW_V_RENYOU_A, mw, mw2);
394: }
395: }
396: }
397:
398:
399:
400:
401: static void
402: try_combine_v_renyou_noun(struct splitter_context *sc,
403: struct meta_word *mw, struct meta_word *mw2)
404: {
405: wtype_t w2;
406: if (!mw->wl || !mw2->wl) return;
407:
408: w2 = mw2->wl->part[PART_CORE].wt;
409: if (mw->wl->head_pos == POS_V &&
410: mw->wl->tail_ct == CT_RENYOU &&
411: anthy_wtype_get_pos(w2) == POS_NOUN &&
412: anthy_wtype_get_scos(w2) == SCOS_T40) {
413: list_metaword(sc, MW_V_RENYOU_NOUN, mw, mw2);
414: }
415: }
416:
417:
418:
419:
420: static void
421: try_combine_number(struct splitter_context *sc,
422: struct meta_word *mw1, struct meta_word *mw2)
423: {
424: struct word_list *wl1 = mw1->wl;
425: struct word_list *wl2 = mw2->wl;
426: struct meta_word *combined_mw;
427: int recursive = wl2 ? 0 : 1;
428:
429:
430:
431: if (anthy_wtype_get_pos(wl1->part[PART_CORE].wt) != POS_NUMBER) return;
432: if (recursive) {
433:
434: if (mw2->type != MW_NUMBER) return;
435: wl2 = mw2->mw1->wl;
436: } else {
437:
438: if (anthy_wtype_get_pos(wl2->part[PART_CORE].wt) != POS_NUMBER) return;
439: }
440:
441: if (wl1->part[PART_POSTFIX].len == 0 &&
442: wl1->part[PART_DEPWORD].len == 0) {
443: int scos1 = anthy_wtype_get_scos(wl1->part[PART_CORE].wt);
444: int scos2 = anthy_wtype_get_scos(wl2->part[PART_CORE].wt);
445:
446:
447: if (scos2 == SCOS_NONE) return;
448:
449:
450:
451:
452:
453: switch (scos1) {
454: case SCOS_N1:
455: if (scos2 == SCOS_N1) return;
456: case SCOS_N10:
457: if (scos2 == SCOS_N10) return;
458: case SCOS_N100:
459: if (scos2 == SCOS_N100) return;
460: case SCOS_N1000:
461: if (scos2 == SCOS_N1000) return;
462: case SCOS_N10000:
463:
464:
465: break;
466: default:
467: return;
468: }
469:
470: if (recursive) {
471: combined_mw = anthy_do_cons_metaword(sc, MW_NUMBER, mw1, mw2);
472: } else {
473:
474: combined_mw = list_metaword(sc, MW_NUMBER, mw1, mw2);
475: }
476: combine_metaword(sc, combined_mw);
477: }
478: }
479:
480:
481: static void
482: try_combine_metaword(struct splitter_context *sc,
483: struct meta_word *mw1, struct meta_word *mw2)
484: {
485: if (!mw1->wl) return;
486:
487:
488:
489: if (mw2->wl && mw2->wl->part[PART_PREFIX].len > 0) {
490: return;
491: }
492:
493: try_combine_v_renyou_a(sc, mw1, mw2);
494: try_combine_v_renyou_noun(sc, mw1, mw2);
495: try_combine_number(sc, mw1, mw2);
496: }
497:
498: static void
499: combine_metaword(struct splitter_context *sc, struct meta_word *mw)
500: {
501: struct word_split_info_cache *info = sc->word_split_info;
502: int i;
503:
504: if (mw->mw_features & MW_FEATURE_DEP_ONLY) {
505:
506: return;
507: }
508:
509: for (i = mw->from - 1; i >= 0; i--) {
510: struct meta_word *mw_left;
511: for (mw_left = info->cnode[i].mw; mw_left; mw_left = mw_left->next) {
512: if (mw_left->from + mw_left->len == mw->from) {
513:
514: try_combine_metaword(sc, mw_left, mw);
515: }
516: }
517: }
518: }
519:
520: static void
521: combine_metaword_all(struct splitter_context *sc)
522: {
523: int i;
524:
525: struct word_split_info_cache *info = sc->word_split_info;
526:
527: for (i = sc->char_count - 1; i >= 0; i--){
528: struct meta_word *mw;
529:
530: for (mw = info->cnode[i].mw;
531: mw; mw = mw->next) {
532: combine_metaword(sc, mw);
533: }
534: }
535: }
536:
537: static void
538: make_dummy_metaword(struct splitter_context *sc, int from,