1:
2:
3:
4:
5:
6:
7:
8:
9:
10:
11:
12:
13:
14:
15:
16:
17:
18:
19:
20:
21:
22:
23:
24:
25:
26:
27:
28:
29:
30:
31:
32:
33:
34: #include <stdio.h>
35: #include <stdlib.h>
36: #include <string.h>
37:
38: #include <anthy/dic.h>
39: #include <anthy/splitter.h>
40: #include <anthy/segment.h>
41: #include "wordborder.h"
42:
43:
44: static struct cand_ent *
45: alloc_cand_ent(void)
46: {
47: struct cand_ent *ce;
48: ce = (struct cand_ent *)malloc(sizeof(struct cand_ent));
49: ce->nr_words = 0;
50: ce->elm = NULL;
51: ce->mw = NULL;
52: ce->core_elm_index = -1;
53: ce->dep_word_hash = 0;
54: return ce;
55: }
56:
57:
58:
59:
60: static struct cand_ent *
61: dup_candidate(struct cand_ent *ce)
62: {
63: struct cand_ent *ce_new;
64: int i;
65: ce_new = alloc_cand_ent();
66: ce_new->nr_words = ce->nr_words;
67: ce_new->str.len = ce->str.len;
68: ce_new->str.str = anthy_xstr_dup_str(&ce->str);
69: ce_new->elm = malloc(sizeof(struct cand_elm)*ce->nr_words);
70: ce_new->flag = ce->flag;
71: ce_new->core_elm_index = ce->core_elm_index;
72: ce_new->mw = ce->mw;
73: ce_new->score = ce->score;
74: ce_new->dep_word_hash = ce->dep_word_hash;
75:
76: for (i = 0 ; i < ce->nr_words ; i++) {
77: ce_new->elm[i] = ce->elm[i];
78: }
79: return ce_new;
80: }
81:
82:
83: static void
84: push_back_candidate(struct seg_ent *seg, struct cand_ent *ce)
85: {
86:
87: seg->nr_cands++;
88: seg->cands = (struct cand_ent **)
89: realloc(seg->cands, sizeof(struct cand_ent *) * seg->nr_cands);
90: seg->cands[seg->nr_cands - 1] = ce;
91:
92: if (anthy_splitter_debug_flags() & SPLITTER_DEBUG_CAND) {
93: anthy_print_candidate(ce);
94: printf("\n");
95: }
96: }
97:
98: static void
99: push_back_guessed_candidate(struct seg_ent *seg)
100: {
101: xchar xc;
102: xstr *xs;
103: struct cand_ent *ce;
104: if (seg->str.len < 2) {
105: return ;
106: }
107:
108: xc = seg->str.str[seg->str.len - 1];
109: if (!(anthy_get_xchar_type(xc) & XCT_DEP)) {
110: return ;
111: }
112:
113: ce = alloc_cand_ent();
114: xs = anthy_xstr_hira_to_kata(&seg->str);
115: xs->str[xs->len-1] = xc;
116: ce->str.str = anthy_xstr_dup_str(xs);
117: ce->str.len = xs->len;
118: ce->flag = CEF_GUESS;
119: anthy_free_xstr(xs);
120: push_back_candidate(seg, ce);
121: }
122:
123:
124: static int
125: enum_candidates(struct seg_ent *seg,
126: struct cand_ent *ce,
127: int from, int n)
128: {
129: int i, p;
130: struct cand_ent *cand;
131: int nr_cands = 0;
132: int pos;
133:
134: if (n == ce->mw->nr_parts) {
135:
136:
137: xstr tail;
138: tail.len = seg->len - from;
139: tail.str = &seg->str.str[from];
140: anthy_xstrcat(&ce->str, &tail);
141: push_back_candidate(seg, dup_candidate(ce));
142: return 1;
143: }
144:
145: p = anthy_get_nr_dic_ents(ce->elm[n].se, &ce->elm[n].str);
146:
147:
148: for (i = 0; i < p; i++) {
149: wtype_t wt;
150: if (anthy_get_nth_dic_ent_is_compound(ce->elm[n].se, i)) {
151: continue;
152: }
153: anthy_get_nth_dic_ent_wtype(ce->elm[n].se, &ce->elm[n].str, i, &wt);
154:
155: ce->elm[n].wt = anthy_get_wtype_with_ct(ce->elm[n].wt, CT_NONE);
156: if (anthy_wtype_include(ce->elm[n].wt, wt)) {
157: xstr word, yomi;
158:
159: yomi.len = ce->elm[n].str.len;
160: yomi.str = &seg->str.str[from];
161: cand = dup_candidate(ce);
162: anthy_get_nth_dic_ent_str(cand->elm[n].se,
163: &yomi, i, &word);
164: cand->elm[n].nth = i;
165: cand->elm[n].id = anthy_xstr_hash(&word);
166:
167:
168: anthy_xstrcat(&cand->str, &word);
169: free(word.str);
170:
171: nr_cands += enum_candidates(seg, cand,
172: from + yomi.len,
173: n+1);
174: anthy_release_cand_ent(cand);
175: }
176: }
177:
178:
179: pos = anthy_wtype_get_pos(ce->elm[n].wt);
180: if (nr_cands == 0 || pos == POS_INVAL || pos == POS_NONE) {
181: xstr xs;
182: xs.len = ce->elm[n].str.len;
183: xs.str = &seg->str.str[from];
184: cand = dup_candidate(ce);
185: cand->elm[n].nth = -1;
186: cand->elm[n].id = -1;
187: anthy_xstrcat(&cand->str, &xs);
188: nr_cands = enum_candidates(seg,cand,
189: from + xs.len,
190: n + 1);
191: anthy_release_cand_ent(cand);
192: return nr_cands;
193: }
194:
195: return nr_cands;
196: }
197:
198:
199:
200:
201: static void
202: push_back_singleword_candidate(struct seg_ent *seg,
203: int is_reverse)
204: {
205: seq_ent_t se;
206: struct cand_ent *ce;
207: wtype_t wt;
208: int i, n;
209: xstr xs;
210:
211: se = anthy_get_seq_ent_from_xstr(&seg->str, is_reverse);
212: n = anthy_get_nr_dic_ents(se, &seg->str);
213:
214: for (i = 0; i < n; i++) {
215: int ct;
216: if (anthy_get_nth_dic_ent_is_compound(se, i)) {
217: continue;
218: }
219:
220: anthy_get_nth_dic_ent_wtype(se, &seg->str, i, &wt);
221: ct = anthy_wtype_get_ct(wt);
222:
223: if (ct == CT_SYUSI || ct == CT_NONE) {
224: ce = alloc_cand_ent();
225: anthy_get_nth_dic_ent_str(se,&seg->str, i, &xs);
226: ce->str.str = xs.str;
227: ce->str.len = xs.len;
228: ce->flag = CEF_SINGLEWORD;
229: push_back_candidate(seg, ce);
230: }
231: }
232: }
233:
234: static void
235: push_back_noconv_candidate(struct seg_ent *seg)
236: {
237:
238: struct cand_ent *ce;
239: xstr *xs;
240:
241:
242: ce = alloc_cand_ent();
243: ce->str.str = anthy_xstr_dup_str(&seg->str);
244: ce->str.len = seg->str.len;
245: ce->flag = CEF_HIRAGANA;
246: push_back_candidate(seg, ce);
247:
248:
249: ce = alloc_cand_ent();
250: xs = anthy_xstr_hira_to_kata(&seg->str);
251: ce->str.str = anthy_xstr_dup_str(xs);
252: ce->str.len = xs->len;
253: ce->flag = CEF_KATAKANA;
254: anthy_free_xstr(xs);
255: push_back_candidate(seg, ce);
256:
257:
258: xs = anthy_conv_half_wide(&seg->str);
259: if (xs) {
260: ce = alloc_cand_ent();
261: ce->str.str = anthy_xstr_dup_str(xs);
262: ce->str.len = xs->len;
263: ce->flag = CEF_NONE;
264: anthy_free_xstr(xs);
265: push_back_candidate(seg, ce);
266: }
267: }
268:
269:
270: static void
271: make_cand_elem_from_word_list(struct seg_ent *se,
272: struct cand_ent *ce,
273: struct word_list *wl,
274: int index,
275: int is_reverse)
276: {
277: int i;
278: int from = wl->from - se->from;
279:
280: for (i = 0; i < NR_PARTS; ++i) {
281: struct part_info *part = &wl->part[i];
282: xstr core_xs;
283: if (part->len == 0) {
284:
285: continue;
286: }
287: if (i == PART_CORE) {
288: ce->core_elm_index = i + index;
289: }
290: core_xs.str = &se->str.str[from];
291: core_xs.len = part->len;
292: if (i == PART_DEPWORD) {
293: ce->dep_word_hash = anthy_dep_word_hash(&core_xs);
294: }
295: ce->elm[i + index].se = anthy_get_seq_ent_from_xstr(&core_xs, is_reverse);
296: ce->elm[i + index].str.str = core_xs.str;
297: ce->elm[i + index].str.len = core_xs.len;
298: ce->elm[i + index].wt = part->wt;
299: ce->elm[i + index].ratio = RATIO_BASE * wl->len;
300: from += part->len;
301: }
302: }
303:
304:
305:
306: static void
307: make_candidate_from_simple_metaword(struct seg_ent *se,
308: struct meta_word *mw,
309: struct meta_word *top_mw,
310: int is_reverse)
311: {
312:
313:
314:
315: struct cand_ent *ce;
316:
317:
318: ce = alloc_cand_ent();
319: ce->nr_words = mw->nr_parts;
320: ce->str.str = NULL;
321: ce->str.len = 0;
322: ce->elm = calloc(sizeof(struct cand_elm),ce->nr_words);
323: ce->mw = mw;
324: ce->score = 0;
325:
326:
327: make_cand_elem_from_word_list(se, ce, mw->wl, 0, is_reverse);
328:
329:
330: if (anthy_metaword_type_tab[top_mw->type].status != MW_STATUS_WRAPPED) {
331: ce->flag = (se->best_mw == mw) ? CEF_BEST : CEF_NONE;
332: } else {
333: ce->flag = CEF_GUESS;
334: }
335:
336: enum_candidates(se, ce, 0, 0);
337: anthy_release_cand_ent(ce);
338: }
339:
340:
341: static void
342: make_candidate_from_combined_metaword(struct seg_ent *se,
343: struct meta_word *mw,
344: struct meta_word *top_mw,
345: int is_reverse)
346: {
347:
348:
349:
350: struct cand_ent *ce;
351:
352:
353: ce = alloc_cand_ent();
354: ce->nr_words = mw->nr_parts;
355: ce->score = 0;
356: ce->str.str = NULL;
357: ce->str.len = 0;
358: ce->elm = calloc(sizeof(struct cand_elm),ce->nr_words);
359: ce->mw = top_mw;
360:
361:
362: make_cand_elem_from_word_list(se, ce, mw->mw1->wl, 0, is_reverse);
363: if (mw->mw2) {
364: make_cand_elem_from_word_list(se, ce, mw->mw2->mw1->wl, NR_PARTS, is_reverse);
365: }
366:
367:
368: if (anthy_metaword_type_tab[top_mw->type].status != MW_STATUS_WRAPPED) {
369: ce->flag = (se->best_mw == mw) ? CEF_BEST : CEF_NONE;
370: } else {
371: ce->flag = CEF_GUESS;
372: }
373:
374: enum_candidates(se, ce, 0, 0);
375: anthy_release_cand_ent(ce);
376: }
377:
378:
379:
380:
381: static void
382: proc_splitter_info(struct seg_ent *se,
383: struct meta_word *mw,
384:
385: struct meta_word *top_mw,
386: int is_reverse)
387: {
388: enum mw_status st;
389: if (!mw) return;
390:
391:
392: if (mw->wl && mw->wl->len) {
393: make_candidate_from_simple_metaword(se, mw, top_mw, is_reverse);
394: return;
395: }
396:
397: st = anthy_metaword_type_tab[mw->type].status;
398: switch (st) {
399: case MW_STATUS_WRAPPED:
400:
401: proc_splitter_info(se, mw->mw1, top_mw, is_reverse);
402: break;
403: case MW_STATUS_COMBINED:
404: make_candidate_from_combined_metaword(se, mw, top_mw, is_reverse);
405: break;
406: case MW_STATUS_COMPOUND:
407:
408: {
409: struct cand_ent *ce;
410: ce = alloc_cand_ent();
411: ce->str.str = anthy_xstr_dup_str(&mw->cand_hint);
412: ce->str.len = mw->cand_hint.len;
413: ce->flag = CEF_COMPOUND;
414: ce->mw = top_mw;
415: push_back_candidate(se, ce);
416: }
417: break;
418: case MW_STATUS_COMPOUND_PART:
419:
420:
421: case MW_STATUS_OCHAIRE:
422: {
423:
424:
425: struct cand_ent *ce;
426: ce = alloc_cand_ent();
427: ce->str.str = anthy_xstr_dup_str(&mw->cand_hint);
428: ce->str.len = mw->cand_hint.len;
429: ce->mw = top_mw;
430: ce->flag = (st == MW_STATUS_OCHAIRE) ? CEF_OCHAIRE : CEF_COMPOUND_PART;
431:
432: if (mw->len < se->len) {
433:
434: xstr xs;
435: xs.str = &se->str.str[mw->len];
436: xs.len = se->len - mw->len;
437: anthy_xstrcat(&ce->str ,&xs);
438: }
439: push_back_candidate(se, ce);
440: }
441: break;
442: case MW_STATUS_NONE:
443: break;
444: default:
445: break;
446: }
447: }
448:
449:
450:
451:
452: void
453: anthy_do_make_candidates(struct splitter_context *sc,
454: struct seg_ent *se, int is_reverse)
455: {
456: int i;
457:
458:
459: for (i = 0; i < se->nr_metaword; i++) {
460: struct meta_word *mw = se->mw_array[i];
461: if (anthy_splitter_debug_flags() & SPLITTER_DEBUG_CAND) {
462: anthy_print_metaword(sc, mw);
463: }
464: proc_splitter_info(se, mw, mw, is_reverse);
465: }
466: if (anthy_splitter_debug_flags() & SPLITTER_DEBUG_CAND) {
467: printf("#done\n");
468: }
469:
470: push_back_singleword_candidate(se, is_reverse);
471:
472:
473: push_back_noconv_candidate(se);
474:
475:
476: push_back_guessed_candidate(se);
477: }