1:
2:
3:
4:
5:
6: #include <stdio.h>
7: #include <string.h>
8: #include <stdlib.h>
9:
10: #include <anthy/anthy.h>
11: #include <anthy/convdb.h>
12: #include <anthy/segment.h>
13: #include <anthy/feature_set.h>
14:
15: #include "../src-main/main.h"
16: #include "../src-splitter/wordborder.h"
17: #include "../src-worddic/dic_ent.h"
18:
19:
20:
21: #define WORD_INDEP 0
22: #define WORD_DEP 1
23:
24:
25: struct word {
26:
27: int type;
28:
29: int hash;
30:
31: int yomi_hash;
32:
33: xstr *raw_xs;
34:
35: xstr *conv_xs;
36:
37: const char *wt;
38: };
39:
40: static struct cand_ent *
41: selected_candidate(struct seg_ent *seg)
42: {
43: if (seg->committed > -1) {
44: return seg->cands[seg->committed];
45: }
46: return seg->cands[0];
47: }
48:
49: static void
50: get_res(anthy_context_t ac, char *res_buf, int conv)
51: {
52: struct anthy_conv_stat acs;
53: int i;
54:
55: anthy_get_stat(ac, &acs);
56: res_buf[0] = 0;
57: if (!conv) {
58: strcat(res_buf, "|");
59: }
60: for (i = 0; i < acs.nr_segment; i++) {
61: char buf[1024];
62: if (conv) {
63: anthy_get_segment(ac, i, 0, buf, 1024);
64: strcat(res_buf, buf);
65: } else {
66: anthy_get_segment(ac, i, NTH_UNCONVERTED_CANDIDATE, buf, 1024);
67: strcat(res_buf, buf);
68: strcat(res_buf, "|");
69: }
70: }
71: }
72:
73: static struct conv_res *
74: do_find_conv_res(struct res_db *db, const char *src, const char *res)
75: {
76: struct conv_res *cr;
77:
78: for (cr = db->res_list.next; cr; cr = cr->next) {
79: if (((!cr->res_str && !res) ||
80: !strcmp(cr->res_str, res)) &&
81: !strcmp(cr->src_str, src)) {
82: return cr;
83: }
84: }
85: cr = (struct conv_res *)malloc(sizeof(struct conv_res));
86: cr->src_str = strdup(src);
87: if (res) {
88: cr->res_str = strdup(res);
89: } else {
90: cr->res_str = NULL;
91: }
92: cr->cand_str = NULL;
93: cr->check = CHK_UNKNOWN;
94: cr->used = 0;
95: cr->cand_check = NULL;
96:
97: db->tail->next = cr;
98: cr->next = NULL;
99: db->tail = cr;
100: return cr;
101: }
102:
103: struct conv_res *
104: find_conv_res(struct res_db *db, anthy_context_t ac,
105: const char *src, int conv)
106: {
107: char res_buf[1024];
108: get_res(ac, res_buf, conv);
109:
110: return do_find_conv_res(db, src, res_buf);
111: }
112:
113: static void
114: chomp_line(char *buf)
115: {
116: int len = strlen(buf);
117: if (buf[len-1] == '\n') {
118: buf[len-1] = 0;
119: }
120: }
121:
122: struct res_db *
123: create_db(void)
124: {
125: struct res_db *db;
126:
127: db = malloc(sizeof(struct res_db));
128: db->res_list.next = NULL;
129: db->tail = &db->res_list;
130: db->total = 0;
131: db->res.unknown = 0;
132: db->res.ok = 0;
133: db->res.miss = 0;
134: db->res.dontcare = 0;
135: db->split.unknown = 0;
136: db->split.ok = 0;
137: db->split.miss = 0;
138: db->split.dontcare = 0;
139:
140: return db;
141: }
142:
143: static void
144: strip_separator_vbar(char *buf, const char *str)
145: {
146: const char *src = str;
147: char *dst = buf;
148: while (*src) {
149: if (*src != '|' && *src != '~') {
150: *dst = *src;
151: dst ++;
152: }
153: src ++;
154: }
155: *dst = 0;
156: }
157:
158: static void
159: parse_line(struct res_db *db, char *line)
160: {
161: char buf1[1024], buf2[1024], buf3[1024], buf4[1024];
162: char *src, *res;
163: const char *check;
164: struct conv_res *cr;
165: int nr;
166: chomp_line(line);
167: if (line[0] == '#' || line[0] == 0) {
168: return ;
169: }
170: nr = sscanf(line, "%s %s %s", buf1, buf2, buf3);
171: if (nr == 1) {
172: cr = do_find_conv_res(db, buf1, NULL);
173: cr->check = CHK_UNKNOWN;
174: return ;
175: }
176: if (nr < 2) {
177: return ;
178: }
179: if (buf1[0] != '|') {
180:
181:
182:
183:
184:
185: src = buf1;
186: res = buf2;
187: if (nr == 3) {
188: check = buf3;
189: } else {
190: check = "?";
191: }
192: } else {
193:
194:
195:
196:
197:
198: strip_separator_vbar(buf4, buf1);
199: src = buf4;
200: res = buf1;
201: check = buf2;
202: }
203: cr = do_find_conv_res(db, src, res);
204: if (nr == 2 && check[0] != '|') {
205: cr->check = CHK_OK;
206: return ;
207: }
208: if (check[0] == 'O') {
209: cr->check = CHK_OK;
210: } else if (check[0] == 'X') {
211: cr->check = CHK_MISS;
212: } else if (check[0] == '*') {
213: cr->check = CHK_DONTCARE;
214: } else if (check[0] == '|') {
215: cr->check = CHK_UNKNOWN;
216: cr->cand_str = strdup(check);
217: } else {
218: cr->check = CHK_UNKNOWN;
219: }
220: }
221:
222: void
223: read_db(struct res_db *db, const char *fn)
224: {
225: FILE *fp;
226: char line[1024];
227:
228: if (!fn) {
229: return ;
230: }
231: fp = fopen(fn, "r");
232: if (!fp) {
233: return ;
234: }
235: while (fgets(line, 1024, fp)) {
236: parse_line(db, line);
237: }
238: }
239:
240: static void
241: fill_conv_info(struct word *w, struct cand_elm *elm)
242: {
243:
244: struct dic_ent *de;
245: if (elm->nth == -1 ||
246: elm->nth >= elm->se->nr_dic_ents) {
247: w->conv_xs = NULL;
248: w->wt = NULL;
249: return ;
250: }
251: if (!elm->se->dic_ents) {
252: w->conv_xs = NULL;
253: w->wt = NULL;
254: return ;
255: }
256:
257: de = elm->se->dic_ents[elm->nth];
258: w->conv_xs = anthy_xstr_dup(&de->str);
259: w->wt = de->wt_name;
260: w->hash = anthy_xstr_hash(w->conv_xs);
261: }
262:
263: static void
264: init_word(struct word *w, int type)
265: {
266: w->type = type;
267: w->raw_xs = NULL;
268: w->conv_xs = NULL;
269: w->wt = NULL;
270: }
271:
272: static void
273: free_word(struct word *w)
274: {
275: anthy_free_xstr(w->raw_xs);
276: anthy_free_xstr(w->conv_xs);
277: }
278:
279:
280: static void
281: fill_indep_word(struct word *w, struct cand_elm *elm)
282: {
283: init_word(w, WORD_INDEP);
284:
285: w->raw_xs = anthy_xstr_dup(&elm->str);
286: w->yomi_hash = anthy_xstr_hash(w->raw_xs);
287: w->hash = 0;
288:
289: fill_conv_info(w, elm);
290: }
291:
292:
293: static void
294: fill_dep_word(struct word *w, struct cand_elm *elm)
295: {
296: init_word(w, WORD_DEP);
297:
298: w->hash = anthy_xstr_hash(&elm->str);
299: w->yomi_hash = w->hash;
300: w->raw_xs = anthy_xstr_dup(&elm->str);
301: }
302:
303: static void
304: print_features(struct feature_list *fl)
305: {
306: int i, nr;
307: if (!fl) {
308: return ;
309: }
310: nr = anthy_feature_list_nr(fl);
311: if (nr == 0) {
312: return ;
313: }
314: printf(" features=");
315: for (i = 0; i < nr; i++) {
316: if (i > 0) {
317: printf(",");
318: }
319: printf("%d", anthy_feature_list_nth(fl, i));
320: }
321: }
322:
323: static void
324: print_word(const char *prefix, struct word *w, struct feature_list *fl)
325: {
326: printf("%s", prefix);
327: if (w->type == WORD_DEP) {
328:
329: printf("dep_word hash=%d ", w->hash);
330: anthy_putxstrln(w->raw_xs);
331: return ;
332: }
333:
334: printf("indep_word hash=%d", w->hash);
335:
336: if (fl) {
337: print_features(fl);
338: }
339:
340: if (w->wt) {
341: printf(" %s", w->wt);
342: } else {
343: printf(" null");
344: }
345:
346: if (w->conv_xs) {
347: printf(" ");
348: anthy_putxstr(w->conv_xs);
349: } else {
350: printf(" null");
351: }
352: printf(" ");
353: anthy_putxstrln(w->raw_xs);
354: }
355:
356:
357:
358:
359: static int
360: get_seg_class(struct seg_ent *seg, int cl)
361: {
362: struct cand_ent *ce;
363: if (!seg) {
364: return cl;
365: }
366: ce = selected_candidate(seg);
367: if (ce->mw) {
368: return ce->mw->seg_class;
369: }
370: return SEG_BUNSETSU;
371: }
372:
373: static void
374: set_features(struct feature_list *fl,
375: struct seg_ent *prev_seg,
376: struct seg_ent *cur_seg)
377: {
378: int cl, pc;
379: cl = get_seg_class(cur_seg, SEG_TAIL);
380: pc = get_seg_class(prev_seg, SEG_HEAD);
381:
382: anthy_feature_list_set_cur_class(fl, cl);
383: if (cur_seg) {
384: struct cand_ent *ce = selected_candidate(cur_seg);
385: anthy_feature_list_set_dep_word(fl, ce->dep_word_hash);
386: if (ce->mw) {
387: anthy_feature_list_set_dep_class(fl, ce->mw->dep_class);
388: anthy_feature_list_set_mw_features(fl, ce->mw->mw_features);
389: anthy_feature_list_set_noun_cos(fl, ce->mw->core_wt);
390: }
391: }
392: anthy_feature_list_set_class_trans(fl, pc, cl);
393:
394: anthy_feature_list_sort(fl);
395: }
396:
397: static void
398: print_element(const char *prefix,
399: struct cand_elm *elm, struct feature_list *fl)
400: {
401: struct word w;
402:
403: if (elm->str.len == 0) {
404: return ;
405: }
406: if (elm->id != -1) {
407:
408: fill_indep_word(&w, elm);
409: print_word(prefix, &w, fl);
410: } else {
411:
412: fill_dep_word(&w, elm);
413: print_word(prefix, &w, NULL);
414: }
415: free_word(&w);
416: }
417:
418: static void
419: print_unconverted(struct cand_ent *ce)
420: {
421: printf("unknown ");
422: anthy_putxstrln(&ce->str);
423: }
424:
425: static void
426: print_eos(struct seg_ent *prev_seg)
427: {
428: struct feature_list fl;
429: anthy_feature_list_init(&fl);
430: set_features(&fl, prev_seg, NULL);
431: printf("eos ");
432: print_features(&fl);
433: printf("\n");
434: anthy_feature_list_free(&fl);
435: }
436:
437:
438:
439:
440: static const char *
441: get_prefix(int flag)
442: {
443: if (flag & CONV_INVALID) {
444: return "^";
445: }
446: if (flag & CONV_SIZE_MISS) {
447: return "!";
448: }
449: if (flag & CONV_CAND_MISS) {
450: return "~";
451: }
452: return "";
453: }
454:
455: static void
456: print_segment_info(int is_negative,
457: struct seg_ent *prev_seg,
458: struct seg_ent *seg)
459: {
460: int i;
461: struct feature_list fl;
462: struct cand_ent *ce = selected_candidate(seg);
463: int nr_indep = 0;
464: const char *prefix = get_prefix(is_negative);
465:
466: anthy_feature_list_init(&fl);
467: set_features(&fl, prev_seg, seg);
468: for (i = 0; i < ce->nr_words; i++) {
469: struct cand_elm *elm = &ce->elm[i];
470: prefix = get_prefix(is_negative);
471: if (nr_indep > 0 && elm->id != -1) {
472: prefix = get_prefix(is_negative | CONV_INVALID);
473: }
474:
475: print_element(prefix, elm, &fl);
476:
477: if (elm->id != -1) {
478: nr_indep ++;
479: }
480: }
481: anthy_feature_list_free(&fl);
482: }
483:
484: void
485: print_size_miss_segment_info(anthy_context_t ac, int nth)
486: {
487: struct seg_ent *prev_seg = NULL;
488: struct seg_ent *seg = anthy_get_nth_segment(&ac->seg_list, nth);
489: if (nth > 0) {
490: prev_seg = anthy_get_nth_segment(&ac->seg_list, nth - 1);
491: }
492: print_segment_info(CONV_SIZE_MISS, prev_seg, seg);
493: }
494:
495: void
496: print_cand_miss_segment_info(anthy_context_t ac, int nth)
497: {
498: struct seg_ent *prev_seg = NULL;
499: struct seg_ent *seg = anthy_get_nth_segment(&ac->seg_list, nth);
500: if (nth > 0) {
501: prev_seg = anthy_get_nth_segment(&ac->seg_list, nth - 1);
502: }
503: print_segment_info(CONV_CAND_MISS, prev_seg, seg);
504: }
505:
506: void
507: print_context_info(anthy_context_t ac, struct conv_res *cr)
508: {
509: int i;
510: struct seg_ent *prev_seg = NULL;
511:
512: printf("segments: %d\n", ac->seg_list.nr_segments);
513:
514: for (i = 0; i < ac->seg_list.nr_segments; i++) {
515: struct seg_ent *seg = anthy_get_nth_segment(&ac->seg_list, i);
516: struct cand_ent *ce = selected_candidate(seg);
517: int is_negative = 0;
518: if (cr && cr->cand_check && cr->cand_check[i]) {
519: is_negative = CONV_CAND_MISS;
520: }
521:
522:
523: if (!ce->nr_words) {
524:
525: print_unconverted(ce);
526: } else {
527:
528: if (seg->committed > 0) {
529: int tmp = seg->committed;
530: seg->committed = 0;
531: print_cand_miss_segment_info(ac, i);
532: seg->committed = tmp;
533: }
534:
535: print_segment_info(is_negative, prev_seg, seg);
536: }
537:
538: prev_seg = seg;
539: }
540: print_eos(prev_seg);
541: printf("\n");
542: }