1:
2:
3:
4:
5:
6:
7:
8:
9:
10:
11:
12:
13:
14:
15:
16:
17:
18:
19:
20:
21:
22:
23:
24:
25:
26:
27:
28:
29:
30:
31:
32:
33:
34:
35:
36:
37:
38:
39:
40: #include <stdio.h>
41: #include <stdlib.h>
42: #include <string.h>
43: #include <math.h>
44:
45: #include <anthy/alloc.h>
46: #include <anthy/xstr.h>
47: #include <anthy/segclass.h>
48: #include <anthy/splitter.h>
49: #include <anthy/feature_set.h>
50: #include <anthy/diclib.h>
51: #include "wordborder.h"
52:
53: static float anthy_normal_length = 20.0;
54: static void *trans_info_array;
55:
56: #define NODE_MAX_SIZE 50
57:
58:
59: struct lattice_node {
60: int border;
61: enum seg_class seg_class;
62:
63:
64: double real_probability;
65: double adjusted_probability;
66:
67:
68: struct lattice_node* before_node;
69: struct meta_word* mw;
70:
71: struct lattice_node* next;
72: };
73:
74: struct node_list_head {
75: struct lattice_node *head;
76: int nr_nodes;
77: };
78:
79: struct lattice_info {
80:
81: struct node_list_head *lattice_node_list;
82: struct splitter_context *sc;
83:
84: allocator node_allocator;
85: };
86:
87:
88:
89: static void
90: print_lattice_node(struct lattice_info *info, struct lattice_node *node)
91: {
92: if (!node) {
93: printf("**lattice_node (null)*\n");
94: return ;
95: }
96: printf("**lattice_node probability=%.128f\n", node->real_probability);
97: if (node->mw) {
98: anthy_print_metaword(info->sc, node->mw);
99: }
100: }
101:
102: static double
103: get_poisson(double lambda, int r)
104: {
105: int i;
106: double result;
107:
108:
109: result = pow(lambda, r) * exp(-lambda);
110: for (i = 2; i <= r; ++i) {
111: result /= i;
112: }
113:
114: return result;
115: }
116:
117:
118: static double
119: get_form_bias(struct meta_word *mw)
120: {
121: double bias;
122: int r;
123:
124: while (mw->type == MW_WRAP) {
125: mw = mw->mw1;
126: }
127:
128: r = mw->len;
129: if (r > 6) {
130: r = 6;
131: }
132: if (r < 2) {
133: r = 2;
134: }
135: if (mw->seg_class == SEG_RENTAI_SHUSHOKU &&
136: r < 3) {
137:
138: r = 3;
139: }
140: bias = get_poisson(anthy_normal_length, r);
141: return bias;
142: }
143:
144: static void
145: build_feature_list(struct lattice_node *node,
146: struct feature_list *features)
147: {
148: int pc, cc;
149: if (node) {
150: cc = node->seg_class;
151: } else {
152: cc = SEG_TAIL;
153: }
154: anthy_feature_list_set_cur_class(features, cc);
155: if (node && node->before_node) {
156: pc = node->before_node->seg_class;
157: } else {
158: pc = SEG_HEAD;
159: }
160: anthy_feature_list_set_class_trans(features, pc, cc);
161:
162: if (node && node->mw) {
163: struct meta_word *mw = node->mw;
164: anthy_feature_list_set_dep_class(features, mw->dep_class);
165: anthy_feature_list_set_dep_word(features,
166: mw->dep_word_hash);
167: anthy_feature_list_set_mw_features(features, mw->mw_features);
168: anthy_feature_list_set_noun_cos(features, mw->core_wt);
169:
170: }
171: anthy_feature_list_sort(features);
172: }
173:
174: static double
175: calc_probability(int cc, struct feature_list *fl)
176: {
177: struct feature_freq *res, arg;
178: double prob;
179:
180:
181: res = anthy_find_feature_freq(trans_info_array,
182: fl, &arg);
183: prob = 0;
184: if (res) {
185: double pos = res->f[15];
186: double neg = res->f[14];
187: prob = 1 - (neg) / (double) (pos + neg);
188: }
189: if (prob <= 0) {
190:
191: prob = 1.0f / (double)(10000 * 100);
192: }
193:
194: if (anthy_splitter_debug_flags() & SPLITTER_DEBUG_LN) {
195: anthy_feature_list_print(fl);
196: printf(" cc=%d(%s), P=%f\n", cc, anthy_seg_class_name(cc), prob);
197: }
198: return prob;
199: }
200:
201: static double
202: get_transition_probability(struct lattice_node *node)
203: {
204: struct feature_list features;
205: double probability;
206:
207:
208: anthy_feature_list_init(&features);
209: build_feature_list(node, &features);
210: probability = calc_probability(node->seg_class, &features);
211: anthy_feature_list_free(&features);
212:
213:
214: probability *= get_form_bias(node->mw);
215: return probability;
216: }
217:
218: static struct lattice_info*
219: alloc_lattice_info(struct splitter_context *sc, int size)
220: {
221: int i;
222: struct lattice_info* info = (struct lattice_info*)malloc(sizeof(struct lattice_info));
223: info->sc = sc;
224: info->lattice_node_list = (struct node_list_head*)
225: malloc((size + 1) * sizeof(struct node_list_head));
226: for (i = 0; i < size + 1; i++) {
227: info->lattice_node_list[i].head = NULL;
228: info->lattice_node_list[i].nr_nodes = 0;
229: }
230: info->node_allocator = anthy_create_allocator(sizeof(struct lattice_node),
231: NULL);
232: return info;
233: }
234:
235: static void
236: calc_node_parameters(struct lattice_node *node)
237: {
238:
239: node->seg_class = node->mw ? node->mw->seg_class : SEG_HEAD;
240:
241: if (node->before_node) {
242:
243: node->real_probability = node->before_node->real_probability *
244: get_transition_probability(node);
245: node->adjusted_probability = node->real_probability *
246: (node->mw ? node->mw->score : 1000);
247: } else {
248:
249: node->real_probability = 1.0;
250: node->adjusted_probability = node->real_probability;
251: }
252: }
253:
254: static struct lattice_node*
255: alloc_lattice_node(struct lattice_info *info,
256: struct lattice_node* before_node,
257: struct meta_word* mw, int border)
258: {
259: struct lattice_node* node;
260: node = anthy_smalloc(info->node_allocator);
261: node->before_node = before_node;
262: node->border = border;
263: node->next = NULL;
264: node->mw = mw;
265:
266: calc_node_parameters(node);
267:
268: return node;
269: }
270:
271: static void
272: release_lattice_node(struct lattice_info *info, struct lattice_node* node)
273: {
274: anthy_sfree(info->node_allocator, node);
275: }
276:
277: static void
278: release_lattice_info(struct lattice_info* info)
279: {
280: anthy_free_allocator(info->node_allocator);
281: free(info->lattice_node_list);
282: free(info);
283: }
284:
285: static int
286: cmp_node_by_type(struct lattice_node *lhs, struct lattice_node *rhs,
287: enum metaword_type type)
288: {
289: if (lhs->mw->type == type && rhs->mw->type != type) {
290: return 1;
291: } else if (lhs->mw->type != type && rhs->mw->type == type) {
292: return -1;
293: } else {
294: return 0;
295: }
296: }
297:
298: static int
299: cmp_node_by_type_to_type(struct lattice_node *lhs, struct lattice_node *rhs,
300: enum metaword_type type1, enum metaword_type type2)
301: {
302: if (lhs->mw->type == type1 && rhs->mw->type == type2) {
303: return 1;
304: } else if (lhs->mw->type == type2 && rhs->mw->type == type1) {
305: return -1;
306: } else {
307: return 0;
308: }
309: }
310:
311:
312:
313:
314:
315:
316:
317:
318:
319: static int
320: cmp_node(struct lattice_node *lhs, struct lattice_node *rhs)
321: {
322: struct lattice_node *lhs_before = lhs;
323: struct lattice_node *rhs_before = rhs;
324: int ret;
325:
326: if (lhs && !rhs) return 1;
327: if (!lhs && rhs) return -1;
328: if (!lhs && !rhs) return 0;
329:
330: while (lhs_before && rhs_before) {
331: if (lhs_before->mw && rhs_before->mw &&
332: lhs_before->mw->from + lhs_before->mw->len == rhs_before->mw->from + rhs_before->mw->len) {
333:
334: ret = cmp_node_by_type(lhs_before, rhs_before, MW_OCHAIRE);
335: if (ret != 0) return ret;
336:
337:
338: ret = cmp_node_by_type_to_type(lhs_before, rhs_before, MW_COMPOUND_HEAD, MW_COMPOUND_PART);
339: if (ret != 0) return ret;
340: } else {
341: break;
342: }
343: lhs_before = lhs_before->before_node;
344: rhs_before = rhs_before->before_node;
345: }
346:
347:
348: if (lhs->adjusted_probability > rhs->adjusted_probability) {
349: return 1;
350: } else if (lhs->adjusted_probability < rhs->adjusted_probability) {
351: return -1;
352: } else {
353: return 0;
354: }
355: }
356:
357:
358:
359:
360: static void
361: push_node(struct lattice_info* info, struct lattice_node* new_node,
362: int position)
363: {
364: struct lattice_node* node;
365: struct lattice_node* previous_node = NULL;
366:
367: if (anthy_splitter_debug_flags() & SPLITTER_DEBUG_LN) {
368: print_lattice_node(info, new_node);
369: }
370:
371:
372: node = info->lattice_node_list[position].head;
373: if (!node) {
374: info->lattice_node_list[position].head = new_node;
375: info->lattice_node_list[position].nr_nodes ++;
376: return;
377: }
378:
379: while (node->next) {
380:
381: if (new_node->seg_class == node->seg_class &&
382: new_node->border == node->border) {
383:
384: switch (cmp_node(new_node, node)) {
385: case 0:
386: case 1:
387:
388: if (previous_node) {
389: previous_node->next = new_node;
390: } else {
391: info->lattice_node_list[position].head = new_node;
392: }
393: new_node->next = node->next;
394: release_lattice_node(info, node);
395: break;
396: case -1:
397:
398: release_lattice_node(info, new_node);
399: break;
400: }
401: return;
402: }
403: previous_node = node;
404: node = node->next;
405: }
406:
407:
408: node->next = new_node;
409: info->lattice_node_list[position].nr_nodes ++;
410: }
411:
412:
413: static void
414: remove_min_node(struct lattice_info *info, struct node_list_head *node_list)
415: {
416: struct lattice_node* node = node_list->head;
417: struct lattice_node* previous_node = NULL;
418: struct lattice_node* min_node = node;
419: struct lattice_node* previous_min_node = NULL;
420:
421:
422: while (node) {
423: if (cmp_node(node, min_node) < 0) {
424: previous_min_node = previous_node;
425: min_node = node;
426: }
427: previous_node = node;
428: node = node->next;
429: }
430:
431:
432: if (previous_min_node) {
433: previous_min_node->next = min_node->next;
434: } else {
435: node_list->head = min_node->next;
436: }
437: release_lattice_node(info, min_node);
438: node_list->nr_nodes --;
439: }
440:
441:
442: static void
443: choose_path(struct lattice_info* info, int to)
444: {
445:
446: struct lattice_node* node;
447: struct lattice_node* best_node = NULL;
448: int last = to;
449: while (!info->lattice_node_list[last].head) {
450:
451: --last;
452: }
453: for (node = info->lattice_node_list[last].head; node; node = node->next) {
454: if (cmp_node(node, best_node) > 0) {
455: best_node = node;
456: }
457: }
458: if (!best_node) {
459: return;
460: }
461:
462:
463: node = best_node;
464: if (anthy_splitter_debug_flags() & SPLITTER_DEBUG_LN) {
465: printf("choose_path()\n");
466: }
467: while (node->before_node) {
468: info->sc->word_split_info->best_seg_class[node->border] =
469: node->seg_class;
470: anthy_mark_border_by_metaword(info->sc, node->mw);
471:
472: if (anthy_splitter_debug_flags() & SPLITTER_DEBUG_LN) {
473: print_lattice_node(info, node);
474: }
475:
476: node = node->before_node;
477: }
478: }
479:
480: static void
481: build_graph(struct lattice_info* info, int from, int to)
482: {
483: int i;
484: struct lattice_node* node;
485: struct lattice_node* left_node;
486:
487:
488: node = alloc_lattice_node(info, NULL, NULL, from);
489: push_node(info, node, from);
490:
491:
492:
493:
494:
495:
496: for (i = from; i < to; ++i) {
497: for (left_node = info->lattice_node_list[i].head; left_node;
498: left_node = left_node->next) {
499: struct meta_word *mw;
500:
501:
502: for (mw = info->sc->word_split_info->cnode[i].mw; mw; mw = mw->next) {
503: int position;
504: struct lattice_node* new_node;
505:
506:
507: if (mw->can_use != ok) {
508: continue;
509: }
510: position = i + mw->len;
511: new_node = alloc_lattice_node(info, left_node, mw, i);
512: push_node(info, new_node, position);
513:
514:
515: if (info->lattice_node_list[position].nr_nodes >= NODE_MAX_SIZE) {
516: remove_min_node(info, &info->lattice_node_list[position]);
517: }
518: }
519: }
520: }
521:
522:
523: for (node = info->lattice_node_list[to].head; node; node = node->next) {
524: struct feature_list features;
525: anthy_feature_list_init(&features);
526: build_feature_list(NULL, &features);
527: node->adjusted_probability = node->adjusted_probability *
528: calc_probability(SEG_TAIL, &features);
529: anthy_feature_list_free(&features);
530: }
531: }
532:
533: void
534: anthy_mark_borders(struct splitter_context *sc, int from, int to)
535: {
536: struct lattice_info* info = alloc_lattice_info(sc, to);
537: trans_info_array = anthy_file_dic_get_section("trans_info");
538: build_graph(info, from, to);
539: choose_path(info, to);
540: release_lattice_info(info);
541: }