1:
2:
3:
4:
5:
6:
7:
8:
9:
10:
11:
12:
13:
14:
15:
16:
17:
18: #include <stdlib.h>
19: #include <stdio.h>
20: #include <string.h>
21: #include <arpa/inet.h>
22:
23: #include <anthy/alloc.h>
24: #include <anthy/record.h>
25: #include <anthy/xstr.h>
26: #include <anthy/diclib.h>
27: #include <anthy/wtype.h>
28: #include <anthy/ruleparser.h>
29: #include <anthy/dic.h>
30: #include <anthy/splitter.h>
31: #include <anthy/feature_set.h>
32: #include "wordborder.h"
33:
34: #define HF_THRESH 784
35:
36: static void *weak_word_array;
37:
38:
39: void
40: anthy_print_word_list(struct splitter_context *sc,
41: struct word_list *wl)
42: {
43: xstr xs;
44: if (!wl) {
45: printf("--\n");
46: return ;
47: }
48:
49: xs.len = wl->part[PART_CORE].from - wl->from;
50: xs.str = sc->ce[wl->from].c;
51: anthy_putxstr(&xs);
52: printf(".");
53:
54: xs.len = wl->part[PART_CORE].len;
55: xs.str = sc->ce[wl->part[PART_CORE].from].c;
56: anthy_putxstr(&xs);
57: printf(".");
58:
59: xs.len = wl->part[PART_POSTFIX].len;
60: xs.str = sc->ce[wl->part[PART_CORE].from + wl->part[PART_CORE].len].c;
61: anthy_putxstr(&xs);
62: printf("-");
63:
64: xs.len = wl->part[PART_DEPWORD].len;
65: xs.str = sc->ce[wl->part[PART_CORE].from +
66: wl->part[PART_CORE].len +
67: wl->part[PART_POSTFIX].len].c;
68: anthy_putxstr(&xs);
69: anthy_print_wtype(wl->part[PART_CORE].wt);
70: printf(" %s%s\n", anthy_seg_class_name(wl->seg_class),
71: (wl->is_compound ? ",compound" : ""));
72: }
73:
74: int
75: anthy_dep_word_hash(xstr *xs)
76: {
77: return anthy_xstr_hash(xs) % WORD_HASH_MAX;
78: }
79:
80:
81:
82: static int
83: word_list_same(struct word_list *wl1, struct word_list *wl2)
84: {
85: if (wl1->node_id != wl2->node_id ||
86: wl1->from != wl2->from ||
87: wl1->len != wl2->len ||
88: wl1->mw_features != wl2->mw_features ||
89: wl1->tail_ct != wl2->tail_ct ||
90: wl1->part[PART_CORE].len != wl2->part[PART_CORE].len ||
91: wl1->is_compound != wl2->is_compound ||
92: !anthy_wtype_equal(wl1->part[PART_CORE].wt, wl2->part[PART_CORE].wt) ||
93: wl1->head_pos != wl2->head_pos) {
94: return 0;
95: }
96: if (wl1->part[PART_DEPWORD].dc != wl2->part[PART_DEPWORD].dc) {
97: return 0;
98: }
99:
100: return 1;
101: }
102:
103: static void
104: set_features(struct word_list *wl)
105: {
106: if (anthy_wtype_get_pos(wl->part[PART_CORE].wt) == POS_NOUN &&
107: anthy_wtype_get_sv(wl->part[PART_CORE].wt)) {
108: wl->mw_features |= MW_FEATURE_SV;
109: }
110: if (wl->part[PART_POSTFIX].len || wl->part[PART_PREFIX].len) {
111: wl->mw_features |= MW_FEATURE_SUFFIX;
112: }
113: if (anthy_wtype_get_pos(wl->part[PART_CORE].wt) == POS_NUMBER) {
114: wl->mw_features |= MW_FEATURE_NUM;
115: }
116: if (wl->part[PART_CORE].len == 1) {
117: wl->mw_features |= MW_FEATURE_CORE1;
118: }
119: if (wl->part[PART_CORE].len == 0) {
120: wl->mw_features |= MW_FEATURE_DEP_ONLY;
121: }
122: if (wl->part[PART_CORE].freq > HF_THRESH) {
123: wl->mw_features |= MW_FEATURE_HIGH_FREQ;
124: }
125: }
126:
127:
128: void
129: anthy_commit_word_list(struct splitter_context *sc,
130: struct word_list *wl)
131: {
132: struct word_list *tmp;
133: xstr xs;
134:
135:
136: if (wl->len == 0) return;
137:
138: wl->last_part = PART_DEPWORD;
139:
140:
141: set_features(wl);
142:
143: anthy_set_seg_class(wl);
144:
145: xs.len = wl->part[PART_DEPWORD].len;
146: xs.str = sc->ce[wl->part[PART_POSTFIX].from + wl->part[PART_POSTFIX].len].c;
147: wl->dep_word_hash = anthy_dep_word_hash(&xs);
148: if (wl->part[PART_POSTFIX].len) {
149: xs.len = wl->part[PART_POSTFIX].len;
150: xs.str = sc->ce[wl->part[PART_POSTFIX].from].c;
151: }
152:
153:
154: for (tmp = sc->word_split_info->cnode[wl->from].wl; tmp; tmp = tmp->next) {
155: if (word_list_same(tmp, wl)) {
156: return ;
157: }
158: }
159:
160: wl->next = sc->word_split_info->cnode[wl->from].wl;
161: sc->word_split_info->cnode[wl->from].wl = wl;
162:
163:
164: if (anthy_splitter_debug_flags() & SPLITTER_DEBUG_WL) {
165: anthy_print_word_list(sc, wl);
166: }
167: }
168:
169: struct word_list *
170: anthy_alloc_word_list(struct splitter_context *sc)
171: {
172: return anthy_smalloc(sc->word_split_info->WlAllocator);
173: }
174:
175:
176: static void
177: make_following_word_list(struct splitter_context *sc,
178: struct word_list *tmpl)
179: {
180:
181: xstr xs;
182: xs.str = sc->ce[tmpl->from+tmpl->len].c;
183: xs.len = sc->char_count - tmpl->from - tmpl->len;
184: tmpl->part[PART_DEPWORD].from =
185: tmpl->part[PART_POSTFIX].from + tmpl->part[PART_POSTFIX].len;
186:
187: if (tmpl->node_id >= 0) {
188:
189: anthy_scan_node(sc, tmpl, &xs, tmpl->node_id);
190: } else {
191:
192: struct wordseq_rule rule;
193: struct word_list new_tmpl;
194: int i;
195: int nr_rule = anthy_get_nr_dep_rule();
196: new_tmpl = *tmpl;
197:
198: for (i = 0; i < nr_rule; ++i) {
199: anthy_get_nth_dep_rule(i, &rule);
200: if (anthy_wtype_get_pos(rule.wt) == POS_NOUN
201: && anthy_wtype_get_scos(rule.wt) == SCOS_T35) {
202: new_tmpl.part[PART_CORE].wt = rule.wt;
203: new_tmpl.node_id = rule.node_id;
204: new_tmpl.head_pos = anthy_wtype_get_pos(new_tmpl.part[PART_CORE].wt);
205: anthy_scan_node(sc, &new_tmpl, &xs, new_tmpl.node_id);
206: }
207: }
208: }
209: }
210:
211: static void
212: push_part_back(struct word_list *tmpl, int len,
213: seq_ent_t se, wtype_t wt)
214: {
215: tmpl->len += len;
216: tmpl->part[PART_POSTFIX].len += len;
217: tmpl->part[PART_POSTFIX].wt = wt;
218: tmpl->part[PART_POSTFIX].seq = se;
219: tmpl->last_part = PART_POSTFIX;
220: }
221:
222:
223: static void
224: make_suc_words(struct splitter_context *sc,
225: struct word_list *tmpl)
226: {
227: int i, right;
228:
229: wtype_t core_wt = tmpl->part[PART_CORE].wt;
230:
231: int core_is_num = 0;
232: int core_is_name = 0;
233: int core_is_sv_noun = 0;
234:
235:
236: if (anthy_wtype_include(anthy_wtype_num_noun, core_wt)) {
237: core_is_num = 1;
238: }
239: if (anthy_wtype_include(anthy_wtype_name_noun, core_wt)) {
240: core_is_name = 1;
241: }
242: if (anthy_wtype_get_sv(core_wt)) {
243: core_is_sv_noun = 1;
244: }
245: if (!core_is_num && !core_is_name && !core_is_sv_noun) {
246: return ;
247: }
248:
249: right = tmpl->part[PART_CORE].from + tmpl->part[PART_CORE].len;
250:
251: for (i = 1;
252: i <= sc->word_split_info->seq_len[right];
253: i++){
254: xstr xs;
255: seq_ent_t suc;
256: xs.str = sc->ce[right].c;
257: xs.len = i;
258: suc = anthy_get_seq_ent_from_xstr(&xs, sc->is_reverse);
259: if (anthy_get_seq_ent_pos(suc, POS_SUC)) {
260:
261: struct word_list new_tmpl;
262: if (core_is_num &&
263: anthy_get_seq_ent_wtype_freq(suc, anthy_wtype_num_postfix)) {
264: new_tmpl = *tmpl;
265: push_part_back(&new_tmpl, i, suc, anthy_wtype_num_postfix);
266: make_following_word_list(sc, &new_tmpl);
267: }
268: if (core_is_name &&
269: anthy_get_seq_ent_wtype_freq(suc, anthy_wtype_name_postfix)) {
270: new_tmpl = *tmpl;
271: push_part_back(&new_tmpl, i, suc, anthy_wtype_name_postfix);
272: make_following_word_list(sc, &new_tmpl);
273: }
274: if (core_is_sv_noun &&
275: anthy_get_seq_ent_wtype_freq(suc, anthy_wtype_sv_postfix)) {
276: new_tmpl = *tmpl;
277: push_part_back(&new_tmpl, i, suc, anthy_wtype_sv_postfix);
278: make_following_word_list(sc, &new_tmpl);
279: }
280: }
281: }
282: }
283:
284: static void
285: push_part_front(struct word_list *tmpl, int len,
286: seq_ent_t se, wtype_t wt)
287: {
288: tmpl->from = tmpl->from - len;
289: tmpl->len = tmpl->len + len;
290: tmpl->part[PART_PREFIX].from = tmpl->from;
291: tmpl->part[PART_PREFIX].len += len;
292: tmpl->part[PART_PREFIX].wt = wt;
293: tmpl->part[PART_PREFIX].seq = se;
294: }
295:
296:
297: static void
298: make_pre_words(struct splitter_context *sc,
299: struct word_list *tmpl)
300: {
301: int i;
302: wtype_t core_wt = tmpl->part[PART_CORE].wt;
303: int core_is_num = 0;
304:
305: if (anthy_wtype_include(anthy_wtype_num_noun, core_wt)) {
306: core_is_num = 1;
307: }
308:
309: for (i = 1;
310: i <= sc->word_split_info->rev_seq_len[tmpl->part[PART_CORE].from];
311: i++) {
312: seq_ent_t pre;
313:
314: xstr xs;
315: xs.str = sc->ce[tmpl->part[PART_CORE].from - i].c;
316: xs.len = i;
317: pre = anthy_get_seq_ent_from_xstr(&xs, sc->is_reverse);
318: if (anthy_get_seq_ent_pos(pre, POS_PRE)) {
319: struct word_list new_tmpl;
320: if (core_is_num &&
321: anthy_get_seq_ent_wtype_freq(pre, anthy_wtype_num_prefix)) {
322: new_tmpl = *tmpl;
323: push_part_front(&new_tmpl, i, pre, anthy_wtype_num_prefix);
324: make_following_word_list(sc, &new_tmpl);
325:
326: make_suc_words(sc, &new_tmpl);
327: }
328:
329:
330:
331:
332: }
333: }
334: }
335:
336:
337: static void
338: setup_word_list(struct word_list *wl, int from, int len,
339: int is_compound, int is_weak)
340: {
341: int i;
342: wl->from = from;
343: wl->len = len;
344: wl->is_compound = is_compound;
345:
346: for (i = 0; i < NR_PARTS; i++) {
347: wl->part[i].from = 0;
348: wl->part[i].len = 0;
349: wl->part[i].wt = anthy_wt_none;
350: wl->part[i].seq = 0;
351: wl->part[i].freq = 1;
352: wl->part[i].dc = DEP_NONE;
353: }
354:
355: wl->part[PART_CORE].from = from;
356: wl->part[PART_CORE].len = len;
357:
358: wl->mw_features = MW_FEATURE_NONE;
359: wl->node_id = -1;
360: wl->last_part = PART_CORE;
361: wl->head_pos = POS_NONE;
362: wl->tail_ct = CT_NONE;
363: if (is_weak) {
364: wl->mw_features |= MW_FEATURE_WEAK_SEQ;
365: }
366: }
367:
368:
369:
370:
371:
372: static void
373: make_word_list(struct splitter_context *sc,
374: seq_ent_t se,
375: int from, int len,
376: int is_compound,
377: int is_weak)
378: {
379: struct word_list tmpl;
380: struct wordseq_rule rule;
381: int nr_rule = anthy_get_nr_dep_rule();
382: int i;
383:
384:
385: setup_word_list(&tmpl, from, len, is_compound, is_weak);
386: tmpl.part[PART_CORE].seq = se;
387:
388:
389: for (i = 0; i < nr_rule; ++i) {
390: int freq;
391: anthy_get_nth_dep_rule(i, &rule);
392: if (!is_compound) {
393: freq = anthy_get_seq_ent_wtype_freq(se, rule.wt);
394: } else {
395: freq = anthy_get_seq_ent_wtype_compound_freq(se, rule.wt);
396: }
397:
398: if (freq) {
399:
400: if (anthy_splitter_debug_flags() & SPLITTER_DEBUG_ID) {
401:
402: xstr xs;
403: xs.str = sc->ce[tmpl.part[PART_CORE].from].c;
404: xs.len = tmpl.part[PART_CORE].len;
405: anthy_putxstr(&xs);
406: printf(" freq=%d rule_id=%d node_id=%d\n",
407: freq, i, rule.node_id);
408: }
409:
410: tmpl.part[PART_CORE].wt = rule.wt;
411: tmpl.part[PART_CORE].freq = freq;
412: tmpl.node_id = rule.node_id;
413: tmpl.head_pos = anthy_wtype_get_pos(tmpl.part[PART_CORE].wt);
414:
415:
416: tmpl.part[PART_POSTFIX].from =
417: tmpl.part[PART_CORE].from +
418: tmpl.part[PART_CORE].len;
419:
420: if (anthy_wtype_get_pos(rule.wt) == POS_NOUN ||
421: anthy_wtype_get_pos(rule.wt) == POS_NUMBER) {
422:
423: make_pre_words(sc, &tmpl);
424: make_suc_words(sc, &tmpl);
425: }
426:
427: make_following_word_list(sc, &tmpl);
428: }
429: }
430: }
431:
432: static void
433: make_dummy_head(struct splitter_context *sc)
434: {
435: struct word_list tmpl;
436: setup_word_list(&tmpl, 0, 0, 0, 0);
437: tmpl.part[PART_CORE].seq = 0;
438: tmpl.part[PART_CORE].wt = anthy_wtype_noun;
439:
440: tmpl.head_pos = anthy_wtype_get_pos(tmpl.part[PART_CORE].wt);
441: make_suc_words(sc, &tmpl);
442: }
443:
444: static int
445: compare_hash(const void *kp, const void *cp)
446: {
447: const int *h = kp;
448: const int *c = cp;
449: return (*h) - ntohl(*c);
450: }
451:
452: static int
453: check_weak(xstr *xs)
454: {
455: const int *array = (int *)weak_word_array;
456: int nr;
457: int h;
458: if (!array) {
459: return 0;
460: }
461: nr = ntohl(array[1]);
462: h = anthy_xstr_hash(xs);
463: if (bsearch(&h, &array[16], nr,
464: sizeof(int), compare_hash)) {
465: return 1;
466: }
467: return 0;
468: }
469:
470:
471: void
472: anthy_make_word_list_all(struct splitter_context *sc)
473: {
474: int i, j;
475: xstr xs;
476: seq_ent_t se;
477: struct depword_ent {
478: struct depword_ent *next;
479: int from, len;
480: int is_compound;
481: int is_weak;
482: seq_ent_t se;
483: } *head, *de;
484: struct word_split_info_cache *info;
485: allocator de_ator;
486:
487: weak_word_array = anthy_file_dic_get_section("weak_words");
488:
489: info = sc->word_split_info;
490: head = NULL;
491: de_ator = anthy_create_allocator(sizeof(struct depword_ent), 0);
492:
493: xs.str = sc->ce[0].c;
494: xs.len = sc->char_count;
495: anthy_gang_load_dic(&xs, sc->is_reverse);
496:
497:
498:
499: for (i = 0; i < sc->char_count ; i++) {
500: int search_len = sc->char_count - i;
501: int search_from = 0;
502: if (search_len > 30) {
503: search_len = 30;
504: }
505:
506:
507: for (j = search_len; j > search_from; j--) {
508:
509: xs.len = j;
510: xs.str = sc->ce[i].c;
511: se = anthy_get_seq_ent_from_xstr(&xs, sc->is_reverse);
512:
513:
514: if (!se) {
515: continue;
516: }
517:
518:
519:
520: if (j > info->seq_len[i] &&
521: anthy_get_seq_ent_pos(se, POS_SUC)) {
522: info->seq_len[i] = j;
523: }
524: if (j > info->rev_seq_len[i + j] &&
525: anthy_get_seq_ent_pos(se, POS_PRE)) {
526: info->rev_seq_len[i + j] = j;
527: }
528: