1:
2:
3:
4:
5:
6:
7:
8:
9:
10:
11:
12:
13:
14:
15:
16:
17:
18:
19:
20:
21:
22:
23:
24:
25: #include <stdlib.h>
26: #include <string.h>
27:
28: #include <anthy/anthy.h>
29: #include <anthy/dic.h>
30: #include <anthy/conf.h>
31: #include <anthy/record.h>
32: #include <anthy/alloc.h>
33: #include <anthy/logger.h>
34: #include <anthy/xchar.h>
35: #include <anthy/feature_set.h>
36: #include <anthy/textdict.h>
37:
38: #include <anthy/diclib.h>
39:
40: #include "dic_ent.h"
41: #include "dic_personality.h"
42: #include "dic_main.h"
43:
44:
45: static int dic_init_count;
46:
47:
48:
49: static struct word_dic *master_dic_file;
50:
51:
52: struct mem_dic *anthy_current_personal_dic_cache;
53:
54: struct record_stat *anthy_current_record;
55:
56: struct seq_ent *
57: anthy_validate_seq_ent(struct seq_ent *seq, xstr *xs, int is_reverse)
58: {
59: if (!seq) {
60: return NULL;
61: }
62: if (seq->nr_dic_ents == 0 && seq->nr_compound_ents == 0) {
63:
64: anthy_mem_dic_release_seq_ent(anthy_current_personal_dic_cache,
65: xs, is_reverse);
66: return NULL;
67: }
68:
69: return seq;
70: }
71:
72: struct seq_ent *
73: anthy_cache_get_seq_ent(xstr *xs, int is_reverse)
74: {
75: struct seq_ent *seq;
76:
77:
78: seq = anthy_mem_dic_find_seq_ent_by_xstr(anthy_current_personal_dic_cache,
79: xs, is_reverse);
80: if (seq) {
81: return seq;
82: }
83:
84:
85: return anthy_mem_dic_alloc_seq_ent_by_xstr(anthy_current_personal_dic_cache,
86: xs, is_reverse);
87: }
88:
89: int
90: anthy_dic_check_word_relation(int from, int to)
91: {
92: return anthy_word_dic_check_word_relation(master_dic_file, from, to);
93: }
94:
95: static seq_ent_t
96: do_get_seq_ent_from_xstr(xstr *xs, int is_reverse)
97: {
98: struct seq_ent *seq;
99:
100: seq = anthy_cache_get_seq_ent(xs, is_reverse);
101: seq = anthy_validate_seq_ent(seq, xs, is_reverse);
102: if (!seq) {
103:
104: return anthy_get_ext_seq_ent_from_xstr(xs, is_reverse);
105: }
106: return seq;
107: }
108:
109: static xstr *
110: convert_vu(xstr *xs)
111: {
112: int i, v = 0;
113: int j;
114:
115:
116: for (i = 0; i < xs->len; i++) {
117: if (xs->str[i] == KK_VU) {
118: v++;
119: }
120: }
121: if (v > 0) {
122: xstr *nx = malloc(sizeof(xstr));
123: nx->len = xs->len + v;
124: nx->str = malloc(sizeof(xchar)*nx->len);
125: j = 0;
126:
127: for (i = 0; i < xs->len; i++) {
128: if (xs->str[i] == KK_VU) {
129: nx->str[j] = HK_U;
130: j++;
131: nx->str[j] = HK_DDOT;
132: j++;
133: } else {
134: nx->str[j] = xs->str[i];
135: j++;
136: }
137: }
138: return nx;
139: }
140: return NULL;
141: }
142:
143: seq_ent_t
144: anthy_get_seq_ent_from_xstr(xstr *xs, int is_reverse)
145: {
146: struct seq_ent *se;
147:
148: if (!xs) {
149: return NULL;
150: }
151: if (!is_reverse) {
152: xstr *nx = convert_vu(xs);
153:
154:
155:
156:
157: if (nx) {
158: se = do_get_seq_ent_from_xstr(nx, 0);
159: anthy_free_xstr(nx);
160: return se;
161: }
162: }
163:
164: return do_get_seq_ent_from_xstr(xs, is_reverse);
165: }
166:
167: static void
168: gang_elm_dtor(void *p)
169: {
170: struct gang_elm *ge = p;
171: free(ge->key);
172: }
173:
174: static int
175: find_gang_elm(allocator ator, struct gang_elm *head, xstr *xs)
176: {
177: char *str = anthy_xstr_to_cstr(xs, ANTHY_UTF8_ENCODING);
178: struct gang_elm *ge;
179: for (ge = head->tmp.next; ge; ge = ge->tmp.next) {
180: if (!strcmp(ge->key, str)) {
181: free(str);
182: return 0;
183: }
184: }
185: ge = anthy_smalloc(ator);
186: ge->xs = *xs;
187: ge->key = str;
188: ge->tmp.next = head->tmp.next;
189: head->tmp.next = ge;
190: return 1;
191: }
192:
193: static int
194: gang_elm_compare_func(const void *p1, const void *p2)
195: {
196: const struct gang_elm * const *s1 = p1;
197: const struct gang_elm * const *s2 = p2;
198: return strcmp((*s1)->key, (*s2)->key);
199: }
200:
201: struct gang_scan_context {
202:
203: int nr;
204: struct gang_elm **array;
205:
206: int nth;
207: };
208:
209: static int
210: is_ext_ent(struct seq_ent *seq)
211: {
212: if (!seq->md) {
213: return 1;
214: }
215: return 0;
216: }
217:
218: static void
219: scan_misc_dic(struct gang_elm **array, int nr, int is_reverse)
220: {
221: int i;
222: for (i = 0; i < nr; i++) {
223: xstr *xs = &array[i]->xs;
224: struct seq_ent *seq;
225: seq = anthy_cache_get_seq_ent(xs, is_reverse);
226:
227: if (seq) {
228: anthy_copy_words_from_private_dic(seq, xs, is_reverse);
229: anthy_validate_seq_ent(seq, xs, is_reverse);
230: }
231: }
232: }
233:
234: static void
235: load_word(xstr *xs, const char *n, int is_reverse)
236: {
237: struct seq_ent *seq = anthy_get_seq_ent_from_xstr(xs, 0);
238: xstr *word_xs;
239: wtype_t wt;
240: struct word_line wl;
241: if (!seq || is_ext_ent(seq)) {
242: seq = anthy_mem_dic_alloc_seq_ent_by_xstr(anthy_current_personal_dic_cache,
243: xs, is_reverse);
244: }
245: if (anthy_parse_word_line(n, &wl)) {
246: return ;
247: }
248: word_xs = anthy_cstr_to_xstr(wl.word, ANTHY_UTF8_ENCODING);
249: if (anthy_type_to_wtype(wl.wt, &wt)) {
250: anthy_mem_dic_push_back_dic_ent(seq, 0, word_xs, wt,
251: NULL, wl.freq, 0);
252: }
253:
254: anthy_free_xstr(word_xs);
255: }
256:
257: static int
258: gang_scan(void *p, int offset, const char *key, const char *n)
259: {
260: struct gang_scan_context *gsc = p;
261: struct gang_elm *elm;
262: int r;
263: (void)offset;
264: while (1) {
265: if (gsc->nth >= gsc->nr) {
266: return 0;
267: }
268: elm = gsc->array[gsc->nth];
269: r = strcmp(elm->key, key);
270: if (r == 0) {
271:
272: load_word(&elm->xs, n, 0);
273:
274: return 0;
275: } else if (r > 0) {
276:
277: return 0;
278: } else {
279:
280: gsc->nth ++;
281: }
282: }
283: return 0;
284: }
285:
286: static void
287: scan_dict(struct textdict *td, int nr, struct gang_elm **array)
288: {
289: struct gang_scan_context gsc;
290: gsc.nr = nr;
291: gsc.array = array;
292: gsc.nth = 0;
293: anthy_textdict_scan(td, 0, &gsc, gang_scan);
294: }
295:
296: struct scan_arg {
297: struct gang_elm **array;
298: int nr;
299: };
300:
301: static void
302: request_scan(struct textdict *td, void *arg)
303: {
304: struct scan_arg *sarg = (struct scan_arg *)arg;
305: scan_dict(td, sarg->nr, sarg->array);
306: }
307:
308: static void
309: do_gang_load_dic(xstr *sentence, int is_reverse)
310: {
311: allocator ator = anthy_create_allocator(sizeof(struct gang_elm),
312: gang_elm_dtor);
313: int from, len;
314: xstr xs;
315: int i, nr;
316: struct gang_elm head;
317: struct gang_elm **array, *cur;
318: struct scan_arg sarg;
319: head.tmp.next = NULL;
320: nr = 0;
321: for (from = 0; from < sentence->len ; from ++) {
322: for (len = 1; len < 32 && from + len <= sentence->len; len ++) {
323: xs.str = &sentence->str[from];
324: xs.len = len;
325: nr += find_gang_elm(ator, &head, &xs);
326: }
327: }
328: array = malloc(sizeof(struct gang_elm *) * nr);
329: cur = head.tmp.next;
330: for (i = 0; i < nr; i++) {
331: array[i] = cur;
332: cur = cur->tmp.next;
333: }
334: qsort(array, nr, sizeof(struct gang_elm *), gang_elm_compare_func);
335:
336: anthy_gang_fill_seq_ent(master_dic_file, array, nr, is_reverse);
337:
338: scan_misc_dic(array, nr, is_reverse);
339:
340: sarg.nr = nr;
341: sarg.array = array;
342: anthy_ask_scan(request_scan, (void *)&sarg);
343:
344: free(array);
345: anthy_free_allocator(ator);
346: }
347:
348: void
349: anthy_gang_load_dic(xstr *sentence, int is_reverse)
350: {
351: xstr *nx;
352: if (!is_reverse && (nx = convert_vu(sentence))) {
353: do_gang_load_dic(nx, is_reverse);
354: anthy_free_xstr(nx);
355: } else {
356: do_gang_load_dic(sentence, is_reverse);
357: }
358: }
359:
360:
361:
362:
363:
364:
365: int
366: anthy_get_nr_dic_ents(seq_ent_t se, xstr *xs)
367: {
368: struct seq_ent *s = se;
369: if (!s) {
370: return 0;
371: }
372: if (!xs) {
373: return s->nr_dic_ents;
374: }
375: return s->nr_dic_ents + anthy_get_nr_dic_ents_of_ext_ent(se, xs);
376: }
377:
378: int
379: anthy_get_nth_dic_ent_str(seq_ent_t se, xstr *orig,
380: int n, xstr *x)
381: {
382: if (!se) {
383: return -1;
384: }
385: if (n >= se->nr_dic_ents) {
386: return anthy_get_nth_dic_ent_str_of_ext_ent(se, orig,
387: n - se->nr_dic_ents, x);
388: }
389: x->len = se->dic_ents[n]->str.len;
390: x->str = anthy_xstr_dup_str(&se->dic_ents[n]->str);
391: return 0;
392: }
393:
394: int
395: anthy_get_nth_dic_ent_is_compound(seq_ent_t se, int nth)
396: {
397: if (!se) {
398: return 0;
399: }
400: if (nth >= se->nr_dic_ents) {
401: return 0;
402: }
403: return se->dic_ents[nth]->is_compound;
404: }
405:
406: int
407: anthy_get_nth_dic_ent_freq(seq_ent_t se, int nth)
408: {
409: struct seq_ent *s = se;
410: if (!s) {
411: return 0;
412: }
413: if (!s->dic_ents) {
414: return anthy_get_nth_dic_ent_freq_of_ext_ent(se, nth);
415: }
416: if (s->nr_dic_ents <= nth) {
417: return anthy_get_nth_dic_ent_freq_of_ext_ent(se, nth - se->nr_dic_ents);
418: }
419: return s->dic_ents[nth]->freq;
420: }
421:
422: int
423: anthy_get_nth_dic_ent_wtype(seq_ent_t se, xstr *xs,
424: int n, wtype_t *w)
425: {
426: struct seq_ent *s = se;
427: if (!s) {
428: *w = anthy_wt_none;
429: return -1;
430: }
431: if (s->nr_dic_ents <= n) {
432: int r;
433: r = anthy_get_nth_dic_ent_wtype_of_ext_ent(xs, n - s->nr_dic_ents, w);
434: if (r == -1) {
435: *w = anthy_wt_none;
436: }
437: return r;
438: }
439: *w = s->dic_ents[n]->type;
440: return 0;
441: }
442:
443: int
444: anthy_get_seq_ent_pos(seq_ent_t se, int pos)
445: {
446: int i, v=0;
447: struct seq_ent *s = se;
448: if (!s) {
449: return 0;
450: }
451: if (s->nr_dic_ents == 0) {
452: return anthy_get_ext_seq_ent_pos(se, pos);
453: }
454: for (i = 0; i < s->nr_dic_ents; i++) {
455: if (anthy_wtype_get_pos(s->dic_ents[i]->type) == pos) {
456: v += s->dic_ents[i]->freq;
457: if (v == 0) {
458: v = 1;
459: }
460: }
461: }
462: return v;
463: }
464:
465: int
466: anthy_get_seq_ent_ct(seq_ent_t se, int pos, int ct)
467: {
468: int i, v=0;
469: struct seq_ent *s = se;
470: if (!s) {
471: return 0;
472: }
473: if (s->nr_dic_ents == 0) {
474: return anthy_get_ext_seq_ent_ct(s, pos, ct);
475: }
476: for (i = 0; i < s->nr_dic_ents; i++) {
477: if (anthy_wtype_get_pos(s->dic_ents[i]->type)== pos &&
478: anthy_wtype_get_ct(s->dic_ents[i]->type)==ct) {
479: v += s->dic_ents[i]->freq;
480: if (v == 0) {
481: v = 1;
482: }
483: }
484: }
485: return v;
486: }
487:
488:
489:
490:
491: int
492: anthy_get_seq_ent_wtype_freq(seq_ent_t seq, wtype_t wt)
493: {
494: int i, f;
495:
496: if (!seq) {
497: return 0;
498: }
499:
500: if (seq->nr_dic_ents == 0) {
501: return anthy_get_ext_seq_ent_wtype(seq, wt);
502: }
503:
504: f = 0;
505:
506: for (i = 0; i < seq->nr_dic_ents; i++) {
507: if (seq->dic_ents[i]->order == 0 &&
508: anthy_wtype_include(wt, seq->dic_ents[i]->type)) {
509: if (f < seq->dic_ents[i]->freq) {
510: f = seq->dic_ents[i]->freq;
511: }
512: }
513: }
514: return f;
515: }
516:
517:
518:
519:
520: int
521: anthy_get_seq_ent_wtype_compound_freq(seq_ent_t se, wtype_t wt)
522: {
523: int i,f;
524: struct seq_ent *s = se;
525: if (!s) {
526: return 0;
527: }
528:
529: f = 0;
530: for (i = 0; i < s->nr_dic_ents; i++) {
531: if (!anthy_get_nth_dic_ent_is_compound(se, i)) {
532: continue;
533: }
534: if (anthy_wtype_include(wt, s->dic_ents[i]->type)) {
535: if (f < s->dic_ents[i]->freq) {
536: f = s->dic_ents[i]->freq;
537: }
538: }
539: }
540: return f;
541: }
542:
543: int
544: anthy_get_seq_ent_indep(seq_ent_t se)
545: {
546: int i;
547: struct seq_ent *s = se;
548: if (!s) {
549: return 0;
550: }
551: if (s->nr_dic_ents == 0) {
552: return anthy_get_ext_seq_ent_indep(s);
553: }
554: for (i = 0; i < s->nr_dic_ents; i++) {
555: if (anthy_wtype_get_indep(s->dic_ents[i]->type)) {
556: return 1;
557: }
558: }
559: return 0;
560: }
561:
562: int
563: anthy_has_compound_ents(seq_ent_t se)
564: {
565: if (!se) {
566: return 0;
567: }
568: return se->nr_compound_ents;
569: }
570:
571:
572: int
573: anthy_has_non_compound_ents(seq_ent_t se)
574: {