1:
2:
3:
4:
5:
6:
7:
8:
9:
10:
11:
12:
13:
14:
15:
16:
17:
18:
19:
20:
21:
22:
23:
24:
25:
26:
27:
28:
29:
30:
31:
32:
33:
34:
35: #include <stdio.h>
36: #include <stdlib.h>
37: #include <string.h>
38: #include <ctype.h>
39:
40: #include "config.h"
41: #include <anthy/anthy.h>
42: #include <anthy/alloc.h>
43: #include <anthy/dic.h>
44: #include <anthy/word_dic.h>
45: #include <anthy/logger.h>
46: #include <anthy/xstr.h>
47: #include <anthy/diclib.h>
48:
49: #include "dic_main.h"
50: #include "dic_ent.h"
51:
52: #define NO_WORD -1
53:
54: static allocator word_dic_ator;
55:
56: struct lookup_context {
57: struct gang_elm **array;
58: int nr;
59: int nth;
60: int is_reverse;
61: };
62:
63:
64: static int
65: mb_fragment_len(const char *str)
66: {
67: unsigned char c = *((const unsigned char *)str);
68: if (c < 0x80) {
69: return 1;
70: }
71: if (c < 0xe0) {
72: return 2;
73: }
74: if (c < 0xf0) {
75: return 3;
76: }
77: if (c < 0xf8) {
78: return 4;
79: }
80: if (c < 0xfc) {
81: return 5;
82: }
83: return 6;
84: }
85:
86: static int
87: is_printable(char *str)
88: {
89: unsigned char *tmp = (unsigned char *)str;
90: if (*tmp > 31 && *tmp < 127) {
91: return 1;
92: }
93: if (mb_fragment_len(str) > 1) {
94: return 1;
95: }
96: return 0;
97: }
98:
99:
100: static xchar
101: form_mb_char(const char *str)
102: {
103: xchar xc;
104: anthy_utf8_to_ucs4_xchar(str, &xc);
105: return xc;
106: }
107:
108: static int
109: hash(xstr *x)
110: {
111: return anthy_xstr_hash(x)&
112: (YOMI_HASH_ARRAY_SIZE*YOMI_HASH_ARRAY_BITS-1);
113: }
114:
115: static int
116: check_hash_ent(struct word_dic *wdic, xstr *xs)
117: {
118: int val = hash(xs);
119: int idx = (val>>YOMI_HASH_ARRAY_SHIFT)&(YOMI_HASH_ARRAY_SIZE-1);
120: int bit = val & ((1<<YOMI_HASH_ARRAY_SHIFT)-1);
121: return wdic->hash_ent[idx] & (1<<bit);
122: }
123:
124: static int
125: wtype_str_len(const char *str)
126: {
127: int i;
128: for (i = 0; str[i] && str[i]!= ' '; i++);
129: return i;
130: }
131:
132:
133: struct wt_stat {
134: wtype_t wt;
135: const char *wt_name;
136: int feature;
137: int freq;
138: int order_bonus;
139: int offset;
140: const char *line;
141: int encoding;
142: };
143:
144:
145:
146:
147:
148:
149: static const char *
150: parse_wtype_str(struct wt_stat *ws)
151: {
152: int len;
153: char *buf;
154: char *freq_part;
155: char *feature_part;
156: const char *wt_name;
157:
158: len = wtype_str_len(&ws->line[ws->offset]);
159: buf = alloca(len + 1);
160: strncpy(buf, &ws->line[ws->offset], len);
161: buf[len] = 0;
162:
163:
164: feature_part = strchr(buf, ',');
165: if (feature_part) {
166: ws->feature = 1;
167: } else {
168: ws->feature = 0;
169: }
170:
171:
172: freq_part = strchr(buf, '*');
173: if (freq_part) {
174: *freq_part = 0;
175: freq_part ++;
176: ws->freq = atoi(freq_part) * FREQ_RATIO;
177: } else {
178: ws->freq = FREQ_RATIO - 2;
179: }
180:
181:
182: wt_name = anthy_type_to_wtype(buf, &ws->wt);
183: if (!wt_name) {
184: ws->wt = anthy_wt_none;
185: }
186: ws->offset += len;
187: return wt_name;
188: }
189:
190:
191: static int
192: normalize_freq(struct wt_stat* ws)
193: {
194: if (ws->freq < 0) {
195: ws->freq *= -1;
196: }
197: return ws->freq + ws->order_bonus;
198: }
199:
200:
201: static void
202: copy_to_buf(char *buf, const char *src, int char_count)
203: {
204: int pos;
205: int i;
206: pos = 0;
207: for (i = 0; i < char_count; i++){
208: if (src[i] == '\\') {
209: if (src[i + 1] == ' ') {
210: i ++;
211: } else if (src[i + 1] == '\\') {
212: i ++;
213: }
214: }
215: buf[pos] = src[i];
216: pos ++;
217: }
218: buf[pos] = 0;
219: }
220:
221:
222: static int
223: add_dic_ent(struct seq_ent *seq, struct wt_stat *ws,
224: xstr* yomi, int is_reverse)
225: {
226: int i;
227:
228: int char_count;
229: char *buf;
230: xstr *xs;
231: int freq;
232: wtype_t w = ws->wt;
233: const char *s = &ws->line[ws->offset];
234:
235:
236: for (i = 0, char_count = 0;
237: s[i] && (s[i] != ' ') && (s[i] != '#'); i++) {
238: char_count ++;
239: if (s[i] == '\\') {
240: char_count++;
241: i++;
242: }
243: }
244:
245:
246: if (!ws->wt_name) {
247: return char_count;
248: }
249:
250:
251: if (!is_reverse && ws->freq < 0) {
252: return char_count;
253: }
254:
255:
256: buf = alloca(char_count+1);
257: copy_to_buf(buf, s, char_count);
258:
259: xs = anthy_cstr_to_xstr(buf, ws->encoding);
260:
261:
262: if (is_reverse && ws->freq > 0) {
263:
264:
265:
266: if (anthy_get_xstr_type(yomi) & XCT_HIRA) {
267: freq = normalize_freq(ws);
268: anthy_mem_dic_push_back_dic_ent(seq, 0, yomi, w,
269: ws->wt_name, freq, 0);
270: }
271: anthy_free_xstr(xs);
272: return char_count;
273: }
274:
275: freq = normalize_freq(ws);
276:
277: anthy_mem_dic_push_back_dic_ent(seq, 0, xs, w, ws->wt_name, freq, 0);
278: if (anthy_wtype_get_meisi(w)) {
279:
280: w = anthy_get_wtype_with_ct(w, CT_MEISIKA);
281: anthy_mem_dic_push_back_dic_ent(seq, 0, xs, w, ws->wt_name, freq, 0);
282: }
283: anthy_free_xstr(xs);
284: return char_count;
285: }
286:
287: static int
288: add_compound_ent(struct seq_ent *seq, struct wt_stat *ws,
289: xstr* yomi,
290: int is_reverse)
291: {
292: int len = wtype_str_len(&ws->line[ws->offset]);
293: char *buf = alloca(len);
294: xstr *xs;
295: int freq;
296:
297: (void)yomi;
298:
299:
300: if (!is_reverse && ws->freq < 0) {
301:
302: return len;
303: }
304:
305:
306: if (is_reverse && ws->freq > 0) {
307:
308:
309:
310:
311:
312:
313:
314:
315:
316:
317:
318: return len;
319: }
320:
321: strncpy(buf, &ws->line[ws->offset + 1], len - 1);
322: buf[len - 1] = 0;
323: xs = anthy_cstr_to_xstr(buf, ws->encoding);
324:
325: freq = normalize_freq(ws);
326: anthy_mem_dic_push_back_dic_ent(seq, 1, xs, ws->wt,
327: ws->wt_name, freq, 0);
328: anthy_free_xstr(xs);
329:
330: return len;
331: }
332:
333: static void
334: init_wt_stat(struct wt_stat *ws, char *line)
335: {
336: ws->wt_name = NULL;
337: ws->freq = 0;
338: ws->feature = 0;
339: ws->order_bonus = 0;
340: ws->offset = 0;
341: ws->line = line;
342: ws->encoding = ANTHY_EUC_JP_ENCODING;
343: if (*(ws->line) == 'u') {
344: ws->encoding = ANTHY_UTF8_ENCODING;
345: ws->line ++;
346: }
347: }
348:
349:
350: static void
351: fill_dic_ent(char *line, struct seq_ent *seq,
352: xstr* yomi, int is_reverse)
353: {
354: struct wt_stat ws;
355: init_wt_stat(&ws, line);
356:
357: while (ws.line[ws.offset]) {
358: if (ws.line[ws.offset] == '#') {
359: if (isalpha(ws.line[ws.offset + 1])) {
360:
361: ws.wt_name = parse_wtype_str(&ws);
362:
363: ws.order_bonus = FREQ_RATIO - 1;
364: } else {
365:
366: ws.offset += add_compound_ent(seq, &ws,
367: yomi,
368: is_reverse);
369: }
370: } else {
371:
372: ws.offset += add_dic_ent(seq, &ws, yomi,
373: is_reverse);
374: if (ws.order_bonus > 0) {
375: ws.order_bonus --;
376: }
377: }
378: if (ws.line[ws.offset] == ' ') {
379: ws.offset++;
380: }
381: }
382: }
383:
384:
385:
386:
387:
388: static int
389: mkxstr(char *s, xstr *x)
390: {
391: int i, len;
392:
393: x->len -= (s[0] - 1);
394: for (i = 1; is_printable(&s[i]); i ++) {
395: len = mb_fragment_len(&s[i]);
396: if (len > 1) {
397:
398: x->str[x->len] = form_mb_char(&s[i]);
399: x->len ++;
400: i += (len - 1);
401: } else {
402:
403: x->str[x->len] = s[i];
404: x->len ++;
405: }
406: }
407: return i;
408: }
409:
410: static int
411: set_next_idx(struct lookup_context *lc)
412: {
413: lc->nth ++;
414: while (lc->nth < lc->nr) {
415: if (lc->array[lc->nth]->tmp.idx != NO_WORD) {
416: return 1;
417: }
418: lc->nth ++;
419: }
420: return 0;
421: }
422:
423:
424: static void
425: search_words_in_page(struct lookup_context *lc, int page, char *s)
426: {
427: int o = 0;
428: xchar *buf;
429: xstr xs;
430: int nr = 0;
431:
432: buf = alloca(sizeof(xchar)*strlen(s)/2);
433: xs.str = buf;
434: xs.len = 0;
435:
436: while (*s) {
437: int r;
438: s += mkxstr(s, &xs);
439: r = anthy_xstrcmp(&xs, &lc->array[lc->nth]->xs);
440: if (!r) {
441: lc->array[lc->nth]->tmp.idx = o + page * WORDS_PER_PAGE;
442: nr ++;
443: if (!set_next_idx(lc)) {
444: return ;
445: }
446:
447: }
448: o ++;
449: }
450: if (nr == 0) {
451:
452: lc->array[lc->nth]->tmp.idx = NO_WORD;
453: set_next_idx(lc);
454: }
455:
456: }
457:
458:
459: static int
460: compare_page_index(struct word_dic *wdic, const char *key, int page)
461: {
462: char buf[100];
463: char *s = &wdic->page[anthy_dic_ntohl(wdic->page_index[page])];
464: int i;
465: s++;
466: for (i = 0; is_printable(&s[i]);) {
467: int j, l = mb_fragment_len(&s[i]);
468: for (j = 0; j < l; j++) {
469: buf[i+j] = s[i+j];
470: }
471: i += l;
472: }
473: buf[i] = 0;
474: return strcmp(key ,buf);
475: }
476:
477:
478: static int
479: get_page_index_search(struct word_dic *wdic, const char *key, int f, int t)
480: {
481:
482: int c,p;
483: c = (f+t)/2;
484: if (f+1==t) {
485: return c;
486: } else {
487: p = compare_page_index(wdic, key, c);
488: if (p < 0) {
489: return get_page_index_search(wdic, key, f, c);
490: } else {
491:
492: return get_page_index_search(wdic, key, c, t);
493: }
494: }
495: }
496:
497:
498:
499:
500: static int
501: get_page_index(struct word_dic *wdic, struct lookup_context *lc)
502: {
503: int page;
504: const char *key = lc->array[lc->nth]->key;
505:
506: if (compare_page_index(wdic, key, 0) < 0) {
507: return -1;
508: }
509:
510: if (compare_page_index(wdic, key, wdic->nr_pages-1) >= 0) {
511: return wdic->nr_pages-1;
512: }
513:
514: page = get_page_index_search(wdic, key, 0, wdic->nr_pages);
515: return page;
516: }
517:
518: static int
519: get_nr_page(struct word_dic *h)
520: {
521: int i;
522: for (i = 1; anthy_dic_ntohl(h->page_index[i]); i++);
523: return i;
524: }
525:
526: static char *
527: get_section(struct word_dic *wdic, int section)
528: {
529: int *p = (int *)wdic->dic_file;
530: int offset = anthy_dic_ntohl(p[section]);
531: return &wdic->dic_file[offset];
532: }
533:
534:
535: static int
536: get_word_dic_sections(struct word_dic *wdic)
537: {
538: wdic->entry_index = (int *)get_section(wdic, 2);
539: wdic->entry = (char *)get_section(wdic, 3);
540: wdic->page = (char *)get_section(wdic, 4);
541: wdic->page_index = (int *)get_section(wdic, 5);
542: wdic->uc_section = (char *)get_section(wdic, 6);
543: wdic->hash_ent = (unsigned char *)get_section(wdic, 7);
544:
545: return 0;
546: }
547:
548:
549: static void
550: search_yomi_index(struct word_dic *wdic, struct lookup_context *lc)
551: {
552: int p;
553: int page_number;
554:
555:
556: if (lc->array[lc->nth]->tmp.idx == NO_WORD) {
557: set_next_idx(lc);
558: return ;
559: }
560:
561: p = get_page_index(wdic, lc);
562: if (