1:
2:
3:
4:
5:
6:
7:
8:
9:
10:
11:
12:
13:
14:
15:
16:
17:
18:
19:
20:
21:
22:
23:
24:
25:
26:
27:
28:
29:
30:
31:
32:
33:
34:
35: #include <sys/types.h>
36: #include <unistd.h>
37: #include <stdio.h>
38: #include <stdlib.h>
39: #include <string.h>
40: #include <errno.h>
41: #include <ctype.h>
42:
43: #include <config.h>
44:
45: #include <anthy/anthy.h>
46: #include <anthy/xstr.h>
47: #include <anthy/wtype.h>
48: #include <anthy/ruleparser.h>
49: #include <anthy/word_dic.h>
50: #include <anthy/diclib.h>
51: #include "mkdic.h"
52:
53: #define MAX_LINE_LEN 10240
54: #define NR_HEADER_SECTIONS 16
55: #define SECTION_ALIGNMENT 8
56: #define MAX_WTYPE_LEN 20
57:
58: #define DEFAULT_FN "anthy.wdic"
59:
60: static const char *progname;
61:
62:
63: FILE *yomi_entry_index_out, *yomi_entry_out;
64: FILE *page_out, *page_index_out;
65:
66: static FILE *uc_out;
67: static FILE *yomi_hash_out;
68:
69: static int yomi_hash_collision;
70:
71:
72: struct file_section {
73: FILE **fpp;
74: char *fn;
75: } file_array[] = {
76: {&yomi_entry_index_out, NULL},
77: {&yomi_entry_out, NULL},
78: {&page_out, NULL},
79: {&page_index_out, NULL},
80: {&uc_out, NULL},
81: {&yomi_hash_out, NULL},
82: {NULL, NULL},
83: };
84:
85:
86: struct mkdic_stat {
87:
88: struct yomi_entry_list yl;
89:
90: struct adjust_command ac_list;
91:
92: struct uc_dict *ud;
93:
94: const char *output_fn;
95:
96: int input_encoding;
97:
98: int nr_excluded;
99: char **excluded_wtypes;
100: };
101:
102:
103: static void
104: open_output_files(void)
105: {
106: struct file_section *fs;
107: for (fs = file_array; fs->fpp; fs ++) {
108: char *tmpdir = getenv("TMPDIR");
109: fs->fn = NULL;
110: if (tmpdir) {
111:
112: char buf[256];
113: int fd = -1;
114: snprintf(buf, sizeof(buf), "%s/mkanthydic.XXXXXX", tmpdir);
115: fd = mkstemp(buf);
116: if (fd == -1) {
117: *(fs->fpp) = NULL;
118: } else {
119: *(fs->fpp) = fdopen(fd, "w+");
120: fs->fn = strdup(buf);
121: }
122: } else {
123: *(fs->fpp) = tmpfile();
124: }
125:
126: if (!(*(fs->fpp))) {
127: fprintf (stderr, "%s: cannot open temporary file: %s\n",
128: progname, strerror (errno));
129: exit (2);
130: }
131: }
132: }
133:
134:
135: static void
136: flush_output_files (void)
137: {
138: struct file_section *fs;
139: for (fs = file_array; fs->fpp; fs ++) {
140: if (ferror(*(fs->fpp))) {
141: fprintf (stderr, "%s: write error\n", progname);
142: exit (1);
143: }
144: }
145: for (fs = file_array; fs->fpp; fs ++) {
146: if (fflush(*(fs->fpp))) {
147: fprintf (stderr, "%s: write error: %s\n", progname, strerror (errno));
148: exit (1);
149: }
150: }
151: }
152:
153:
154: void
155: write_nl(FILE *fp, int i)
156: {
157: i = anthy_dic_htonl(i);
158: fwrite(&i, sizeof(int), 1, fp);
159: }
160:
161: static void
162: print_usage(void)
163: {
164: printf("please do not use mkanthydic command directly.\n");
165: exit(0);
166: }
167:
168: static char *
169: read_line(FILE *fp, char *buf)
170: {
171:
172: int toolong = 0;
173:
174: while (fgets(buf, MAX_LINE_LEN, fp)) {
175: int len = strlen(buf);
176: if (buf[0] == '#') {
177: continue ;
178: }
179: if (buf[len - 1] != '\n') {
180: toolong = 1;
181: continue ;
182: }
183:
184: buf[len - 1] = 0;
185: if (toolong) {
186: toolong = 0;
187: } else {
188: return buf;
189: }
190: }
191: return NULL;
192: }
193:
194:
195: static xstr *
196: get_index_from_line(struct mkdic_stat *mds, char *buf)
197: {
198: char *sp;
199: xstr *xs;
200: sp = strchr(buf, ' ');
201: if (!sp) {
202:
203: return NULL;
204: }
205: *sp = 0;
206: xs = anthy_cstr_to_xstr(buf, mds->input_encoding);
207: *sp = ' ';
208: return xs;
209: }
210:
211:
212: static char *
213: get_entry_from_line(char *buf)
214: {
215: char *sp;
216: sp = strchr(buf, ' ');
217: while(*sp == ' ') {
218: sp ++;
219: }
220: return strdup(sp);
221: }
222:
223: static int
224: index_hash(xstr *xs)
225: {
226: int i;
227: unsigned int h = 0;
228: for (i = 0; i < xs->len; i++) {
229: h += xs->str[i] * 11;
230: }
231: return (int)(h % YOMI_HASH);
232: }
233:
234: const char *
235: get_wt_name(const char *name)
236: {
237: wtype_t dummy;
238: const char *res;
239: if (!strcmp(name, "#T35")) {
240: return "#T";
241: }
242: res = anthy_type_to_wtype(name, &dummy);
243: if (!res) {
244: return "unknown";
245: }
246: return res;
247: }
248:
249:
250: static void
251: push_back_word_entry(struct mkdic_stat *mds,
252: struct yomi_entry *ye, const char *wt_name,
253: int freq, const char *word, int order)
254: {
255: wtype_t wt;
256: char *s;
257: if (freq == 0) {
258: return ;
259: }
260: if (!anthy_type_to_wtype(wt_name, &wt)) {
261:
262: return ;
263: }
264: ye->entries = realloc(ye->entries,
265: sizeof(struct word_entry) *
266: (ye->nr_entries + 1));
267: ye->entries[ye->nr_entries].ye = ye;
268: ye->entries[ye->nr_entries].wt_name = get_wt_name(wt_name);
269: ye->entries[ye->nr_entries].raw_freq = freq;
270: ye->entries[ye->nr_entries].feature = 0;
271: ye->entries[ye->nr_entries].source_order = order;
272: if (mds->input_encoding == ANTHY_EUC_JP_ENCODING) {
273: s = anthy_conv_euc_to_utf8(word);
274: } else {
275: s = strdup(word);
276: }
277: ye->entries[ye->nr_entries].word_utf8 = s;
278: ye->nr_entries ++;
279: }
280:
281: static int
282: parse_wtype(char *wtbuf, char *cur)
283: {
284:
285: char *t;
286: int freq;
287: if (strlen(cur) >= MAX_WTYPE_LEN) {
288: return 0;
289: }
290: strcpy(wtbuf, cur);
291:
292: t = strchr(wtbuf, '*');
293: freq = 1;
294: if (t) {
295: int tmp_freq;
296: *t = 0;
297: t++;
298: tmp_freq = atoi(t);
299: if (tmp_freq) {
300: freq = tmp_freq;
301: }
302: }
303: return freq;
304: }
305:
306:
307: static int
308: get_element_len(xchar xc)
309: {
310: if (xc > '0' && xc <= '9') {
311: return xc - '0';
312: }
313: if (xc >= 'a' && xc <= 'z') {
314: return xc - 'a' + 10;
315: }
316: return 0;
317: }
318:
319:
320: static int
321: check_compound_candidate(struct mkdic_stat *mds, xstr *index, const char *cur)
322: {
323:
324: xstr *xs = anthy_cstr_to_xstr(cur, mds->input_encoding);
325: int i, total = 0;
326: for (i = 0; i < xs->len - 1; i++) {
327: if (xs->str[i] == '_') {
328: total += get_element_len(xs->str[i+1]);
329: }
330: }
331: anthy_free_xstr(xs);
332:
333: if (total != index->len) {
334: fprintf(stderr, "Invalid compound candidate (%s, length = %d).\n",
335: cur, total);
336: return 0;
337: }
338: return 1;
339: }
340:
341: static int
342: is_excluded_wtype(struct mkdic_stat *mds, char *wt)
343: {
344: int i;
345: for (i = 0; i < mds->nr_excluded; i++) {
346: if (!strcmp(mds->excluded_wtypes[i], wt)) {
347: return 1;
348: }
349: }
350: return 0;
351: }
352:
353: static char *
354: find_token_end(char *cur)
355: {
356: char *n;
357: for (n = cur; *n != ' ' && *n != '\0'; n++) {
358: if (*n == '\\') {
359: if (!n[1]) {
360: return NULL;
361: }
362: n++;
363: }
364: }
365: return n;
366: }
367:
368:
369: static void
370: push_back_word_entry_line(struct mkdic_stat *mds, struct yomi_entry *ye,
371: const char *ent)
372: {
373: char *buf = alloca(strlen(ent) + 1);
374: char *cur = buf;
375: char *n;
376: char wtbuf[MAX_WTYPE_LEN];
377: int freq = 0;
378: int order = 0;
379:
380: strcpy(buf, ent);
381: wtbuf[0] = 0;
382:
383: while (1) {
384:
385: n = find_token_end(cur);
386: if (!n) {
387: fprintf(stderr, "invalid \\ at the end of line (%s).\n",
388: ent);
389: return ;
390: }
391: if (*n) {
392: *n = 0;
393: } else {
394: n = NULL;
395: }
396:
397: if (cur[0] == '#') {
398: if (isalpha((unsigned char)cur[1])) {
399:
400: freq = parse_wtype(wtbuf, cur);
401: } else {
402: if (cur[1] == '_' &&
403: check_compound_candidate(mds, ye->index_xstr, &cur[1])) {
404:
405: push_back_word_entry(mds, ye, wtbuf, freq, cur, order);
406: order ++;
407: }
408: }
409: } else {
410:
411: if (!is_excluded_wtype(mds, wtbuf)) {
412:
413: push_back_word_entry(mds, ye, wtbuf, freq, cur, order);
414: order ++;
415: }
416:
417:
418:
419:
420: }
421: if (!n) {
422:
423: return ;
424: }
425: cur = n;
426: cur ++;
427: }
428: }
429:
430:
431: static int
432: check_same_word(struct yomi_entry *ye, int idx)
433: {
434: struct word_entry *base = &ye->entries[idx];
435: int i;
436: for (i = idx -1; i >= 0; i--) {
437: struct word_entry *cur = &ye->entries[i];
438: if (base->raw_freq != cur->raw_freq) {
439: return 0;
440: }
441: if (strcmp(base->wt_name, cur->wt_name)) {
442: return 0;
443: }
444: if (strcmp(base->word_utf8, cur->word_utf8)) {
445: return 0;
446: }
447:
448: return 1;
449: }
450: return 0;
451: }
452:
453:
454: static int
455: compare_word_entry_by_freq(const void *p1, const void *p2)
456: {
457: const struct word_entry *e1 = p1;
458: const struct word_entry *e2 = p2;
459: return e2->raw_freq - e1->raw_freq;
460: }
461:
462:
463: static int
464: compare_word_entry_by_wtype(const void *p1, const void *p2)
465: {
466: const struct word_entry *e1 = p1;
467: const struct word_entry *e2 = p2;
468: int ret = strcmp(e1->wt_name, e2->wt_name);
469: if (ret != 0) {
470: return ret;
471: } else {
472: return compare_word_entry_by_freq(p1, p2);
473: }
474: }
475:
476:
477: static int
478: normalize_word_entry(struct yomi_entry *ye)
479: {
480: int i, nr_dup = 0;
481: if (!ye) {
482: return 0;
483: }
484:
485: qsort(ye->entries, ye->nr_entries,
486: sizeof(struct word_entry),
487: compare_word_entry_by_freq);
488:
489: for (i = 0; i < ye->nr_entries; i++) {
490: if (check_same_word(ye, i)) {
491: ye->entries[i].raw_freq = 0;
492: nr_dup ++;
493: }
494: }
495:
496: qsort(ye->entries, ye->nr_entries,
497: sizeof(struct word_entry),
498: compare_word_entry_by_wtype);
499: return ye->nr_entries - nr_dup;
500: }
501:
502:
503:
504: struct yomi_entry *
505: find_yomi_entry(struct yomi_entry_list *yl, xstr *index, int create)
506: {
507: struct yomi_entry *ye;
508: int hash = index_hash(index);
509: int search = 0;
510:
511: for (ye = yl->hash[hash];ye ; ye = ye->hash_next) {
512: search ++;
513: if (!anthy_xstrcmp(ye->index_xstr, index)) {
514: return ye;
515: }
516: }
517: if (!create) {
518: return NULL;
519: }
520:
521:
522: ye = malloc(sizeof(struct yomi_entry));
523: ye->nr_entries = 0;
524: ye->entries = 0;
525: ye->next = NULL;
526: ye->index_xstr = anthy_xstr_dup(index);
527: ye->index_str = NULL;
528:
529:
530: ye->hash_next = yl->hash[hash];
531: yl->hash[hash] = ye;
532:
533:
534:
535: ye->next = yl->head;
536: yl->head = ye;
537:
538: yl->nr_entries ++;
539:
540: return ye;
541: }
542:
543:
544: static void
545: mark_hash_array(unsigned char *hash_array, xstr *xs)
546: {
547: int val, idx, bit, mask;
548: val = anthy_xstr_hash(xs);
549: val &= (YOMI_HASH_ARRAY_SIZE*YOMI_HASH_ARRAY_BITS-1);
550: idx=(val>>YOMI_HASH_ARRAY_SHIFT)&(YOMI_HASH_ARRAY_SIZE-1);
551: bit= val & ((1<<YOMI_HASH_ARRAY_SHIFT)-1);
552: mask = (1<<bit);
553: if (hash_array[idx] & mask) {
554: yomi_hash_collision ++;
555: }
556: hash_array[idx] |= mask;
557: }
558:
559:
560: static void
561: mk_yomi_hash(FILE *yomi_hash_out, struct yomi_entry_list *yl)
562: {
563: unsigned char *hash_array;
564: int i;
565: struct yomi_entry *ye;
566: hash_array = (unsigned char *)malloc(YOMI_HASH_ARRAY_SIZE);
567: for (i = 0; i < YOMI_HASH_ARRAY_SIZE; i++) {
568: hash_array[i] = 0;
569: }
570: for (i = 0; i < yl->nr_valid_entries; i++) {
571: ye = yl->ye_array[i];
572: mark_hash_array(hash_array, ye->index_xstr);
573: }
574: fwrite(hash_array, YOMI_HASH_ARRAY_SIZE, 1, yomi_hash_out);
575: printf("generated yomi hash bitmap (%d collisions/%d entries)\n",
576: yomi_hash_collision, yl->nr_valid_entries);
577:
578: }
579:
580: static struct adjust_command *
581: parse_modify_freq_command(const char *buf)
582: {
583: char *line = alloca(strlen(buf) + 1);
584: char *yomi, *wt, *word, *type_str;
585: struct adjust_command *cmd;
586: int type = 0;
587: strcpy(line, buf);
588: yomi = strtok(line, " ");
589: wt = strtok(NULL, " ");
590: word = strtok(NULL<