1:
2:
3:
4:
5:
6:
7:
8:
9:
10:
11:
12:
13:
14:
15:
16:
17:
18:
19:
20:
21:
22:
23:
24:
25:
26: #include <stdio.h>
27: #include <string.h>
28: #include <stdlib.h>
29:
30: #include <anthy/alloc.h>
31: #include <anthy/conf.h>
32: #include <anthy/ruleparser.h>
33: #include <anthy/xstr.h>
34: #include <anthy/logger.h>
35: #include <anthy/splitter.h>
36: #include <anthy/anthy.h>
37: #include <anthy/depgraph.h>
38: #include <anthy/diclib.h>
39:
40: #ifndef SRCDIR
41: #define SRCDIR "."
42: #endif
43:
44: static int verbose;
45:
46: static struct dep_node* gNodes;
47: static char** gNodeNames;
48: static int nrNodes;
49:
50:
51: static struct wordseq_rule *gRules;
52: static int nrRules;
53:
54: static int
55: get_node_id_by_name(const char *name)
56: {
57: int i;
58:
59: for (i = 0; i < nrNodes; i++) {
60: if (!strcmp(name,gNodeNames[i])) {
61: return i;
62: }
63: }
64:
65: gNodes = realloc(gNodes, sizeof(struct dep_node)*(nrNodes+1));
66: gNodeNames = realloc(gNodeNames, sizeof(char*)*(nrNodes+1));
67: gNodes[nrNodes].nr_branch = 0;
68: gNodes[nrNodes].branch = 0;
69: gNodeNames[nrNodes] = strdup(name);
70: nrNodes++;
71: return nrNodes-1;
72: }
73:
74:
75:
76: static struct dep_branch *
77: find_branch(struct dep_node *node, xstr **strs, int nr_strs)
78: {
79: struct dep_branch *db;
80: int i, j;
81:
82: for (i = 0; i < node->nr_branch; i++) {
83: db = &node->branch[i];
84: if (nr_strs != db->nr_strs) {
85: continue ;
86: }
87: for (j = 0; j < nr_strs; j++) {
88: if (anthy_xstrcmp(db->str[j], strs[j])) {
89: goto fail;
90: }
91: }
92:
93: return db;
94: fail:;
95: }
96:
97: node->branch = realloc(node->branch,
98: sizeof(struct dep_branch)*(node->nr_branch+1));
99: db = &node->branch[node->nr_branch];
100: node->nr_branch++;
101: db->str = malloc(sizeof(xstr*)*nr_strs);
102: for (i = 0; i < nr_strs; i++) {
103: db->str[i] = strs[i];
104: }
105: db->nr_strs = nr_strs;
106: db->nr_transitions = 0;
107: db->transition = 0;
108: return db;
109: }
110:
111:
112:
113:
114:
115: static void
116: parse_transition(char *token, struct dep_transition *tr)
117: {
118: int ct = CT_NONE;
119: int pos = POS_NONE;
120: enum dep_class dc = DEP_NONE;
121: char *str = token;
122: tr->head_pos = POS_NONE;
123: tr->weak = 0;
124:
125: while (*token != '@') {
126: switch(*token){
127: case ':':
128: case '.':
129: tr->weak = 1;
130: break;
131: case 'C':
132:
133: switch (token[1]) {
134: case 'z': ct = CT_MIZEN; break;
135: case 'y': ct = CT_RENYOU; break;
136: case 's': ct = CT_SYUSI; break;
137: case 't': ct = CT_RENTAI; break;
138: case 'k': ct = CT_KATEI; break;
139: case 'm': ct = CT_MEIREI; break;
140: case 'g': ct = CT_HEAD; break;
141: }
142: token ++;
143: break;
144: case 'H':
145:
146: switch (token[1]) {
147: case 'n': tr->head_pos = POS_NOUN; break;
148: case 'v': tr->head_pos = POS_V; break;
149: case 'j': tr->head_pos = POS_AJV; break;
150: }
151: token ++;
152: break;
153: case 'S':
154:
155: switch (token[1]) {
156:
157: case 'f': dc = DEP_FUZOKUGO; break;
158: case 'k': dc = DEP_KAKUJOSHI; break;
159: case 'y': dc = DEP_RENYOU; break;
160: case 't': dc = DEP_RENTAI; break;
161: case 'e': dc = DEP_END; break;
162: case 'r': dc = DEP_RAW; break;
163: default: printf("unknown (S%c)\n", token[1]);
164: }
165: token ++;
166: break;
167: default:
168: printf("Unknown (%c) %s\n", *token, str);
169: break;
170: }
171: token ++;
172: }
173:
174: tr->next_node = get_node_id_by_name(token);
175:
176: tr->pos = pos;
177: tr->ct = ct;
178: tr->dc = dc;
179: }
180:
181:
182:
183:
184: static void
185: parse_dep(char **tokens, int nr)
186: {
187: int id, row = 0;
188: struct dep_branch *db;
189: struct dep_node *dn;
190: int nr_strs;
191: xstr **strs = alloca(sizeof(xstr*) * nr);
192:
193:
194: id = get_node_id_by_name(tokens[row]);
195: dn = &gNodes[id];
196: row ++;
197:
198: nr_strs = 0;
199:
200:
201: for (; row < nr && tokens[row][0] == '\"'; row++) {
202: char *s;
203: s = strdup(&tokens[row][1]);
204: s[strlen(s)-1] =0;
205: strs[nr_strs] = anthy_cstr_to_xstr(s, ANTHY_EUC_JP_ENCODING);
206: nr_strs ++;
207: free(s);
208: }
209:
210:
211: if (nr_strs == 0) {
212: char *s;
213: anthy_log(0, "node %s has a branch without any transition condition.\n",
214: tokens[0]);
215: s = strdup("");
216: strs[0] = anthy_cstr_to_xstr(s, ANTHY_EUC_JP_ENCODING);
217: nr_strs = 1;
218: free(s);
219: }
220:
221:
222: db = find_branch(dn, strs, nr_strs);
223: for ( ; row < nr; row++){
224: struct dep_transition *tr;
225: db->transition = realloc(db->transition,
226: sizeof(struct dep_transition)*
227: (db->nr_transitions+1));
228: tr = &db->transition[db->nr_transitions];
229: parse_transition(tokens[row], tr);
230: db->nr_transitions ++;
231: }
232: }
233:
234:
235: static void
236: check_nodes(void)
237: {
238: int i;
239: for (i = 1; i < nrNodes; i++) {
240: if (gNodes[i].nr_branch == 0) {
241: anthy_log(0, "node %s has no branch.\n", gNodeNames);
242: }
243: }
244: }
245:
246:
247: static int
248: init_depword_tab(void)
249: {
250: const char *fn;
251: char **tokens;
252: int nr;
253:
254:
255: get_node_id_by_name("@");
256:
257:
258: fn = anthy_conf_get_str("DEPWORD");
259: if (!fn) {
260: anthy_log(0, "Dependent word dictionary is unspecified.\n");
261: return -1;
262: }
263: if (anthy_open_file(fn) == -1) {
264: anthy_log(0, "Failed to open dep word dict (%s).\n", fn);
265: return -1;
266: }
267:
268: while (!anthy_read_line(&tokens, &nr)) {
269: parse_dep(tokens, nr);
270: anthy_free_line();
271: }
272: anthy_close_file();
273: check_nodes();
274: return 0;
275: }
276:
277:
278: static void
279: parse_indep(char **tokens, int nr)
280: {
281: if (nr < 2) {
282: printf("Syntex error in indepword defs"
283: " :%d.\n", anthy_get_line_number());
284: return ;
285: }
286: gRules = realloc(gRules, sizeof(struct wordseq_rule)*(nrRules+1));
287:
288:
289: gRules[nrRules].wt = anthy_init_wtype_by_name(tokens[0]);
290:
291:
292: gRules[nrRules].node_id = get_node_id_by_name(tokens[1]);
293:
294: if (verbose) {
295: printf("%d (%s)\n", nrRules, tokens[0]);
296: }
297:
298: nrRules ++;
299: }
300:
301:
302: static int
303: init_indep_word_seq_tab(void)
304: {
305: const char *fn;
306: char **tokens;
307: int nr;
308:
309: fn = anthy_conf_get_str("INDEPWORD");
310: if (!fn){
311: printf("independent word dict unspecified.\n");
312: return -1;
313: }
314: if (anthy_open_file(fn) == -1) {
315: printf("Failed to open indep word dict (%s).\n", fn);
316: return -1;
317: }
318:
319: while (!anthy_read_line(&tokens, &nr)) {
320: parse_indep(tokens, nr);
321: anthy_free_line();
322: }
323: anthy_close_file();
324:
325: return 0;
326: }
327:
328:
329:
330:
331: static void
332: write_nl(FILE* fp, int i)
333: {
334: i = anthy_dic_htonl(i);
335: fwrite(&i, sizeof(int), 1, fp);
336: }
337:
338: static void
339: write_transition(FILE* fp, struct dep_transition* transition)
340: {
341: write_nl(fp, transition->next_node);
342: write_nl(fp, transition->pos);
343: write_nl(fp, transition->ct);
344: write_nl(fp, transition->dc);
345: write_nl(fp, transition->head_pos);
346: write_nl(fp, transition->weak);
347: }
348:
349: static void
350: write_xstr(FILE* fp, xstr* str)
351: {
352: int i;
353: xchar c;
354: write_nl(fp, str->len);
355:
356: for (i = 0; i < str->len; i++) {
357: c = anthy_dic_htonl(str->str[i]);
358: fwrite(&c, sizeof(xchar), 1, fp);
359: }
360: }
361:
362: static void
363: write_branch(FILE* fp, struct dep_branch* branch)
364: {
365: int i;
366:
367: write_nl(fp, branch->nr_strs);
368: for (i = 0; i < branch->nr_strs; ++i) {
369: write_xstr(fp, branch->str[i]);
370: }
371:
372: write_nl(fp, branch->nr_transitions);
373: for (i = 0; i < branch->nr_transitions; ++i) {
374: write_transition(fp, &branch->transition[i]);
375: }
376: }
377:
378: static void
379: write_node(FILE* fp, struct dep_node* node)
380: {
381: int i;
382: write_nl(fp, node->nr_branch);
383: for (i = 0; i < node->nr_branch; ++i) {
384: write_branch(fp, &node->branch[i]);
385: }
386: }
387:
388: static void
389: write_wtype(FILE *fp, wtype_t wt)
390: {
391: fputc(anthy_wtype_get_pos(wt), fp);
392: fputc(anthy_wtype_get_cos(wt), fp);
393: fputc(anthy_wtype_get_scos(wt), fp);
394: fputc(anthy_wtype_get_cc(wt), fp);
395: fputc(anthy_wtype_get_ct(wt), fp);
396: fputc(anthy_wtype_get_wf(wt), fp);
397: fputc(0, fp);
398: fputc(0, fp);
399: }
400:
401: static void
402: write_file(const char* file_name)
403: {
404: int i;
405: FILE* fp = fopen(file_name, "w");
406: int* node_offset = malloc(sizeof(int) * nrNodes);
407:
408:
409: write_nl(fp, nrRules);
410: for (i = 0; i < nrRules; ++i) {
411: write_wtype(fp, gRules[i].wt);
412: write_nl(fp, gRules[i].node_id);
413: }
414:
415: write_nl(fp, nrNodes);
416:
417: for (i = 0; i < nrNodes; ++i) {
418: write_node(fp, &gNodes[i]);
419: }
420:
421: free(node_offset);
422: fclose(fp);
423: }
424:
425: int
426: main(int argc, char* argv[])
427: {
428:
429: anthy_conf_override("CONFFILE", "../anthy-conf");
430: anthy_conf_override("ANTHYDIR", SRCDIR "/../depgraph/");
431:
432: anthy_init_wtypes();
433: anthy_do_conf_init();
434:
435: init_depword_tab();
436:
437: init_indep_word_seq_tab();
438:
439: write_file("anthy.dep");
440:
441: return 0;
442: }