1:
2:
3:
4:
5:
6:
7:
8:
9:
10:
11:
12:
13:
14:
15:
16:
17:
18:
19:
20:
21:
22:
23:
24:
25:
26:
27:
28:
29:
30:
31:
32:
33:
34: #include <stdio.h>
35: #include <string.h>
36: #include <stdlib.h>
37: #include <math.h>
38:
39: #include <anthy/anthy.h>
40: #include <anthy/xstr.h>
41: #include <anthy/feature_set.h>
42: #include <anthy/diclib.h>
43: #include "input_set.h"
44: #include <anthy/corpus.h>
45:
46: #define FEATURE_SET_SIZE NR_EM_FEATURES
47:
48: #define ARRAY_SIZE 16
49:
50: struct array {
51: int len;
52: int f[ARRAY_SIZE];
53: };
54:
55: #define MAX_SEGMENT 64
56:
57: struct segment_info {
58: int orig_hash;
59: int hash;
60: };
61:
62: struct sentence_info {
63: int nr_segments;
64: struct segment_info segs[MAX_SEGMENT];
65: };
66:
67:
68: struct input_info {
69:
70: struct input_set *cand_is;
71:
72: struct input_set *seg_is;
73:
74: struct corpus *indep_corpus;
75:
76:
77: struct array missed_cand_features;
78:
79:
80: int nth_input_file;
81:
82:
83: int nr_sentences;
84: int nr_connections;
85: };
86:
87: static struct input_info *
88: init_input_info(void)
89: {
90: struct input_info *m;
91: m = malloc(sizeof(struct input_info));
92: m->seg_is = input_set_create();
93: m->cand_is = input_set_create();
94: m->indep_corpus = corpus_new();
95: m->missed_cand_features.len = 0;
96: m->nth_input_file = 0;
97: m->nr_sentences = 0;
98: m->nr_connections = 0;
99: return m;
100: }
101:
102:
103: static void
104: parse_features(struct array *features, char *s)
105: {
106: char *tok, *str = s;
107: tok = strtok(str, ",");
108: features->len = 0;
109: do {
110: features->f[features->len] = atoi(tok);
111: features->len++;
112: tok = strtok(NULL, ",");
113: } while(tok);
114: }
115:
116: static void
117: add_seg_struct_info(struct input_info *m,
118: struct array *features,
119: int weight)
120: {
121: input_set_set_features(m->cand_is, features->f, features->len, weight);
122: }
123:
124: static void
125: set_hash(struct sentence_info *sinfo, int error_class,
126: char tag, int hash)
127: {
128: if (tag == '~') {
129: sinfo->segs[sinfo->nr_segments].orig_hash = hash;
130: } else {
131: sinfo->segs[sinfo->nr_segments].hash = hash;
132: }
133: if (!error_class) {
134: sinfo->nr_segments++;
135: }
136: }
137:
138: static int
139: compare_array(struct array *a1, struct array *a2)
140: {
141: int i;
142: if (a1->len != a2->len) {
143: return 1;
144: }
145: for (i = 0; i < a1->len; i++) {
146: if (a1->f[i] != a2->f[i]) {
147: return 1;
148: }
149: }
150: return 0;
151: }
152:
153:
154: static void
155: parse_indep(struct input_info *m, struct sentence_info *sinfo,
156: char *line, char *buf, int error_class)
157: {
158: struct array features;
159: char *s;
160: int weight = 1;
161:
162: s = strstr(buf, "features=");
163: if (s) {
164: s += 9;
165: parse_features(&features, s);
166: m->nr_connections ++;
167: }
168: s = strstr(buf, "hash=");
169: if (s) {
170: s += 5;
171: set_hash(sinfo, error_class, line[0], atoi(s));
172: }
173:
174:
175: if (error_class) {
176: if (line[0] == '~') {
177:
178: m->missed_cand_features = features;
179: }
180: if (line[0] == '!') {
181:
182: input_set_set_features(m->seg_is, features.f, features.len, -weight);
183: }
184: } else {
185:
186: input_set_set_features(m->seg_is, features.f, features.len, weight);
187:
188: if (m->missed_cand_features.len != 0 &&
189: compare_array(&features, &m->missed_cand_features)) {
190:
191: add_seg_struct_info(m, &m->missed_cand_features, -weight);
192: }
193: m->missed_cand_features.len = 0;
194: add_seg_struct_info(m, &features, weight);
195: }
196: }
197:
198: static void
199: init_sentence_info(struct sentence_info *sinfo)
200: {
201: int i;
202: sinfo->nr_segments = 0;
203: for (i = 0; i < MAX_SEGMENT; i++) {
204: sinfo->segs[i].orig_hash = 0;
205: sinfo->segs[i].hash = 0;
206: }
207: }
208:
209:
210:
211: static void
212: complete_sentence_info(struct input_info *m, struct sentence_info *sinfo)
213: {
214: int i;
215: if (m->nth_input_file > 0) {
216:
217: return ;
218: }
219: for (i = 0; i < sinfo->nr_segments; i++) {
220: int flags = ELM_NONE;
221: int nr = 1;
222: int buf[2];
223: if (i == 0) {
224: flags |= ELM_BOS;
225: }
226:
227: buf[0] = sinfo->segs[i].hash;
228: if (sinfo->segs[i].orig_hash) {
229:
230:
231:
232:
233: }
234: corpus_push_back(m->indep_corpus, buf, nr, flags);
235: }
236: }
237:
238: static void
239: do_read_file(struct input_info *m, FILE *fp)
240: {
241: char line[1024];
242: struct sentence_info sinfo;
243:
244: init_sentence_info(&sinfo);
245:
246: while (fgets(line, 1024, fp)) {
247: char *buf = line;
248: int error_class = 0;
249: if (!strncmp(buf, "eos", 3)) {
250: m->nr_sentences ++;
251: complete_sentence_info(m, &sinfo);
252: init_sentence_info(&sinfo);
253: }
254: if (line[0] == '~' || line[0] == '!' ||
255: line[0] == '^') {
256: buf ++;
257: error_class = 1;
258: }
259: if (!strncmp(buf, "indep_word", 10) ||
260: !strncmp(buf, "eos", 3)) {
261: parse_indep(m, &sinfo, line, buf, error_class);
262: }
263: }
264: }
265:
266: static void
267: read_file(struct input_info *m, char *fn)
268: {
269: FILE *ifp;
270: ifp = fopen(fn, "r");
271: if (!ifp) {
272: return ;
273: }
274: do_read_file(m, ifp);
275: fclose(ifp);
276: }
277:
278: static void
279: write_nl(FILE *fp, int i)
280: {
281: i = anthy_dic_htonl(i);
282: fwrite(&i, sizeof(int), 1, fp);
283: }
284:
285: static void
286: dump_line(FILE *ofp, struct input_line *il)
287: {
288: int i;
289: for (i = 0; i < FEATURE_SET_SIZE || i < il->nr_features; i++) {
290: if (i) {
291: fprintf(ofp, ", ");
292: }
293: if (i < il->nr_features) {
294: fprintf(ofp, "%d", il->features[i]);
295: } else {
296: fprintf(ofp, "0");
297: }
298: }
299: fprintf(ofp,",%d,%d\n", (int)il->negative_weight, (int)il->weight);
300: }
301:
302: static int
303: compare_line(const void *p1, const void *p2)
304: {
305: const struct input_line *const *il1 = p1;
306: const struct input_line *const *il2 = p2;
307: int i;
308: for (i = 0; i < (*il1)->nr_features &&
309: i < (*il2)->nr_features; i++) {
310: if ((*il1)->features[i] !=
311: (*il2)->features[i]) {
312: return (*il1)->features[i] - (*il2)->features[i];
313: }
314: }
315: return (*il1)->nr_features - (*il2)->nr_features;
316: }
317:
318: static void
319: dump_features(FILE *ofp, struct input_set *is)
320: {
321: struct input_line *il, **lines;
322: int i, nr = 0;
323: int weight = 0;
324:
325:
326: for (il = input_set_get_input_line(is); il; il = il->next_line) {
327: nr ++;
328: weight += (int)il->weight;
329: }
330:
331: lines = malloc(sizeof(struct input_line *) * nr);
332: for (il = input_set_get_input_line(is), i = 0; i < nr;
333: i++, il = il->next_line) {
334: lines[i] = il;
335: }
336:
337: qsort(lines, nr, sizeof(struct input_line *), compare_line);
338:
339: fprintf(ofp, "%d %d total_line_weight,count\n", weight, nr);
340:
341: for (i = 0; i < nr; i++) {
342: dump_line(ofp, lines[i]);
343: }
344: }
345:
346: static void
347: dump_input_info(FILE *ofp, struct input_info *m)
348: {
349: fprintf(ofp, "section anthy.trans_info ");
350: dump_features(ofp, m->seg_is);
351: fprintf(ofp, "section anthy.cand_info ");
352: dump_features(ofp, m->cand_is);
353: fprintf(ofp, "section anthy.corpus_bucket ");
354: corpus_write_bucket(ofp, m->indep_corpus);
355: fprintf(ofp, "section anthy.corpus_array ");
356: corpus_write_array(ofp, m->indep_corpus);
357:
358: fprintf(ofp, "section anthy.feature_info ");
359: input_set_output_feature_freq(ofp, m->seg_is);
360: }
361:
362: static void
363: convert_line(FILE *ofp, char *buf)
364: {
365: char *tok;
366: tok = strtok(buf, ",");
367: do {
368: int n = atoi(tok);
369: write_nl(ofp, n);
370: tok = strtok(NULL, ",");
371: } while (tok);
372: }
373:
374: static void
375: convert_file(FILE *ifp)
376: {
377: char buf[1024];
378: FILE *ofp = NULL;
379: while (fgets(buf, 1024, ifp)) {
380:
381: if (buf[0] == '#') {
382: continue;
383: }
384: if (!strncmp("section", buf, 7)) {
385: int w, n, i;
386: char fn[1024];
387: if (ofp) {
388: fclose(ofp);
389: ofp = NULL;
390: }
391: sscanf(buf, "section %s %d %d", fn, &w, &n);
392: ofp = fopen(fn, "w");
393: if (!ofp) {
394: fprintf(stderr, "failed to open (%s)\n", fn);
395: abort();
396: }
397: write_nl(ofp, w);
398: write_nl(ofp, n);
399: for (i = 0; i < NR_EM_FEATURES; i++) {
400: write_nl(ofp, 0);
401: }
402: } else {
403: convert_line(ofp, buf);
404: }
405: }
406: if (ofp) {
407: fclose(ofp);
408: }
409: }
410:
411: static void
412: convert_data(int nr_fn, char **fns)
413: {
414: FILE *ifp;
415: int i;
416:
417: for (i = 0; i < nr_fn; i++) {
418: ifp = fopen(fns[i], "r");
419: if (!ifp) {
420: fprintf(stderr, "failed to open (%s)\n", fns[i]);
421: continue;
422: }
423: convert_file(ifp);
424: fclose(ifp);
425: }
426: }
427:
428:
429: #define STRING_HASH_SIZE 256
430: struct string_node {
431: int key;
432: char *str;
433: struct string_node *next_hash;
434: };
435: struct string_pool {
436: int nr;
437: struct string_node hash[STRING_HASH_SIZE];
438: struct string_node **array;
439: };
440: struct resize_info {
441: char *indep;
442: int valid;
443: };
444: struct extract_stat {
445: int nr;
446: struct resize_info info[MAX_SEGMENT];
447: };
448:
449: static void
450: string_pool_init(struct string_pool *sp)
451: {
452: int i;
453: for (i = 0; i < STRING_HASH_SIZE; i++) {
454: sp->hash[i].next_hash = NULL;
455: }
456: sp->nr = 0;
457: }
458:
459: static int
460: compare_string_node(const void *p1, const void *p2)
461: {
462: const struct string_node *const *n1 = p1;
463: const struct string_node *const *n2 = p2;
464: return (*n1)->key -(*n2)->key;
465: }
466:
467: static void
468: string_pool_sort(struct string_pool *sp)
469: {
470: int idx, h;
471: sp->array = malloc(sizeof(struct string_node *) * sp->nr);
472: for (idx = 0, h = 0; h < STRING_HASH_SIZE; h++) {
473: struct string_node *node;
474: for (node = sp->hash[h].next_hash; node; node = node->next_hash) {
475: sp->array[idx] = node;
476: idx ++;
477: }
478: }
479:
480: qsort(sp->array, sp->nr, sizeof(struct string_node *), compare_string_node);
481: }
482:
483: static void
484: string_pool_dump(FILE *ofp, struct string_pool *sp)
485: {
486: int i;
487: fprintf(ofp, "section anthy.weak_words 0 %d\n", sp->nr);
488: for (i = 0; i < sp->nr; i++) {
489: fprintf(ofp, "%d\n", sp->array[i]->key);
490: }
491: }
492:
493: static unsigned int
494: string_hash(const unsigned char *str)
495: {
496: unsigned int h = 0;
497: while (*str) {
498: h += *str;
499: h *= 13;
500: str ++;
501: }
502: return h % STRING_HASH_SIZE;
503: }
504:
505: static struct string_node *
506: find_string_node(struct string_pool *sp, const char *str)
507: {
508: int h = (int)string_hash((const unsigned char *)str);
509: struct string_node *node;
510: for (node = sp->hash[h].next_hash; node; node = node->next_hash) {
511: if (!strcmp(str, node->str)) {
512: return node;
513: }
514: }
515:
516: node = malloc(sizeof(*node));
517: node->str = strdup(str);
518: node->key = 0;
519: node->next_hash = sp->hash[h].next_hash;
520: sp->hash[h].next_hash = node;
521: sp->nr ++;
522: return node;
523: }
524:
525: static void
526: flush_extract_stat(struct extract_stat *es, struct string_pool *sp)
527: {
528: int i;
529: for (i = 0; i < es->nr; i++) {
530: if (es->info[i].valid) {
531: struct string_node *node;
532: node = find_string_node(sp, es->info[i].indep);
533: if (node->key == 0) {
534: xstr *xs = anthy_cstr_to_xstr(node->str, ANTHY_EUC_JP_ENCODING);
535: node->key = anthy_xstr_hash(xs);
536: anthy_free_xstr(xs);
537: }
538:
539: }
540: free(es->info[i].indep);
541: es->info[i].indep = NULL;
542: }
543: es->nr = 0;
544: }
545:
546: static char *
547: get_indep_part(char *buf)
548: {
549: int len;
550: char *c = strchr(buf, '#');
551: if (!c) {
552: return NULL;
553: }
554: c = strchr(c, ' ');
555: if (!c) {
556: return NULL;
557: }
558: c++;
559: c = strchr(c, ' ');
560: if (!c) {
561: return NULL;
562: }
563: c++;
564: len = strlen(c);
565: c[len-1] = 0;
566: return c;
567: }
568:
569: static