1:
2:
3:
4:
5:
6:
7:
8:
9:
10:
11:
12:
13:
14:
15:
16:
17:
18:
19:
20:
21:
22:
23: #include <stdio.h>
24: #include <string.h>
25: #include <stdlib.h>
26:
27: #include <anthy/corpus.h>
28:
29: #define MAX_NR_VAL 8
30: #define BUCKET_SIZE 8192
31: #define MAX_COLLISION 8
32:
33:
34: struct node {
35: int nr;
36: int val[MAX_NR_VAL];
37: int flags;
38: };
39:
40:
41: struct element {
42:
43: int val;
44:
45: int idx;
46:
47: int next_idx;
48:
49: int flags;
50: };
51:
52:
53: struct bucket {
54:
55: int key;
56:
57: int first_idx;
58:
59: int last_idx;
60: };
61:
62: struct corpus {
63:
64: struct node *array;
65: int nr_node;
66: int array_size;
67:
68:
69: int nr_values;
70: struct element *elms;
71:
72: int nr_buckets;
73: struct bucket *buckets;
74:
75:
76: int bucket_collision;
77: };
78:
79: static void
80: corpus_setup_bucket(struct corpus *c, int nr)
81: {
82: int i;
83: free(c->buckets);
84:
85: c->nr_buckets = nr;
86: c->buckets = malloc(sizeof(struct bucket) * nr);
87: for (i = 0; i < nr; i++) {
88: c->buckets[i].key = -1;
89: c->buckets[i].first_idx = -1;
90: c->buckets[i].last_idx = -1;
91: }
92: }
93:
94: struct corpus *
95: corpus_new(void)
96: {
97: struct corpus *c = malloc(sizeof(*c));
98: c->nr_node = 0;
99: c->array_size = 0;
100: c->array = NULL;
101: c->nr_values = 0;
102: c->elms = NULL;
103: c->buckets = NULL;
104: c->bucket_collision = 0;
105: return c;
106: }
107:
108: static void
109: corpus_ensure_array(struct corpus *c, int nr)
110: {
111: int i, size;
112: if (c->array_size >= nr) {
113: return ;
114: }
115: size = nr * 2;
116: c->array = realloc(c->array, sizeof(struct node) * size);
117: for (i = c->array_size; i < size; i++) {
118: c->array[i].nr = 0;
119: }
120: c->array_size = nr;
121: }
122:
123: void
124: corpus_dump(struct corpus *c)
125: {
126: int i;
127: for (i = 0; i < c->nr_values; i++) {
128: if (c->elms[i].flags & ELM_WORD_BORDER) {
129: printf("%d:", i);
130: }
131: printf("%d(%d) ", c->elms[i].val, c->elms[i].next_idx);
132: }
133: printf("\n");
134: }
135:
136: static int
137: count_nr_valid_values(struct corpus *c)
138: {
139: int i;
140: int nr = 0;
141: for (i = 0; i < c->nr_node; i++) {
142: struct node *nd = &c->array[i];
143: if (nd->flags & ELM_INVALID) {
144: continue;
145: }
146: nr += nd->nr;
147: }
148: return nr;
149: }
150:
151: static void
152: corpus_build_flatten(struct corpus *c)
153: {
154: int i, j;
155: int idx = 0;
156: int nr_valid_elms = count_nr_valid_values(c);
157: c->elms = malloc(sizeof(struct element) * nr_valid_elms);
158: for (i = 0; i < c->nr_node; i++) {
159: struct node *nd = &c->array[i];
160: if (nd->flags & ELM_INVALID) {
161: continue;
162: }
163: for (j = 0; j < nd->nr; j++) {
164: c->elms[idx].val = nd->val[j];
165: c->elms[idx].next_idx = -1;
166: c->elms[idx].flags = nd->flags;
167: if (j == 0) {
168: c->elms[idx].flags |= ELM_WORD_BORDER;
169: }
170: c->elms[idx].idx = idx;
171: idx++;
172: }
173: }
174: }
175:
176: static struct bucket *
177: find_bucket(struct corpus *c, int val)
178: {
179: int i;
180: int h = val % c->nr_buckets;
181: for (i = 0; i < MAX_COLLISION; i++) {
182: struct bucket *bkt = &c->buckets[h];
183: if (bkt->key == val) {
184: return bkt;
185: }
186: if (bkt->key == -1) {
187: bkt->key = val;
188: return bkt;
189: }
190:
191: h ++;
192: h %= c->nr_buckets;
193: }
194: c->bucket_collision ++;
195: return NULL;
196: }
197:
198: static void
199: corpus_build_link(struct corpus *c)
200: {
201: int i;
202: for (i = 0; i < c->nr_values; i++) {
203: struct element *elm = &c->elms[i];
204: struct bucket *bkt = find_bucket(c, elm->val);
205: if (!bkt) {
206: continue;
207: }
208: if (bkt->first_idx < 0) {
209:
210: bkt->first_idx = c->elms[i].idx;
211: } else {
212: c->elms[bkt->last_idx].next_idx = c->elms[i].idx;
213: }
214: bkt->last_idx = c->elms[i].idx;
215: c->elms[i].next_idx = -1;
216: }
217: }
218:
219: void
220: corpus_build(struct corpus *c)
221: {
222: corpus_setup_bucket(c, BUCKET_SIZE);
223:
224: corpus_build_flatten(c);
225: corpus_build_link(c);
226:
227: }
228:
229: void
230: corpus_push_back(struct corpus *c, int *val, int nr, int flags)
231: {
232: struct node nd;
233: int i;
234: for (i = 0; i < nr; i++) {
235: nd.val[i] = val[i];
236: }
237: nd.nr = nr;
238: nd.flags = flags;
239:
240: corpus_ensure_array(c, c->nr_node+1);
241: c->array[c->nr_node] = nd;
242: c->nr_node++;
243: c->nr_values += nd.nr;
244: }
245:
246: void
247: corpus_write_bucket(FILE *fp, struct corpus *c)
248: {
249: int i;
250: fprintf(fp, "0 %d\n", c->nr_buckets);
251: for (i = 0; i < c->nr_buckets; i++) {
252: fprintf(fp, "%d,%d\n", (c->buckets[i].key & CORPUS_KEY_MASK),
253: c->buckets[i].first_idx);
254: }
255: printf(" %d collisions in corpus bucket\n", c->bucket_collision);
256: }
257:
258: void
259: corpus_write_array(FILE *fp, struct corpus *c)
260: {
261: int i;
262: fprintf(fp, "0 %d\n", c->nr_values);
263: for (i = 0; i < c->nr_values; i++) {
264: struct element *elm = &c->elms[i];
265: int val;
266: val = elm->val;
267: val &= CORPUS_KEY_MASK;
268: if (elm->flags & ELM_BOS) {
269: val |= ELM_BOS;
270: }
271: if (elm->flags & ELM_WORD_BORDER) {
272: val |= ELM_WORD_BORDER;
273: }
274: fprintf(fp, "%d,%d\n", val,
275: c->elms[i].next_idx);
276: }
277: }