1:
2:
3:
4:
5:
6:
7:
8:
9: #include <stdio.h>
10: #include <stdlib.h>
11: #include <string.h>
12: #include <math.h>
13: #include "input_set.h"
14:
15: #define HASH_SIZE 1024
16:
17: struct int_map_node {
18: int key;
19: int val;
20: struct int_map_node *next;
21: };
22:
23: struct int_map {
24:
25: int nr;
26: struct int_map_node *hash_head;
27:
28: int array_size;
29: struct int_map_node **array;
30: };
31:
32: struct input_set {
33:
34: struct input_line *lines;
35: struct input_line *buckets[HASH_SIZE];
36:
37: struct int_map *feature_freq;
38: };
39:
40: static int
41: line_hash(const int *ar, int nr)
42: {
43: int i;
44: unsigned h = 0;
45: for (i = 0; i < nr; i++) {
46: h += ar[i];
47: }
48: return (h % HASH_SIZE);
49: }
50:
51: static struct input_line *
52: find_same_line(struct input_set *is, int *features, int nr)
53: {
54: struct input_line *il;
55: int h = line_hash(features, nr);
56: for (il = is->buckets[h]; il; il = il->next_in_hash) {
57: int i;
58: if (il->nr_features != nr) {
59: continue;
60: }
61: for (i = 0; i < nr; i++) {
62: if (il->features[i] != features[i]) {
63: break;
64: }
65: }
66: if (i >= nr) {
67: return il;
68: }
69: }
70: return NULL;
71: }
72:
73: static struct input_line *
74: add_line(struct input_set *is, int *features, int nr)
75: {
76: int i, h;
77: struct input_line *il;
78: il = malloc(sizeof(struct input_line));
79: il->nr_features = nr;
80: il->features = malloc(sizeof(int) * nr);
81: for (i = 0; i < nr; i++) {
82: il->features[i] = features[i];
83: }
84: il->weight = 0;
85: il->negative_weight = 0;
86:
87: il->next_line = is->lines;
88: is->lines = il;
89:
90: h = line_hash(features, nr);
91: il->next_in_hash = is->buckets[h];
92: is->buckets[h] = il;
93: return il;
94: }
95:
96: static void
97: add_feature_count(struct int_map *im, int nr, int *features, int weight)
98: {
99: int i;
100: for (i = 0; i < nr; i++) {
101: int f = features[i];
102: int v = int_map_peek(im, f);
103: int_map_set(im, f, v + weight);
104: }
105: }
106:
107:
108: void
109: input_set_set_features(struct input_set *is, int *features,
110: int nr, int weight)
111: {
112: struct input_line *il;
113: int abs_weight = abs(weight);
114:
115:
116: il = find_same_line(is, features, nr);
117: if (!il) {
118: il = add_line(is, features, nr);
119: }
120:
121: if (weight > 0) {
122: il->weight += weight;
123: } else {
124: il->negative_weight += abs_weight;
125: }
126:
127: add_feature_count(is->feature_freq, nr, features, abs_weight);
128: }
129:
130: struct input_set *
131: input_set_create(void)
132: {
133: int i;
134: struct input_set *is;
135: is = malloc(sizeof(struct input_set));
136: is->lines = NULL;
137:
138: for (i = 0; i < HASH_SIZE; i++) {
139: is->buckets[i] = NULL;
140: }
141:
142: is->feature_freq = int_map_new();
143:
144: return is;
145: }
146:
147: struct input_line *
148: input_set_get_input_line(struct input_set *is)
149: {
150: return is->lines;
151: }
152:
153: struct input_set *
154: input_set_filter(struct input_set *is,
155: double pos, double neg)
156: {
157: struct input_set *new_is = input_set_create();
158: struct input_line *il;
159: for (il = is->lines; il; il = il->next_line) {
160: if (il->weight > pos ||
161: il->negative_weight > neg) {
162: input_set_set_features(new_is, il->features,
163: il->nr_features, il->weight);
164: input_set_set_features(new_is, il->features,
165: il->nr_features, -il->negative_weight);
166: }
167: }
168: return new_is;
169: }
170:
171: void
172: input_set_output_feature_freq(FILE *fp, struct input_set *is)
173: {
174: int i;
175: struct int_map *im = is->feature_freq;
176: fprintf(fp, "0 %d\n", im->array_size);
177: int_map_flatten(im);
178: for (i = 0; i < im->array_size; i++) {
179: if (!im->array[i]) {
180: fprintf(fp, "0 0\n");
181: } else {
182: fprintf(fp, "%d %d\n", im->array[i]->key,
183: im->array[i]->val);
184: }
185: }
186: }
187:
188: struct int_map *
189: int_map_new(void)
190: {
191: int i;
192: struct int_map *im = malloc(sizeof(struct int_map));
193: im->nr = 0;
194: im->hash_head = malloc(sizeof(struct int_map_node) * HASH_SIZE);
195: for (i = 0; i < HASH_SIZE; i++) {
196: im->hash_head[i].next = NULL;
197: }
198: return im;
199: }
200:
201: static int
202: node_index(int idx)
203: {
204: return idx % HASH_SIZE;
205: }
206:
207: static struct int_map_node *
208: find_int_map_node(struct int_map *im, int idx)
209: {
210: struct int_map_node *node;
211: int h = node_index(idx);
212: for (node = im->hash_head[h].next; node; node = node->next) {
213: if (node->key == idx) {
214: return node;
215: }
216: }
217: return NULL;
218: }
219:
220: int
221: int_map_peek(struct int_map *im, int idx)
222: {
223: struct int_map_node *node = find_int_map_node(im, idx);
224: if (node) {
225: return node->val;
226: }
227: return 0;
228: }
229:
230: void
231: int_map_set(struct int_map *im, int idx, int val)
232: {
233: struct int_map_node *node = find_int_map_node(im, idx);
234: int h;
235: if (node) {
236: node->val = val;
237: return ;
238: }
239:
240: node = malloc(sizeof(struct int_map_node));
241: node->key = idx;
242: node->val = val;
243: h = node_index(idx);
244: node->next = im->hash_head[h].next;
245: im->hash_head[h].next = node;
246:
247: im->nr ++;
248: }
249:
250: void
251: int_map_flatten(struct int_map *im)
252: {
253: int i;
254: struct int_map_node *node;
255: int max_n = 0;
256:
257: im->array_size = im->nr * 2;
258: im->array = malloc(sizeof(struct int_map_node *) *
259: im->array_size);
260: for (i = 0; i < im->array_size; i++) {
261: im->array[i] = NULL;
262: }
263:
264: for (i = 0; i < HASH_SIZE; i++) {
265: for (node = im->hash_head[i].next; node; node = node->next) {
266: int n = 0;
267: while (1) {
268: int h;
269: h = node->key + n;
270: h %= im->array_size;
271: if (!im->array[h]) {
272: im->array[h] = node;
273: break;
274: }
275:
276: n++;
277: }
278: if (n > max_n) {
279: max_n = n;
280: }
281: }
282: }
283:
284: printf(" max_collision=%d\n", max_n);
285: }