(linenum→info "unix/slp.c:2238")

anthy/9100e/calctrans/corpus.c

    1: /*
    2:  * 例文の中を検索できるデータ構造を作る
    3:  * 現時点では例文をすべて入れているが、そのうちフィルターすることも考えられる
    4:  *
    5:  * Copyright (C) 2007 TABATA Yusuke
    6:  *
    7:  */
    8: /*
    9:   This library is free software; you can redistribute it and/or
   10:   modify it under the terms of the GNU Lesser General Public
   11:   License as published by the Free Software Foundation; either
   12:   version 2 of the License, or (at your option) any later version.
   13: 
   14:   This library is distributed in the hope that it will be useful,
   15:   but WITHOUT ANY WARRANTY; without even the implied warranty of
   16:   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   17:   Lesser General Public License for more details.
   18: 
   19:   You should have received a copy of the GNU Lesser General Public
   20:   License along with this library; if not, write to the Free Software
   21:   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307  USA
   22:  */
   23: #include <stdio.h>
   24: #include <string.h>
   25: #include <stdlib.h>
   26: 
   27: #include <anthy/corpus.h>
   28: 
   29: #define MAX_NR_VAL 8
   30: #define BUCKET_SIZE 8192
   31: #define MAX_COLLISION 8
   32: 
   33: /* word in source */
   34: struct node {
   35:   int nr;
   36:   int val[MAX_NR_VAL];
   37:   int flags;
   38: };
   39: 
   40: /* word feature in corpus file */
   41: struct element {
   42:   /* hash値 */
   43:   int val;
   44:   /* 有効な(ELM_INVALIDの無い)エントリとしてのindex */
   45:   int idx;
   46:   /* このhash値の次の出現場所 */
   47:   int next_idx;
   48:   /**/
   49:   int flags;
   50: };
   51: 
   52: /* index */
   53: struct bucket {
   54:   /* 検索のキー */
   55:   int key;
   56:   /* 最初の出現場所 */
   57:   int first_idx;
   58:   /**/
   59:   int last_idx;
   60: };
   61: 
   62: struct corpus {
   63:   /**/
   64:   struct node *array;
   65:   int nr_node;
   66:   int array_size;
   67: 
   68:   /**/
   69:   int nr_values;
   70:   struct element *elms;
   71:   /**/
   72:   int nr_buckets;
   73:   struct bucket *buckets;
   74: 
   75:   /**/
   76:   int bucket_collision;
   77: };
   78: 
   79: static void
   80: corpus_setup_bucket(struct corpus *c, int nr)
   81: {
   82:   int i;
   83:   free(c->buckets);
   84:   /**/
   85:   c->nr_buckets = nr;
   86:   c->buckets = malloc(sizeof(struct bucket) * nr);
   87:   for (i = 0; i < nr; i++) {
   88:     c->buckets[i].key = -1;
   89:     c->buckets[i].first_idx = -1;
   90:     c->buckets[i].last_idx = -1;
   91:   }
   92: }
   93: 
   94: struct corpus *
   95: corpus_new(void)
   96: {
   97:   struct corpus *c = malloc(sizeof(*c));
   98:   c->nr_node = 0;
   99:   c->array_size = 0;
  100:   c->array = NULL;
  101:   c->nr_values = 0;
  102:   c->elms = NULL;
  103:   c->buckets = NULL;
  104:   c->bucket_collision = 0;
  105:   return c;
  106: }
  107: 
  108: static void
  109: corpus_ensure_array(struct corpus *c, int nr)
  110: {
  111:   int i, size;
  112:   if (c->array_size >= nr) {
  113:     return ;
  114:   }
  115:   size = nr * 2;
  116:   c->array = realloc(c->array, sizeof(struct node) * size);
  117:   for (i = c->array_size; i < size; i++) {
  118:     c->array[i].nr = 0;
  119:   }
  120:   c->array_size = nr;
  121: }
  122: 
  123: void
  124: corpus_dump(struct corpus *c)
  125: {
  126:   int i;
  127:   for (i = 0; i < c->nr_values; i++) {
  128:     if (c->elms[i].flags & ELM_WORD_BORDER) {
  129:       printf("%d:", i);
  130:     }
  131:     printf("%d(%d) ", c->elms[i].val, c->elms[i].next_idx);
  132:   }
  133:   printf("\n");
  134: }
  135: 
  136: static int
  137: count_nr_valid_values(struct corpus *c)
  138: {
  139:   int i;
  140:   int nr = 0;
  141:   for (i = 0; i < c->nr_node; i++) {
  142:     struct node *nd = &c->array[i];
  143:     if (nd->flags & ELM_INVALID) {
  144:       continue;
  145:     }
  146:     nr += nd->nr;
  147:   }
  148:   return nr;
  149: }
  150: 
  151: static void
  152: corpus_build_flatten(struct corpus *c)
  153: {
  154:   int i, j;
  155:   int idx = 0;
  156:   int nr_valid_elms = count_nr_valid_values(c);
  157:   c->elms = malloc(sizeof(struct element) * nr_valid_elms);
  158:   for (i = 0; i < c->nr_node; i++) {
  159:     struct node *nd = &c->array[i];
  160:     if (nd->flags & ELM_INVALID) {
  161:       continue;
  162:     }
  163:     for (j = 0; j < nd->nr; j++) {
  164:       c->elms[idx].val = nd->val[j];
  165:       c->elms[idx].next_idx = -1;
  166:       c->elms[idx].flags = nd->flags;
  167:       if (j == 0) {
  168:         c->elms[idx].flags |= ELM_WORD_BORDER;
  169:       }
  170:       c->elms[idx].idx = idx;
  171:       idx++;
  172:     }
  173:   }
  174: }
  175: 
  176: static struct bucket *
  177: find_bucket(struct corpus *c, int val)
  178: {
  179:   int i;
  180:   int h = val % c->nr_buckets;
  181:   for (i = 0; i < MAX_COLLISION; i++) {
  182:     struct bucket *bkt = &c->buckets[h];
  183:     if (bkt->key == val) {
  184:       return bkt;
  185:     }
  186:     if (bkt->key == -1) {
  187:       bkt->key = val;
  188:       return bkt;
  189:     }
  190:     /**/
  191:     h ++;
  192:     h %= c->nr_buckets;
  193:   }
  194:   c->bucket_collision ++;
  195:   return NULL;
  196: }
  197: 
  198: static void
  199: corpus_build_link(struct corpus *c)
  200: {
  201:   int i;
  202:   for (i = 0; i < c->nr_values; i++) {
  203:     struct element *elm = &c->elms[i];
  204:     struct bucket *bkt = find_bucket(c, elm->val);
  205:     if (!bkt) {
  206:       continue;
  207:     }
  208:     if (bkt->first_idx < 0) {
  209:       /* 最初の出現 */
  210:       bkt->first_idx = c->elms[i].idx;
  211:     } else {
  212:       c->elms[bkt->last_idx].next_idx = c->elms[i].idx;
  213:     }
  214:     bkt->last_idx = c->elms[i].idx;
  215:     c->elms[i].next_idx = -1;
  216:   }
  217: }
  218: 
  219: void
  220: corpus_build(struct corpus *c)
  221: {
  222:   corpus_setup_bucket(c, BUCKET_SIZE);
  223:   /**/
  224:   corpus_build_flatten(c);
  225:   corpus_build_link(c);
  226:   /**/
  227: }
  228: 
  229: void
  230: corpus_push_back(struct corpus *c, int *val, int nr, int flags)
  231: {
  232:   struct node nd;
  233:   int i;
  234:   for (i = 0; i < nr; i++) {
  235:     nd.val[i] = val[i];
  236:   }
  237:   nd.nr = nr;
  238:   nd.flags = flags;
  239:   /**/
  240:   corpus_ensure_array(c, c->nr_node+1);
  241:   c->array[c->nr_node] = nd;
  242:   c->nr_node++;
  243:   c->nr_values += nd.nr;
  244: }
  245: 
  246: void
  247: corpus_write_bucket(FILE *fp, struct corpus *c)
  248: {
  249:   int i;
  250:   fprintf(fp, "0 %d\n", c->nr_buckets);
  251:   for (i = 0; i < c->nr_buckets; i++) {
  252:     fprintf(fp, "%d,%d\n", (c->buckets[i].key & CORPUS_KEY_MASK),
  253:             c->buckets[i].first_idx);
  254:   }
  255:   printf(" %d collisions in corpus bucket\n", c->bucket_collision);
  256: }
  257: 
  258: void
  259: corpus_write_array(FILE *fp, struct corpus *c)
  260: {
  261:   int i;
  262:   fprintf(fp, "0 %d\n", c->nr_values);
  263:   for (i = 0; i < c->nr_values; i++) {
  264:     struct element *elm = &c->elms[i];
  265:     int val;
  266:     val = elm->val;
  267:     val &= CORPUS_KEY_MASK;
  268:     if (elm->flags & ELM_BOS) {
  269:       val |= ELM_BOS;
  270:     }
  271:     if (elm->flags & ELM_WORD_BORDER) {
  272:       val |= ELM_WORD_BORDER;
  273:     }
  274:     fprintf(fp, "%d,%d\n", val,
  275:             c->elms[i].next_idx);
  276:   }
  277: }
Syntax (Markdown)