(linenum→info "unix/slp.c:2238")

anthy/9100e/src-worddic/texttrie.c

    1: /*
    2:  * DEPRECATED, it is too hard to debug.
    3:  * you may use textdict instead
    4:  *
    5:  * Trie in Text
    6:  *
    7:  * *issues
    8:  *  +correct API
    9:  *   -iterator vs callback
   10:  *  +robustness
   11:  *   -error detection
   12:  *   -auto correction
   13:  *   -concurrent access
   14:  *  +efficiency
   15:  *   -lower memory consumption
   16:  *   -disk space?
   17:  *
   18:  * on some file system like jffs2 on linux, writable mmap
   19:  * is not allowed, though you can write it.
   20:  *
   21:  */
   22: /*
   23:  * API
   24:  *  anthy_trie_open()
   25:  *  anthy_trie_close()
   26:  *  anthy_trie_add()
   27:  *  anthy_trie_find()
   28:  *  anthy_trie_delete()
   29:  *  anthy_trie_find_next_key()
   30:  *  anthy_trie_find_prefix()
   31:  *  anthy_trie_print_array()
   32:  *
   33:  * Copyright (C) 2005-2006 TABATA Yusuke
   34:  *
   35:  */
   36: /*
   37:   This library is free software; you can redistribute it and/or
   38:   modify it under the terms of the GNU Lesser General Public
   39:   License as published by the Free Software Foundation; either
   40:   version 2 of the License, or (at your option) any later version.
   41: 
   42:   This library is distributed in the hope that it will be useful,
   43:   but WITHOUT ANY WARRANTY; without even the implied warranty of
   44:   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   45:   Lesser General Public License for more details.
   46: 
   47:   You should have received a copy of the GNU Lesser General Public
   48:   License along with this library; if not, write to the Free Software
   49:   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307  USA
   50:  */
   51: /* open & mmap */
   52: #include <unistd.h>
   53: #include <sys/types.h>
   54: #include <sys/stat.h>
   55: #include <fcntl.h>
   56: /**/
   57: #include <stdlib.h>
   58: #include <stdio.h>
   59: #include <string.h>
   60: #include <ctype.h>
   61: #include <anthy/texttrie.h>
   62: #include <anthy/filemap.h>
   63: #include "dic_main.h"
   64: 
   65: /* configs */
   66: #define OBJ_LEN 20
   67: #define LINE_LEN 32
   68: #define EXPAND_COUNT 16
   69: 
   70: /* cell type */
   71: #define TT_SUPER 0
   72: #define TT_UNUSED 1
   73: #define TT_ALLOCED 2
   74: #define TT_NODE 3
   75: #define TT_BODY 4
   76: #define TT_TAIL 5
   77: 
   78: /* cell structure */
   79: struct cell {
   80:   /* (common) type */
   81:   int type;
   82:   /* union */
   83:   union {
   84:     /* unused */
   85:     int next_unused;
   86:     /* super */
   87:     struct {
   88:       int first_unused;
   89:       int root_cell;
   90:       int size;
   91:       int serial;
   92:     } super;
   93:     /* node */
   94:     struct {
   95:       int key;
   96:       int next;
   97:       int child;
   98:       int body;
   99:       int parent;
  100:     } node;
  101:     /* body */
  102:     struct {
  103:       int owner;
  104:       char *obj;
  105:     } body;
  106:     /* tail */
  107:     struct {
  108:       char *obj;
  109:       int prev;
  110:     } tail;
  111:   } u;
  112:   /* body & tail */
  113:   int next_tail;
  114: };
  115: 
  116: struct text_trie {
  117:   /**/
  118:   int fatal;
  119:   /**/
  120:   char *fn;
  121:   FILE *wfp;
  122:   struct filemapping *mapping;
  123:   char *ptr;
  124:   /**/
  125:   struct cell super;
  126:   int valid_super;
  127: };
  128: 
  129: struct path {
  130:   /**/
  131:   const char *key_str;
  132:   /**/
  133:   int max_len;
  134:   int *path;
  135:   int len;
  136:   int cur;
  137: };
  138: 
  139: static void
  140: print_super_cell(struct cell *c)
  141: {
  142:   printf("super first_unused=%d, root_cell=%d, size=%d, serial=%d\n",
  143:          c->u.super.first_unused, c->u.super.root_cell,
  144:          c->u.super.size, c->u.super.serial);
  145: }
  146: 
  147: static void
  148: print_alloced_cell(void)
  149: {
  150:   printf("alloc-ed\n");
  151: }
  152: 
  153: static void
  154: print_node_cell(struct cell *c)
  155: {
  156:   printf("node key=%d", c->u.node.key);
  157:   if (c->u.node.key > 0 && isprint(c->u.node.key)) {
  158:     printf("(%c)", c->u.node.key);
  159:   }
  160:   printf(" parent=%d next=%d child=%d body=%d\n",
  161:          c->u.node.parent, c->u.node.next, c->u.node.child, c->u.node.body);
  162: }
  163: 
  164: static void
  165: print_unused_cell(struct cell *c)
  166: {
  167:   printf("unused next_unused=%d\n",
  168:          c->u.next_unused);
  169: }
  170: 
  171: static void
  172: print_body_cell(struct cell *c)
  173: {
  174:   printf("body object=(%s), owner=%d, next_tail=%d\n",
  175:          c->u.body.obj, c->u.body.owner, c->next_tail);
  176: }
  177: 
  178: static void
  179: print_tail_cell(struct cell *c)
  180: {
  181:   printf("tail object=(%s), prev=%d, next_tail=%d\n",
  182:          c->u.tail.obj, c->u.tail.prev, c->next_tail);
  183: }
  184: 
  185: static void
  186: print_cell(int idx, struct cell *c)
  187: {
  188:   if (!c) {
  189:     printf("idx =%d(null cell):\n", idx);
  190:     return ;
  191:   }
  192:   printf("idx=%d:", idx);
  193:   switch (c->type) {
  194:   case TT_SUPER:
  195:     print_super_cell(c);
  196:     break;
  197:   case TT_ALLOCED:
  198:     print_alloced_cell();
  199:     break;
  200:   case TT_NODE:
  201:     print_node_cell(c);
  202:     break;
  203:   case TT_UNUSED:
  204:     print_unused_cell(c);
  205:     break;
  206:   case TT_BODY:
  207:     print_body_cell(c);
  208:     break;
  209:   case TT_TAIL:
  210:     print_tail_cell(c);
  211:     break;
  212:   default:
  213:     printf("unknown\n");
  214:   }
  215: }
  216: 
  217: static void
  218: path_setup(struct path *path, const char *key, int len, int *buf)
  219: {
  220:   unsigned char *p = (unsigned char *)key;
  221:   path->key_str = key;
  222:   path->max_len = len;
  223:   path->path = buf;
  224:   path->len = 0;
  225:   path->cur = 0;
  226:   /**/
  227:   while (*p) {
  228:     path->path[path->len] = p[0] * 256 + p[1];
  229:     path->len ++;
  230:     p++;
  231:     if (p[0]) {
  232:       p++;
  233:     }
  234:   }
  235: }
  236: 
  237: static void
  238: path_copy_to_str(struct path *path, char *str, int buf_len)
  239: {
  240:   unsigned char *p = (unsigned char *)str;
  241:   int i, o;
  242:   for (i = 0, o = 0; i < path->cur && o < buf_len - 2; i++) {
  243:     p[o] = (path->path[i]>>8)&255;
  244:     p[o+1] = path->path[i]&255;
  245:     o += 2;
  246:   }
  247:   p[o] = 0;
  248: }
  249: 
  250: static int
  251: sput_int(char *buf, int num)
  252: {
  253:   unsigned char *tmp = (unsigned char *)buf;
  254:   tmp[0] = (num>>24)&255;
  255:   tmp[1] = (num>>16)&255;
  256:   tmp[2] = (num>>8)&255;
  257:   tmp[3] = num&255;
  258:   return 4;
  259: }
  260: 
  261: static char *
  262: sget_int(char *buf, int *num)
  263: {
  264:   unsigned int res;
  265:   unsigned char *tmp = (unsigned char *)buf;
  266:   res = 0;
  267:   res += tmp[0] << 24;
  268:   res += tmp[1] << 16;
  269:   res += tmp[2] << 8;
  270:   res += tmp[3];
  271:   *num = res;
  272:   buf += 4;
  273:   return buf;
  274: }
  275: 
  276: static char *
  277: pass_str(char *buf, const char *str)
  278: {
  279:   buf += strlen(str);
  280:   return buf;
  281: }
  282: 
  283: static void
  284: encode_super(struct cell *c, char *buf)
  285: {
  286:   buf += sprintf(buf, "S ");
  287:   buf += sput_int(buf, c->u.super.size);
  288:   buf += sput_int(buf, c->u.super.root_cell);
  289:   buf += sput_int(buf, c->u.super.first_unused);
  290:   buf += sput_int(buf, c->u.super.serial);
  291:   buf += sput_int(buf, LINE_LEN);
  292: }
  293: 
  294: static void
  295: encode_node(struct cell *c, char *buf)
  296: {
  297:   buf += sprintf(buf, "N ");
  298:   buf += sput_int(buf, c->u.node.key);
  299:   buf += sput_int(buf, c->u.node.parent);
  300:   buf += sput_int(buf, c->u.node.next);
  301:   buf += sput_int(buf, c->u.node.child);
  302:   buf += sput_int(buf, c->u.node.body);
  303: }
  304: 
  305: static void
  306: encode_body(struct cell *c, char *buf)
  307: {
  308:   buf += sprintf(buf, "B");
  309:   buf += sput_int(buf, c->next_tail);
  310:   buf += sput_int(buf, c->u.body.owner);
  311:   sprintf(buf, "\"%s\"",
  312:           c->u.body.obj);
  313: }
  314: 
  315: static void
  316: encode_unused(struct cell *c, char *buf)
  317: {
  318:   buf += sprintf(buf, "-next=");
  319:   buf += sput_int(buf, c->u.next_unused);
  320: }
  321: 
  322: static void
  323: encode_tail(struct cell *c, char *buf)
  324: {
  325:   buf += sprintf(buf, "T");
  326:   buf += sput_int(buf, c->u.tail.prev);
  327:   buf += sput_int(buf, c->next_tail);
  328:   sprintf(buf, "\"%s\"",
  329:           c->u.tail.obj);
  330: }
  331: 
  332: static void
  333: encode_unknown(char *buf)
  334: {
  335:   sprintf(buf, "?");
  336: }
  337: 
  338: static void
  339: encode_cell(struct cell *c, char *buf)
  340: {
  341:   switch (c->type) {
  342:   case TT_SUPER:
  343:     encode_super(c, buf);
  344:     break;
  345:   case TT_NODE:
  346:     encode_node(c, buf);
  347:     break;
  348:   case TT_UNUSED:
  349:     encode_unused(c, buf);
  350:     break;
  351:   case TT_BODY:
  352:     encode_body(c, buf);
  353:     break;
  354:   case TT_TAIL:
  355:     encode_tail(c, buf);
  356:     break;
  357:   default:
  358:     encode_unknown(buf);
  359:     break;
  360:   }
  361: }
  362: 
  363: static void
  364: write_back_cell(struct text_trie *tt, struct cell *c, int idx)
  365: {
  366:   int i;
  367:   char buf[LINE_LEN+1];
  368:   /* sanity check */
  369:   if (((anthy_mmap_size(tt->mapping) / LINE_LEN) < (idx + 1)) ||
  370:       idx < 0) {
  371:     return ;
  372:   }
  373:   for (i = 0; i < LINE_LEN; i++) {
  374:     buf[i] = ' ';
  375:   }
  376:   encode_cell(c, buf);
  377:   buf[LINE_LEN-1] = '\n';
  378:   if (anthy_mmap_is_writable(tt->mapping)) {
  379:     memcpy(&tt->ptr[idx*LINE_LEN], buf, LINE_LEN);
  380:   } else {
  381:     fseek(tt->wfp, idx*LINE_LEN, SEEK_SET);
  382:     fwrite(buf, LINE_LEN, 1, tt->wfp);
  383:     fflush(tt->wfp);
  384:   }
  385: }
  386: 
  387: static char *
  388: decode_str(char *raw_buf, int off)
  389: {
  390:   char *head;
  391:   char copy_buf[LINE_LEN + 1];
  392:   char *buf;
  393:   int i;
  394:   /* from off to before last '\n' */
  395:   for (i = 0; i < LINE_LEN - off - 1; i++) {
  396:     copy_buf[i] = raw_buf[i];
  397:   }
  398:   copy_buf[i] = 0;
  399:   buf = copy_buf;
  400:   /* find first double quote */
  401:   while (*buf && *buf != '\"') {
  402:     buf ++;
  403:   }
  404:   if (!*buf) {
  405:     /* cant find double quote */
  406:     return strdup("");
  407:   }
  408:   buf ++;
  409:   head = buf;
  410:   /* go to the end of string */
  411:   while (*buf) {
  412:     buf ++;
  413:   }
  414:   /* find last double quote */
  415:   while (*buf != '\"') {
  416:     buf--;
  417:   }
  418:   *buf = 0;
  419:   /**/
  420:   return strdup(head);
  421: }
  422: 
  423: static void
  424: release_cell_str(struct cell *c)
  425: {
  426:   if (!c) {
  427:     return ;
  428:   }
  429:   if (c->type == TT_BODY) {
  430:     free(c->u.body.obj);
  431:   }
  432:   if (c->type == TT_TAIL) {
  433:     free(c->u.tail.obj);
  434:   }
  435: }
  436: 
  437: static int
  438: decode_super(struct cell *c, char *buf)
  439: {
  440:   c->type = TT_SUPER;
  441:   buf = pass_str(buf, "S ");
  442:   buf = sget_int(buf, &c->u.super.size);
  443:   buf = sget_int(buf, &c->u.super.root_cell);
  444:   buf = sget_int(buf, &c->u.super.first_unused);
  445:   buf = sget_int(buf, &c->u.super.serial);
  446:   return 0;
  447: }
  448: 
  449: static int
  450: decode_unuse(struct cell *c, char *buf)
  451: {
  452:   c->type = TT_UNUSED;
  453:   buf = pass_str(buf, "-next=");
  454:   buf = sget_int(buf, &c->u.next_unused);
  455:   return 0;
  456: }
  457: 
  458: static int
  459: decode_node(struct cell *c, char *buf)
  460: {
  461:   c->type = TT_NODE;
  462:   buf = pass_str(buf, "N ");
  463:   buf = sget_int(buf, &c->u.node.key);
  464:   buf = sget_int(buf, &c->u.node.parent);
  465:   buf = sget_int(buf, &c->u.node.next);
  466:   buf = sget_int(buf, &c->u.node.child);
  467:   buf = sget_int(buf, &c->u.node.body);
  468:   return 0;
  469: }
  470: 
  471: static int
  472: decode_body(struct cell *c, char *buf)
  473: {
  474:   c->type = TT_BODY;
  475:   buf = pass_str(buf, "B");
  476:   buf = sget_int(buf, &c->next_tail);
  477:   buf = sget_int(buf, &c->u.body.owner);
  478:   c->u.body.obj = decode_str(buf, 9);
  479:   return 0;
  480: }
  481: 
  482: static int
  483: decode_tail(struct cell *c, char *buf)
  484: {
  485:   c->type = TT_TAIL;
  486:   buf = pass_str(buf, "T");
  487:   buf = sget_int(buf, &c->u.tail.prev);
  488:   buf = sget_int(buf, &c->next_tail);
  489:   c->u.tail.obj = decode_str(buf, 9);
  490:   return 0;
  491: }
  492: 
  493: static int
  494: decode_alloced(struct cell *c)
  495: {
  496:   c->type = TT_ALLOCED;
  497:   return 0;
  498: }
  499: 
  500: static struct cell *
  501: decode_nth_cell(struct text_trie *tt, struct cell *c, int nth)
  502: {
  503:   int res;
  504:   char *buf;
  505:   if (nth < 0 ||
  506:       (anthy_mmap_size(tt->mapping) / LINE_LEN) <
  507:       (nth + 1)) {
  508:     return NULL;
  509:   }
  510:   buf = &tt->ptr[nth*LINE_LEN];
  511: 
  512:   res = -1;
  513:   switch (buf[0]) {
  514:   case 'S':
  515:     res = decode_super(c, buf);
  516:     break;
  517:   case '-':
  518:     res = decode_unuse(c, buf);
  519:     break;
  520:   case 'N':
  521:     res = decode_node(c, buf);
  522:     break;
  523:   case 'B':
  524:     res = decode_body(c, buf);
  525:     break;
  526:   case 'T':
  527:     res = decode_tail(c, buf);
  528:     break;
  529:   case '?':
  530:     res =  decode_alloced(c);
  531:     break;
  532:   default:
  533:     /*printf("decode fail (nth=%d::%s).\n", nth, buf);*/
  534:     ;
  535:   }
  536:   if (res) {
  537:     c->type = TT_UNUSED;
  538:   }
  539:   return c;
  540: }
  541: 
  542: static struct cell *
  543: decode_nth_node(struct text_trie *tt, struct cell *c, int nth)
  544: {
  545:   if (!decode_nth_cell(tt, c, nth)) {
  546:     return NULL;
  547:   }
  548:   if (c->type != TT_NODE) {
  549:     return NULL;
  550:   }
  551:   return c;
  552: }
  553: 
  554: static int
  555: update_mapping(struct text_trie *tt)
  556: {
  557:   if (tt->mapping) {
  558:     anthy_munmap(tt->mapping);
  559:   }
  560:   tt->mapping = anthy_mmap(tt->fn, 1);
  561:   if (!tt->mapping) {
  562:     /* try to fall back read-only mmap */
  563:     tt->mapping = anthy_mmap(tt->fn, 0);
  564:   }
  565:   if (!tt->mapping) {
  566:     tt->ptr = NULL;
  567:     return 1;
  568:   }
  569:   tt->ptr = anthy_mmap_address(tt->mapping);
  570:   return 0;
  571: }
  572: 
  5