(linenum→info "unix/slp.c:2238")

gauche/0.8.12/src/hash.c

    1: /*
    2:  * hash.c - hash table implementation
    3:  *
    4:  *   Copyright (c) 2000-2007  Shiro Kawai  <shiro@acm.org>
    5:  * 
    6:  *   Redistribution and use in source and binary forms, with or without
    7:  *   modification, are permitted provided that the following conditions
    8:  *   are met:
    9:  * 
   10:  *   1. Redistributions of source code must retain the above copyright
   11:  *      notice, this list of conditions and the following disclaimer.
   12:  *
   13:  *   2. Redistributions in binary form must reproduce the above copyright
   14:  *      notice, this list of conditions and the following disclaimer in the
   15:  *      documentation and/or other materials provided with the distribution.
   16:  *
   17:  *   3. Neither the name of the authors nor the names of its contributors
   18:  *      may be used to endorse or promote products derived from this
   19:  *      software without specific prior written permission.
   20:  *
   21:  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
   22:  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
   23:  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
   24:  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
   25:  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
   26:  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
   27:  *   TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
   28:  *   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
   29:  *   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
   30:  *   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
   31:  *   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
   32:  *
   33:  *  $Id: hash.c,v 1.58 2007/08/24 23:55:42 shirok Exp $
   34:  */
   35: 
   36: #define LIBGAUCHE_BODY
   37: #include "gauche.h"
   38: #include "gauche/class.h"
   39: 
   40: /*============================================================
   41:  * Internal structures
   42:  */
   43: 
   44: 
   45: /* The beginning of this structure must match ScmDictEntry. */
   46: typedef struct EntryRec {
   47:     intptr_t key;
   48:     intptr_t value;
   49:     struct EntryRec *next;
   50:     u_long   hashval;
   51: } Entry;
   52: 
   53: #define BUCKETS(hc)   ((Entry**)hc->buckets)
   54: 
   55: #define DEFAULT_NUM_BUCKETS    4
   56: #define MAX_AVG_CHAIN_LIMITS   3
   57: #define EXTEND_BITS            2
   58: 
   59: /* We limit hash value to 32bits, for it must be portable across platforms.
   60:    (Especially EQUAL-hash value */
   61: #define HASHMASK  0xffffffffUL
   62: 
   63: typedef Entry *SearchProc(ScmHashCore *core, intptr_t key, ScmDictOp op);
   64: 
   65: static unsigned int round2up(unsigned int val);
   66: 
   67: /*============================================================
   68:  * Hash functions
   69:  */
   70: 
   71: /* Hash function calculates 32bit hash value from the given object.
   72:    HASH2INDEX macro maps the hash value to the bucket number.
   73:    (On 64 bit architecture, it's OK to calculate 64bit, but the
   74:    upper bits are discarded by HASH2INDEX to maintain compatibility. */
   75: 
   76: /* For String
   77:  *
   78:  * Usually, "shift+add" scheme for string hasing works well.  But
   79:  * I found that it works well only if you take the lower bits.
   80:  * Unfortunately, we need to take higher bits for multiplicative
   81:  * hashing of integers and addresses.  So, in HASH2INDEX function,
   82:  * I take both lower bits and higher bits.
   83:  */
   84: 
   85: #define STRING_HASH(hv, chars, size)                                    \
   86:     do {                                                                \
   87:         int i_ = (size);                                                \
   88:         (hv) = 0;                                                       \
   89:         while (i_-- > 0) {                                              \
   90:             (hv) = ((hv)<<5) - (hv) + ((unsigned char)*chars++);        \
   91:         }                                                               \
   92:     } while (0)
   93: 
   94: /* Integer and address. */
   95: /* Integer and address hash is a variation of "multiplicative hashing"
   96:    scheme described in Knuth, TAOCP, section 6.4.  The final shifting
   97:    is done by HASH2INDEX macro  */
   98: 
   99: #define SMALL_INT_HASH(result, val) \
  100:     (result) = ((val)*2654435761UL)
  101: 
  102: #define ADDRESS_HASH(result, val) \
  103:     (result) = (u_long)((SCM_WORD(val) >> 3)*2654435761UL)
  104: 
  105: /* HASH2INDEX
  106:    Map a hash value to bucket number.
  107:    We fix the word length to 32bits, since the multiplication
  108:    constant above is fixed. */
  109: #define HASH2INDEX(tabsiz, bits, hashval) \
  110:     (((hashval)+((hashval)>>(32-(bits)))) & ((tabsiz) - 1))
  111: 
  112: /* Combining two hash values. */
  113: #define COMBINE(hv1, hv2)   ((hv1)*5+(hv2))
  114: 
  115: u_long Scm_EqHash(ScmObj obj)
  116: {
  117:     u_long hashval;
  118:     ADDRESS_HASH(hashval, obj);
  119:     return hashval&HASHMASK;
  120: }
  121: 
  122: u_long Scm_EqvHash(ScmObj obj)
  123: {
  124:     u_long hashval;
  125:     if (SCM_NUMBERP(obj)) {
  126:         if (SCM_INTP(obj)) {
  127:             SMALL_INT_HASH(hashval, SCM_INT_VALUE(obj));
  128:         } else if (SCM_BIGNUMP(obj)) {
  129:             u_int i;
  130:             u_long u = 0;
  131:             for (i=0; i<SCM_BIGNUM_SIZE(obj); i++) {
  132:                 u += SCM_BIGNUM(obj)->values[i];
  133:             }
  134:             SMALL_INT_HASH(hashval, u);
  135:         } else if (SCM_FLONUMP(obj)) {
  136:             /* TODO: I'm not sure this is a good hash. */
  137:             hashval = (u_long)(SCM_FLONUM_VALUE(obj)*2654435761UL);
  138:         } else if (SCM_RATNUMP(obj)) {
  139:             /* Ratnum must be normalized, so we can simply combine
  140:                hashvals of numerator and denominator. */
  141:             u_long h1 = Scm_EqvHash(SCM_RATNUM_NUMER(obj));
  142:             u_long h2 = Scm_EqvHash(SCM_RATNUM_DENOM(obj));
  143:             hashval = COMBINE(h1, h2);
  144:         } else {
  145:             /* TODO: I'm not sure this is a good hash. */
  146:             hashval = (u_long)((SCM_COMPNUM_REAL(obj)+SCM_COMPNUM_IMAG(obj))*2654435761UL);
  147:         }
  148:     } else {
  149:         ADDRESS_HASH(hashval, obj);
  150:     }
  151:     return hashval&HASHMASK;
  152: }
  153: 
  154: /* General hash function */
  155: u_long Scm_Hash(ScmObj obj)
  156: {
  157:     u_long hashval;
  158:     if (!SCM_PTRP(obj)) {
  159:         SMALL_INT_HASH(hashval, (u_long)SCM_WORD(obj));
  160:         return hashval;
  161:     } else if (SCM_NUMBERP(obj)) {
  162:         return Scm_EqvHash(obj);
  163:     } else if (SCM_STRINGP(obj)) {
  164:         goto string_hash;
  165:     } else if (SCM_PAIRP(obj)) {
  166:         u_long h = 0, h2;
  167:         ScmObj cp;
  168:         SCM_FOR_EACH(cp, obj) {
  169:             h2 = Scm_Hash(SCM_CAR(cp));
  170:             h = COMBINE(h, h2);
  171:         }
  172:         h2 = Scm_Hash(cp);
  173:         h = COMBINE(h, h2);
  174:         return h;
  175:     } else if (SCM_VECTORP(obj)) {
  176:         int i, siz = SCM_VECTOR_SIZE(obj);
  177:         u_long h = 0, h2;
  178:         for (i=0; i<siz; i++) {
  179:             h2 = Scm_Hash(SCM_VECTOR_ELEMENT(obj, i));
  180:             h = COMBINE(h, h2);
  181:         }
  182:         return h;
  183:     } else if (SCM_SYMBOLP(obj)) {
  184:         obj = SCM_OBJ(SCM_SYMBOL_NAME(obj));
  185:         goto string_hash;
  186:     } else if (SCM_KEYWORDP(obj)) {
  187:         obj = SCM_OBJ(SCM_KEYWORD_NAME(obj));
  188:         goto string_hash;
  189:     } else {
  190:         /* Call specialized object-hash method */
  191:         ScmObj r = Scm_ApplyRec(SCM_OBJ(&Scm_GenericObjectHash),
  192:                                 SCM_LIST1(obj));
  193:         if (SCM_INTP(r)) {
  194:             return (u_long)SCM_INT_VALUE(r);
  195:         }
  196:         if (SCM_BIGNUMP(r)) {
  197:             /* NB: Scm_GetUInteger clamps the result to [0, ULONG_MAX],
  198:                but taking the LSW would give better distribution. */
  199:             return SCM_BIGNUM(r)->values[0];
  200:         }
  201:         Scm_Error("object-hash returned non-integer: %S", r);
  202:         return 0;               /* dummy */
  203:     }
  204:   string_hash:
  205:     {
  206:         const char *p;
  207:         const ScmStringBody *b = SCM_STRING_BODY(obj);
  208:         p = SCM_STRING_BODY_START(b);
  209:         STRING_HASH(hashval, p, SCM_STRING_BODY_SIZE(b));
  210:         return hashval;
  211:     }
  212: }
  213: 
  214: u_long Scm_HashString(ScmString *str, u_long modulo)
  215: {
  216:     u_long hashval;
  217:     const char *p;
  218:     const ScmStringBody *b = SCM_STRING_BODY(str);
  219:     p = SCM_STRING_BODY_START(b);
  220:     STRING_HASH(hashval, p, SCM_STRING_BODY_SIZE(b));
  221:     return (hashval % modulo);
  222: }
  223: 
  224: /*------------------------------------------------------------
  225:  * Parameterization
  226:  *
  227:  * Conceptually hash tables are parameterized by hash function and
  228:  * compare function.  However, if they are trivial functions, calling
  229:  * them via function pointers incur overhead.  So we layered the
  230:  * parameterization.
  231:  *
  232:  * For the pre-defined simple hash tables, the calls to the hash and
  233:  * compare functions are inlined in a single "access" function.
  234:  * (In this case hashfn and cmpfn are never used.)
  235:  * For the generic hash tables, the general_access function uses
  236:  * the info in hashfn and cmpfn fields.
  237:  *
  238:  * The accessor function takes three arguments.
  239:  *
  240:  *     ScmHashCore *core   : hash table core
  241:  *     intptr_t key        : key
  242:  *     ScmDictOp op        : operation
  243:  */
  244: 
  245: /* NOTE: eq?, eqv?, and string=? hash tables are guaranteed not to
  246:  * throw an error during hash table access (except the case that string=?
  247:  * hash table gets non-string key).  So the caller doesn't need to
  248:  * set unwind handler in case it needs cleanup (like unlocking mutex).
  249:  * However, equal? hash may call back to Scheme method, so it can
  250:  * throw Scheme error.  Be aware of that.
  251:  */
  252: 
  253: /*
  254:  * Common function called when the accessor function needs to add an entry.
  255:  */
  256: static Entry *insert_entry(ScmHashCore *table,
  257:                            intptr_t key,
  258:                            u_long   hashval,
  259:                            int index)
  260: {
  261:     Entry *e = SCM_NEW(Entry);
  262:     Entry **buckets = BUCKETS(table);
  263:     e->key = key;
  264:     e->value = 0;
  265:     e->next = buckets[index];
  266:     e->hashval = hashval;
  267:     buckets[index] = e;
  268:     table->numEntries++;
  269: 
  270:     if (table->numEntries > table->numBuckets*MAX_AVG_CHAIN_LIMITS) {
  271:         /* Extend the table */
  272:         Entry **newb, *f;
  273:         ScmHashIter iter;
  274:         int i, newsize = (table->numBuckets << EXTEND_BITS);
  275:         int newbits = table->numBucketsLog2 + EXTEND_BITS;
  276: 
  277:         newb = SCM_NEW_ARRAY(Entry*, newsize);
  278:         for (i=0; i<newsize; i++) newb[i] = NULL;
  279:         
  280:         Scm_HashIterInit(&iter, table);
  281:         while ((f = (Entry*)Scm_HashIterNext(&iter)) != NULL) {
  282:             index = HASH2INDEX(newsize, newbits, f->hashval);
  283:             f->next = newb[index];
  284:             newb[index] = f;
  285:         }
  286:         table->numBuckets = newsize;
  287:         table->numBucketsLog2 = newbits;
  288:         table->buckets = (void**)newb;
  289:     }
  290:     return e;
  291: }
  292: 
  293: /* NB: Deleting entry E doesn't modify E's key and value, but cut
  294:    the "next" link for the sake of weak-gc robustness.  The hash core
  295:    iterator prefetches a pointer to the next entry, so deleting the
  296:    "current" entry of iteration is safe as far as other iterators
  297:    are running on the same hash table. */
  298: static Entry *delete_entry(ScmHashCore *table,
  299:                            Entry *entry, Entry *prev,
  300:                            int index)
  301: {
  302:     if (prev) prev->next = entry->next;
  303:     else table->buckets[index] = (void*)entry->next;
  304:     table->numEntries--;
  305:     SCM_ASSERT(table->numEntries >= 0);
  306:     entry->next = NULL;         /* GC friendliness */
  307:     return entry;
  308: }
  309: 
  310: #define FOUND(table, op, e, p, index)                   \
  311:     do {                                                \
  312:         switch (op) {                                   \
  313:         case SCM_DICT_GET:;                             \
  314:         case SCM_DICT_CREATE:;                          \
  315:             return e;                                   \
  316:         case SCM_DICT_DELETE:;                          \
  317:             return delete_entry(table, e, p, index);    \
  318:         }                                               \
  319:     } while (0)
  320: 
  321: #define NOTFOUND(table, op, key, hashval, index)                \
  322:     do {                                                        \
  323:         if (op == SCM_DICT_CREATE) {                            \
  324:            return insert_entry(table, key, hashval, index);     \
  325:         } else {                                                \
  326:            return NULL;                                         \
  327:         }                                                       \
  328:     } while (0)
  329: 
  330: /*
  331:  * Accessor function for address.   Used for EQ-type hash.
  332:  */
  333: static Entry *address_access(ScmHashCore *table,
  334:                              intptr_t key,
  335:                              ScmDictOp op)
  336: {
  337:     u_long hashval, index;
  338:     Entry *e, *p, **buckets = (Entry**)table->buckets;
  339: 
  340:     ADDRESS_HASH(hashval, key);
  341:     index = HASH2INDEX(table->numBuckets, table->numBucketsLog2, hashval);
  342:     
  343:     for (e = buckets[index], p = NULL; e; p = e, e = e->next) {
  344:         if (e->key == key) FOUND(table, op, e, p, index);
  345:     }
  346:     NOTFOUND(table, op, key, hashval, index);
  347: }
  348: 
  349: static u_long address_hash(const ScmHashCore *ht, intptr_t obj)
  350: {
  351:     u_long hashval;
  352:     ADDRESS_HASH(hashval, obj);
  353:     return hashval;
  354: }
  355: 
  356: static int address_cmp(const ScmHashCore *ht, intptr_t key, intptr_t k2)
  357: {
  358:     return (key == k2);
  359: }
  360: 
  361: /*
  362:  * Accessor function for equal and eqv-hash.
  363:  * We assume KEY is ScmObj.
  364:  */
  365: static u_long eqv_hash(const ScmHashCore *table, intptr_t key)
  366: {
  367:     return Scm_EqvHash(SCM_OBJ(key));
  368: }
  369: 
  370: static int eqv_cmp(const ScmHashCore *table, intptr_t key, intptr_t k2)
  371: {
  372:     return Scm_EqvP(SCM_OBJ(key), SCM_OBJ(k2));
  373: }
  374: 
  375: static u_long equal_hash(const ScmHashCore *table, intptr_t key)
  376: {
  377:     return Scm_Hash(SCM_OBJ(key));
  378: }
  379: 
  380: static int equal_cmp(const ScmHashCore *table, intptr_t key, intptr_t k2)
  381: {
  382:     return Scm_EqualP(SCM_OBJ(key), SCM_OBJ(k2));
  383: }
  384: 
  385: 
  386: /*
  387:  * Accessor function for string type.
  388:  */
  389: static Entry *string_access(ScmHashCore *table, intptr_t k, ScmDictOp op)
  390: {
  391:     u_long hashval, index;
  392:     int size;
  393:     const char *s;
  394:     ScmObj key = SCM_OBJ(k);
  395:     Entry *e, *p, **buckets;
  396:     const ScmStringBody *keyb;
  397:     
  398:     if (!SCM_STRINGP(key)) {
  399:         Scm_Error("Got non-string key %S to the string hashtable.", key);
  400:     }
  401:     keyb = SCM_STRING_BODY(key);
  402:     s = SCM_STRING_BODY_START(keyb);
  403:     size = SCM_STRING_BODY_SIZE(keyb);
  404:     STRING_HASH(hashval, s, size);
  405:     index = HASH2INDEX(table->numBuckets, table->numBucketsLog2, hashval);
  406:     buckets = (Entry**)table->buckets;
  407: 
  408:     for (e = buckets[index], p = NULL; e; p = e, e = e->next) {
  409:         ScmObj ee = SCM_OBJ(e->key);
  410:         const ScmStringBody *eeb = SCM_STRING_BODY(ee);
  411:         int eesize = SCM_STRING_BODY_SIZE(eeb);
  412:         if (size == eesize
  413:             && memcmp(SCM_STRING_BODY_START(keyb),
  414:                       SCM_STRING_BODY_START(eeb), eesize) == 0){
  415:             FOUND(table, op, e, p, index);
  416:         }
  417:     }
  418:     NOTFOUND(table, op, k, hashval, index);
  419: }
  420: 
  421: static u_long string_hash(const ScmHashCore *table, intptr_t key)
  422: {
  423:     u_long hashval;
  424:     const char *p;
  425:     const ScmStringBody *b = SCM_STRING_BODY(key);
  426:     p = SCM_STRING_BODY_START(b);
  427:     STRING_HASH(hashval, p, SCM_STRING_BODY_SIZE(b));
  428:     return hashval;
  429: }
  430: 
  431: static int string_cmp(const ScmHashCore *table, intptr_t k1, intptr_t k2)
  432: {
  433:     const ScmStringBody *b1 = SCM_STRING_BODY(k1);
  434:     const ScmStringBody *b2 = SCM_STRING_BODY(k2);
  435:     return ((SCM_STRING_BODY_SIZE(b1) == SCM_STRING_BODY_SIZE(b2))
  436:             && (memcmp(SCM_STRING_BODY_START(b1),
  437:                        SCM_STRING_BODY_START(b2),
  438:                        SCM_STRING_BODY_SIZE(b1)) == 0));
  439: }
  440: 
  441: /*
  442:  * Accessor function for multiword raw hashtable.
  443:  * Key points to an array of N words.
  444:  */
  445: static u_long multiword_hash(const ScmHashCore *table, intptr_t key)
  446: {
  447:     ScmWord keysize = (ScmWord)table->data;
  448:     ScmWord *keyarray = (ScmWord*)key;
  449:     u_long h = 0, h1;
  450:     int i;
  451:     for (i=0; i<keysize; i++) {
  452:         ADDRESS_HASH(h1, keyarray[i]);
  453:         h = COMBINE(h, h1);
  454:     }
  455:     return h;
  456: }
  457: 
  458: #if 0
  459: static Entry *multiword_access(ScmHashCore *table, intptr_t k, ScmDictOp op)
  460: {
  461:     u_long hashval, index;
  462:     ScmWord keysize = (ScmWord)table->data;
  463:     Entry *e, *p, **buckets;
  464:     
  465:     hashval = multiword_hash(table, k);
  466:     index = HASH2INDEX(table->numBuckets, table->numBucketsLog2, hashval);
  467:     buckets = (Entry**)table->buckets;
  468: 
  469:     for (e = buckets[index], p = NULL; e; p = e, e = e->next) {
  470:         if (memcmp((void*)k, (void*)e->key, keysize*sizeof(ScmWord)) == 0)
  471:             FOUND(table, op, e, p, index);
  472:     }
  473:     NOTFOUND(table, op, k, hashval, index);
  474: }
  475: #endif
  476: 
  477: 
  478: /*
  479:  * Accessor function for general case
  480:  *    (hashfn and cmpfn are given by user)
  481:  */
  482: static Entry *general_access(ScmHashCore *table, intptr_t key, ScmDictOp op)
  483: {
  484:     u_long hashval, index;
  485:     Entry *e, *p, **buckets;
  486: 
  487:     hashval = table->hashfn(table, key);
  488:     index = HASH2INDEX(table->numBuckets, table->numBucketsLog2, hashval);
  489:     buckets = (Entry**)table->buckets;
  490:     
  491:     for (e = buckets[index], p = NULL; e; p = e, e = e->next) {
  492:         if (table->cmpfn(table, key, e->key)) FOUND(table, op, e, p, index);
  493:     }
  494:     NOTFOUND(table, op, key, hashval, index);
  495: }
  496: 
  497: /*============================================================
  498:  * Hash Core functions
  499:  */
  500: 
  501: static void hash_core_init(ScmHashCore *table,
  502:                            SearchProc  *accessfn,
  503:                            ScmHashProc *hashfn,
  504:                            ScmHashCompareProc *cmpfn,
  505:                            unsigned int initSize,
  506:                            void *data)
  507: {
  508:     Entry **b;
  509:     u_int i;
  510:     
  511:     if (initSize != 0) initSize = round2up(initSize);
  512:     else initSize = DEFAULT_NUM_BUCKETS;
  513: 
  514:     b = SCM_NEW_ARRAY(Entry*, initSize);
  515:     table->buckets = (void**)b;
  516:     table->numBuckets = initSize;
  517:     table->numEntries = 0;
  518:     table->accessfn = (void*)accessfn;
  519:     table->hashfn = hashfn;
  520:     table->cmpfn = cmpfn;
  521:     table->data = data;
  522:     for (i=initSize, table->numBucketsLog2=0; i > 1; i /= 2) {
  523:         table->numBucketsLog2++;
  524:     }
  525:     for (i=0; i<initSize; i++) table->buckets[i] = NULL;
  526: }
  527: 
  528: /* choose appropriate procedures for predefined hash types. */
  529: int  hash_core_predef_procs(ScmHashType type,
  530:                             SearchProc  **accessfn,
  531:                             ScmHashProc **hashfn,
  532:                             ScmHashCompareProc **cmpfn)
  533: {
  534:     switch (type) {
  535:     case SCM_HASH_EQ:
  536:     case SCM_HASH_WORD:
  537:         *accessfn = address_access;
  538:         *hashfn = address_hash;
  539:         *cmpfn  = address_cmp;
  540:         return TRUE;
  541:     case SCM_HASH_EQV:
  542:         *accessfn = general_access;
  543:         *hashfn = eqv_hash;
  544:         *cmpfn  = eqv_cmp;
  545:         return TRUE;
  546:     case SCM_HASH_EQUAL:
  547:         *accessfn = general_access;
  548:         *hashfn = equal_hash;
  549:         *cmpfn  = equal_cmp;
  550:         return TRUE;
  551:     case SCM_HASH_STRING:
  552:         *accessfn = string_access;
  553:         *hashfn = string_hash;
  554:         *cmpfn  = string_cmp;
  555:         return TRUE;
  556:     default:
  557:         return FALSE;
  558:     }
  559: }
  560: 
  561: void Scm_HashCoreInitSimple(ScmHashCore *core,
  562: