(linenum→info "unix/slp.c:2238")

ruby/1.9.0/encoding.c

    1: /**********************************************************************
    2: 
    3:   encoding.c -
    4: 
    5:   $Author: nobu $
    6:   $Date: 2007-12-25 19:01:06 +0900 (Tue, 25 Dec 2007) $
    7:   created at: Thu May 24 17:23:27 JST 2007
    8: 
    9:   Copyright (C) 2007 Yukihiro Matsumoto
   10: 
   11: **********************************************************************/
   12: 
   13: #include "ruby/ruby.h"
   14: #include "ruby/encoding.h"
   15: #include "regenc.h"
   16: #include <ctype.h>
   17: #ifdef HAVE_LANGINFO_H
   18: #include <langinfo.h>
   19: #endif
   20: 
   21: static ID id_encoding, id_based_encoding;
   22: static VALUE rb_cEncoding;
   23: 
   24: struct rb_encoding_entry {
   25:     const char *name;
   26:     rb_encoding *enc;
   27: };
   28: 
   29: static struct rb_encoding_entry *enc_table;
   30: static int enc_table_count;
   31: static int enc_table_size;
   32: static st_table *enc_table_alias;
   33: 
   34: #define ENC_UNINITIALIZED (&rb_cEncoding)
   35: #define enc_initialized_p(enc) ((enc)->auxiliary_data != &rb_cEncoding)
   36: #define ENC_FROM_ENCODING(enc) ((VALUE)(enc)->auxiliary_data)
   37: 
   38: #define ENC_DUMMY FL_USER2
   39: #define ENC_DUMMY_P(enc) (RBASIC(enc)->flags & ENC_DUMMY)
   40: 
   41: static void
   42: enc_mark(void *ptr)
   43: {
   44: }
   45: 
   46: static VALUE
   47: enc_new(rb_encoding *encoding)
   48: {
   49:     VALUE enc = Data_Wrap_Struct(rb_cEncoding, enc_mark, -1, encoding);
   50:     encoding->auxiliary_data = (void *)enc;
   51:     return enc;
   52: }
   53: 
   54: VALUE
   55: rb_enc_from_encoding(rb_encoding *encoding)
   56: {
   57:     if (!encoding) return Qnil;
   58:     if (enc_initialized_p(encoding))
   59:         return ENC_FROM_ENCODING(encoding);
   60:     return enc_new(encoding);
   61: }
   62: 
   63: static int
   64: enc_check_encoding(VALUE obj)
   65: {
   66:     int index;
   67:     if (SPECIAL_CONST_P(obj) || BUILTIN_TYPE(obj) != T_DATA ||
   68:         RDATA(obj)->dmark != enc_mark) {
   69:         return -1;
   70:     }
   71:     index = rb_enc_to_index(RDATA(obj)->data);
   72:     if (rb_enc_from_index(index) != RDATA(obj)->data)
   73:         return -1;
   74:     return index;
   75: }
   76: 
   77: int
   78: rb_to_encoding_index(VALUE enc)
   79: {
   80:     int idx;
   81: 
   82:     if (NIL_P(enc)) return 0;
   83:     idx = enc_check_encoding(enc);
   84:     if (idx >= 0) {
   85:         return idx;
   86:     }
   87:     else if (NIL_P(enc = rb_check_string_type(enc))) {
   88:         return -1;
   89:     }
   90:     else {
   91:         return rb_enc_find_index(StringValueCStr(enc));
   92:     }
   93: }
   94: 
   95: rb_encoding *
   96: rb_to_encoding(VALUE enc)
   97: {
   98:     int idx;
   99: 
  100:     if (NIL_P(enc)) return 0;
  101:     idx = enc_check_encoding(enc);
  102:     if (idx >= 0) return RDATA(enc)->data;
  103:     if (NIL_P(enc = rb_check_string_type(enc))) {
  104:         return 0;
  105:     }
  106:     if ((idx = rb_enc_find_index(StringValueCStr(enc))) < 0) {
  107:         rb_raise(rb_eArgError, "unknown encoding name - %s", RSTRING_PTR(enc));
  108:     }
  109:     return rb_enc_from_index(idx);
  110: }
  111: 
  112: void
  113: rb_gc_mark_encodings(void)
  114: {
  115:     int i;
  116:     for (i = 0; i < enc_table_size; ++i) {
  117:         rb_encoding *enc = enc_table[i].enc;
  118:         if (enc && enc_initialized_p(enc)) {
  119:             rb_gc_mark(ENC_FROM_ENCODING(enc));
  120:         }
  121:     }
  122: }
  123: 
  124: static int
  125: enc_table_expand(int newsize)
  126: {
  127:     struct rb_encoding_entry *ent;
  128: 
  129:     if (enc_table_size >= newsize) return newsize;
  130:     ent = realloc(enc_table, sizeof(*enc_table) * newsize);
  131:     if (!ent) return -1;
  132:     memset(ent + enc_table_size, 0, sizeof(*ent)*(newsize - enc_table_size));
  133:     enc_table = ent;
  134:     enc_table_size = newsize;
  135:     return newsize;
  136: }
  137: 
  138: static int
  139: enc_register_at(int index, const char *name, rb_encoding *encoding)
  140: {
  141:     struct rb_encoding_entry *ent = &enc_table[index];
  142: 
  143:     ent->name = name;
  144:     if (!ent->enc) ent->enc = malloc(sizeof(rb_encoding));
  145:     *ent->enc = *encoding;
  146:     encoding = ent->enc;
  147:     encoding->name = name;
  148:     if (rb_cEncoding) {
  149:         /* initialize encoding data */
  150:         enc_new(encoding);
  151:     }
  152:     else {
  153:         encoding->auxiliary_data = ENC_UNINITIALIZED;
  154:     }
  155:     return index;
  156: }
  157: 
  158: static int
  159: enc_register(const char *name, rb_encoding *encoding)
  160: {
  161:     int index = enc_table_count;
  162: 
  163:     if (index >= ENCODING_INLINE_MAX) index = enc_table_size;
  164:     if ((index = enc_table_expand(index + 1)) < 0) return -1;
  165:     enc_table_count = index;
  166:     return enc_register_at(index - 1, name, encoding);
  167: }
  168: 
  169: static void set_encoding_const(const char *, rb_encoding *);
  170: int rb_enc_registered(const char *name);
  171: 
  172: int
  173: rb_enc_register(const char *name, rb_encoding *encoding)
  174: {
  175:     int index = rb_enc_registered(name);
  176: 
  177:     if (index >= 0) {
  178:         rb_encoding *oldenc = rb_enc_from_index(index);
  179:         if (strcasecmp(name, rb_enc_name(oldenc))) {
  180:             st_data_t key = (st_data_t)name, alias;
  181:             st_delete(enc_table_alias, &key, &alias);
  182:             index = enc_register(name, encoding);
  183:         }
  184:         else if (enc_initialized_p(oldenc) &&
  185:                  !ENC_DUMMY_P(ENC_FROM_ENCODING(oldenc))) {
  186:             enc_register_at(index, name, encoding);
  187:         }
  188:         else {
  189:             rb_raise(rb_eArgError, "encoding %s is already registered", name);
  190:         }
  191:     }
  192:     else {
  193:         index = enc_register(name, encoding);
  194:     }
  195:     set_encoding_const(name, rb_enc_from_index(index));
  196:     return index;
  197: }
  198: 
  199: static void
  200: enc_check_duplication(const char *name)
  201: {
  202:     if (rb_enc_registered(name) >= 0) {
  203:         rb_raise(rb_eArgError, "encoding %s is already registered", name);
  204:     }
  205: }
  206: 
  207: static VALUE
  208: set_based_encoding(int index, rb_encoding *based)
  209: {
  210:     VALUE enc = rb_enc_from_encoding(enc_table[index].enc);
  211: 
  212:     rb_ivar_set(enc, id_based_encoding, rb_enc_from_encoding(based));
  213:     return enc;
  214: }
  215: 
  216: int
  217: rb_enc_replicate(const char *name, rb_encoding *encoding)
  218: {
  219:     int index = enc_table_size;
  220: 
  221:     enc_check_duplication(name);
  222:     if (enc_table_expand(index + 1) < 0) return -1;
  223:     enc_register_at(index, name, encoding);
  224:     set_based_encoding(index, encoding);
  225:     return index;
  226: }
  227: 
  228: int
  229: rb_define_dummy_encoding(const char *name)
  230: {
  231:     int index = enc_table_size;
  232:     rb_encoding *encoding;
  233:     VALUE enc;
  234: 
  235:     enc_check_duplication(name);
  236:     if (index < ENCODING_INLINE_MAX) index = ENCODING_INLINE_MAX;
  237:     if (enc_table_expand(index + 1) < 0) return -1;
  238:     encoding = rb_ascii8bit_encoding();
  239:     enc_register_at(index, name, encoding);
  240:     enc = set_based_encoding(index, encoding);
  241:     FL_SET(enc, ENC_DUMMY);
  242:     return index;
  243: }
  244: 
  245: int
  246: rb_enc_dummy_p(rb_encoding *enc)
  247: {
  248:     VALUE encoding = rb_enc_from_encoding(enc);
  249:     return ENC_DUMMY_P(encoding);
  250: }
  251: 
  252: /*
  253:  * call-seq:
  254:  *   enc.dummy? => true or false
  255:  *
  256:  * Returns true for dummy encoding.
  257:  * A dummy encoding is a encoding which character handling is not properly
  258:  * implemented.
  259:  * It is used for stateful encoding.
  260:  *
  261:  *   Encoding::ISO_2022_JP.dummy?       #=> true
  262:  *   Encoding::UTF_8.dummy?             #=> false
  263:  *
  264:  */
  265: static VALUE
  266: enc_dummy_p(VALUE enc)
  267: {
  268:     return rb_enc_dummy_p(rb_to_encoding(enc)) ? Qtrue : Qfalse;
  269: }
  270: 
  271: static int
  272: enc_alias(const char *alias, const char *orig)
  273: {
  274:     st_data_t data;
  275:     int idx;
  276: 
  277:     if (!enc_table_alias) {
  278:         enc_table_alias = st_init_strcasetable();
  279:     }
  280:     if ((idx = rb_enc_find_index(orig)) < 0) {
  281:         if (!st_lookup(enc_table_alias, (st_data_t)orig, &data))
  282:             return -1;
  283:         idx = (int)data;
  284:     }
  285:     st_insert(enc_table_alias, (st_data_t)alias, (st_data_t)idx);
  286:     return idx;
  287: }
  288: 
  289: int
  290: rb_enc_alias(const char *alias, const char *orig)
  291: {
  292:     enc_check_duplication(alias);
  293:     return enc_alias(alias, orig);
  294: }
  295: 
  296: enum {
  297:     ENCINDEX_ASCII,
  298:     ENCINDEX_EUC_JP,
  299:     ENCINDEX_SJIS,
  300:     ENCINDEX_UTF8,
  301:     ENCINDEX_BUILTIN_MAX
  302: };
  303: 
  304: void
  305: rb_enc_init(void)
  306: {
  307:     enc_table_count = enc_table_expand(ENCINDEX_BUILTIN_MAX);
  308: #define ENC_REGISTER(enc) enc_register_at(ENCINDEX_##enc, rb_enc_name(ONIG_ENCODING_##enc), ONIG_ENCODING_##enc)
  309:     ENC_REGISTER(ASCII);
  310:     ENC_REGISTER(EUC_JP);
  311:     ENC_REGISTER(SJIS);
  312:     ENC_REGISTER(UTF8);
  313: #undef ENC_REGISTER
  314:     enc_alias("ASCII", rb_enc_name(ONIG_ENCODING_ASCII));
  315:     enc_alias("BINARY", rb_enc_name(ONIG_ENCODING_ASCII));
  316:     enc_alias("eucJP", rb_enc_name(ONIG_ENCODING_EUC_JP)); /* UI-OSF Application Platform Profile for Japanese Environment Version 1.1 */
  317:     enc_alias("SJIS", rb_enc_name(ONIG_ENCODING_SJIS));
  318: }
  319: 
  320: rb_encoding *
  321: rb_enc_from_index(int index)
  322: {
  323:     if (!enc_table) {
  324:         rb_enc_init();
  325:     }
  326:     if (index < 0 || enc_table_size <= index) {
  327:         return 0;
  328:     }
  329:     return enc_table[index].enc;
  330: }
  331: 
  332: int
  333: rb_enc_registered(const char *name)
  334: {
  335:     int i;
  336:     st_data_t alias = 0;
  337: 
  338:     if (!name) return -1;
  339:     if (!enc_table) {
  340:         rb_enc_init();
  341:     }
  342:     for (i=0; i<enc_table_size; i++) {
  343:         if (!enc_table[i].name) {
  344:             if (i < ENCODING_INLINE_MAX - 1) i = ENCODING_INLINE_MAX - 1;
  345:             continue;
  346:         }
  347:         if (strcasecmp(name, enc_table[i].name) == 0) {
  348:             return i;
  349:         }
  350:     }
  351:     if (!alias && enc_table_alias) {
  352:         if (st_lookup(enc_table_alias, (st_data_t)name, &alias)) {
  353:             return (int)alias;
  354:         }
  355:     }
  356:     return -1;
  357: }
  358: 
  359: static VALUE
  360: require_enc(VALUE enclib)
  361: {
  362:     return rb_require_safe(enclib, rb_safe_level());
  363: }
  364: 
  365: int
  366: rb_enc_find_index(const char *name)
  367: {
  368:     int i = rb_enc_registered(name);
  369:     if (i < 0) {
  370:         VALUE enclib = rb_sprintf("enc/%s", name);
  371:         char *s = RSTRING_PTR(enclib) + 4, *e = RSTRING_END(enclib);
  372:         while (s < e) {
  373:             if (!ISALNUM(*s)) *s = '_';
  374:             else if (ISUPPER(*s)) *s = tolower(*s);
  375:             ++s;
  376:         }
  377:         OBJ_FREEZE(enclib);
  378:         if (RTEST(rb_protect(require_enc, enclib, 0)))
  379:             i = rb_enc_registered(name);
  380:         rb_set_errinfo(Qnil);
  381:     }
  382:     return i;
  383: }
  384: 
  385: rb_encoding *
  386: rb_enc_find(const char *name)
  387: {
  388:     int idx = rb_enc_find_index(name);
  389:     if (idx < 0) idx = 0;
  390:     return rb_enc_from_index(idx);
  391: }
  392: 
  393: static inline int
  394: enc_capable(VALUE obj)
  395: {
  396:     if (IMMEDIATE_P(obj)) return Qfalse;
  397:     switch (BUILTIN_TYPE(obj)) {
  398:       case T_STRING:
  399:       case T_REGEXP:
  400:       case T_FILE:
  401:         return Qtrue;
  402:       case T_DATA:
  403:         if (RDATA(obj)->dmark == enc_mark) return Qtrue;
  404:       default:
  405:         return Qfalse;
  406:     }
  407: }
  408: 
  409: static void
  410: enc_check_capable(VALUE x)
  411: {
  412:     if (!enc_capable(x)) {
  413:         const char *etype;
  414: 
  415:         if (NIL_P(x)) {
  416:             etype = "nil";
  417:         }
  418:         else if (FIXNUM_P(x)) {
  419:             etype = "Fixnum";
  420:         }
  421:         else if (SYMBOL_P(x)) {
  422:             etype = "Symbol";
  423:         }
  424:         else if (rb_special_const_p(x)) {
  425:             etype = RSTRING_PTR(rb_obj_as_string(x));
  426:         }
  427:         else {
  428:             etype = rb_obj_classname(x);
  429:         }
  430:         rb_raise(rb_eTypeError, "wrong argument type %s (not encode capable)", etype);
  431:     }
  432: }
  433: 
  434: ID
  435: rb_id_encoding(void)
  436: {
  437:     if (!id_encoding) {
  438:         id_encoding = rb_intern("encoding");
  439:     }
  440:     return id_encoding;
  441: }
  442: 
  443: void
  444: rb_enc_associate_index(VALUE obj, int idx)
  445: {
  446:     enc_check_capable(obj);
  447:     if (!ENC_CODERANGE_ASCIIONLY(obj) ||
  448:         !rb_enc_asciicompat(rb_enc_from_index(idx))) {
  449:         ENC_CODERANGE_CLEAR(obj);
  450:     }
  451:     if (idx < ENCODING_INLINE_MAX) {
  452:         ENCODING_SET(obj, idx);
  453:         return;
  454:     }
  455:     ENCODING_SET(obj, ENCODING_INLINE_MAX);
  456:     rb_ivar_set(obj, rb_id_encoding(), INT2NUM(idx));
  457:     return;
  458: }
  459: 
  460: int
  461: rb_enc_to_index(rb_encoding *enc)
  462: {
  463:     int i;
  464: 
  465:     if (!enc) return 0;
  466:     for (i=0; i<enc_table_size; i++) {
  467:         if (enc_table[i].enc == enc) {
  468:             return i;
  469:         }
  470:     }
  471:     return 0;
  472: }
  473: 
  474: void
  475: rb_enc_associate(VALUE obj, rb_encoding *enc)
  476: {
  477:     rb_enc_associate_index(obj, rb_enc_to_index(enc));
  478: }
  479: 
  480: int
  481: rb_enc_get_index(VALUE obj)
  482: {
  483:     int i;
  484: 
  485:     if (!enc_capable(obj)) return -1;
  486:     i = ENCODING_GET(obj);
  487:     if (i == ENCODING_INLINE_MAX) {
  488:         VALUE iv;
  489: 
  490:         iv = rb_ivar_get(obj, rb_id_encoding());
  491:         i = NUM2INT(iv);
  492:     }
  493:     return i;
  494: }
  495: 
  496: rb_encoding*
  497: rb_enc_get(VALUE obj)
  498: {
  499:     return rb_enc_from_index(rb_enc_get_index(obj));
  500: }
  501: 
  502: rb_encoding*
  503: rb_enc_check(VALUE str1, VALUE str2)
  504: {
  505:     rb_encoding *enc = rb_enc_compatible(str1, str2);
  506:     if (!enc)
  507:         rb_raise(rb_eArgError, "character encodings differ");
  508:     return enc;
  509: }
  510: 
  511: rb_encoding*
  512: rb_enc_compatible(VALUE str1, VALUE str2)
  513: {
  514:     int idx1, idx2;
  515:     rb_encoding *enc;
  516: 
  517:     idx1 = rb_enc_get_index(str1);
  518:     idx2 = rb_enc_get_index(str2);
  519: 
  520:     if (idx1 == idx2) {
  521:         return rb_enc_from_index(idx1);
  522:     }
  523:     if (BUILTIN_TYPE(str1) != T_STRING) {
  524:         VALUE tmp = str1;
  525:         int idx0 = idx1;
  526:         str1 = str2;
  527:         str2 = tmp;
  528:         idx1 = idx2;
  529:         idx2 = idx0;
  530:     }
  531:     if (BUILTIN_TYPE(str1) == T_STRING) {
  532:         int cr1, cr2;
  533: 
  534:         cr1 = rb_enc_str_coderange(str1);
  535:         if (BUILTIN_TYPE(str2) == T_STRING) {
  536:             cr2 = rb_enc_str_coderange(str2);
  537:             if (cr1 != cr2) {
  538:                 /* may need to handle ENC_CODERANGE_BROKEN */
  539:                 if (cr1 == ENC_CODERANGE_7BIT) return rb_enc_from_index(idx2);
  540:                 if (cr2 == ENC_CODERANGE_7BIT) return rb_enc_from_index(idx1);
  541:             }
  542:             if (cr2 == ENC_CODERANGE_7BIT) {
  543:                 if (idx1 == 0) return rb_enc_from_index(idx2);
  544:                 return rb_enc_from_index(idx1);
  545:             }
  546:         }
  547:         if (cr1 == ENC_CODERANGE_7BIT &&
  548:             rb_enc_asciicompat(enc = rb_enc_from_index(idx2)))
  549:             return enc;
  550:     }
  551:     return 0;
  552: }
  553: 
  554: void
  555: rb_enc_copy(VALUE obj1, VALUE obj2)
  556: {
  557:     rb_enc_associate_index(obj1, rb_enc_get_index(obj2));
  558: }
  559: 
  560: 
  561: /*
  562:  *  call-seq:
  563:  *     obj.encoding   => encoding
  564:  *
  565:  *  Returns the Encoding object that represents the encoding of obj.
  566:  */
  567: 
  568: VALUE
  569: rb_obj_encoding(VALUE obj)
  570: {
  571:     rb_encoding *enc = rb_enc_get(obj);
  572:     if (!enc) {
  573:         rb_raise(rb_eTypeError, "unknown encoding");
  574:     }
  575:     return rb_enc_from_encoding(enc);
  576: }
  577: 
  578: 
  579: char*
  580: rb_enc_nth(const char *p, const char *e, int nth, rb_encoding *enc)
  581: {
  582:     int c;
  583: 
  584:     if (rb_enc_mbmaxlen(enc) == 1) {
  585:         p += nth;
  586:     }
  587:     else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
  588:         p += nth * rb_enc_mbmaxlen(enc);
  589:     }
  590:     else {
  591:         for (c=0; p<e && nth--; c++) {
  592:             int n = rb_enc_mbclen(p, e, enc);
  593: 
  594:             p += n;
  595:         }
  596:     }
  597:     return (char*)p;
  598: }
  599: 
  600: long
  601: rb_enc_strlen(const char *p, const char *e, rb_encoding *enc)
  602: {
  603:     long c;
  604: 
  605:     if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
  606:         return (e - p) / rb_enc_mbminlen(enc);
  607:     }
  608: 
  609:     for (c=0; p<e; c++) {
  610:         int n = rb_enc_mbclen(p, e, enc);
  611: 
  612:         p += n;
  613:     }
  614:     return c;
  615: }
  616: 
  617: int
  618: rb_enc_mbclen(const char *p, const char *e, rb_encoding *enc)
  619: {
  620:     int n = ONIGENC_PRECISE_MBC_ENC_LEN(enc, (UChar*)p, (UChar*)e);
  621:     n = MBCLEN_CHARFOUND(n);
  622:     if (0 < n && n <= e</