(linenum→info "unix/slp.c:2238")

glibc/2.7/localedata/gen-unicode-ctype.c

    1: /* Generate a Unicode conforming LC_CTYPE category from a UnicodeData file.
    2:    Copyright (C) 2000-2001 Free Software Foundation, Inc.
    3:    This file is part of the GNU C Library.
    4:    Contributed by Bruno Haible <haible@clisp.cons.org>, 2000.
    5: 
    6:    The GNU C Library is free software; you can redistribute it and/or
    7:    modify it under the terms of the GNU Lesser General Public
    8:    License as published by the Free Software Foundation; either
    9:    version 2.1 of the License, or (at your option) any later version.
   10: 
   11:    The GNU C Library is distributed in the hope that it will be useful,
   12:    but WITHOUT ANY WARRANTY; without even the implied warranty of
   13:    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   14:    Lesser General Public License for more details.
   15: 
   16:    You should have received a copy of the GNU Lesser General Public
   17:    License along with the GNU C Library; if not, write to the Free
   18:    Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
   19:    02111-1307 USA.  */
   20: 
   21: /* Usage example:
   22:      $ gen-unicode /usr/local/share/Unidata/UnicodeData.txt 3.1
   23:  */
   24: 
   25: #include <stdio.h>
   26: #include <stdlib.h>
   27: #include <stdbool.h>
   28: #include <string.h>
   29: #include <time.h>
   30: 
   31: /* This structure represents one line in the UnicodeData.txt file.  */
   32: struct unicode_attribute
   33: {
   34:   const char *name;           /* Character name */
   35:   const char *category;       /* General category */
   36:   const char *combining;      /* Canonical combining classes */
   37:   const char *bidi;           /* Bidirectional category */
   38:   const char *decomposition;  /* Character decomposition mapping */
   39:   const char *decdigit;       /* Decimal digit value */
   40:   const char *digit;          /* Digit value */
   41:   const char *numeric;        /* Numeric value */
   42:   int mirrored;               /* mirrored */
   43:   const char *oldname;        /* Old Unicode 1.0 name */
   44:   const char *comment;        /* Comment */
   45:   unsigned int upper;         /* Uppercase mapping */
   46:   unsigned int lower;         /* Lowercase mapping */
   47:   unsigned int title;         /* Titlecase mapping */
   48: };
   49: 
   50: /* Missing fields are represented with "" for strings, and NONE for
   51:    characters.  */
   52: #define NONE (~(unsigned int)0)
   53: 
   54: /* The entire contents of the UnicodeData.txt file.  */
   55: struct unicode_attribute unicode_attributes [0x110000];
   56: 
   57: /* Stores in unicode_attributes[i] the values from the given fields.  */
   58: static void
   59: fill_attribute (unsigned int i,
   60:                 const char *field1, const char *field2,
   61:                 const char *field3, const char *field4,
   62:                 const char *field5, const char *field6,
   63:                 const char *field7, const char *field8,
   64:                 const char *field9, const char *field10,
   65:                 const char *field11, const char *field12,
   66:                 const char *field13, const char *field14)
   67: {
   68:   struct unicode_attribute * uni;
   69: 
   70:   if (i >= 0x110000)
   71:     {
   72:       fprintf (stderr, "index too large\n");
   73:       exit (1);
   74:     }
   75:   if (strcmp (field2, "Cs") == 0)
   76:     /* Surrogates are UTF-16 artefacts, not real characters. Ignore them.  */
   77:     return;
   78:   uni = &unicode_attributes[i];
   79:   /* Copy the strings.  */
   80:   uni->name          = strdup (field1);
   81:   uni->category      = (field2[0] == '\0' ? "" : strdup (field2));
   82:   uni->combining     = (field3[0] == '\0' ? "" : strdup (field3));
   83:   uni->bidi          = (field4[0] == '\0' ? "" : strdup (field4));
   84:   uni->decomposition = (field5[0] == '\0' ? "" : strdup (field5));
   85:   uni->decdigit      = (field6[0] == '\0' ? "" : strdup (field6));
   86:   uni->digit         = (field7[0] == '\0' ? "" : strdup (field7));
   87:   uni->numeric       = (field8[0] == '\0' ? "" : strdup (field8));
   88:   uni->mirrored      = (field9[0] == 'Y');
   89:   uni->oldname       = (field10[0] == '\0' ? "" : strdup (field10));
   90:   uni->comment       = (field11[0] == '\0' ? "" : strdup (field11));
   91:   uni->upper = (field12[0] =='\0' ? NONE : strtoul (field12, NULL, 16));
   92:   uni->lower = (field13[0] =='\0' ? NONE : strtoul (field13, NULL, 16));
   93:   uni->title = (field14[0] =='\0' ? NONE : strtoul (field14, NULL, 16));
   94: }
   95: 
   96: /* Maximum length of a field in the UnicodeData.txt file.  */
   97: #define FIELDLEN 120
   98: 
   99: /* Reads the next field from STREAM.  The buffer BUFFER has size FIELDLEN.
  100:    Reads up to (but excluding) DELIM.
  101:    Returns 1 when a field was successfully read, otherwise 0.  */
  102: static int
  103: getfield (FILE *stream, char *buffer, int delim)
  104: {
  105:   int count = 0;
  106:   int c;
  107: 
  108:   for (; (c = getc (stream)), (c != EOF && c != delim); )
  109:     {
  110:       /* The original unicode.org UnicodeData.txt file happens to have
  111:          CR/LF line terminators.  Silently convert to LF.  */
  112:       if (c == '\r')
  113:         continue;
  114: 
  115:       /* Put c into the buffer.  */
  116:       if (++count >= FIELDLEN - 1)
  117:         {
  118:           fprintf (stderr, "field too long\n");
  119:           exit (1);
  120:         }
  121:       *buffer++ = c;
  122:     }
  123: 
  124:   if (c == EOF)
  125:     return 0;
  126: 
  127:   *buffer = '\0';
  128:   return 1;
  129: }
  130: 
  131: /* Stores in unicode_attributes[] the entire contents of the UnicodeData.txt
  132:    file.  */
  133: static void
  134: fill_attributes (const char *unicodedata_filename)
  135: {
  136:   unsigned int i, j;
  137:   FILE *stream;
  138:   char field0[FIELDLEN];
  139:   char field1[FIELDLEN];
  140:   char field2[FIELDLEN];
  141:   char field3[FIELDLEN];
  142:   char field4[FIELDLEN];
  143:   char field5[FIELDLEN];
  144:   char field6[FIELDLEN];
  145:   char field7[FIELDLEN];
  146:   char field8[FIELDLEN];
  147:   char field9[FIELDLEN];
  148:   char field10[FIELDLEN];
  149:   char field11[FIELDLEN];
  150:   char field12[FIELDLEN];
  151:   char field13[FIELDLEN];
  152:   char field14[FIELDLEN];
  153:   int lineno = 0;
  154: 
  155:   for (i = 0; i < 0x110000; i++)
  156:     unicode_attributes[i].name = NULL;
  157: 
  158:   stream = fopen (unicodedata_filename, "r");
  159:   if (stream == NULL)
  160:     {
  161:       fprintf (stderr, "error during fopen of '%s'\n", unicodedata_filename);
  162:       exit (1);
  163:     }
  164: 
  165:   for (;;)
  166:     {
  167:       int n;
  168: 
  169:       lineno++;
  170:       n = getfield (stream, field0, ';');
  171:       n += getfield (stream, field1, ';');
  172:       n += getfield (stream, field2, ';');
  173:       n += getfield (stream, field3, ';');
  174:       n += getfield (stream, field4, ';');
  175:       n += getfield (stream, field5, ';');
  176:       n += getfield (stream, field6, ';');
  177:       n += getfield (stream, field7, ';');
  178:       n += getfield (stream, field8, ';');
  179:       n += getfield (stream, field9, ';');
  180:       n += getfield (stream, field10, ';');
  181:       n += getfield (stream, field11, ';');
  182:       n += getfield (stream, field12, ';');
  183:       n += getfield (stream, field13, ';');
  184:       n += getfield (stream, field14, '\n');
  185:       if (n == 0)
  186:         break;
  187:       if (n != 15)
  188:         {
  189:           fprintf (stderr, "short line in'%s':%d\n",
  190:                    unicodedata_filename, lineno);
  191:           exit (1);
  192:         }
  193:       i = strtoul (field0, NULL, 16);
  194:       if (field1[0] == '<'
  195:           && strlen (field1) >= 9
  196:           && !strcmp (field1 + strlen(field1) - 8, ", First>"))
  197:         {
  198:           /* Deal with a range. */
  199:           lineno++;
  200:           n = getfield (stream, field0, ';');
  201:           n += getfield (stream, field1, ';');
  202:           n += getfield (stream, field2, ';');
  203:           n += getfield (stream, field3, ';');
  204:           n += getfield (stream, field4, ';');
  205:           n += getfield (stream, field5, ';');
  206:           n += getfield (stream, field6, ';');
  207:           n += getfield (stream, field7, ';');
  208:           n += getfield (stream, field8, ';');
  209:           n += getfield (stream, field9, ';');
  210:           n += getfield (stream, field10, ';');
  211:           n += getfield (stream, field11, ';');
  212:           n += getfield (stream, field12, ';');
  213:           n += getfield (stream, field13, ';');
  214:           n += getfield (stream, field14, '\n');
  215:           if (n != 15)
  216:             {
  217:               fprintf (stderr, "missing end range in '%s':%d\n",
  218:                        unicodedata_filename, lineno);
  219:               exit (1);
  220:             }
  221:           if (!(field1[0] == '<'
  222:                 && strlen (field1) >= 8
  223:                 && !strcmp (field1 + strlen (field1) - 7, ", Last>")))
  224:             {
  225:               fprintf (stderr, "missing end range in '%s':%d\n",
  226:                        unicodedata_filename, lineno);
  227:               exit (1);
  228:             }
  229:           field1[strlen (field1) - 7] = '\0';
  230:           j = strtoul (field0, NULL, 16);
  231:           for (; i <= j; i++)
  232:             fill_attribute (i, field1+1, field2, field3, field4, field5,
  233:                                field6, field7, field8, field9, field10,
  234:                                field11, field12, field13, field14);
  235:         }
  236:       else
  237:         {
  238:           /* Single character line */
  239:           fill_attribute (i, field1, field2, field3, field4, field5,
  240:                              field6, field7, field8, field9, field10,
  241:                              field11, field12, field13, field14);
  242:         }
  243:     }
  244:   if (ferror (stream) || fclose (stream))
  245:     {
  246:       fprintf (stderr, "error reading from '%s'\n", unicodedata_filename);
  247:       exit (1);
  248:     }
  249: }
  250: 
  251: /* Character mappings.  */
  252: 
  253: static unsigned int
  254: to_upper (unsigned int ch)
  255: {
  256:   if (unicode_attributes[ch].name != NULL
  257:       && unicode_attributes[ch].upper != NONE)
  258:     return unicode_attributes[ch].upper;
  259:   else
  260:     return ch;
  261: }
  262: 
  263: static unsigned int
  264: to_lower (unsigned int ch)
  265: {
  266:   if (unicode_attributes[ch].name != NULL
  267:       && unicode_attributes[ch].lower != NONE)
  268:     return unicode_attributes[ch].lower;
  269:   else
  270:     return ch;
  271: }
  272: 
  273: static unsigned int
  274: to_title (unsigned int ch)
  275: {
  276:   if (unicode_attributes[ch].name != NULL
  277:       && unicode_attributes[ch].title != NONE)
  278:     return unicode_attributes[ch].title;
  279:   else
  280:     return ch;
  281: }
  282: 
  283: /* Character class properties.  */
  284: 
  285: static bool
  286: is_upper (unsigned int ch)
  287: {
  288:   return (to_lower (ch) != ch);
  289: }
  290: 
  291: static bool
  292: is_lower (unsigned int ch)
  293: {
  294:   return (to_upper (ch) != ch)
  295:          /* <U00DF> is lowercase, but without simple to_upper mapping.  */
  296:          || (ch == 0x00DF);
  297: }
  298: 
  299: static bool
  300: is_alpha (unsigned int ch)
  301: {
  302:   return (unicode_attributes[ch].name != NULL
  303:           && ((unicode_attributes[ch].category[0] == 'L'
  304:                /* Theppitak Karoonboonyanan <thep@links.nectec.or.th> says
  305:                   <U0E2F>, <U0E46> should belong to is_punct.  */
  306:                && (ch != 0x0E2F) && (ch != 0x0E46))
  307:               /* Theppitak Karoonboonyanan <thep@links.nectec.or.th> says
  308:                  <U0E31>, <U0E34>..<U0E3A>, <U0E47>..<U0E4E> are is_alpha.  */
  309:               || (ch == 0x0E31)
  310:               || (ch >= 0x0E34 && ch <= 0x0E3A)
  311:               || (ch >= 0x0E47 && ch <= 0x0E4E)
  312:               /* Avoid warning for <U0345>.  */
  313:               || (ch == 0x0345)
  314:               /* Avoid warnings for <U2160>..<U217F>.  */
  315:               || (unicode_attributes[ch].category[0] == 'N'
  316:                   && unicode_attributes[ch].category[1] == 'l')
  317:               /* Avoid warnings for <U24B6>..<U24E9>.  */
  318:               || (unicode_attributes[ch].category[0] == 'S'
  319:                   && unicode_attributes[ch].category[1] == 'o'
  320:                   && strstr (unicode_attributes[ch].name, " LETTER ")
  321:                      != NULL)
  322:               /* Consider all the non-ASCII digits as alphabetic.
  323:                  ISO C 99 forbids us to have them in category "digit",
  324:                  but we want iswalnum to return true on them.  */
  325:               || (unicode_attributes[ch].category[0] == 'N'
  326:                   && unicode_attributes[ch].category[1] == 'd'
  327:                   && !(ch >= 0x0030 && ch <= 0x0039))));
  328: }
  329: 
  330: static bool
  331: is_digit (unsigned int ch)
  332: {
  333: #if 0
  334:   return (unicode_attributes[ch].name != NULL
  335:           && unicode_attributes[ch].category[0] == 'N'
  336:           && unicode_attributes[ch].category[1] == 'd');
  337:   /* Note: U+0BE7..U+0BEF and U+1369..U+1371 are digit systems without
  338:      a zero.  Must add <0> in front of them by hand.  */
  339: #else
  340:   /* SUSV2 gives us some freedom for the "digit" category, but ISO C 99
  341:      takes it away:
  342:      7.25.2.1.5:
  343:         The iswdigit function tests for any wide character that corresponds
  344:         to a decimal-digit character (as defined in 5.2.1).
  345:      5.2.1:
  346:         the 10 decimal digits 0 1 2 3 4 5 6 7 8 9
  347:    */
  348:   return (ch >= 0x0030 && ch <= 0x0039);
  349: #endif
  350: }
  351: 
  352: static bool
  353: is_outdigit (unsigned int ch)
  354: {
  355:   return (ch >= 0x0030 && ch <= 0x0039);
  356: }
  357: 
  358: static bool
  359: is_blank (unsigned int ch)
  360: {
  361:   return (ch == 0x0009 /* '\t' */
  362:           /* Category Zs without mention of "<noBreak>" */
  363:           || (unicode_attributes[ch].name != NULL
  364:               && unicode_attributes[ch].category[0] == 'Z'
  365:               && unicode_attributes[ch].category[1] == 's'
  366:               && !strstr (unicode_attributes[ch].decomposition, "<noBreak>")));
  367: }
  368: 
  369: static bool
  370: is_space (unsigned int ch)
  371: {
  372:   /* Don't make U+00A0 a space. Non-breaking space means that all programs
  373:      should treat it like a punctuation character, not like a space. */
  374:   return (ch == 0x0020 /* ' ' */
  375:           || ch == 0x000C /* '\f' */
  376:           || ch == 0x000A /* '\n' */
  377:           || ch == 0x000D /* '\r' */
  378:           || ch == 0x0009 /* '\t' */
  379:           || ch == 0x000B /* '\v' */
  380:           /* Categories Zl, Zp, and Zs without mention of "<noBreak>" */
  381:           || (unicode_attributes[ch].name != NULL
  382:               && unicode_attributes[ch].category[0] == 'Z'
  383:               && (unicode_attributes[ch].category[1] == 'l'
  384:                   || unicode_attributes[ch].category[1] == 'p'
  385:                   || (unicode_attributes[ch].category[1] == 's'
  386:                       && !strstr (unicode_attributes[ch].decomposition,
  387:                                   "<noBreak>")))));
  388: }
  389: 
  390: static bool
  391: is_cntrl (unsigned int ch)
  392: {
  393:   return (unicode_attributes[ch].name != NULL
  394:           && (!strcmp (unicode_attributes[ch].name, "<control>")
  395:               /* Categories Zl and Zp */
  396:               || (unicode_attributes[ch].category[0] == 'Z'
  397:                   && (unicode_attributes[ch].category[1] == 'l'
  398:                       || unicode_attributes[ch].category[1] == 'p'))));
  399: }
  400: 
  401: static bool
  402: is_xdigit (unsigned int ch)
  403: {
  404: #if 0
  405:   return is_digit (ch)
  406:          || (ch >= 0x0041 && ch <= 0x0046)
  407:          || (ch >= 0x0061 && ch <= 0x0066);
  408: #else
  409:   /* SUSV2 gives us some freedom for the "xdigit" category, but ISO C 99
  410:      takes it away:
  411:      7.25.2.1.12:
  412:         The iswxdigit function tests for any wide character that corresponds
  413:         to a hexadecimal-digit character (as defined in 6.4.4.1).
  414:      6.4.4.1:
  415:         hexadecimal-digit: one of 0 1 2 3 4 5 6 7 8 9 a b c d e f A B C D E F
  416:    */
  417:   return (ch >= 0x0030 && ch <= 0x0039)
  418:          || (ch >= 0x0041 && ch <= 0x0046)
  419:          || (ch >= 0x0061 && ch <= 0x0066);
  420: #endif
  421: }
  422: 
  423: static bool
  424: is_graph (unsigned int ch)
  425: {
  426:   return (unicode_attributes[ch].name != NULL
  427:           && strcmp (unicode_attributes[ch].name, "<control>")
  428:           && !is_space (ch));
  429: }
  430: 
  431: static bool
  432: is_print (unsigned int ch)
  433: {
  434:   return (unicode_attributes[ch].name != NULL
  435:           && strcmp (unicode_attributes[ch].name, "<control>")
  436:           /* Categories Zl and Zp */
  437:           && !(unicode_attributes[ch].name != NULL
  438:                && unicode_attributes[ch].category[0] == 'Z'
  439:                && (unicode_attributes[ch].category[1] == 'l'
  440:                    || unicode_attributes[ch].category[1] == 'p')));
  441: }
  442: 
  443: static bool
  444: is_punct (unsigned int ch)
  445: {
  446: #if 0
  447:   return (unicode_attributes[ch].name != NULL
  448:           && unicode_attributes[ch].category[0] == 'P');
  449: #else
  450:   /* The traditional POSIX definition of punctuation is every graphic,
  451:      non-alphanumeric character.  */
  452:   return (is_graph (ch) && !is_alpha (ch) && !is_digit (ch));
  453: #endif
  454: }
  455: 
  456: static bool
  457: is_combining (unsigned int ch)
  458: {
  459:   /* Up to Unicode 3.0.1 we took the Combining property from the PropList.txt
  460:      file. In 3.0.1 it was identical to the union of the general categories
  461:      "Mn", "Mc", "Me". In Unicode 3.1 this property has been dropped from the
  462:      PropList.txt file, so we take the latter definition.  */
  463:   return (unicode_attributes[ch].name != NULL
  464:           && unicode_attributes[ch].category[0] == 'M'
  465:           && (unicode_attributes[ch].category[1] == 'n'
  466:               || unicode_attributes[ch].category[1] == 'c'
  467:               || unicode_attributes[ch].category[1] == 'e'));
  468: }
  469: 
  470: static bool
  471: is_combining_level3 (unsigned int ch)
  472: {
  473:   return is_combining (ch)
  474:          && !(unicode_attributes[ch].combining[0] != '\0'
  475:               && unicode_attributes[ch].combining[0] != '0'
  476:               && strtoul (unicode_attributes[ch].combining, NULL, 10) >= 200);
  477: }
  478: 
  479: /* Return the UCS symbol string for a Unicode character.  */
  480: static const char *
  481: ucs_symbol (unsigned int i)
  482: {
  483:   static char buf[11+1];
  484: 
  485:   sprintf (buf, (i < 0x10000 ? "<U%04X>" : "<U%08X>"), i);
  486:   return buf;
  487: }
  488: 
  489: /* Return the UCS symbol range string for a Unicode characters interval.  */
  490: static const char *
  491: ucs_symbol_range (unsigned int low, unsigned int high)
  492: {
  493:   static char buf[24+1];
  494: 
  495:   strcpy (buf, ucs_symbol (low));
  496:   strcat (buf, "..");
  497:   strcat (buf, ucs_symbol (high));
  498:   return buf;
  499: }
  500: 
  501: /* Output a character class (= property) table.  */
  502: 
  503: static void
  504: output_charclass (FILE *stream, const char *classname,
  505:                   bool (*func) (unsigned int))
  506: {
  507:   char table[0x110000];
  508:   unsigned int i;