(linenum→info "unix/slp.c:2238")

glibc/2.7/iconv/iconv_charmap.c

    1: /* Convert using charmaps and possibly iconv().
    2:    Copyright (C) 2001, 2005, 2006 Free Software Foundation, Inc.
    3:    This file is part of the GNU C Library.
    4:    Contributed by Ulrich Drepper <drepper@redhat.com>, 2001.
    5: 
    6:    This program is free software; you can redistribute it and/or modify
    7:    it under the terms of the GNU General Public License as published
    8:    by the Free Software Foundation; version 2 of the License, or
    9:    (at your option) any later version.
   10: 
   11:    This program is distributed in the hope that it will be useful,
   12:    but WITHOUT ANY WARRANTY; without even the implied warranty of
   13:    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   14:    GNU General Public License for more details.
   15: 
   16:    You should have received a copy of the GNU General Public License
   17:    along with this program; if not, write to the Free Software Foundation,
   18:    Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.  */
   19: 
   20: #include <assert.h>
   21: #include <errno.h>
   22: #include <error.h>
   23: #include <fcntl.h>
   24: #include <iconv.h>
   25: #include <libintl.h>
   26: #include <stdio.h>
   27: #include <stdlib.h>
   28: #include <unistd.h>
   29: #include <sys/mman.h>
   30: #include <sys/stat.h>
   31: 
   32: #include "iconv_prog.h"
   33: 
   34: 
   35: /* Prototypes for a few program-wide used functions.  */
   36: extern void *xmalloc (size_t __n);
   37: extern void *xcalloc (size_t __n, size_t __s);
   38: 
   39: 
   40: struct convtable
   41: {
   42:   int term[256 / 8];
   43:   union
   44:   {
   45:     struct convtable *sub;
   46:     struct charseq *out;
   47:   } val[256];
   48: };
   49: 
   50: 
   51: static inline struct convtable *
   52: allocate_table (void)
   53: {
   54:   return (struct convtable *) xcalloc (1, sizeof (struct convtable));
   55: }
   56: 
   57: 
   58: static inline int
   59: is_term (struct convtable *tbl, unsigned int idx)
   60: {
   61:   return tbl->term[idx / 8] & (1 << (idx % 8));
   62: }
   63: 
   64: 
   65: static inline void
   66: clear_term (struct convtable *tbl, unsigned int idx)
   67: {
   68:   tbl->term[idx / 8] &= ~(1 << (idx % 8));
   69: }
   70: 
   71: 
   72: static inline void
   73: set_term (struct convtable *tbl, unsigned int idx)
   74: {
   75:   tbl->term[idx / 8] |= 1 << (idx % 8);
   76: }
   77: 
   78: 
   79: /* Generate the conversion table.  */
   80: static struct convtable *use_from_charmap (struct charmap_t *from_charmap,
   81:                                            const char *to_code);
   82: static struct convtable *use_to_charmap (const char *from_code,
   83:                                          struct charmap_t *to_charmap);
   84: static struct convtable *use_both_charmaps (struct charmap_t *from_charmap,
   85:                                             struct charmap_t *to_charmap);
   86: 
   87: /* Prototypes for the functions doing the actual work.  */
   88: static int process_block (struct convtable *tbl, char *addr, size_t len,
   89:                           FILE *output);
   90: static int process_fd (struct convtable *tbl, int fd, FILE *output);
   91: static int process_file (struct convtable *tbl, FILE *input, FILE *output);
   92: 
   93: 
   94: int
   95: charmap_conversion (const char *from_code, struct charmap_t *from_charmap,
   96:                     const char *to_code, struct charmap_t *to_charmap,
   97:                     int argc, int remaining, char *argv[], FILE *output)
   98: {
   99:   struct convtable *cvtbl;
  100:   int status = EXIT_SUCCESS;
  101: 
  102:   /* We have three different cases to handle:
  103: 
  104:      - both, from_charmap and to_charmap, are available.  This means we
  105:        can assume that the symbolic names match and use them to create
  106:        the mapping.
  107: 
  108:      - only from_charmap is available.  In this case we can only hope that
  109:        the symbolic names used are of the <Uxxxx> form in which case we
  110:        can use a UCS4->"to_code" iconv() conversion for the second step.
  111: 
  112:      - only to_charmap is available.  This is similar, only that we would
  113:        use iconv() for the "to_code"->UCS4 conversion.
  114: 
  115:        We first create a table which maps input bytes into output bytes.
  116:        Once this is done we can handle all three of the cases above
  117:        equally.  */
  118:   if (from_charmap != NULL)
  119:     {
  120:       if (to_charmap == NULL)
  121:         cvtbl = use_from_charmap (from_charmap, to_code);
  122:       else
  123:         cvtbl = use_both_charmaps (from_charmap, to_charmap);
  124:     }
  125:   else
  126:     {
  127:       assert (to_charmap != NULL);
  128:       cvtbl = use_to_charmap (from_code, to_charmap);
  129:     }
  130: 
  131:   /* If we couldn't generate a table stop now.  */
  132:   if (cvtbl == NULL)
  133:     return EXIT_FAILURE;
  134: 
  135:   /* We can now start the conversion.  */
  136:   if (remaining == argc)
  137:     {
  138:       if (process_file (cvtbl, stdin, output) != 0)
  139:         status = EXIT_FAILURE;
  140:     }
  141:   else
  142:     do
  143:       {
  144:         struct stat st;
  145:         char *addr;
  146:         int fd;
  147: 
  148:         if (verbose)
  149:           printf ("%s:\n", argv[remaining]);
  150:         if (strcmp (argv[remaining], "-") == 0)
  151:           fd = 0;
  152:         else
  153:           {
  154:             fd = open (argv[remaining], O_RDONLY);
  155: 
  156:             if (fd == -1)
  157:               {
  158:                 error (0, errno, _("cannot open input file `%s'"),
  159:                        argv[remaining]);
  160:                 status = EXIT_FAILURE;
  161:                 continue;
  162:               }
  163:           }
  164: 
  165: #ifdef _POSIX_MAPPED_FILES
  166:         /* We have possibilities for reading the input file.  First try
  167:            to mmap() it since this will provide the fastest solution.  */
  168:         if (fstat (fd, &st) == 0
  169:             && ((addr = mmap (NULL, st.st_size, PROT_READ, MAP_PRIVATE,
  170:                               fd, 0)) != MAP_FAILED))
  171:           {
  172:             /* Yes, we can use mmap().  The descriptor is not needed
  173:                anymore.  */
  174:             if (close (fd) != 0)
  175:               error (EXIT_FAILURE, errno,
  176:                      _("error while closing input `%s'"), argv[remaining]);
  177: 
  178:             if (process_block (cvtbl, addr, st.st_size, output) < 0)
  179:               {
  180:                 /* Something went wrong.  */
  181:                 status = EXIT_FAILURE;
  182: 
  183:                 /* We don't need the input data anymore.  */
  184:                 munmap ((void *) addr, st.st_size);
  185: 
  186:                 /* We cannot go on with producing output since it might
  187:                    lead to problem because the last output might leave
  188:                    the output stream in an undefined state.  */
  189:                 break;
  190:               }
  191: 
  192:             /* We don't need the input data anymore.  */
  193:             munmap ((void *) addr, st.st_size);
  194:           }
  195:         else
  196: #endif  /* _POSIX_MAPPED_FILES */
  197:           {
  198:             /* Read the file in pieces.  */
  199:             if (process_fd (cvtbl, fd, output) != 0)
  200:               {
  201:                 /* Something went wrong.  */
  202:                 status = EXIT_FAILURE;
  203: 
  204:                 /* We don't need the input file anymore.  */
  205:                 close (fd);
  206: 
  207:                 /* We cannot go on with producing output since it might
  208:                    lead to problem because the last output might leave
  209:                    the output stream in an undefined state.  */
  210:                 break;
  211:               }
  212: 
  213:             /* Now close the file.  */
  214:             close (fd);
  215:           }
  216:       }
  217:     while (++remaining < argc);
  218: 
  219:   /* All done.  */
  220:   return status;
  221: }
  222: 
  223: 
  224: static void
  225: add_bytes (struct convtable *tbl, struct charseq *in, struct charseq *out)
  226: {
  227:   int n = 0;
  228:   unsigned int byte;
  229: 
  230:   assert (in->nbytes > 0);
  231: 
  232:   byte = ((unsigned char *) in->bytes)[n];
  233:   while (n + 1 < in->nbytes)
  234:     {
  235:       if (is_term (tbl, byte) || tbl->val[byte].sub == NULL)
  236:         {
  237:           /* Note that we simply ignore a definition for a byte sequence
  238:              which is also the prefix for a longer one.  */
  239:           clear_term (tbl, byte);
  240:           tbl->val[byte].sub =
  241:             (struct convtable *) xcalloc (1, sizeof (struct convtable));
  242:         }
  243: 
  244:       tbl = tbl->val[byte].sub;
  245: 
  246:       byte = ((unsigned char *) in->bytes)[++n];
  247:     }
  248: 
  249:   /* Only add the new sequence if there is none yet and the byte sequence
  250:      is not part of an even longer one.  */
  251:   if (! is_term (tbl, byte) && tbl->val[byte].sub == NULL)
  252:     {
  253:       set_term (tbl, byte);
  254:       tbl->val[byte].out = out;
  255:     }
  256: }
  257: 
  258: 
  259: static struct convtable *
  260: use_from_charmap (struct charmap_t *from_charmap, const char *to_code)
  261: {
  262:   /* We iterate over all entries in the from_charmap and for those which
  263:      have a known UCS4 representation we use an iconv() call to determine
  264:      the mapping to the to_code charset.  */
  265:   struct convtable *rettbl;
  266:   iconv_t cd;
  267:   void *ptr = NULL;
  268:   const void *key;
  269:   size_t keylen;
  270:   void *data;
  271: 
  272:   cd = iconv_open (to_code, "WCHAR_T");
  273:   if (cd == (iconv_t) -1)
  274:     /* We cannot do anything.  */
  275:     return NULL;
  276: 
  277:   rettbl = allocate_table ();
  278: 
  279:   while (iterate_table (&from_charmap->char_table, &ptr, &key, &keylen, &data)
  280:          >= 0)
  281:     {
  282:       struct charseq *in = (struct charseq *) data;
  283: 
  284:       if (in->ucs4 != UNINITIALIZED_CHAR_VALUE)
  285:         {
  286:           /* There is a chance.  Try the iconv module.  */
  287:           wchar_t inbuf[1] = { in->ucs4 };
  288:           unsigned char outbuf[64];
  289:           char *inptr = (char *) inbuf;
  290:           size_t inlen = sizeof (inbuf);
  291:           char *outptr = (char *) outbuf;
  292:           size_t outlen = sizeof (outbuf);
  293: 
  294:           (void) iconv (cd, &inptr, &inlen, &outptr, &outlen);
  295: 
  296:           if (outptr != (char *) outbuf)
  297:             {
  298:               /* We got some output.  Good, use it.  */
  299:               struct charseq *newp;
  300: 
  301:               outlen = sizeof (outbuf) - outlen;
  302:               assert ((char *) outbuf + outlen == outptr);
  303: 
  304:               newp = (struct charseq *) xmalloc (sizeof (struct charseq)
  305:                                                  + outlen);
  306:               newp->name = in->name;
  307:               newp->ucs4 = in->ucs4;
  308:               newp->nbytes = outlen;
  309:               memcpy (newp->bytes, outbuf, outlen);
  310: 
  311:               add_bytes (rettbl, in, newp);
  312:             }
  313: 
  314:           /* Clear any possible state left behind.  */
  315:           (void) iconv (cd, NULL, NULL, NULL, NULL);
  316:         }
  317:     }
  318: 
  319:   iconv_close (cd);
  320: 
  321:   return rettbl;
  322: }
  323: 
  324: 
  325: static struct convtable *
  326: use_to_charmap (const char *from_code, struct charmap_t *to_charmap)
  327: {
  328:   /* We iterate over all entries in the to_charmap and for those which
  329:      have a known UCS4 representation we use an iconv() call to determine
  330:      the mapping to the from_code charset.  */
  331:   struct convtable *rettbl;
  332:   iconv_t cd;
  333:   void *ptr = NULL;
  334:   const void *key;
  335:   size_t keylen;
  336:   void *data;
  337: 
  338:   /* Note that the conversion we use here is the reverse direction.  Without
  339:      exhaustive search we cannot figure out which input yields the UCS4
  340:      character we are looking for.  Therefore we determine it the other
  341:      way round.  */
  342:   cd = iconv_open (from_code, "WCHAR_T");
  343:   if (cd == (iconv_t) -1)
  344:     /* We cannot do anything.  */
  345:     return NULL;
  346: 
  347:   rettbl = allocate_table ();
  348: 
  349:   while (iterate_table (&to_charmap->char_table, &ptr, &key, &keylen, &data)
  350:          >= 0)
  351:     {
  352:       struct charseq *out = (struct charseq *) data;
  353: 
  354:       if (out->ucs4 != UNINITIALIZED_CHAR_VALUE)
  355:         {
  356:           /* There is a chance.  Try the iconv module.  */
  357:           wchar_t inbuf[1] = { out->ucs4 };
  358:           unsigned char outbuf[64];
  359:           char *inptr = (char *) inbuf;
  360:           size_t inlen = sizeof (inbuf);
  361:           char *outptr = (char *) outbuf;
  362:           size_t outlen = sizeof (outbuf);
  363: 
  364:           (void) iconv (cd, &inptr, &inlen, &outptr, &outlen);
  365: 
  366:           if (outptr != (char *) outbuf)
  367:             {
  368:               /* We got some output.  Good, use it.  */
  369:               union
  370:               {
  371:                 struct charseq seq;
  372:                 struct
  373:                 {
  374:                   const char *name;
  375:                   uint32_t ucs4;
  376:                   int nbytes;
  377:                   unsigned char bytes[outlen];
  378:                 } mem;
  379:               } new;
  380: 
  381:               outlen = sizeof (outbuf) - outlen;
  382:               assert ((char *) outbuf + outlen == outptr);
  383: 
  384:               new.mem.name = out->name;
  385:               new.mem.ucs4 = out->ucs4;
  386:               new.mem.nbytes = outlen;
  387:               memcpy (new.mem.bytes, outbuf, outlen);
  388: 
  389:               add_bytes (rettbl, &new.seq, out);
  390:             }
  391: 
  392:           /* Clear any possible state left behind.  */
  393:           (void) iconv (cd, NULL, NULL, NULL, NULL);
  394:         }
  395:     }
  396: 
  397:   iconv_close (cd);
  398: 
  399:   return rettbl;
  400: }
  401: 
  402: 
  403: static struct convtable *
  404: use_both_charmaps (struct charmap_t *from_charmap,
  405:                    struct charmap_t *to_charmap)
  406: {
  407:   /* In this case we iterate over all the entries in the from_charmap,
  408:      determine the internal name, and find an appropriate entry in the
  409:      to_charmap (if it exists).  */
  410:   struct convtable *rettbl = allocate_table ();
  411:   void *ptr = NULL;
  412:   const void *key;
  413:   size_t keylen;
  414:   void *data;
  415: 
  416:   while (iterate_table (&from_charmap->char_table, &ptr, &key, &keylen, &data)
  417:          >= 0)
  418:     {
  419:       struct charseq *in = (struct charseq *) data;
  420:       struct charseq *out = charmap_find_value (to_charmap, key, keylen);
  421: 
  422:       if (out != NULL)
  423:         add_bytes (rettbl, in, out);
  424:     }
  425: 
  426:   return rettbl;
  427: }
  428: 
  429: 
  430: static int
  431: process_block (struct convtable *tbl, char *addr, size_t len, FILE *output)
  432: {
  433:   size_t n = 0;
  434: 
  435:   while (n < len)
  436:     {
  437:       struct convtable *cur = tbl;
  438:       unsigned char *curp = (unsigned char *) addr;
  439:       unsigned int byte = *curp;
  440:       int cnt;
  441:       struct charseq *out;
  442: 
  443:       while (! is_term (cur, byte))
  444:         if (cur->val[byte].sub == NULL)
  445:           {
  446:             /* This is a invalid sequence.  Skip the first byte if we are
  447:                ignoring errors.  Otherwise punt.  */
  448:             if (! omit_invalid)
  449:               {
  450:                 error (0, 0, _("illegal input sequence at position %Zd"), n);
  451:                 return -1;
  452:               }
  453: 
  454:             n -= curp - (unsigned char *) addr;
  455: 
  456:             byte = *(curp = (unsigned char *) ++addr);
  457:             if (++n >= len)
  458:               /* All converted.  */
  459:               return 0;
  460: 
  461:             cur = tbl;
  462:           }
  463:         else
  464:           {
  465:             cur = cur->val[byte].sub;
  466: 
  467:             if (++n >= len)
  468:               {
  469:                 error (0, 0, _("\
  470: incomplete character or shift sequence at end of buffer"));
  471:                 return -1;
  472:               }
  473: 
  474:             byte = *++curp;
  475:           }
  476: 
  477:       /* We found a final byte.  Write the output bytes.  */
  478:       out = cur->val[byte].out;
  479:       for (cnt = 0; cnt < out->nbytes; ++cnt)
  480:         fputc_unlocked (out->bytes[cnt], output);
  481: 
  482:       addr = (char *) curp + 1;
  483:       ++n;
  484:     }
  485: 
  486:   return 0;
  487: }
  488: 
  489: 
  490: static int
  491: process_fd (struct convtable *tbl, int fd, FILE *output)
  492: {
  493:   /* We have a problem with reading from a descriptor since we must not
  494:      provide the iconv() function an incomplete character or shift
  495:      sequence at the end of the buffer.  Since we have to deal with
  496:      arbitrary encodings we must read the whole text in a buffer and
  497:      process it in one step.  */
  498:   static char *inbuf = NULL;
  499:   static size_t maxlen = 0;
  500:   char *inptr = inbuf;
  501:   size_t actlen = 0;
  502: 
  503:   while (actlen < maxlen)
  504:     {
  505:       ssize_t n = read (fd, inptr, maxlen - actlen);
  506: 
  507:       if (n == 0)
  508:         /* No more text to read.  */
  509:         break;
  510: 
  511:       if (n == -1)
  512:         {
  513:           /* Error while reading.  */
  514:           error (0, errno, _("error while reading the input"));
  515:           return -1;
  516:         }
  517: 
  518:       inptr += n;
  519:       actlen += n;
  520:     }
  521: 
  522:   if (actlen == maxlen)
  523:     while (1)
  524:       {
  525:         ssize_t n;
  526:         char *new_inbuf;
  527: 
  528:         /* Increase the buffer.  */
  529:         new_inbuf = (char *) realloc (inbuf, maxlen + 32768);
  530:         if (new_inbuf == NULL)
  531:           {
  532:             error (0, errno, _("unable to allocate buffer for input"));
  533:             return -1;
  534:           }
  535:         inbuf = new_inbuf;
  536:         maxlen += 32768;
  537:         inptr = inbuf + actlen;
  538: 
  539:         do
  540:           {
  541:             n = read (fd, inptr, maxlen - actlen);
  542: 
  543:             if (n == 0)
  544:               /* No more text to read.  */
  545:               break;
  546: 
  547:             if (n == -1)
  548:               {
  549:                 /* Error while reading.  */
  550:                 error (0, errno, _("error while reading the input"));
  551:                 return -1;
  552:               }
  553: 
  554:             inptr += n;
  555:             actlen += n;
  556:           }
  557:         while (actlen < maxlen);
  558: 
  559:         if (n == 0)
  560:           /* Break again so we leave both loops.  */
  561:           break;
  562:       }
  563: 
  564:   /* Now we have all the input in the buffer.  Process it in one run.  */
  565:   return process_block (tbl, inbuf, actlen, output);
  566: }
  567: 
  568: 
  569: static int
  570: process_file (struct convtable *tbl, FILE *input, FILE *output)
  571: {
  572:   /* This should be safe since we use this function only for `stdin' and
  573:      we haven't read anything so far.  */
  574:   return process_fd (tbl, fileno (input), output);
  575: }
1