(linenum→info "unix/slp.c:2238")

glibc/2.7/iconv/iconv_prog.c

    1: /* Convert text in given files from the specified from-set to the to-set.
    2:    Copyright (C) 1998-2004, 2005, 2006, 2007 Free Software Foundation, Inc.
    3:    This file is part of the GNU C Library.
    4:    Contributed by Ulrich Drepper <drepper@cygnus.com>, 1998.
    5: 
    6:    This program is free software; you can redistribute it and/or modify
    7:    it under the terms of the GNU General Public License as published
    8:    by the Free Software Foundation; version 2 of the License, or
    9:    (at your option) any later version.
   10: 
   11:    This program is distributed in the hope that it will be useful,
   12:    but WITHOUT ANY WARRANTY; without even the implied warranty of
   13:    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   14:    GNU General Public License for more details.
   15: 
   16:    You should have received a copy of the GNU General Public License
   17:    along with this program; if not, write to the Free Software Foundation,
   18:    Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.  */
   19: 
   20: #include <argp.h>
   21: #include <assert.h>
   22: #include <ctype.h>
   23: #include <errno.h>
   24: #include <error.h>
   25: #include <fcntl.h>
   26: #include <iconv.h>
   27: #include <langinfo.h>
   28: #include <locale.h>
   29: #include <search.h>
   30: #include <stdbool.h>
   31: #include <stdio.h>
   32: #include <stdlib.h>
   33: #include <string.h>
   34: #include <unistd.h>
   35: #include <libintl.h>
   36: #ifdef _POSIX_MAPPED_FILES
   37: # include <sys/mman.h>
   38: #endif
   39: #include <charmap.h>
   40: #include <gconv_int.h>
   41: #include "iconv_prog.h"
   42: #include "iconvconfig.h"
   43: 
   44: /* Get libc version number.  */
   45: #include "../version.h"
   46: 
   47: #define PACKAGE _libc_intl_domainname
   48: 
   49: 
   50: /* Name and version of program.  */
   51: static void print_version (FILE *stream, struct argp_state *state);
   52: void (*argp_program_version_hook) (FILE *, struct argp_state *) = print_version;
   53: 
   54: #define OPT_VERBOSE     1000
   55: #define OPT_LIST        'l'
   56: 
   57: /* Definitions of arguments for argp functions.  */
   58: static const struct argp_option options[] =
   59: {
   60:   { NULL, 0, NULL, 0, N_("Input/Output format specification:") },
   61:   { "from-code", 'f', "NAME", 0, N_("encoding of original text") },
   62:   { "to-code", 't', "NAME", 0, N_("encoding for output") },
   63:   { NULL, 0, NULL, 0, N_("Information:") },
   64:   { "list", 'l', NULL, 0, N_("list all known coded character sets") },
   65:   { NULL, 0, NULL, 0, N_("Output control:") },
   66:   { NULL, 'c', NULL, 0, N_("omit invalid characters from output") },
   67:   { "output", 'o', "FILE", 0, N_("output file") },
   68:   { "silent", 's', NULL, 0, N_("suppress warnings") },
   69:   { "verbose", OPT_VERBOSE, NULL, 0, N_("print progress information") },
   70:   { NULL, 0, NULL, 0, NULL }
   71: };
   72: 
   73: /* Short description of program.  */
   74: static const char doc[] = N_("\
   75: Convert encoding of given files from one encoding to another.");
   76: 
   77: /* Strings for arguments in help texts.  */
   78: static const char args_doc[] = N_("[FILE...]");
   79: 
   80: /* Prototype for option handler.  */
   81: static error_t parse_opt (int key, char *arg, struct argp_state *state);
   82: 
   83: /* Function to print some extra text in the help message.  */
   84: static char *more_help (int key, const char *text, void *input);
   85: 
   86: /* Data structure to communicate with argp functions.  */
   87: static struct argp argp =
   88: {
   89:   options, parse_opt, args_doc, doc, NULL, more_help
   90: };
   91: 
   92: /* Code sets to convert from and to respectively.  An empty string as the
   93:    default causes the 'iconv_open' function to look up the charset of the
   94:    currently selected locale and use it.  */
   95: static const char *from_code = "";
   96: static const char *to_code = "";
   97: 
   98: /* File to write output to.  If NULL write to stdout.  */
   99: static const char *output_file;
  100: 
  101: /* Nonzero if verbose ouput is wanted.  */
  102: int verbose;
  103: 
  104: /* Nonzero if list of all coded character sets is wanted.  */
  105: static int list;
  106: 
  107: /* If nonzero omit invalid character from output.  */
  108: int omit_invalid;
  109: 
  110: /* Prototypes for the functions doing the actual work.  */
  111: static int process_block (iconv_t cd, char *addr, size_t len, FILE *output);
  112: static int process_fd (iconv_t cd, int fd, FILE *output);
  113: static int process_file (iconv_t cd, FILE *input, FILE *output);
  114: static void print_known_names (void) internal_function;
  115: 
  116: 
  117: int
  118: main (int argc, char *argv[])
  119: {
  120:   int status = EXIT_SUCCESS;
  121:   int remaining;
  122:   FILE *output;
  123:   iconv_t cd;
  124:   const char *orig_to_code;
  125:   struct charmap_t *from_charmap = NULL;
  126:   struct charmap_t *to_charmap = NULL;
  127: 
  128:   /* Set locale via LC_ALL.  */
  129:   setlocale (LC_ALL, "");
  130: 
  131:   /* Set the text message domain.  */
  132:   textdomain (_libc_intl_domainname);
  133: 
  134:   /* Parse and process arguments.  */
  135:   argp_parse (&argp, argc, argv, 0, &remaining, NULL);
  136: 
  137:   /* List all coded character sets if wanted.  */
  138:   if (list)
  139:     {
  140:       print_known_names ();
  141:       exit (EXIT_SUCCESS);
  142:     }
  143: 
  144:   /* If we have to ignore errors make sure we use the appropriate name for
  145:      the to-character-set.  */
  146:   orig_to_code = to_code;
  147:   if (omit_invalid)
  148:     {
  149:       const char *errhand = strchrnul (to_code, '/');
  150:       int nslash = 2;
  151:       char *newp;
  152:       char *cp;
  153: 
  154:       if (*errhand == '/')
  155:         {
  156:           --nslash;
  157:           errhand = strchrnul (errhand, '/');
  158: 
  159:           if (*errhand == '/')
  160:             {
  161:               --nslash;
  162:               errhand = strchr (errhand, '\0');
  163:             }
  164:         }
  165: 
  166:       newp = (char *) alloca (errhand - to_code + nslash + 7 + 1);
  167:       cp = mempcpy (newp, to_code, errhand - to_code);
  168:       while (nslash-- > 0)
  169:         *cp++ = '/';
  170:       if (cp[-1] != '/')
  171:         *cp++ = ',';
  172:       memcpy (cp, "IGNORE", sizeof ("IGNORE"));
  173: 
  174:       to_code = newp;
  175:     }
  176: 
  177:   /* POSIX 1003.2b introduces a silly thing: the arguments to -t anf -f
  178:      can be file names of charmaps.  In this case iconv will have to read
  179:      those charmaps and use them to do the conversion.  But there are
  180:      holes in the specification.  There is nothing said that if -f is a
  181:      charmap filename that -t must be, too.  And vice versa.  There is
  182:      also no word about the symbolic names used.  What if they don't
  183:      match?  */
  184:   if (strchr (from_code, '/') != NULL)
  185:     /* The from-name might be a charmap file name.  Try reading the
  186:        file.  */
  187:     from_charmap = charmap_read (from_code, /*0, 1*/1, 0, 0, 0);
  188: 
  189:   if (strchr (orig_to_code, '/') != NULL)
  190:     /* The to-name might be a charmap file name.  Try reading the
  191:        file.  */
  192:     to_charmap = charmap_read (orig_to_code, /*0, 1,*/1, 0, 0, 0);
  193: 
  194: 
  195:   /* Determine output file.  */
  196:   if (output_file != NULL && strcmp (output_file, "-") != 0)
  197:     {
  198:       output = fopen (output_file, "w");
  199:       if (output == NULL)
  200:         error (EXIT_FAILURE, errno, _("cannot open output file"));
  201:     }
  202:   else
  203:     output = stdout;
  204: 
  205:   /* At this point we have to handle two cases.  The first one is
  206:      where a charmap is used for the from- or to-charset, or both.  We
  207:      handle this special since it is very different from the sane way of
  208:      doing things.  The other case allows converting using the iconv()
  209:      function.  */
  210:   if (from_charmap != NULL || to_charmap != NULL)
  211:     /* Construct the conversion table and do the conversion.  */
  212:     status = charmap_conversion (from_code, from_charmap, to_code, to_charmap,
  213:                                  argc, remaining, argv, output);
  214:   else
  215:     {
  216:       /* Let's see whether we have these coded character sets.  */
  217:       cd = iconv_open (to_code, from_code);
  218:       if (cd == (iconv_t) -1)
  219:         {
  220:           if (errno == EINVAL)
  221:             {
  222:               /* Try to be nice with the user and tell her which of the
  223:                  two encoding names is wrong.  This is possible because
  224:                  all supported encodings can be converted from/to Unicode,
  225:                  in other words, because the graph of encodings is
  226:                  connected.  */
  227:               bool from_wrong =
  228:                 (iconv_open ("UTF-8", from_code) == (iconv_t) -1
  229:                  && errno == EINVAL);
  230:               bool to_wrong =
  231:                 (iconv_open (to_code, "UTF-8") == (iconv_t) -1
  232:                  && errno == EINVAL);
  233:               const char *from_pretty =
  234:                 (from_code[0] ? from_code : nl_langinfo (CODESET));
  235:               const char *to_pretty =
  236:                 (orig_to_code[0] ? orig_to_code : nl_langinfo (CODESET));
  237: 
  238:               if (from_wrong)
  239:                 {
  240:                   if (to_wrong)
  241:                     error (0, 0,
  242:                            _("\
  243: conversions from `%s' and to `%s' are not supported"),
  244:                            from_pretty, to_pretty);
  245:                   else
  246:                     error (0, 0,
  247:                            _("conversion from `%s' is not supported"),
  248:                            from_pretty);
  249:                 }
  250:               else
  251:                 {
  252:                   if (to_wrong)
  253:                     error (0, 0,
  254:                            _("conversion to `%s' is not supported"),
  255:                            to_pretty);
  256:                   else
  257:                     error (0, 0,
  258:                            _("conversion from `%s' to `%s' is not supported"),
  259:                            from_pretty, to_pretty);
  260:                 }
  261: 
  262:               argp_help (&argp, stderr, ARGP_HELP_SEE,
  263:                          program_invocation_short_name);
  264:               exit (1);
  265:             }
  266:           else
  267:             error (EXIT_FAILURE, errno,
  268:                    _("failed to start conversion processing"));
  269:         }
  270: 
  271:       /* Now process the remaining files.  Write them to stdout or the file
  272:          specified with the `-o' parameter.  If we have no file given as
  273:          the parameter process all from stdin.  */
  274:       if (remaining == argc)
  275:         {
  276:           if (process_file (cd, stdin, output) != 0)
  277:             status = EXIT_FAILURE;
  278:         }
  279:       else
  280:         do
  281:           {
  282: #ifdef _POSIX_MAPPED_FILES
  283:             struct stat st;
  284:             char *addr;
  285: #endif
  286:             int fd, ret;
  287: 
  288:             if (verbose)
  289:               fprintf (stderr, "%s:\n", argv[remaining]);
  290:             if (strcmp (argv[remaining], "-") == 0)
  291:               fd = 0;
  292:             else
  293:               {
  294:                 fd = open (argv[remaining], O_RDONLY);
  295: 
  296:                 if (fd == -1)
  297:                   {
  298:                     error (0, errno, _("cannot open input file `%s'"),
  299:                            argv[remaining]);
  300:                     status = EXIT_FAILURE;
  301:                     continue;
  302:                   }
  303:               }
  304: 
  305: #ifdef _POSIX_MAPPED_FILES
  306:             /* We have possibilities for reading the input file.  First try
  307:                to mmap() it since this will provide the fastest solution.  */
  308:             if (fstat (fd, &st) == 0
  309:                 && ((addr = mmap (NULL, st.st_size, PROT_READ, MAP_PRIVATE,
  310:                                   fd, 0)) != MAP_FAILED))
  311:               {
  312:                 /* Yes, we can use mmap().  The descriptor is not needed
  313:                    anymore.  */
  314:                 if (close (fd) != 0)
  315:                   error (EXIT_FAILURE, errno,
  316:                          _("error while closing input `%s'"),
  317:                          argv[remaining]);
  318: 
  319:                 ret = process_block (cd, addr, st.st_size, output);
  320: 
  321:                 /* We don't need the input data anymore.  */
  322:                 munmap ((void *) addr, st.st_size);
  323: 
  324:                 if (ret != 0)
  325:                   {
  326:                     status = EXIT_FAILURE;
  327: 
  328:                     if (ret < 0)
  329:                       /* We cannot go on with producing output since it might
  330:                          lead to problem because the last output might leave
  331:                          the output stream in an undefined state.  */
  332:                       break;
  333:                   }
  334:               }
  335:             else
  336: #endif  /* _POSIX_MAPPED_FILES */
  337:               {
  338:                 /* Read the file in pieces.  */
  339:                 ret = process_fd (cd, fd, output);
  340: 
  341:                 /* Now close the file.  */
  342:                 close (fd);
  343: 
  344:                 if (ret != 0)
  345:                   {
  346:                     /* Something went wrong.  */
  347:                     status = EXIT_FAILURE;
  348: 
  349:                     if (ret < 0)
  350:                       /* We cannot go on with producing output since it might
  351:                          lead to problem because the last output might leave
  352:                          the output stream in an undefined state.  */
  353:                       break;
  354:                   }
  355:               }
  356:           }
  357:         while (++remaining < argc);
  358:     }
  359: 
  360:   /* Close the output file now.  */
  361:   if (fclose (output))
  362:     error (EXIT_FAILURE, errno, _("error while closing output file"));
  363: 
  364:   return status;
  365: }
  366: 
  367: 
  368: /* Handle program arguments.  */
  369: static error_t
  370: parse_opt (int key, char *arg, struct argp_state *state)
  371: {
  372:   switch (key)
  373:     {
  374:     case 'f':
  375:       from_code = arg;
  376:       break;
  377:     case 't':
  378:       to_code = arg;
  379:       break;
  380:     case 'o':
  381:       output_file = arg;
  382:       break;
  383:     case 's':
  384:       /* Nothing, for now at least.  We are not giving out any information
  385:          about missing character or so.  */
  386:       break;
  387:     case 'c':
  388:       /* Omit invalid characters from output.  */
  389:       omit_invalid = 1;
  390:       break;
  391:     case OPT_VERBOSE:
  392:       verbose = 1;
  393:       break;
  394:     case OPT_LIST:
  395:       list = 1;
  396:       break;
  397:     default:
  398:       return ARGP_ERR_UNKNOWN;
  399:     }
  400:   return 0;
  401: }
  402: 
  403: 
  404: static char *
  405: more_help (int key, const char *text, void *input)
  406: {
  407:   switch (key)
  408:     {
  409:     case ARGP_KEY_HELP_EXTRA:
  410:       /* We print some extra information.  */
  411:       return strdup (gettext ("\
  412: For bug reporting instructions, please see:\n\
  413: <http://www.gnu.org/software/libc/bugs.html>.\n"));
  414:     default:
  415:       break;
  416:     }
  417:   return (char *) text;
  418: }
  419: 
  420: 
  421: /* Print the version information.  */
  422: static void
  423: print_version (FILE *stream, struct argp_state *state)
  424: {
  425:   fprintf (stream, "iconv (GNU %s) %s\n", PACKAGE, VERSION);
  426:   fprintf (stream, gettext ("\
  427: Copyright (C) %s Free Software Foundation, Inc.\n\
  428: This is free software; see the source for copying conditions.  There is NO\n\
  429: warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.\n\
  430: "), "2007");
  431:   fprintf (stream, gettext ("Written by %s.\n"), "Ulrich Drepper");
  432: }
  433: 
  434: 
  435: static int
  436: process_block (iconv_t cd, char *addr, size_t len, FILE *output)
  437: {
  438: #define OUTBUF_SIZE     32768
  439:   const char *start = addr;
  440:   char outbuf[OUTBUF_SIZE];
  441:   char *outptr;
  442:   size_t outlen;
  443:   size_t n;
  444:   int ret = 0;
  445: 
  446:   while (len > 0)
  447:     {
  448:       outptr = outbuf;
  449:       outlen = OUTBUF_SIZE;
  450:       n = iconv (cd, &addr, &len, &outptr, &outlen);
  451: 
  452:       if (n == (size_t) -1 && omit_invalid && errno == EILSEQ)
  453:         {
  454:           ret = 1;
  455:           if (len == 0)
  456:             n = 0;
  457:           else
  458:             errno = E2BIG;
  459:         }
  460: 
  461:       if (outptr != outbuf)
  462:         {
  463:           /* We have something to write out.  */
  464:           int errno_save = errno;
  465: 
  466:           if (fwrite (outbuf, 1, outptr - outbuf, output)
  467:               < (size_t) (outptr - outbuf)
  468:               || ferror (output))
  469:             {
  470:               /* Error occurred while printing the result.  */
  471:               error (0, 0, _("\
  472: conversion stopped due to problem in writing the output"));
  473:               return -1;
  474:             }
  475: 
  476:           errno = errno_save;
  477:         }
  478: 
  479:       if (n != (size_t) -1)
  480:         {
  481:           /* All the input test is processed.  For state-dependent
  482:              character sets we have to flush the state now.  */
  483:           outptr = outbuf;
  484:           outlen = OUTBUF_SIZE;
  485:           n = iconv (cd, NULL, NULL, &outptr, &outlen);
  486: 
  487:           if (outptr != outbuf)
  488:             {
  489:               /* We have something to write out.  */
  490:               int errno_save = errno;
  491: 
  492:               if (fwrite (outbuf, 1, outptr - outbuf, output)
  493:                   < (size_t) (outptr - outbuf)
  494:                   || ferror (output))
  495:                 {
  496:                   /* Error occurred while printing the result.  */
  497:                   error (0, 0, _("\
  498: conversion stopped due to problem in writing the output"));
  499:                   return -1;
  500:                 }
  501: 
  502:               errno = errno_save;
  503:             }
  504: 
  505:           if (n != (size_t) -1)
  506:             break;
  507: 
  508:           if (omit_invalid && errno == EILSEQ)
  509:             {
  510:               ret = 1;
  511:               break;
  512:             }
  513:         }
  514: 
  515:       if (errno != E2BIG)
  516:         {
  517:           /* iconv() ran into a problem.  */
  518:           switch (errno)
  519:             {
  520:             case EILSEQ:
  521:               if (! omit_invalid)
  522:                 error (0, 0, _("illegal input sequence at position %ld"),
  523:                        (long int) (addr - start));
  524:               break;
  525:             case EINVAL:
  526:               error (0, 0, _("\
  527: incomplete character or shift sequence at end of buffer"));
  528:               break;
  529:             case EBADF:
  530:               error (0, 0, _("internal error (illegal descriptor)"));
  531:               break;
  532:             default:
  533:               error (0, 0, _("unknown iconv() error %d"), errno);
  534:               break;
  535:             }
  536: 
  537:           return -1;
  538:         }
  539:     }
  540: 
  541:   return ret;
  542: }
  543: 
  544: 
  545: static int
  546: process_fd (iconv_t cd, int fd, FILE *output)
  547: {
  548:   /* we have a problem with reading from a desriptor since we must not
  549:      provide the iconv() function an incomplete character or shift
  550:      sequence at the end of the buffer.  Since we have to deal with
  551:      arbitrary encodings we must read the whole text in a buffer and
  552:      process it in one step.  */
  553:   static char *inbuf = NULL;
  554:   static size_t maxlen = 0;
  555:   char *inptr = NULL;
  556:   size_t actlen = 0;
  557: 
  558:   while (actlen < maxlen)
  559:     {
  560:       ssize_t n = read (fd, inptr, maxlen - actlen);
  561: 
  562:       if (n == 0)
  563:         /* No more text to read.  */
  564:         break;
  565: 
  566:       if (n == -1)
  567:         {
  568:           /* Error while reading.  */
  569:           error (0, errno, _("error while reading the input"));
  570:           return -1;
  571:         }
  572: 
  573:       inptr += n;
  574:       actlen += n;
  575:     }
  576: 
  577:   if (actlen == maxlen)
  578:     while (1)
  579:       {
  580:         ssize_t n;
  581:         char *new_inbuf;
  582: 
  583:         /* Increase the buffer.  */
  584:         new_inbuf = (char *) realloc (inbuf, maxlen + 32768);
  585: