rpm: file/src/ascmagic.c Source File

00001 /*
00002  * Copyright (c) Ian F. Darwin 1986-1995.
00003  * Software written by Ian F. Darwin and others;
00004  * maintained 1995-present by Christos Zoulas and others.
00005  * 
00006  * Redistribution and use in source and binary forms, with or without
00007  * modification, are permitted provided that the following conditions
00008  * are met:
00009  * 1. Redistributions of source code must retain the above copyright
00010  *    notice immediately at the beginning of the file, without modification,
00011  *    this list of conditions, and the following disclaimer.
00012  * 2. Redistributions in binary form must reproduce the above copyright
00013  *    notice, this list of conditions and the following disclaimer in the
00014  *    documentation and/or other materials provided with the distribution.
00015  *  
00016  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
00017  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
00018  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
00019  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR
00020  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
00021  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
00022  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
00023  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
00024  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
00025  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
00026  * SUCH DAMAGE.
00027  */
00028 /*
00029  * ASCII magic -- file types that we know based on keywords
00030  * that can appear anywhere in the file.
00031  *
00032  * Extensively modified by Eric Fischer <enf@pobox.com> in July, 2000,
00033  * to handle character codes other than ASCII on a unified basis.
00034  *
00035  * Joerg Wunsch <joerg@freebsd.org> wrote the original support for 8-bit
00036  * international characters, now subsumed into this file.
00037  */
00038 
00039 #include "file.h"
00040 #include "magic.h"
00041 #include <stdio.h>
00042 #include <string.h>
00043 #include <memory.h>
00044 #include <ctype.h>
00045 #include <stdlib.h>
00046 #ifdef HAVE_UNISTD_H
00047 #include <unistd.h>
00048 #endif
00049 #include "names.h"
00050 
00051 #ifndef lint
00052 FILE_RCSID("@(#)$Id: ascmagic.c,v 1.43 2005/06/25 15:52:14 christos Exp $")
00053 #endif  /* lint */
00054 
00055 typedef unsigned long unichar;
00056 
00057 #define MAXLINELEN 300  /* longest sane line length */
00058 #define ISSPC(x) ((x) == ' ' || (x) == '\t' || (x) == '\r' || (x) == '\n' \
00059                   || (x) == 0x85 || (x) == '\f')
00060 
00061 private int looks_ascii(const unsigned char *buf, size_t nbytes, unichar *ubuf, size_t *ulen)
00062         /*@modifies *ubuf, *ulen @*/;
00063 private int looks_utf8(const unsigned char *buf, size_t nbytes, unichar *ubuf, size_t *ulen)
00064         /*@modifies *ubuf, *ulen @*/;
00065 private int looks_unicode(const unsigned char *buf, size_t nbytes, unichar *ubuf, size_t *ulen)
00066         /*@modifies *ubuf, *ulen @*/;
00067 private int looks_latin1(const unsigned char *buf, size_t nbytes, unichar *ubuf, size_t *ulen)
00068         /*@modifies *ubuf, *ulen @*/;
00069 private int looks_extended(const unsigned char *buf, size_t nbytes, unichar *ubuf, size_t *ulen)
00070         /*@modifies *ubuf, *ulen @*/;
00071 private void from_ebcdic(const unsigned char *buf, size_t nbytes, unsigned char *out)
00072         /*@modifies *out @*/;
00073 private int ascmatch(const unsigned char *s, const unichar *us, size_t ulen)
00074         /*@*/;
00075 
00076 
00077 protected int
00078 file_ascmagic(struct magic_set *ms, const unsigned char *buf, size_t nbytes)
00079 {
00080         size_t i;
00081         unsigned char *nbuf = NULL;
00082         unichar *ubuf = NULL;   
00083         size_t ulen;
00084         struct names *p;
00085         int rv = -1;
00086 
00087         const char *code = NULL;
00088         const char *code_mime = NULL;
00089         const char *type = NULL;
00090         const char *subtype = NULL;
00091         const char *subtype_mime = NULL;
00092 
00093         int has_escapes = 0;
00094         int has_backspace = 0;
00095         int seen_cr = 0;
00096 
00097         int n_crlf = 0;
00098         int n_lf = 0;
00099         int n_cr = 0;
00100         int n_nel = 0;
00101 
00102         int last_line_end = -1;
00103         int has_long_lines = 0;
00104 
00105         /*
00106          * Undo the NUL-termination kindly provided by process()
00107          * but leave at least one byte to look at
00108          */
00109         while (nbytes > 1 && buf[nbytes - 1] == '\0')
00110                 nbytes--;
00111 
00112         if ((nbuf = malloc((nbytes + 1) * sizeof(nbuf[0]))) == NULL)
00113                 goto done;
00114         if ((ubuf = malloc((nbytes + 1) * sizeof(ubuf[0]))) == NULL)
00115                 goto done;
00116 
00117         /*
00118          * Then try to determine whether it's any character code we can
00119          * identify.  Each of these tests, if it succeeds, will leave
00120          * the text converted into one-unichar-per-character Unicode in
00121          * ubuf, and the number of characters converted in ulen.
00122          */
00123         if (looks_ascii(buf, nbytes, ubuf, &ulen)) {
00124                 code = "ASCII";
00125                 code_mime = "us-ascii";
00126                 type = "text";
00127         } else if (looks_utf8(buf, nbytes, ubuf, &ulen)) {
00128                 code = "UTF-8 Unicode";
00129                 code_mime = "utf-8";
00130                 type = "text";
00131         } else if ((i = looks_unicode(buf, nbytes, ubuf, &ulen)) != 0) {
00132                 if (i == 1)
00133                         code = "Little-endian UTF-16 Unicode";
00134                 else
00135                         code = "Big-endian UTF-16 Unicode";
00136 
00137                 type = "character data";
00138                 code_mime = "utf-16";    /* is this defined? */
00139         } else if (looks_latin1(buf, nbytes, ubuf, &ulen)) {
00140                 code = "ISO-8859";
00141                 type = "text";
00142                 code_mime = "iso-8859-1"; 
00143         } else if (looks_extended(buf, nbytes, ubuf, &ulen)) {
00144                 code = "Non-ISO extended-ASCII";
00145                 type = "text";
00146                 code_mime = "unknown";
00147         } else {
00148                 from_ebcdic(buf, nbytes, nbuf);
00149 
00150                 if (looks_ascii(nbuf, nbytes, ubuf, &ulen)) {
00151                         code = "EBCDIC";
00152                         type = "character data";
00153                         code_mime = "ebcdic";
00154                 } else if (looks_latin1(nbuf, nbytes, ubuf, &ulen)) {
00155                         code = "International EBCDIC";
00156                         type = "character data";
00157                         code_mime = "ebcdic";
00158                 } else {
00159                         rv = 0;
00160                         goto done;  /* doesn't look like text at all */
00161                 }
00162         }
00163 
00164         /*
00165          * for troff, look for . + letter + letter or .\";
00166          * this must be done to disambiguate tar archives' ./file
00167          * and other trash from real troff input.
00168          *
00169          * I believe Plan 9 troff allows non-ASCII characters in the names
00170          * of macros, so this test might possibly fail on such a file.
00171          */
00172         if (*ubuf == '.') {
00173                 unichar *tp = ubuf + 1;
00174 
00175                 while (ISSPC(*tp))
00176                         ++tp;   /* skip leading whitespace */
00177                 if ((tp[0] == '\\' && tp[1] == '\"') ||
00178                     (isascii((unsigned char)tp[0]) &&
00179                      isalnum((unsigned char)tp[0]) &&
00180                      isascii((unsigned char)tp[1]) &&
00181                      isalnum((unsigned char)tp[1]) &&
00182                      ISSPC(tp[2]))) {
00183                         subtype_mime = "text/troff";
00184                         subtype = "troff or preprocessor input";
00185                         goto subtype_identified;
00186                 }
00187         }
00188 
00189         if ((*buf == 'c' || *buf == 'C') && ISSPC(buf[1])) {
00190                 subtype_mime = "text/fortran";
00191                 subtype = "fortran program";
00192                 goto subtype_identified;
00193         }
00194 
00195         /* look for tokens from names.h - this is expensive! */
00196 
00197         i = 0;
00198         while (i < ulen) {
00199                 size_t end;
00200 
00201                 /*
00202                  * skip past any leading space
00203                  */
00204                 while (i < ulen && ISSPC(ubuf[i]))
00205                         i++;
00206                 if (i >= ulen)
00207                         break;
00208 
00209                 /*
00210                  * find the next whitespace
00211                  */
00212                 for (end = i + 1; end < nbytes; end++)
00213                         if (ISSPC(ubuf[end]))
00214                                 /*@innerbreak@*/ break;
00215 
00216                 /*
00217                  * compare the word thus isolated against the token list
00218                  */
00219                 for (p = names; p < names + NNAMES; p++) {
00220                         if (ascmatch((const unsigned char *)p->name, ubuf + i,
00221                             end - i)) {
00222                                 subtype = types[p->type].human;
00223                                 subtype_mime = types[p->type].mime;
00224                                 goto subtype_identified;
00225                         }
00226                 }
00227 
00228                 i = end;
00229         }
00230 
00231 subtype_identified:
00232 
00233         /*
00234          * Now try to discover other details about the file.
00235          */
00236         for (i = 0; i < ulen; i++) {
00237                 if (ubuf[i] == '\n') {
00238                         if (seen_cr)
00239                                 n_crlf++;
00240                         else
00241                                 n_lf++;
00242                         last_line_end = i;
00243                 } else if (seen_cr)
00244                         n_cr++;
00245 
00246                 seen_cr = (ubuf[i] == '\r');
00247                 if (seen_cr)
00248                         last_line_end = i;
00249 
00250                 if (ubuf[i] == 0x85) { /* X3.64/ECMA-43 "next line" character */
00251                         n_nel++;
00252                         last_line_end = i;
00253                 }
00254 
00255                 /* If this line is _longer_ than MAXLINELEN, remember it. */
00256                 if (i > last_line_end + MAXLINELEN)
00257                         has_long_lines = 1;
00258 
00259                 if (ubuf[i] == '\033')
00260                         has_escapes = 1;
00261                 if (ubuf[i] == '\b')
00262                         has_backspace = 1;
00263         }
00264 
00265         /* Beware, if the data has been truncated, the final CR could have
00266            been followed by a LF.  If we have HOWMANY bytes, it indicates
00267            that the data might have been truncated, probably even before
00268            this function was called. */
00269         if (seen_cr && nbytes < HOWMANY)
00270                 n_cr++;
00271 
00272         if ((ms->flags & MAGIC_MIME)) {
00273                 if (subtype_mime) {
00274                         if (file_printf(ms, subtype_mime) == -1)
00275                                 goto done;
00276                 } else {
00277                         if (file_printf(ms, "text/plain") == -1)
00278                                 goto done;
00279                 }
00280 
00281                 if (code_mime) {
00282                         if (file_printf(ms, "; charset=") == -1)
00283                                 goto done;
00284                         if (file_printf(ms, code_mime) == -1)
00285                                 goto done;
00286                 }
00287         } else {
00288                 if (file_printf(ms, code) == -1)
00289                         goto done;
00290 
00291                 if (subtype) {
00292                         if (file_printf(ms, " ") == -1)
00293                                 goto done;
00294                         if (file_printf(ms, subtype) == -1)
00295                                 goto done;
00296                 }
00297 
00298                 if (file_printf(ms, " ") == -1)
00299                         goto done;
00300                 if (file_printf(ms, type) == -1)
00301                         goto done;
00302 
00303                 if (has_long_lines)
00304                         if (file_printf(ms, ", with very long lines") == -1)
00305                                 goto done;
00306 
00307                 /*
00308                  * Only report line terminators if we find one other than LF,
00309                  * or if we find none at all.
00310                  */
00311                 if ((n_crlf == 0 && n_cr == 0 && n_nel == 0 && n_lf == 0) ||
00312                     (n_crlf != 0 || n_cr != 0 || n_nel != 0)) {
00313                         if (file_printf(ms, ", with") == -1)
00314                                 goto done;
00315 
00316                         if (n_crlf == 0 && n_cr == 0 && n_nel == 0 && n_lf == 0)                        {
00317                                 if (file_printf(ms, " no") == -1)
00318                                         goto done;
00319                         } else {
00320                                 if (n_crlf) {
00321                                         if (file_printf(ms, " CRLF") == -1)
00322                                                 goto done;
00323                                         if (n_cr || n_lf || n_nel)
00324                                                 if (file_printf(ms, ",") == -1)
00325                                                         goto done;
00326                                 }
00327                                 if (n_cr) {
00328                                         if (file_printf(ms, " CR") == -1)
00329                                                 goto done;
00330                                         if (n_lf || n_nel)
00331                                                 if (file_printf(ms, ",") == -1)
00332                                                         goto done;
00333                                 }
00334                                 if (n_lf) {
00335                                         if (file_printf(ms, " LF") == -1)
00336                                                 goto done;
00337                                         if (n_nel)
00338                                                 if (file_printf(ms, ",") == -1)
00339                                                         goto done;
00340                                 }
00341                                 if (n_nel)
00342                                         if (file_printf(ms, " NEL") == -1)
00343                                                 goto done;
00344                         }
00345 
00346                         if (file_printf(ms, " line terminators") == -1)
00347                                 goto done;
00348                 }
00349 
00350                 if (has_escapes)
00351                         if (file_printf(ms, ", with escape sequences") == -1)
00352                                 goto done;
00353                 if (has_backspace)
00354                         if (file_printf(ms, ", with overstriking") == -1)
00355                                 goto done;
00356         }
00357         rv = 1;
00358 done:
00359         if (nbuf)
00360                 free(nbuf);
00361         if (ubuf)
00362                 free(ubuf);
00363 
00364         return rv;
00365 }
00366 
00367 private int
00368 ascmatch(const unsigned char *s, const unichar *us, size_t ulen)
00369 {
00370         size_t i;
00371 
00372         for (i = 0; i < ulen; i++) {
00373                 if (s[i] != us[i])
00374                         return 0;
00375         }
00376 
00377         if (s[i])
00378                 return 0;
00379         else
00380                 return 1;
00381 }
00382 
00383 /*
00384  * This table reflects a particular philosophy about what constitutes
00385  * "text," and there is room for disagreement about it.
00386  *
00387  * Version 3.31 of the file command considered a file to be ASCII if
00388  * each of its characters was approved by either the isascii() or
00389  * isalpha() function.  On most systems, this would mean that any
00390  * file consisting only of characters in the range 0x00 ... 0x7F
00391  * would be called ASCII text, but many systems might reasonably
00392  * consider some characters outside this range to be alphabetic,
00393  * so the file command would call such characters ASCII.  It might
00394  * have been more accurate to call this "considered textual on the
00395  * local system" than "ASCII."
00396  *
00397  * It considered a file to be "International language text" if each
00398  * of its characters was either an ASCII printing character (according
00399  * to the real ASCII standard, not the above test), a character in
00400  * the range 0x80 ... 0xFF, or one of the following control characters:
00401  * backspace, tab, line feed, vertical tab, form feed, carriage return,
00402  * escape.  No attempt was made to determine the language in which files
00403  * of this type were written.
00404  *
00405  *
00406  * The table below considers a file to be ASCII if all of its characters
00407  * are either ASCII printing characters (again, according to the X3.4
00408  * standard, not isascii()) or any of the following controls: bell,
00409  * backspace, tab, line feed, form feed, carriage return, esc, nextline.
00410  *
00411  * I include bell because some programs (particularly shell scripts)
00412  * use it literally, even though it is rare in normal text.  I exclude
00413  * vertical tab because it never seems to be used in real text.  I also
00414  * include, with hesitation, the X3.64/ECMA-43 control nextline (0x85),
00415  * because that's what the dd EBCDIC->ASCII table maps the EBCDIC newline
00416  * character to.  It might be more appropriate to include it in the 8859
00417  * set instead of the ASCII set, but it's got to be included in *something*
00418  * we recognize or EBCDIC files aren't going to be considered textual.
00419  * Some old Unix source files use SO/SI (^N/^O) to shift between Greek
00420  * and Latin characters, so these should possibly be allowed.  But they
00421  * make a real mess on VT100-style displays if they're not paired properly,
00422  * so we are probably better off not calling them text.
00423  *
00424  * A file is considered to be ISO-8859 text if its characters are all
00425  * either ASCII, according to the above definition, or printing characters
00426  * from the ISO-8859 8-bit extension, characters 0xA0 ... 0xFF.
00427  *
00428  * Finally, a file is considered to be international text from some other
00429  * character code if its characters are all either ISO-8859 (according to
00430  * the above definition) or characters in the range 0x80 ... 0x9F, which
00431  * ISO-8859 considers to be control characters but the IBM PC and Macintosh
00432  * consider to be printing characters.
00433  */
00434 
00435 #define F 0   /* character never appears in text */
00436 #define T 1   /* character appears in plain ASCII text */
00437 #define I 2   /* character appears in ISO-8859 text */
00438 #define X 3   /* character appears in non-ISO extended ASCII (Mac, IBM PC) */
00439 
00440 /*@unchecked@*/ /*@observer@*/
00441 private char text_chars[256] = {
00442         /*                  BEL BS HT LF    FF CR    */
00443         F, F, F, F, F, F, F, T, T, T, T, F, T, T, F, F,  /* 0x0X */
00444         /*                              ESC          */
00445         F, F, F, F, F, F, F, F, F, F, F, T, F, F, F, F,  /* 0x1X */
00446         T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x2X */
00447         T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x3X */
00448         T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x4X */
00449         T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x5X */
00450         T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x6X */
00451         T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, F,  /* 0x7X */
00452         /*            NEL                            */
00453         X, X, X, X, X, T, X, X, X, X, X, X, X, X, X, X,  /* 0x8X */
00454         X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X,  /* 0x9X */
00455         I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xaX */
00456         I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xbX */
00457         I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xcX */
00458         I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xdX */
00459         I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xeX */
00460         I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I   /* 0xfX */
00461 };
00462 
00463 private int
00464 looks_ascii(const unsigned char *buf, size_t nbytes, unichar *ubuf,
00465     size_t *ulen)
00466 {
00467         int i;
00468 
00469         *ulen = 0;
00470 
00471         for (i = 0; i < nbytes; i++) {
00472                 int t = text_chars[buf[i]];
00473 
00474                 if (t != T)
00475                         return 0;
00476 
00477                 ubuf[(*ulen)++] = buf[i];
00478         }
00479 
00480         return 1;
00481 }
00482 
00483 private int
00484 looks_latin1(const unsigned char *buf, size_t nbytes, unichar *ubuf, size_t *ulen)
00485 {
00486         int i;
00487 
00488         *ulen = 0;
00489 
00490         for (i = 0; i < nbytes; i++) {
00491                 int t = text_chars[buf[i]];
00492 
00493                 if (t != T && t != I)
00494                         return 0;
00495 
00496                 ubuf[(*ulen)++] = buf[i];
00497         }
00498 
00499         return 1;
00500 }
00501 
00502 private int
00503 looks_extended(const unsigned char *buf, size_t nbytes, unichar *ubuf,
00504     size_t *ulen)
00505 {
00506         int i;
00507 
00508         *ulen = 0;
00509 
00510         for (i = 0; i < nbytes; i++) {
00511                 int t = text_chars[buf[i]];
00512 
00513                 if (t != T && t != I && t != X)
00514                         return 0;
00515 
00516                 ubuf[(*ulen)++] = buf[i];
00517         }
00518 
00519         return 1;
00520 }
00521 
00522 private int
00523 looks_utf8(const unsigned char *buf, size_t nbytes, unichar *ubuf, size_t *ulen)
00524 {
00525         int i, n;
00526         unichar c;
00527         int gotone = 0;
00528 
00529         *ulen = 0;
00530 
00531         for (i = 0; i < nbytes; i++) {
00532                 if ((buf[i] & 0x80) == 0) {        /* 0xxxxxxx is plain ASCII */
00533                         /*
00534                          * Even if the whole file is valid UTF-8 sequences,
00535                          * still reject it if it uses weird control characters.
00536                          */
00537 
00538                         if (text_chars[buf[i]] != T)
00539                                 return 0;
00540 
00541                         ubuf[(*ulen)++] = buf[i];
00542                 } else if ((buf[i] & 0x40) == 0) { /* 10xxxxxx never 1st byte */
00543                         return 0;
00544                 } else {                           /* 11xxxxxx begins UTF-8 */
00545                         int following;
00546 
00547                         if ((buf[i] & 0x20) == 0) {             /* 110xxxxx */
00548                                 c = buf[i] & 0x1f;
00549                                 following = 1;
00550                         } else if ((buf[i] & 0x10) == 0) {      /* 1110xxxx */
00551                                 c = buf[i] & 0x0f;
00552                                 following = 2;
00553                         } else if ((buf[i] & 0x08) == 0) {      /* 11110xxx */
00554                                 c = buf[i] & 0x07;
00555                                 following = 3;
00556                         } else if ((buf[i] & 0x04) == 0) {      /* 111110xx */
00557                                 c = buf[i] & 0x03;
00558                                 following = 4;
00559                         } else if ((buf[i] & 0x02) == 0) {      /* 1111110x */
00560                                 c = buf[i] & 0x01;
00561                                 following = 5;
00562                         } else
00563                                 return 0;
00564 
00565                         for (n = 0; n < following; n++) {
00566                                 i++;
00567                                 if (i >= nbytes)
00568                                         goto done;
00569 
00570                                 if ((buf[i] & 0x80) == 0 || (buf[i] & 0x40))
00571                                         return 0;
00572 
00573                                 c = (c << 6) + (buf[i] & 0x3f);
00574                         }
00575 
00576                         ubuf[(*ulen)++] = c;
00577                         gotone = 1;
00578                 }
00579         }
00580 done:
00581         return gotone;   /* don't claim it's UTF-8 if it's all 7-bit */
00582 }
00583 
00584 private int
00585 looks_unicode(const unsigned char *buf, size_t nbytes, unichar *ubuf,
00586     size_t *ulen)
00587 {
00588         int bigend;
00589         int i;
00590 
00591         if (nbytes < 2)
00592                 return 0;
00593 
00594         if (buf[0] == 0xff && buf[1] == 0xfe)
00595                 bigend = 0;
00596         else if (buf[0] == 0xfe && buf[1] == 0xff)
00597                 bigend = 1;
00598         else
00599                 return 0;
00600 
00601         *ulen = 0;
00602 
00603         for (i = 2; i + 1 < nbytes; i += 2) {
00604                 /* XXX fix to properly handle chars > 65536 */
00605 
00606                 if (bigend)
00607                         ubuf[(*ulen)++] = buf[i + 1] + 256 * buf[i];
00608                 else
00609                         ubuf[(*ulen)++] = buf[i] + 256 * buf[i + 1];
00610 
00611                 if (ubuf[*ulen - 1] == 0xfffe)
00612                         return 0;
00613                 if (ubuf[*ulen - 1] < 128 &&
00614                     text_chars[(size_t)ubuf[*ulen - 1]] != T)
00615                         return 0;
00616         }
00617 
00618         return 1 + bigend;
00619 }
00620 
00621 #undef F
00622 #undef T
00623 #undef I
00624 #undef X
00625 
00626 /*
00627  * This table maps each EBCDIC character to an (8-bit extended) ASCII
00628  * character, as specified in the rationale for the dd(1) command in
00629  * draft 11.2 (September, 1991) of the POSIX P1003.2 standard.
00630  *
00631  * Unfortunately it does not seem to correspond exactly to any of the
00632  * five variants of EBCDIC documented in IBM's _Enterprise Systems
00633  * Architecture/390: Principles of Operation_, SA22-7201-06, Seventh
00634  * Edition, July, 1999, pp. I-1 - I-4.
00635  *
00636  * Fortunately, though, all versions of EBCDIC, including this one, agree
00637  * on most of the printing characters that also appear in (7-bit) ASCII.
00638  * Of these, only '|', '!', '~', '^', '[', and ']' are in question at all.
00639  *
00640  * Fortunately too, there is general agreement that codes 0x00 through
00641  * 0x3F represent control characters, 0x41 a nonbreaking space, and the
00642  * remainder printing characters.
00643  *
00644  * This is sufficient to allow us to identify EBCDIC text and to distinguish
00645  * between old-style and internationalized examples of text.
00646  */
00647 
00648 /*@unchecked@*/ /*@observer@*/
00649 private unsigned char ebcdic_to_ascii[] = {
00650   0,   1,   2,   3, 156,   9, 134, 127, 151, 141, 142,  11,  12,  13,  14,  15,
00651  16,  17,  18,  19, 157, 133,   8, 135,  24,  25, 146, 143,  28,  29,  30,  31,
00652 128, 129, 130, 131, 132,  10,  23,  27, 136, 137, 138, 139, 140,   5,   6,   7,
00653 144, 145,  22, 147, 148, 149, 150,   4, 152, 153, 154, 155,  20,  21, 158,  26,
00654 ' ', 160, 161, 162, 163, 164, 165, 166, 167, 168, 213, '.', '<', '(', '+', '|',
00655 '&', 169, 170, 171, 172, 173, 174, 175, 176, 177, '!', '$', '*', ')', ';', '~',
00656 '-', '/', 178, 179, 180, 181, 182, 183, 184, 185, 203, ',', '%', '_', '>', '?',
00657 186, 187, 188, 189, 190, 191, 192, 193, 194, '`', ':', '#', '@', '\'','=', '"',
00658 195, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 196, 197, 198, 199, 200, 201,
00659 202, 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', '^', 204, 205, 206, 207, 208,
00660 209, 229, 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 210, 211, 212, '[', 214, 215,
00661 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, ']', 230, 231,
00662 '{', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 232, 233, 234, 235, 236, 237,
00663 '}', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 238, 239, 240, 241, 242, 243,
00664 '\\',159, 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 244, 245, 246, 247, 248, 249,
00665 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 250, 251, 252, 253, 254, 255
00666 };
00667 
00668 #ifdef notdef
00669 /*
00670  * The following EBCDIC-to-ASCII table may relate more closely to reality,
00671  * or at least to modern reality.  It comes from
00672  *
00673  *   http://ftp.s390.ibm.com/products/oe/bpxqp9.html
00674  *
00675  * and maps the characters of EBCDIC code page 1047 (the code used for
00676  * Unix-derived software on IBM's 390 systems) to the corresponding
00677  * characters from ISO 8859-1.
00678  *
00679  * If this table is used instead of the above one, some of the special
00680  * cases for the NEL character can be taken out of the code.
00681  */
00682 
00683 /*@unchecked@*/ /*@observer@*/
00684 private unsigned char ebcdic_1047_to_8859[] = {
00685 0x00,0x01,0x02,0x03,0x9C,0x09,0x86,0x7F,0x97,0x8D,0x8E,0x0B,0x0C,0x0D,0x0E,0x0F,
00686 0x10,0x11,0x12,0x13,0x9D,0x0A,0x08,0x87,0x18,0x19,0x92,0x8F,0x1C,0x1D,0x1E,0x1F,
00687 0x80,0x81,0x82,0x83,0x84,0x85,0x17,0x1B,0x88,0x89,0x8A,0x8B,0x8C,0x05,0x06,0x07,
00688 0x90,0x91,0x16,0x93,0x94,0x95,0x96,0x04,0x98,0x99,0x9A,0x9B,0x14,0x15,0x9E,0x1A,
00689 0x20,0xA0,0xE2,0xE4,0xE0,0xE1,0xE3,0xE5,0xE7,0xF1,0xA2,0x2E,0x3C,0x28,0x2B,0x7C,
00690 0x26,0xE9,0xEA,0xEB,0xE8,0xED,0xEE,0xEF,0xEC,0xDF,0x21,0x24,0x2A,0x29,0x3B,0x5E,
00691 0x2D,0x2F,0xC2,0xC4,0xC0,0xC1,0xC3,0xC5,0xC7,0xD1,0xA6,0x2C,0x25,0x5F,0x3E,0x3F,
00692 0xF8,0xC9,0xCA,0xCB,0xC8,0xCD,0xCE,0xCF,0xCC,0x60,0x3A,0x23,0x40,0x27,0x3D,0x22,
00693 0xD8,0x61,0x62,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0xAB,0xBB,0xF0,0xFD,0xFE,0xB1,
00694 0xB0,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,0x70,0x71,0x72,0xAA,0xBA,0xE6,0xB8,0xC6,0xA4,
00695 0xB5,0x7E,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7A,0xA1,0xBF,0xD0,0x5B,0xDE,0xAE,
00696 0xAC,0xA3,0xA5,0xB7,0xA9,0xA7,0xB6,0xBC,0xBD,0xBE,0xDD,0xA8,0xAF,0x5D,0xB4,0xD7,
00697 0x7B,0x41,0x42,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0xAD,0xF4,0xF6,0xF2,0xF3,0xF5,
00698 0x7D,0x4A,0x4B,0x4C,0x4D,0x4E,0x4F,0x50,0x51,0x52,0xB9,0xFB,0xFC,0xF9,0xFA,0xFF,
00699 0x5C,0xF7,0x53,0x54,0x55,0x56,0x57,0x58,0x59,0x5A,0xB2,0xD4,0xD6,0xD2,0xD3,0xD5,
00700 0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,0x38,0x39,0xB3,0xDB,0xDC,0xD9,0xDA,0x9F
00701 };
00702 #endif
00703 
00704 /*
00705  * Copy buf[0 ... nbytes-1] into out[], translating EBCDIC to ASCII.
00706  */
00707 private void
00708 from_ebcdic(const unsigned char *buf, size_t nbytes, unsigned char *out)
00709 {
00710         int i;
00711 
00712         for (i = 0; i < nbytes; i++) {
00713                 out[i] = ebcdic_to_ascii[buf[i]];
00714         }
00715 }