Logo Search packages:      
Sourcecode: halibut version File versions

iso2022s.c

/*
 * iso2022s.c - support for ISO-2022 subset encodings.
 * 
 * (The `s' suffix on the filename is there to leave `iso2022.c'
 * free for the unlikely event that I ever attempt to implement
 * _full_ ISO-2022 in this library!)
 */

#ifndef ENUM_CHARSETS

#include <stdio.h>
#include <string.h>
#include <assert.h>

#include "charset.h"
#include "internal.h"
#include "sbcsdat.h"

#define SO (0x0E)
#define SI (0x0F)
#define ESC (0x1B)

/* Functional description of a single ISO 2022 escape sequence. */
struct iso2022_escape {
    char const *sequence;
    unsigned long andbits, xorbits;
    /*
     * For output, these variables help us figure out which escape
     * sequences we need to get where we want to be.
     * 
     * `container' should be in the range 0-3, but can also be ORed
     * with the bit flag RO to indicate that this is not a
     * preferred container to use for this charset during output.
     */
    int container, subcharset;
};
#define RO 0x80

struct iso2022 {
    /*
     * List of escape sequences supported in this subset. Must be
     * in ASCII order, so that we can narrow down the list as
     * necessary.
     */
    const struct iso2022_escape *escapes;/* must be sorted in ASCII order! */
    int nescapes;

    /*
     * We assign indices from 0 upwards to the sub-charsets of a
     * given ISO 2022 subset. nbytes[i] tells us how many bytes per
     * character are required by sub-charset i. (It's a string
     * mainly because that makes it easier to declare in C syntax
     * than an int array.)
     */
    char const *nbytes;

    /*
     * The characters in this string are indices-plus-one (so that
     * NUL can still terminate) of escape sequences in `escapes'.
     * These escapes are output in the given sequence to reset the
     * encoding state, unless it turns out that a given escape
     * would not change the state at all.
     */
    char const *reset;

    /*
     * Initial value of s1, in case the default container contents
     * needs to be something other than charset 0 in all cases.
     * (Note that this must have the top bit set!)
     */
    unsigned long s1;

    /*
     * For output, some ISO 2022 subsets _mandate_ an initial shift
     * sequence. If so, here it is so we can output it. (For the
     * sake of basic sanity we won't bother to _require_ it on
     * input, although it should of course be listed under
     * `escapes' above so that we ignore it when present.)
     */
    char const *initial_sequence;

    /*
     * Is this an 8-bit ISO 2022 subset?
     */
    int eightbit;

    /*
     * Function calls to do the actual translation.
     */
    long int (*to_ucs)(int subcharset, unsigned long bytes);
    int (*from_ucs)(long int ucs, int *subcharset, unsigned long *bytes);
};

static void read_iso2022s(charset_spec const *charset, long int input_chr,
                    charset_state *state,
                    void (*emit)(void *ctx, long int output),
                    void *emitctx)
{
    struct iso2022 const *iso = (struct iso2022 *)charset->data;

    /*
     * For reading ISO-2022 subsets, we divide up our state
     * variables as follows:
     * 
     *      - The top byte of s0 (bits 31:24) indicates, if nonzero,
     *        that we are part-way through a recognised ISO-2022 escape
     *        sequence. Five of those bits (31:27) give the index of
     *        the first member of the escapes list matching what we
     *        have so far; the remaining three (26:24) give the number
     *        of characters we have seen so far.
     * 
     *      - The top bit of s1 (bit 31) is non-zero at all times, to
     *        indicate that we have performed any necessary
     *        initialisation. When we start, we detect a zero s1 and
     *        respond to it by initialising the default container
     *        contents.
     * 
     *      - The next three bits of s1 (bits 30:28) indicate which
     *        _container_ is currently selected. This isn't quite as
     *        simple as it sounds, since we have to preserve memory of
     *        which of the SI/SO containers we came from when we're
     *        temporarily in SS2/SS3. Hence, what happens is:
     *     + bit 28 indicates SI/SO.
     *         + if we're in an SS2/SS3 container, that's indicated by
     *           the two bits above that being nonzero and holding
     *           either 2 or 3.
     *         + Hence: 0 is SI, 1 is SO, 4 is SS2-from-SI, 5 is
     *           SS2-from-SO, 6 is SS3-from-SI, 7 is SS3-from-SO.
     *         + For added fun: in an _8-bit_ ISO 2022 subset, we have
     *           the further special value 2, which means that we're
     *           theoretically in SI but the current character being
     *           accumulated is composed of 8-bit characters and will
     *           therefore be interpreted as if in SO.
     * 
     *      - The next nibble of s1 (27:24) indicates how many bytes
     *        have been accumulated in the current character.
     * 
     *      - The remaining three bytes of s1 are divided into four
     *        six-bit sections, and each section gives the current
     *        sub-charset selected in one of the possible containers.
     *        (Those containers are SI, SO, SS2 and SS3, respectively
     *        and in order from the bottom of s0 to the top.)
     * 
     *      - The bottom 24 bits of s0 give the accumulated character
     *        data so far.
     * 
     * (Note that this means s1 contains all the parts of the state
     * which might need to be operated on by escape sequences.
     * Cunning, eh?)
     */

    if (!(state->s1 & 0x80000000)) {
      state->s1 = iso->s1;
    }

    /*
     * So. Firstly, we process escape sequences, if we're in the
     * middle of one or if we see a possible introducer (SI, SO,
     * ESC).
     */
    if ((state->s0 >> 24) ||
      (input_chr == SO || input_chr == SI || input_chr == ESC)) {
      int n = (state->s0 >> 24) & 7, i = (state->s0 >> 27), oi = i, j;

      /*
       * If this is the start of an escape sequence, we might be
       * in mid-character. If so, clear the character state and
       * emit an error token for the incomplete character.
       */
      if (state->s1 & 0x0F000000) {
          state->s1 &= ~0x0F000000;
          state->s0 &= 0xFF000000;
          /*
           * If we were in the SS2 or SS3 container, we
           * automatically exit it.
           */
          if (state->s1 & 0x60000000)
            state->s1 &= 0x9FFFFFFF;
          emit(emitctx, ERROR);
      }

      j = i;
      while (j < iso->nescapes &&
             !memcmp(iso->escapes[j].sequence,
                   iso->escapes[oi].sequence, n)) {
          if (iso->escapes[j].sequence[n] < input_chr)
            i = ++j;
          else
            break;
      }
      if (i >= iso->nescapes ||
          memcmp(iso->escapes[i].sequence,
               iso->escapes[oi].sequence, n) ||
          iso->escapes[i].sequence[n] != input_chr) {
          /*
           * This character does not appear in any valid escape
           * sequence. Therefore, we must emit all the characters
           * we had previously swallowed, plus this one, and
           * return to non-escape-sequence state.
           */
          for (j = 0; j < n; j++)
            emit(emitctx, iso->escapes[oi].sequence[j]);
          emit(emitctx, input_chr);
          state->s0 = 0;
          return;
      }

      /*
       * Otherwise, we have found an additional character in our
       * escape sequence. See if we have reached the _end_ of our
       * sequence (and therefore must process the sequence).
       */
      n++;
      if (!iso->escapes[i].sequence[n]) {
          state->s0 = 0;
          state->s1 &= iso->escapes[i].andbits;
          state->s1 ^= iso->escapes[i].xorbits;
          return;
      }

      /*
       * Failing _that_, we simply update our escape-sequence-
       * tracking state.
       */
      assert(i < 32 && n < 8);
      state->s0 = (i << 27) | (n << 24);
      return;
    }

    /*
     * If this isn't an escape sequence, it must be part of a
     * character. One possibility is that it's a control character
     * (00-20 or 7F-9F; also in non-8-bit ISO 2022 subsets I'm
     * going to treat all top-half characters as controls), in
     * which case we output it verbatim.
     */
    if (input_chr < 0x21 ||
      (input_chr > 0x7E && (!iso->eightbit || input_chr < 0xA0))) {
      /*
       * We might be in mid-multibyte-character. If so, clear the
       * character state and emit an error token for the
       * incomplete character.
       */
      if (state->s1 & 0x0F000000) {
          state->s1 &= ~0x0F000000;
          state->s0 &= 0xFF000000;
          emit(emitctx, ERROR);
          /*
           * If we were in the SS2 or SS3 container, we
           * automatically exit it.
           */
          if (state->s1 & 0x60000000)
            state->s1 &= 0x9FFFFFFF;
      }

      emit(emitctx, input_chr);
      return;
    }

    /*
     * Otherwise, accumulate character data.
     */
    {
      unsigned long chr;
      int chrlen, cont, subcharset, bytes;

      /*
       * Verify that we've seen the right kind of character for
       * what we're currently doing. This only matters in 8-bit
       * subsets.
       */
      if (iso->eightbit) {
          cont = (state->s1 >> 28) & 7;
          /*
           * If cont==0, we're entitled to see either GL or GR
           * characters. If cont==2, we expect only GR; otherwise
           * we expect only GL.
           * 
           * If we see a GR character while cont==0, we set
           * cont=2 immediately.
           */
          if ((cont == 2 && !(input_chr & 0x80)) ||
            (cont != 0 && cont != 2 && (input_chr & 0x80))) {
            /*
             * Clear the previous character; it was prematurely
             * terminated by this error.
             */
            state->s1 &= ~0x0F000000;
            state->s0 &= 0xFF000000;
            emit(emitctx, ERROR);
            /*
             * If we were in the SS2 or SS3 container, we
             * automatically exit it.
             */
            if (state->s1 & 0x60000000)
                state->s1 &= 0x9FFFFFFF;
          }

          if (cont == 0 && (input_chr & 0x80)) {
            state->s1 |= 0x20000000;
          }
      }

      /* The current character and its length. */
      chr = ((state->s0 & 0x00FFFFFF) << 8) | (input_chr & 0x7F);
      chrlen = ((state->s1 >> 24) & 0xF) + 1;
      /* The current sub-charset. */
      cont = (state->s1 >> 28) & 7;
      if (cont > 1) cont >>= 1;
      subcharset = (state->s1 >> (6*cont)) & 0x3F;
      /* The number of bytes-per-character in that sub-charset. */
      bytes = iso->nbytes[subcharset];

      /*
       * If this character is now complete, we convert and emit
       * it. Otherwise, we simply update the state and return.
       */
      if (chrlen >= bytes) {
          emit(emitctx, iso->to_ucs(subcharset, chr));
          chr = chrlen = 0;
          /*
           * If we were in the SS2 or SS3 container, we
           * automatically exit it.
           */
          if (state->s1 & 0x60000000)
            state->s1 &= 0x9FFFFFFF;
      }
      state->s0 = (state->s0 & 0xFF000000) | chr;
      state->s1 = (state->s1 & 0xF0FFFFFF) | (chrlen << 24);
    }
}

static int write_iso2022s(charset_spec const *charset, long int input_chr,
                    charset_state *state,
                    void (*emit)(void *ctx, long int output),
                    void *emitctx)
{
    struct iso2022 const *iso = (struct iso2022 *)charset->data;
    int subcharset, len, i, j, cont, topbit = 0;
    unsigned long bytes;

    /*
     * For output, our s1 state variable contains most of the same
     * stuff as it did for input - initial-state indicator bit,
     * current container, and current subcharset selected in each
     * container.
     */

    /*
     * Analyse the character and find out what subcharset it needs
     * to go in.
     */
    if (input_chr >= 0 && !iso->from_ucs(input_chr, &subcharset, &bytes))
      return FALSE;

    if (!(state->s1 & 0x80000000)) {
      state->s1 = iso->s1;
      if (iso->initial_sequence)
          for (i = 0; iso->initial_sequence[i]; i++)
            emit(emitctx, iso->initial_sequence[i]);
    }

    if (input_chr == -1) {
      unsigned long oldstate;
      int k;

      /*
       * Special case: reset encoding state.
       */
      for (i = 0; iso->reset[i]; i++) {
          j = iso->reset[i] - 1;
          oldstate = state->s1;
          state->s1 &= iso->escapes[j].andbits;
          state->s1 ^= iso->escapes[j].xorbits;
          if (state->s1 != oldstate) {
            /* We must actually emit this sequence. */
            for (k = 0; iso->escapes[j].sequence[k]; k++)
                emit(emitctx, iso->escapes[j].sequence[k]);
          }
      }

      return TRUE;
    }

    /*
     * Now begins the fun. We now know what subcharset we want. So
     * we must find out which container we should select it into,
     * select it into it if necessary, select that _container_ if
     * necessary, and then output the given bytes.
     */
    for (i = 0; i < iso->nescapes; i++)
      if (iso->escapes[i].subcharset == subcharset &&
          !(iso->escapes[i].container & RO))
          break;
    assert(i < iso->nescapes);

    /*
     * We've found the escape sequence which would select this
     * subcharset into a container. However, that subcharset might
     * already _be_ selected in that container! Check before we go
     * to the effort of emitting the sequence.
     */
    cont = iso->escapes[i].container &~ RO;
    if (((state->s1 >> (6*cont)) & 0x3F) != (unsigned)subcharset) {
      for (j = 0; iso->escapes[i].sequence[j]; j++)
          emit(emitctx, iso->escapes[i].sequence[j]);
      state->s1 &= iso->escapes[i].andbits;
      state->s1 ^= iso->escapes[i].xorbits;
    }

    /*
     * Now we know what container our subcharset is in, so we want
     * to select that container.
     */
    if (cont > 1) {
      /* SS2 or SS3; just output the sequence and be done. */
      emit(emitctx, ESC);
      emit(emitctx, 'L' + cont);     /* comes out to 'N' or 'O' */
    } else {
      /*
       * Emit SI or SO, but only if the current container isn't already
       * the right one.
       * 
       * Also, in an 8-bit subset, we need not do this; we'll
       * just use 8-bit characters to output SO-container
       * characters.
       */
      if (iso->eightbit && cont == 1 && ((state->s1 >> 28) & 7) == 0) {
          topbit = 0x80;
      } else if (((state->s1 >> 28) & 7) != (unsigned)cont) {
          emit(emitctx, cont ? SO : SI);
          state->s1 = (state->s1 & 0x8FFFFFFF) | (cont << 28);
      }
    }

    /*
     * We're done. Subcharset is selected in container, container
     * is selected. All we need now is to write out the bytes.
     */
    len = iso->nbytes[subcharset];
    while (len--)
      emit(emitctx, ((bytes >> (8*len)) & 0xFF) | topbit);

    return TRUE;
}

/*
 * ISO-2022-JP, defined in RFC 1468.
 */
static long int iso2022jp_to_ucs(int subcharset, unsigned long bytes)
{
    switch (subcharset) {
      case 0: return bytes;          /* one-byte ASCII */
      case 1:                        /* JIS X 0201 half-width katakana */
      if (bytes >= 0x21 && bytes <= 0x5F)
          return bytes + (0xFF61 - 0x21);
      else
          return ERROR;
      /* (no break needed since all control paths have returned) */
      case 2: return jisx0208_to_unicode(((bytes >> 8) & 0xFF) - 0x21,
                               ((bytes     ) & 0xFF) - 0x21);
      default: return ERROR;
    }
}
static int iso2022jp_from_ucs(long int ucs, int *subcharset,
                        unsigned long *bytes)
{
    int r, c;
    if (ucs < 0x80) {
      *subcharset = 0;
      *bytes = ucs;
      return 1;
    } else if (ucs >= 0xFF61 && ucs <= 0xFF9F) {
      *subcharset = 1;
      *bytes = ucs - (0xFF61 - 0x21);
      return 1;
    } else if (unicode_to_jisx0208(ucs, &r, &c)) {
      *subcharset = 2;
      *bytes = ((r+0x21) << 8) | (c+0x21);
      return 1;
    } else {
      return 0;
    }
}
static const struct iso2022_escape iso2022jp_escapes[] = {
    {"\033$@", 0xFFFFFFC0, 0x00000002, -1, -1},   /* we ignore this one */
    {"\033$B", 0xFFFFFFC0, 0x00000002, 0, 2},
    {"\033(B", 0xFFFFFFC0, 0x00000000, 0, 0},
    {"\033(J", 0xFFFFFFC0, 0x00000001, 0, 1},
};
static const struct iso2022 iso2022jp = {
    iso2022jp_escapes, lenof(iso2022jp_escapes),
    "\1\1\2", "\3", 0x80000000, NULL, FALSE,
    iso2022jp_to_ucs, iso2022jp_from_ucs
};
const charset_spec charset_CS_ISO2022_JP = {
    CS_ISO2022_JP, read_iso2022s, write_iso2022s, &iso2022jp
};

/*
 * ISO-2022-KR, defined in RFC 1557.
 */
static long int iso2022kr_to_ucs(int subcharset, unsigned long bytes)
{
    switch (subcharset) {
      case 0: return bytes;          /* one-byte ASCII */
      case 1: return ksx1001_to_unicode(((bytes >> 8) & 0xFF) - 0x21,
                              ((bytes     ) & 0xFF) - 0x21);
      default: return ERROR;
    }
}
static int iso2022kr_from_ucs(long int ucs, int *subcharset,
                        unsigned long *bytes)
{
    int r, c;
    if (ucs < 0x80) {
      *subcharset = 0;
      *bytes = ucs;
      return 1;
    } else if (unicode_to_ksx1001(ucs, &r, &c)) {
      *subcharset = 1;
      *bytes = ((r+0x21) << 8) | (c+0x21);
      return 1;
    } else {
      return 0;
    }
}
static const struct iso2022_escape iso2022kr_escapes[] = {
    {"\016", 0x8FFFFFFF, 0x10000000, -1, -1},
    {"\017", 0x8FFFFFFF, 0x00000000, 0, 0},
    {"\033$)C", 0xFFFFF03F, 0x00000040, 1, 1},   /* bits[11:6] <- 1 */
};
static const struct iso2022 iso2022kr = {
    iso2022kr_escapes, lenof(iso2022kr_escapes),
    "\1\2", "\2", 0x80000040, "\033$)C", FALSE,
    iso2022kr_to_ucs, iso2022kr_from_ucs
};
const charset_spec charset_CS_ISO2022_KR = {
    CS_ISO2022_KR, read_iso2022s, write_iso2022s, &iso2022kr
};

/*
 * The COMPOUND_TEXT encoding used in X selections. Defined by the
 * X consortium.
 * 
 * This encoding has quite a few sub-charsets. The order I assign
 * to them here is given in an enum.
 */
enum {
    /* This must match the bytes-per-character string given below. */
    CTEXT_ASCII,
    CTEXT_JISX0201_LEFT,
    CTEXT_JISX0201_RIGHT,
    CTEXT_ISO8859_1,
    CTEXT_ISO8859_2,
    CTEXT_ISO8859_3,
    CTEXT_ISO8859_4,
    CTEXT_ISO8859_5,
    CTEXT_ISO8859_6,
    CTEXT_ISO8859_7,
    CTEXT_ISO8859_8,
    CTEXT_ISO8859_9,
    CTEXT_GB2312,
    CTEXT_KSC5601,
    CTEXT_JISX0208,
    CTEXT_JISX0212
};
static long int ctext_to_ucs(int subcharset, unsigned long bytes)
{
    switch (subcharset) {
      case CTEXT_ASCII: return bytes;            /* one-byte ASCII */
      case CTEXT_JISX0201_LEFT:        /* ASCII with yen and overline */
      return sbcs_to_unicode(&sbcsdata_CS_JISX0201, bytes & 0x7F);
      case CTEXT_JISX0201_RIGHT:       /* JIS X 0201 half-width katakana */
      return sbcs_to_unicode(&sbcsdata_CS_JISX0201, (bytes & 0x7F) | 0x80);
      case CTEXT_ISO8859_1:
      return sbcs_to_unicode(&sbcsdata_CS_ISO8859_1, (bytes & 0x7F) | 0x80);
      case CTEXT_ISO8859_2:
      return sbcs_to_unicode(&sbcsdata_CS_ISO8859_2, (bytes & 0x7F) | 0x80);
      case CTEXT_ISO8859_3:
      return sbcs_to_unicode(&sbcsdata_CS_ISO8859_3, (bytes & 0x7F) | 0x80);
      case CTEXT_ISO8859_4:
      return sbcs_to_unicode(&sbcsdata_CS_ISO8859_4, (bytes & 0x7F) | 0x80);
      case CTEXT_ISO8859_5:
      return sbcs_to_unicode(&sbcsdata_CS_ISO8859_5, (bytes & 0x7F) | 0x80);
      case CTEXT_ISO8859_6:
      return sbcs_to_unicode(&sbcsdata_CS_ISO8859_6, (bytes & 0x7F) | 0x80);
      case CTEXT_ISO8859_7:
      return sbcs_to_unicode(&sbcsdata_CS_ISO8859_7, (bytes & 0x7F) | 0x80);
      case CTEXT_ISO8859_8:
      return sbcs_to_unicode(&sbcsdata_CS_ISO8859_8, (bytes & 0x7F) | 0x80);
      case CTEXT_ISO8859_9:
      return sbcs_to_unicode(&sbcsdata_CS_ISO8859_9, (bytes & 0x7F) | 0x80);
      case CTEXT_GB2312:
      return gb2312_to_unicode(((bytes >> 8) & 0xFF) - 0x21,
                         ((bytes     ) & 0xFF) - 0x21);
      case CTEXT_KSC5601:
      return ksx1001_to_unicode(((bytes >> 8) & 0xFF) - 0x21,
                          ((bytes     ) & 0xFF) - 0x21);
      case CTEXT_JISX0208:
      return jisx0208_to_unicode(((bytes >> 8) & 0xFF) - 0x21,
                           ((bytes     ) & 0xFF) - 0x21);
      case CTEXT_JISX0212:
      return jisx0212_to_unicode(((bytes >> 8) & 0xFF) - 0x21,
                           ((bytes     ) & 0xFF) - 0x21);
      default: return ERROR;
    }
}
static int ctext_from_ucs(long int ucs, int *subcharset, unsigned long *bytes)
{
    int r, c;
    if (ucs < 0x80) {
      *subcharset = CTEXT_ASCII;
      *bytes = ucs;
      return 1;
    } else if ((c = sbcs_from_unicode(&sbcsdata_CS_ISO8859_1, ucs)) != ERROR) {
      *subcharset = CTEXT_ISO8859_1;
      *bytes = c - 0x80;
      return 1;
    } else if ((c = sbcs_from_unicode(&sbcsdata_CS_ISO8859_2, ucs)) != ERROR) {
      *subcharset = CTEXT_ISO8859_2;
      *bytes = c - 0x80;
      return 1;
    } else if ((c = sbcs_from_unicode(&sbcsdata_CS_ISO8859_3, ucs)) != ERROR) {
      *subcharset = CTEXT_ISO8859_3;
      *bytes = c - 0x80;
      return 1;
    } else if ((c = sbcs_from_unicode(&sbcsdata_CS_ISO8859_4, ucs)) != ERROR) {
      *subcharset = CTEXT_ISO8859_4;
      *bytes = c - 0x80;
      return 1;
    } else if ((c = sbcs_from_unicode(&sbcsdata_CS_ISO8859_5, ucs)) != ERROR) {
      *subcharset = CTEXT_ISO8859_5;
      *bytes = c - 0x80;
      return 1;
    } else if ((c = sbcs_from_unicode(&sbcsdata_CS_ISO8859_6, ucs)) != ERROR) {
      *subcharset = CTEXT_ISO8859_6;
      *bytes = c - 0x80;
      return 1;
    } else if ((c = sbcs_from_unicode(&sbcsdata_CS_ISO8859_7, ucs)) != ERROR) {
      *subcharset = CTEXT_ISO8859_7;
      *bytes = c - 0x80;
      return 1;
    } else if ((c = sbcs_from_unicode(&sbcsdata_CS_ISO8859_8, ucs)) != ERROR) {
      *subcharset = CTEXT_ISO8859_8;
      *bytes = c - 0x80;
      return 1;
    } else if ((c = sbcs_from_unicode(&sbcsdata_CS_ISO8859_9, ucs)) != ERROR) {
      *subcharset = CTEXT_ISO8859_9;
      *bytes = c - 0x80;
      return 1;
    } else if ((c = sbcs_from_unicode(&sbcsdata_CS_JISX0201, ucs)) != ERROR) {
      if (c < 0x80) {
          *subcharset = CTEXT_JISX0201_LEFT;
      } else {
          *subcharset = CTEXT_JISX0201_RIGHT;
          c -= 0x80;
      }
      *bytes = c;
      return 1;
    } else if (unicode_to_gb2312(ucs, &r, &c)) {
      *subcharset = CTEXT_GB2312;
      *bytes = ((r+0x21) << 8) | (c+0x21);
      return 1;
    } else if (unicode_to_ksx1001(ucs, &r, &c)) {
      *subcharset = CTEXT_KSC5601;
      *bytes = ((r+0x21) << 8) | (c+0x21);
      return 1;
    } else if (unicode_to_jisx0208(ucs, &r, &c)) {
      *subcharset = CTEXT_JISX0208;
      *bytes = ((r+0x21) << 8) | (c+0x21);
      return 1;
    } else if (unicode_to_jisx0212(ucs, &r, &c)) {
      *subcharset = CTEXT_JISX0212;
      *bytes = ((r+0x21) << 8) | (c+0x21);
      return 1;
    } else {
      return 0;
    }
}
#define SEQ(str,cont,cs) \
    {str,~(63<<(6*(((cont)&~RO)))),(cs)<<(6*(((cont)&~RO))),(cont),(cs)}
/*
 * Compound text defines restrictions on which container can take
 * which character sets. Things labelled `left half of' can only go
 * in GL; things labelled `right half of' can only go in GR; and 96
 * or 96^n character sets only _fit_ in GR. Thus:
 *  - ASCII can only go in GL since it is the left half of 8859-*.
 *  - All the 8859 sets can only go in GR.
 *  - JISX0201 left is GL only; JISX0201 right is GR only.
 *  - The three multibyte sets (GB2312, JISX0208, KSC5601) can go
 *    in either; we prefer GR where possible since this leads to a
 *    more compact EUC-like encoding.
 */
static const struct iso2022_escape ctext_escapes[] = {
    SEQ("\033$(A", 0|RO, CTEXT_GB2312),
    SEQ("\033$(B", 0|RO, CTEXT_JISX0208),
    SEQ("\033$(C", 0|RO, CTEXT_KSC5601),
    SEQ("\033$(D", 0|RO, CTEXT_JISX0212),
    SEQ("\033$)A", 1, CTEXT_GB2312),
    SEQ("\033$)B", 1, CTEXT_JISX0208),
    SEQ("\033$)C", 1, CTEXT_KSC5601),
    SEQ("\033$)D", 1, CTEXT_JISX0212),
    SEQ("\033(B", 0, CTEXT_ASCII),
    SEQ("\033(J", 0, CTEXT_JISX0201_LEFT),
    SEQ("\033)I", 1, CTEXT_JISX0201_RIGHT),
    SEQ("\033-A", 1, CTEXT_ISO8859_1),
    SEQ("\033-B", 1, CTEXT_ISO8859_2),
    SEQ("\033-C", 1, CTEXT_ISO8859_3),
    SEQ("\033-D", 1, CTEXT_ISO8859_4),
    SEQ("\033-F", 1, CTEXT_ISO8859_7),
    SEQ("\033-G", 1, CTEXT_ISO8859_6),
    SEQ("\033-H", 1, CTEXT_ISO8859_8),
    SEQ("\033-L", 1, CTEXT_ISO8859_5),
    SEQ("\033-M", 1, CTEXT_ISO8859_9),
};
static const struct iso2022 ctext = {
    ctext_escapes, lenof(ctext_escapes),
    "\1\1\1\1\1\1\1\1\1\1\1\1\2\2\2\2",  /* must match the enum above */
    "", 0x80000000 | (CTEXT_ASCII<<0) | (CTEXT_ISO8859_1<<6), "", TRUE,
    ctext_to_ucs, ctext_from_ucs
};
const charset_spec charset_CS_CTEXT = {
    CS_CTEXT, read_iso2022s, write_iso2022s, &ctext
};

#else /* ENUM_CHARSETS */

ENUM_CHARSET(CS_ISO2022_JP)
ENUM_CHARSET(CS_ISO2022_KR)
ENUM_CHARSET(CS_CTEXT)

#endif /* ENUM_CHARSETS */

Generated by  Doxygen 1.6.0   Back to index