File intl_support.c¶

File List > base > intl_support.c
/*
 * Copyright 2008 Search Solution Corporation
 * Copyright 2016 CUBRID Corporation
 *
 *  Licensed under the Apache License, Version 2.0 (the "License");
 *  you may not use this file except in compliance with the License.
 *  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 *
 */

/*
 * intl_support.c : platform independent internationalization functions.
 */

#ident "$Id$"

#include "config.h"

#include <assert.h>
#include <stdio.h>
#include <string.h>
#include <errno.h>
#include <locale.h>
#include <ctype.h>
#include <wctype.h>

#include "error_manager.h"
#include "intl_support.h"
#include "language_support.h"
#include "chartype.h"
#include "system_parameter.h"
#include "charset_converters.h"
// XXX: SHOULD BE THE LAST INCLUDE HEADER
#include "memory_wrapper.hpp"

#if defined (SUPPRESS_STRLEN_WARNING)
#define strlen(s1)  ((int) strlen(s1))
#endif /* defined (SUPPRESS_STRLEN_WARNING) */

#define IS_8BIT(c)              ((c) >> 7)
/* Special values for EUC encodings */
#ifndef SS3
#define SS3                     143
#endif

#define LOCALE_C        "C"
#if defined(AIX)
#define LOCALE_KOREAN   "ko_KR.IBM-eucKR"
#else
#define LOCALE_KOREAN   "korean"
#endif

#if defined (ENABLE_UNUSED_FUNCTION)
/* EUC-KR characters may be used with ISO-88591-1 charset when
 * PRM_SINGLE_BYTE_COMPARE is 'no'
 * EUC-KR have either 3 (when first byte is SS3) or two bytes (use this macro
 * to check the byte range) */
#define IS_PSEUDO_KOREAN(ch) \
          ( ((unsigned char) ch >= (unsigned char) 0xa1)       \
              && ((unsigned char) ch <= (unsigned char) 0xfe) )
#endif

/* conversion from turkish ISO 8859-9 to UTF-8 */
#define ISO_8859_9_FIRST_CP 0x11e
#define ISO_8859_9_LAST_CP 0x15f

static CONV_CP_TO_BYTES iso8859_9_To_utf8_conv[256];
static CONV_CP_TO_BYTES utf8_Cp_to_iso_8859_9_conv[ISO_8859_9_LAST_CP - ISO_8859_9_FIRST_CP + 1];

/* conversion from Latin 1 ISO 8859-1 to UTF-8: */
static CONV_CP_TO_BYTES iso8859_1_To_utf8_conv[256];


/* identifiers : support for multibyte chars in INTL_CODESET_ISO88591 codeset
 * (default legacy codeset) */
bool intl_Mbs_support = true;
bool intl_String_validation = false;

/* General EUC string manipulations */
static int intl_tolower_euc (const unsigned char *src, unsigned char *d, int byte_size);
static int intl_toupper_euc (const unsigned char *src, unsigned char *d, int byte_size);
static int intl_count_euc_chars (const unsigned char *s, int length_in_bytes);
static int intl_count_euc_bytes (const unsigned char *s, int length_in_chars);
#if defined (ENABLE_UNUSED_FUNCTION)
static wchar_t *intl_copy_lowercase (const wchar_t * ws, size_t n);
static int intl_is_korean (unsigned char ch);
#endif /* ENABLE_UNUSED_FUNCTION */

/* UTF-8 string manipulations */
static int intl_tolower_utf8 (const ALPHABET_DATA * a, const unsigned char *s, unsigned char *d, int length_in_chars,
                  int *d_size);
static int intl_toupper_utf8 (const ALPHABET_DATA * a, const unsigned char *s, unsigned char *d, int length_in_chars,
                  int *d_size);
static int intl_count_utf8_bytes (const unsigned char *s, int length_in_chars);
static int intl_char_tolower_utf8 (const ALPHABET_DATA * a, const unsigned char *s, const int size, unsigned char *d,
                   unsigned char **next);
static int intl_char_toupper_utf8 (const ALPHABET_DATA * a, const unsigned char *s, const int size, unsigned char *d,
                   unsigned char **next);
static int intl_strcasecmp_utf8_one_cp (const ALPHABET_DATA * alphabet, unsigned char *str1, unsigned char *str2,
                    const int size_str1, const int size_str2, unsigned int cp1, unsigned int cp2,
                    int *skip_size1, int *skip_size2);
static void intl_init_conv_iso8859_9_to_utf8 (void);
static void intl_init_conv_iso8859_1_to_utf8 (void);


TEXT_CONVERSION con_Iso_8859_9_conv = {
  TEXT_CONV_ISO_88599_BUILTIN,  /* type */
  (char *) "28599",     /* Windows Code page */
  (char *) "iso88599",      /* Linux charset identifiers */
  {0},              /* byte flags : not used for ISO */
  0, 0, NULL,           /* UTF-8 to console : filled by init function */
  0, 0, NULL,           /* console to UTF-8 : filled by init function */
  intl_text_utf8_to_single_byte,    /* UTF-8 to console conversion function */
  intl_text_single_byte_to_utf8,    /* console to UTF-8 conversion function */
  intl_init_conv_iso8859_9_to_utf8, /* init function */
};

TEXT_CONVERSION con_Iso_8859_1_conv = {
  TEXT_CONV_ISO_88591_BUILTIN,  /* type */
  (char *) "28591",     /* Windows Code page */
  (char *) "iso88591",      /* Linux charset identifiers */
  {0},              /* byte flags : not used for ISO */
  0, 0, NULL,           /* UTF-8 to console : filled by init function */
  0, 0, NULL,           /* console to UTF-8 : filled by init function */
  intl_text_utf8_to_single_byte,    /* UTF-8 to console conversion function */
  intl_text_single_byte_to_utf8,    /* console to UTF-8 conversion function */
  intl_init_conv_iso8859_1_to_utf8, /* init function */
};


/*
 * intl_mbs_chr() - find first occurrence of the given character
 *   return: a pointer to the first occurrence of the given character in
 *           the given multibyte string, or NULL if no occurrence is found
 *   mbs(in)
 *   wc(in)
 */
char *
intl_mbs_chr (const char *mbs, wchar_t wc)
{
  int nbytes;
  wchar_t cur_wc;

  assert (mbs != NULL);

  if (!intl_Mbs_support)
    {
      return (char *) (strchr (mbs, (int) wc));
    }

  for (nbytes = 0; (nbytes = mbtowc (&cur_wc, mbs, MB_LEN_MAX)) > 0 && cur_wc != L'\0' && cur_wc != wc; mbs += nbytes)
    {
      continue;
    }

  if (!*mbs && wc)
    {
      return NULL;
    }

  return (char *) mbs;
}

/*
 * intl_mbs_len() - computes the number of multibyte character sequences in the multibyte
 *             character string, not including the terminating zero byte
 *   return: number of characters if  success.
 *           On error, 0 is returned and errno is set.
 *              EINVAL  : mbs contains an invalid byte sequence.
 *   mbs(in)
 */
int
intl_mbs_len (const char *mbs)
{
  int num_of_chars;
  int clen;

  assert (mbs != NULL);

  if (!intl_Mbs_support)
    {
      return strlen (mbs);
    }

  for (num_of_chars = 0; (clen = mblen (mbs, MB_LEN_MAX)) > 0 && *mbs; mbs += clen, num_of_chars++)
    {
      continue;
    }

  if (clen < 0)
    {
      errno = EINVAL;
      num_of_chars = 0;
    }

  return num_of_chars;
}

/*
 * intl_mbs_nth() - finds the nth multibyte character in the multibyte string
 *   return: a pointer to the nth character in n.
 *           NULL if either an error occurs or there are not n characters
 *                in the string
 *   mbs(in)
 *   n(in)
 */

const char *
intl_mbs_nth (const char *mbs, size_t n)
{
  size_t num_of_chars;
  int clen;

  assert (mbs != NULL);
  if (mbs == NULL)
    {
      return NULL;
    }

  if (!intl_Mbs_support)
    {
      if (strlen (mbs) < (int) n)
    {
      errno = EINVAL;
      return NULL;
    }
      return &mbs[n];
    }

  for (num_of_chars = 0, clen = 0; num_of_chars < n && (clen = mblen (mbs, MB_LEN_MAX)) > 0 && *mbs;
       mbs += clen, num_of_chars++)
    {
      continue;
    }

  if (clen < 0)
    {
      errno = EINVAL;
      mbs = NULL;
    }
  else if (num_of_chars < n)
    {
      mbs = NULL;
    }

  return mbs;
}

/*
 * intl_mbs_spn() - return the size of the prefix of the given multibyte string
 *             consisting of the given wide characters.
 *   return: size in bytes.
 *           If mbs contains an invalid byte sequence,
 *           errno is set and 0 is returned.
 *   mbs(in)
 *   chars(in)
 */
int
intl_mbs_spn (const char *mbs, const wchar_t * chars)
{
  int clen;
  wchar_t wc;
  int size;

  assert (mbs != NULL && chars != NULL);

  if (!intl_Mbs_support)
    {
      return (int) strspn (mbs, (const char *) chars);
    }

  for (size = 0; (clen = mbtowc (&wc, mbs, MB_LEN_MAX)) > 0 && *mbs && wcschr (chars, wc); mbs += clen, size += clen)
    {
      continue;
    }

  if (clen < 0)
    {
      errno = EINVAL;
      size = 0;
    }

  return size;
}

#if defined (ENABLE_UNUSED_FUNCTION)
/*
 * intl_mbs_namecmp() - compares successive multi-byte character
 *                 from two multi-byte identifier string
 *   return: 0 if all the multi-byte character identifier are the "same",
 *           positive number if mbs1 is greater than mbs2,
 *           negative number otherwise.
 *   mbs1(in)
 *   mbs2(in)
 *
 * Note: "same" means that this function ignores bracket '[', ']'
 *       so mbs1 = "[value]" and mbs2 = "value" returns 0
 */
int
intl_mbs_namecmp (const char *mbs1, const char *mbs2)
{
  const char *cp1 = mbs1;
  const char *cp2 = mbs2;
  int cp1_len, cp2_len;

  assert (mbs1 != NULL && mbs2 != NULL);

  cp1_len = strlen (cp1);
  cp2_len = strlen (cp2);

  if (cp1[0] == '[')
    {
      cp1++;
      cp1_len -= 2;
    }

  if (cp2[0] == '[')
    {
      cp2++;
      cp2_len -= 2;
    }

  if (cp1_len != cp2_len)
    {
      /* fail return */
      return intl_mbs_casecmp (cp1, cp2);
    }

  return intl_mbs_ncasecmp (cp1, cp2, cp1_len);
}
#endif

/*
 * intl_mbs_casecmp() - compares successive multi-byte character elements
 *                 from two multi-byte strings
 *   return: 0 if all the multi-byte character elements are the same,
 *           positive number if mbs1 is greater than mbs2,
 *           negative number otherwise.
 *   mbs1(in)
 *   mbs2(in)
 *
 * Note: This function does not use the collating sequences specified
 *       in the LC_COLLATE category of the current locale.
 *       This function set errno if mbs1 or mbs2 contain one or more
 *       invalid multi-byte characters.
 */
int
intl_mbs_casecmp (const char *mbs1, const char *mbs2)
{
  wchar_t wc1, wc2;
  int mb1_len, mb2_len;

  assert (mbs1 != NULL && mbs2 != NULL);

  if (!intl_Mbs_support)
    {
#if defined(WINDOWS)
      return _stricmp (mbs1, mbs2);
#else
      return strcasecmp (mbs1, mbs2);
#endif
    }

  for (mb1_len = mbtowc (&wc1, mbs1, MB_LEN_MAX), mb2_len = mbtowc (&wc2, mbs2, MB_LEN_MAX);
       mb1_len > 0 && mb2_len > 0 && wc1 && wc2 && !(towlower (wc1) - towlower (wc2));)
    {
      mbs1 += mb1_len;
      mbs2 += mb2_len;

      mb1_len = mbtowc (&wc1, mbs1, MB_LEN_MAX);
      mb2_len = mbtowc (&wc2, mbs2, MB_LEN_MAX);
    }

  if (mb1_len < 0 || mb2_len < 0)
    {
      errno = EINVAL;
    }

  return (int) (towlower (wc1) - towlower (wc2));
}

#if defined (ENABLE_UNUSED_FUNCTION)
int
intl_mbs_cmp (const char *mbs1, const char *mbs2)
{
  wchar_t wc1, wc2;
  int mb1_len, mb2_len;

  assert (mbs1 != NULL && mbs2 != NULL);

  if (!intl_Mbs_support)
    {
      return strcmp (mbs1, mbs2);
    }

  for (mb1_len = mbtowc (&wc1, mbs1, MB_LEN_MAX), mb2_len = mbtowc (&wc2, mbs2, MB_LEN_MAX);
       mb1_len > 0 && mb2_len > 0 && wc1 && wc2 && !(wc1 - wc2);)
    {
      mbs1 += mb1_len;
      mbs2 += mb2_len;

      mb1_len = mbtowc (&wc1, mbs1, MB_LEN_MAX);
      mb2_len = mbtowc (&wc2, mbs2, MB_LEN_MAX);
    }

  if (mb1_len < 0 || mb2_len < 0)
    {
      errno = EINVAL;
    }

  return (int) (wc1 - wc2);
}
#endif

/*
 * intl_mbs_ncasecmp() - compares the first n successive multi-byte character elements
 *                  from two multi-byte strings
 *   return: 0 if the first n multi-byte character elements are the same,
 *           positive number if mbs1 is greater than mbs2,
 *           negative number otherwise.
 *   mbs1(in)
 *   mbs2(in)
 *   n (in)
 *
 * Note: This function does not use the collating sequences specified
 *       in the LC_COLLATE category of the current locale.
 *       This function set errno if mbs1 or mbs2 contain one or more
 *       invalid multi-byte characters.
 */
int
intl_mbs_ncasecmp (const char *mbs1, const char *mbs2, size_t n)
{
  wchar_t wc1, wc2;
  int mb1_len, mb2_len;
  size_t num_of_chars;

  assert (mbs1 != NULL && mbs2 != NULL);

  if (!intl_Mbs_support)
    {
#if defined(WINDOWS)
      return _strnicmp (mbs1, mbs2, n);
#else
      return strncasecmp (mbs1, mbs2, n);
#endif
    }

  for (num_of_chars = 1, mb1_len = mbtowc (&wc1, mbs1, MB_LEN_MAX), mb2_len = mbtowc (&wc2, mbs2, MB_LEN_MAX);
       mb1_len > 0 && mb2_len > 0 && wc1 && wc2 && num_of_chars < n && !(towlower (wc1) - towlower (wc2));
       num_of_chars++)
    {
      mbs1 += mb1_len;
      mbs2 += mb2_len;

      mb1_len = mbtowc (&wc1, mbs1, MB_LEN_MAX);
      mb2_len = mbtowc (&wc2, mbs2, MB_LEN_MAX);
    }

  if (mb1_len < 0 || mb2_len < 0)
    {
      errno = EINVAL;
    }

  return (int) (towlower (wc1) - towlower (wc2));
}

/*
 * intl_mbs_ncpy() - Copy characters from mbs2 to mbs1 at most (n-1) bytes
 *   return: mbs1, null-terminated string.
 *   mbs1(out)
 *   mbs2(in)
 *   n(in): size of destination buffer, including null-terminator
 *
 * Note: If mbs2 contains an invalid multi-byte character, errno is set and the
 *   function returns NULL.  In this case, the contents of mbs1 are undefined.
 */

char *
intl_mbs_ncpy (char *mbs1, const char *mbs2, size_t n)
{
  size_t num_of_bytes;
  int clen, i;
  char *dest;

  assert (mbs1 != NULL && mbs2 != NULL);

  if (!intl_Mbs_support)
    {
      size_t src_len = strlen (mbs2);

      strncpy (mbs1, mbs2, n - 1);
      if (src_len < n)
    {
      mbs1[src_len] = '\0';
    }
      else
    {
      mbs1[n - 1] = '\0';
    }

      return mbs1;
    }

  for (num_of_bytes = 0, clen = mblen (mbs2, MB_LEN_MAX), dest = mbs1; clen > 0 && (num_of_bytes + clen) <= n - 1;
       clen = mblen (mbs2, MB_LEN_MAX))
    {
      /* copy the next multi-byte char */
      for (i = 0; i < clen; i++)
    {
      *dest++ = *mbs2++;
    }

      /* advance the byte counter */
      num_of_bytes += clen;
    }

  if (clen < 0)
    {
      errno = EINVAL;
      mbs1 = NULL;
    }
  else
    {
      *dest = '\0';
    }

  return mbs1;
}

#if defined (ENABLE_UNUSED_FUNCTION)
/*
 * intl_mbs_lower() - convert given characters to lowercase characters
 *   return: always 0
 *   mbs1(in)
 *   mbs2(out)
 */
int
intl_mbs_lower (const char *mbs1, char *mbs2)
{
  int char_count = 0;
  int length_in_bytes = 0;

  if (!intl_Mbs_support)
    {
      char *s;
      s = strcpy (mbs2, mbs1);
      while (*s)
    {
      *s = char_tolower (*s);
      s++;
    }
      return 0;
    }

  if (mbs1)
    {
      length_in_bytes = strlen (mbs1);
    }

  if (length_in_bytes)
    {
      intl_char_count ((unsigned char *) mbs1, length_in_bytes, lang_charset (), &char_count);
      intl_lower_string ((unsigned char *) mbs1, (unsigned char *) mbs2, char_count, lang_charset ());
      mbs2[length_in_bytes] = '\0';
    }
  else
    {
      mbs2[0] = '\0';
    }

  return 0;
}

/*
 * intl_mbs_nlower() - convert given characters to lowercase characters
 *   return: always 0
 *   dest(out) : destination buffer
 *   src(in) : source buffer
 *   max_len(in) : maximum buffer length
 */

int
intl_mbs_nlower (char *dest, const char *src, const int max_len)
{
  int char_count = 0;
  int length_in_bytes = 0;

  if (src == NULL)
    {
      dest[0] = '\0';
      return 0;
    }

  if (!intl_Mbs_support)
    {
      int i = 0;
      for (i = 0; (src[i] != '\0') && (i < max_len - 1); ++i)
    {
      dest[i] = char_tolower (src[i]);
    }
      dest[i] = '\0';
      return 0;
    }

  length_in_bytes = strlen (src);

  if (length_in_bytes >= max_len)
    {
      /* include null */
      length_in_bytes = max_len - 1;
    }

  if (length_in_bytes > 0)
    {
      intl_char_count ((unsigned char *) src, length_in_bytes, lang_charset (), &char_count);
      intl_lower_string ((unsigned char *) src, (unsigned char *) dest, char_count, lang_charset ());
      dest[length_in_bytes] = '\0';
    }
  else
    {
      dest[0] = '\0';
    }

  return 0;
}

/*
 * intl_mbs_upper() - convert given characters to uppercase characters
 *   return: always 0
 *   mbs1(in)
 *   mbs2(out)
 */
int
intl_mbs_upper (const char *mbs1, char *mbs2)
{
  int char_count = 0;
  int length_in_bytes = 0;

  if (!intl_Mbs_support)
    {
      char *s;

      for (s = strcpy (mbs2, mbs1); *s; s++)
    {
      *s = char_toupper (*s);
    }
      return 0;
    }

  if (mbs1)
    {
      length_in_bytes = strlen (mbs1);
    }

  if (length_in_bytes)
    {
      intl_char_count ((unsigned char *) mbs1, length_in_bytes, lang_charset (), &char_count);
      intl_upper_string ((unsigned char *) mbs1, (unsigned char *) mbs2, char_count, lang_charset ());
      mbs2[length_in_bytes] = '\0';
    }
  else
    {
      mbs2[0] = '\0';
    }
  return 0;
}

/*
 * intl_copy_lowercase() - converts the given wide character string to
 *                    a lowercase wide character string
 *   return: new wide character string.
 *           At most n wide characters will be converted and the new wide
 *           character string is null terminated.
 *   ws(in)
 *   n (in)
 *
 * Note: The returned pointer must be freed using wcs_delete().
 */
static wchar_t *
intl_copy_lowercase (const wchar_t * ws, size_t n)
{
  size_t i;
  wchar_t *lower_ws;

  lower_ws = (wchar_t *) malloc (sizeof (wchar_t) * (n + 1));
  if (lower_ws)
    {
      for (i = 0; ws[i] && i < n; i++)
    {
      lower_ws[i] = towlower (ws[i]);
    }
      lower_ws[i] = L'\0';
    }

  return lower_ws;
}
#endif /* ENABLE_UNUSED_FUNCTION */

/*
 * ISO 8859-1 encoding functions
 */

/*
 * intl_tolower_iso8859() - replaces all upper case ISO88591 characters
 *                          with their lower case codes.
 *   return: character counts
 *   s(in/out): string to lowercase
 *   length(in): length of the string
 */
int
intl_tolower_iso8859 (unsigned char *s, int length)
{
  int char_count = length;
  unsigned char *end;

  assert (s != NULL);

  for (end = s + length; s < end; s++)
    {
      *s = char_tolower_iso8859 (*s);
    }

  return char_count;
}

/*
 * intl_toupper_iso8859() - replaces all lower case ISO88591 characters
 *                          with their upper case codes.
 *   return: character counts
 *   s(in/out): string to uppercase
 *   length(in): length of the string
 */
int
intl_toupper_iso8859 (unsigned char *s, int length)
{
  int char_count = length;
  unsigned char *end;

  assert (s != NULL);

  for (end = s + length; s < end; s++)
    {
      *s = char_toupper_iso8859 (*s);
    }

  return char_count;
}

/*
 * general routines for EUC encoding
 */

/*
 * intl_nextchar_euc() - returns a pointer to the next character in the EUC encoded
 *              string.
 *   return: pointer to the next EUC character in the string.
 *   s(in): string
 *   curr_char_length(out): length of the character at s
 */
const unsigned char *
intl_nextchar_euc (const unsigned char *s, int *curr_char_length)
{
  assert (s != NULL);

  if (!IS_8BIT (*s))        /* Detected ASCII character */
    {
      *curr_char_length = 1;
    }
  else if (*s == SS3)       /* Detected Code Set 3 character */
    {
      *curr_char_length = 3;
    }
  else              /* Detected 2 byte character (CS1 or CS2) */
    {
      *curr_char_length = 2;
    }

  return (s + (*curr_char_length));
}

/*
 * intl_prevchar_euc() - returns a pointer to the previous character in the EUC
 *                   encoded string.
 *   return: pointer to the previous EUC character in the string s.
 *   s(in): string
 *   s_start(in) : start of buffer string
 *   prev_char_length(out): length of the previous character
 */
const unsigned char *
intl_prevchar_euc (const unsigned char *s, const unsigned char *s_start, int *prev_char_length)
{
  assert (s != NULL);
  assert (s > s_start);

  if (s - 3 >= s_start && *(s - 3) == SS3)
    {
      *prev_char_length = 3;
      return s - 3;
    }
  else if (s - 2 >= s_start && IS_8BIT (*(s - 2)))
    {
      *prev_char_length = 2;
      return s - 2;
    }

  *prev_char_length = 1;
  return --s;
}

/*
 * intl_tolower_euc() - Replaces all upper case ASCII characters inside an EUC
 *                      encoded string with their lower case codes.
 *   return: character counts
 *   src(in): EUC string to lowercase
 *   byte_size(in): size in bytes of source string
 */
static int
intl_tolower_euc (const unsigned char *src, unsigned char *d, int byte_size)
{
  int byte_count;
  const unsigned char *s = src;

  assert (src != NULL);

  for (byte_count = 0; byte_count < byte_size; byte_count++)
    {
      *d = char_tolower (*s);
      s++;
      d++;
    }

  return intl_count_euc_chars (src, byte_size);
}

/*
 * intl_toupper_euc() - Replaces all upper case ASCII characters inside an EUC
 *                      encoded string with their upper case codes.
 *   return: character counts
 *   src(in): EUC string to uppercase
 *   byte_size(in): size in bytes of source string
 */
static int
intl_toupper_euc (const unsigned char *src, unsigned char *d, int byte_size)
{
  int byte_count;
  const unsigned char *s = src;

  assert (src != NULL);

  for (byte_count = 0; byte_count < byte_size; byte_count++)
    {
      *d = char_toupper (*s);
      s++;
      d++;
    }

  return intl_count_euc_chars (src, byte_size);;
}

/*
 * intl_count_euc_chars() - Counts the number of EUC encoded characters in the
 *                     string.  Embedded NULL characters are counted.
 *   return: none
 *   s(in): string
 *   length_in_bytes(in): length of the string
 *   char_count(out): number of EUC encoded characters found
 *
 * Note: Only whole characters are counted.
 *       if s[length_in_bytes-1] is not the last byte of a multi-byte
 *       character or a single byte character, then that character is not
 *       counted.
 */
static int
intl_count_euc_chars (const unsigned char *s, int length_in_bytes)
{
  const unsigned char *end;
  int dummy;
  int char_count;

  assert (s != NULL);

  for (end = s + length_in_bytes, char_count = 0; s < end;)
    {
      s = intl_nextchar_euc (s, &dummy);
      if (s <= end)
    {
      char_count++;
    }
    }

  return char_count;
}

/*
 * intl_count_euc_bytes() - Counts the number of bytes it takes to encode the
 *                     next <length_in_chars> EUC characters in the string
 *   return:  byte counts
 *   s(in): EUC encoded string
 *   lenth_in_chars(in): length of the string in characters
 *   byte_count(out): number of bytes used for encode
 */
static int
intl_count_euc_bytes (const unsigned char *s, int length_in_chars)
{
  int char_count;
  int char_width;
  int byte_count;

  assert (s != NULL);

  for (char_count = 0, byte_count = 0; char_count < length_in_chars; char_count++)
    {
      s = intl_nextchar_euc (s, &char_width);
      byte_count += char_width;
    }

  return byte_count;
}

/*
 * string handling functions
 */

/*
 * intl_convert_charset() - converts a character string from one codeset to another
 *   return: error code
 *   src(in): string to convert
 *   length_in_chars(in): number of characters from src to convert
 *   src_codeset(IN): enumeration of src codeset
 *   dest(out): string of converted characters
 *   dest_codeset(in): enumeration of dest codeset
 *   unconverted(out): number of chars that could not be converted
 *
 * Note: Currently, codeset conversion is not supported
 */
int
intl_convert_charset (const unsigned char *src, int length_in_chars, INTL_CODESET src_codeset, unsigned char *dest,
              INTL_CODESET dest_codeset, int *unconverted)
{
  int error_code = NO_ERROR;

  switch (src_codeset)
    {
    case INTL_CODESET_ISO88591:
    case INTL_CODESET_KSC5601_EUC:
    case INTL_CODESET_UTF8:
    case INTL_CODESET_RAW_BYTES:
    default:
      error_code = ER_QSTR_BAD_SRC_CODESET;
      break;
    }

  return (error_code);
}

/*
 * intl_char_count() - Counts the number of characters in the string
 *   return: number of characters found
 *   src(in): string of characters to count
 *   length_in_bytes(in): length of the string
 *   src_codeset(in): enumeration of src codeset
 *   char_count(out): number of characters found
 *
 * Note: Embedded NULL characters are counted.
 */
int
intl_char_count (const unsigned char *src, int length_in_bytes, INTL_CODESET src_codeset, int *char_count)
{
  /* no need to check codeset for NULL string */
  if (src == NULL)
    {
      assert (length_in_bytes == 0);

      return *char_count = 0;
    }

  switch (src_codeset)
    {
    case INTL_CODESET_ISO88591:
    case INTL_CODESET_RAW_BYTES:
      *char_count = length_in_bytes;
      break;

    case INTL_CODESET_KSC5601_EUC:
      *char_count = intl_count_euc_chars (src, length_in_bytes);
      break;

    case INTL_CODESET_UTF8:
      *char_count = intl_count_utf8_chars (src, length_in_bytes);
      break;

    default:
      assert (false);
      *char_count = 0;
      break;
    }

  return *char_count;
}

/*
 * intl_char_size() - returns the number of bytes in a string given the
 *                   start and character length of the string
 *   return: none
 *   src(in): number of byets
 *   length_in_chars(in): legnth of the string in characters
 *   src_code_set(in): enumeration of src codeset
 *   bytes_count(out): number of byets used for encode the number of
 *                     characters specified
 *
 * Note: Embedded NULL's are counted as characters.
 */
int
intl_char_size (const unsigned char *src, int length_in_chars, INTL_CODESET src_codeset, int *byte_count)
{
  /* no need to check codeset for NULL string */
  if (src == NULL)
    {
      assert (length_in_chars == 0);

      return *byte_count = 0;
    }

  switch (src_codeset)
    {
    case INTL_CODESET_ISO88591:
    case INTL_CODESET_RAW_BYTES:
      *byte_count = length_in_chars;
      break;

    case INTL_CODESET_KSC5601_EUC:
      *byte_count = intl_count_euc_bytes (src, length_in_chars);
      break;

    case INTL_CODESET_UTF8:
      *byte_count = intl_count_utf8_bytes (src, length_in_chars);
      break;

    default:
      assert (false);
      *byte_count = 0;
      break;
    }

  return *byte_count;
}

#if defined (ENABLE_UNUSED_FUNCTION)
/*
 * intl_char_size_pseudo_kor() - returns the number of bytes in a string given
 *               the start and character length of the string
 *
 *   return: none
 *   src(in): number of byets
 *   length_in_chars(in): legnth of the string in characters
 *   src_code_set(in): enumeration of src codeset
 *   bytes_count(out): number of byets used for encode teh number of
 *                     characters specified
 *
 * Note: Embedded NULL's are counted as characters.
 *   This is similar to 'intl_char_size' except with INTL_CODESET_ISO88591
 *   codeset, some bytes are considered korean characters
 *   This function is used in context of some specific string functions.
 */
int
intl_char_size_pseudo_kor (const unsigned char *src, int length_in_chars, INTL_CODESET src_codeset, int *byte_count)
{
  switch (src_codeset)
    {
    case INTL_CODESET_ISO88591:
      if (!prm_get_bool_value (PRM_ID_SINGLE_BYTE_COMPARE))
    {
      int b_count = 0;
      while (length_in_chars-- > 0)
        {
          if (*src == SS3)
        {
          b_count += 3;
          src += 3;
        }
          else if (IS_PSEUDO_KOREAN (*src))
        {
          b_count += 2;
          src += 2;
        }
          else
        {
          b_count++;
          src++;
        }
        }
      *byte_count = b_count;
    }
      else
    {
      *byte_count = length_in_chars;
    }
      break;

    case INTL_CODESET_KSC5601_EUC:
      *byte_count = intl_count_euc_bytes (src, length_in_chars);
      break;

    case INTL_CODESET_UTF8:
      *byte_count = intl_count_utf8_bytes (src, length_in_chars);
      break;

    default:
      assert (false);
      *byte_count = 0;
      break;
    }

  return *byte_count;
}
#endif

/*
 * intl_prev_char() - returns pointer to the previous char in string
 *
 *   return : pointer to previous character
 *   s(in) : string
 *   s_start(in) : start of buffer string
 *   codeset(in) : enumeration of src codeset
 *   prev_char_size(out) : size of previous character
 */
const unsigned char *
intl_prev_char (const unsigned char *s, const unsigned char *s_start, INTL_CODESET codeset, int *prev_char_size)
{
  assert (s > s_start);

  switch (codeset)
    {
    case INTL_CODESET_KSC5601_EUC:
      return intl_prevchar_euc (s, s_start, prev_char_size);

    case INTL_CODESET_UTF8:
      return intl_prevchar_utf8 (s, s_start, prev_char_size);

    case INTL_CODESET_ISO88591:
    case INTL_CODESET_RAW_BYTES:
      break;
    default:
      assert (false);
    }

  *prev_char_size = 1;
  return --s;
}

#if defined (ENABLE_UNUSED_FUNCTION)
/*
 * intl_prev_char_pseudo_kor() - returns pointer to the previous char in
 *               string
 *
 *   return : pointer to previous character
 *   s(in) : string
 *   s_start(in) : start of buffer string
 *   codeset(in) : enumeration of src codeset
 *   prev_char_size(out) : size of previous character
 *
 * Note: This is similar to 'intl_prev_char' except with INTL_CODESET_ISO88591
 *   codeset, some bytes are considered korean characters
 *   This function is used in context of some specific string functions.
 */
unsigned char *
intl_prev_char_pseudo_kor (const unsigned char *s, const unsigned char *s_start, INTL_CODESET codeset,
               int *prev_char_size)
{
  assert (s > s_start);

  switch (codeset)
    {
    case INTL_CODESET_ISO88591:
      if (!prm_get_bool_value (PRM_ID_SINGLE_BYTE_COMPARE) && IS_PSEUDO_KOREAN (*(s - 1)))
    {
      if (s - 2 >= s_start && *(s - 2) == SS3)
        {
          *prev_char_size = 3;
          return s - 3;
        }
      else if (s - 1 >= s_start && IS_PSEUDO_KOREAN (*(s - 1)))
        {
          *prev_char_size = 2;
          return s - 2;
        }
    }

      break;

    case INTL_CODESET_KSC5601_EUC:
      return intl_prevchar_euc (s, s_start, prev_char_size);

    case INTL_CODESET_UTF8:
      return intl_prevchar_utf8 (s, s_start, prev_char_size);

    default:
      assert (false);
    }

  *prev_char_size = 1;
  return --s;
}
#endif

/*
 * intl_next_char () - returns pointer to the next char in string
 *
 *   return: Pointer to the next character in the string.
 *   s(in) : string
 *   codeset(in) : enumeration of the codeset of s
 *   current_char_size(out) : length of the character at s
 *
 * Note: Returns a pointer to the next character in the string.
 *   curr_char_length is set to the byte length of the current character.
 */
const unsigned char *
intl_next_char (const unsigned char *s, INTL_CODESET codeset, int *current_char_size)
{
  switch (codeset)
    {
    case INTL_CODESET_ISO88591:
    case INTL_CODESET_RAW_BYTES:
      *current_char_size = 1;
      return ++s;

    case INTL_CODESET_KSC5601_EUC:
      return intl_nextchar_euc (s, current_char_size);

    case INTL_CODESET_UTF8:
      *current_char_size = intl_Len_utf8_char[*s];
      return s + *current_char_size;

    default:
      assert (false);
      *current_char_size = 0;
      return s;
    }
}

#if defined (ENABLE_UNUSED_FUNCTION)
/*
 * intl_next_char_pseudo_kor () - returns pointer to the next char in string
 *
 *   return: Pointer to the next character in the string.
 *   s(in) : string
 *   codeset(in) : enumeration of the codeset of s
 *   current_char_size(out) : length of the character at s
 *
 * Note: This is similar to 'intl_next_char' except with INTL_CODESET_ISO88591
 *   codeset, some bytes are considered korean characters
 *   This function should be used only in context of string functions
 *   where korean characters are expected to be handled.
 */
unsigned char *
intl_next_char_pseudo_kor (const unsigned char *s, INTL_CODESET codeset, int *current_char_size)
{
  switch (codeset)
    {
    case INTL_CODESET_ISO88591:
      if (!prm_get_bool_value (PRM_ID_SINGLE_BYTE_COMPARE) && IS_PSEUDO_KOREAN (*s))
    {
      if (*s == SS3)
        {
          *current_char_size = 3;
          return s + 3;
        }
      else if (IS_PSEUDO_KOREAN (*s))
        {
          *current_char_size = 2;
          return s + 2;
        }
    }

      *current_char_size = 1;
      return ++s;

    case INTL_CODESET_KSC5601_EUC:
      return intl_nextchar_euc (s, current_char_size);

    case INTL_CODESET_UTF8:
      *current_char_size = intl_Len_utf8_char[*s];
      return s + *current_char_size;

    default:
      assert (false);
      *current_char_size = 0;
      return s;
    }
}
#endif

/*
 * intl_cmp_char() - compares the first character of two strings
 *   return: zero if character are equal, non-zero otherwise
 *   s1(in):
 *   s2(in):
 *   codeset:
 *   char_size(in): size of char in bytes of the first character in s1
 *
 *  Note: it is assumed that both strings contain at least one character of
 *    the given codeset.
 *
 */
int
intl_cmp_char (const unsigned char *s1, const unsigned char *s2, INTL_CODESET codeset, int *char_size)
{

  switch (codeset)
    {
    case INTL_CODESET_ISO88591:
    case INTL_CODESET_RAW_BYTES:
      *char_size = 1;
      return *s1 - *s2;

    case INTL_CODESET_KSC5601_EUC:
      (void) intl_nextchar_euc (s1, char_size);
      return memcmp (s1, s2, *char_size);

    case INTL_CODESET_UTF8:
      *char_size = intl_Len_utf8_char[*s1];
      return memcmp (s1, s2, *char_size);

    default:
      assert (false);
      *char_size = 1;
      return 0;
    }

  return 0;
}

#if defined (ENABLE_UNUSED_FUNCTION)
/*
 * intl_cmp_char_pseudo_kor() - compares the first character of two strings
 *   return: zero if character are equal, non-zero otherwise
 *   s1(in):
 *   s2(in):
 *   codeset:
 *   char_size(out): size of char in bytes of the first character in s1
 *
 *  Note: same as intl_cmp_char, except that with ISO-8859-1 codeset, some
 *    bytes are handled as Korean characters.
 *
 */
int
intl_cmp_char_pseudo_kor (const unsigned char *s1, const unsigned char *s2, INTL_CODESET codeset, int *char_size)
{
  switch (codeset)
    {
    case INTL_CODESET_ISO88591:
      if (!prm_get_bool_value (PRM_ID_SINGLE_BYTE_COMPARE) && IS_PSEUDO_KOREAN (*s1))
    {
      if (*s1 == SS3)
        {
          *char_size = 3;
          return memcmp (s1, s2, 3);
        }
      else if (IS_PSEUDO_KOREAN (*s1))
        {
          *char_size = 2;
          return memcmp (s1, s2, 2);
        }
    }
      *char_size = 1;
      return *s1 - *s2;

    case INTL_CODESET_KSC5601_EUC:
      (void) intl_nextchar_euc ((unsigned char *) s1, char_size);
      return memcmp (s1, s2, *char_size);

    case INTL_CODESET_UTF8:
      *char_size = intl_Len_utf8_char[*s1];
      return memcmp (s1, s2, *char_size);

    default:
      assert (false);
      *char_size = 1;
      return 0;
    }

  return 0;
}

/*
 * intl_kor_cmp() - compares first characters of two strings
 *   return: required size
 *   s1(in):
 *   s2(in):
 *   size(in): max size in bytes to compare
 *
 *  Note: this function is used only in context of 'replace' string function
 *    strncmp function should be used.
 */
int
intl_kor_cmp (unsigned char *s1, unsigned char *s2, int size)
{
  int r;
  while (size > 0)
    {
      if (!prm_get_bool_value (PRM_ID_SINGLE_BYTE_COMPARE) && IS_PSEUDO_KOREAN (*s1) && IS_PSEUDO_KOREAN (*s2))
    {
      r = memcmp (s1, s2, 2);
      if (r == 0)
        {
          s1 += 2;
          s2 += 2;
          size -= 2;
        }
      else
        {
          return r;
        }
    }
      else if ((prm_get_bool_value (PRM_ID_SINGLE_BYTE_COMPARE) || !IS_PSEUDO_KOREAN (*s1)) && *s1 == *s2)
    {
      s1++;
      s2++;
      size--;
    }
      else
    {
      return (*s1 - *s2);
    }
    }
  return 0;
}
#endif

/*
 * intl_pad_char() - returns the pad character of requested codeset
 *   return: none
 *   codeset(in): International codeset.
 *   pad_char(in/out): Pointer to array which will be filled with
 *             the pad character.
 *   pad_size(out): Size of pad character.
 *
 * Note:
 *     There is a pad character associated with every character code
 *     set.  This function will retrieve the pad character for a given
 *     code set.  The pad character is written into an array that must
 *     allocated by the caller.
 *
 */
void
intl_pad_char (const INTL_CODESET codeset, unsigned char *pad_char, int *pad_size)
{
  switch (codeset)
    {
    case INTL_CODESET_RAW_BITS:
    case INTL_CODESET_RAW_BYTES:
      pad_char[0] = '\0';
      *pad_size = 1;
      break;

    case INTL_CODESET_KSC5601_EUC:
      pad_char[0] = pad_char[1] = '\241';
      *pad_size = 2;
      break;

    case INTL_CODESET_ASCII:
    case INTL_CODESET_ISO88591:
    case INTL_CODESET_UTF8:
      pad_char[0] = ' ';
      *pad_size = 1;
      break;

    default:
      assert (false);
      break;
    }
}

/*
 * intl_pad_size() - Returns the byte size of the pad character for the given
 *           codeset.
 *   return: size of pading char
 *   codeset(in): International codeset.
 *
 * Note:
 *     There is a pad character associated with every character code
 *     set.  This function will retrieve the pad character for a given
 *     code set.  The pad character is written into an array that must
 *     allocated by the caller.
 *
 */
int
intl_pad_size (INTL_CODESET codeset)
{
  int size;

  switch (codeset)
    {
    case INTL_CODESET_KSC5601_EUC:
      size = 2;
      break;
    case INTL_CODESET_ISO88591:
    case INTL_CODESET_UTF8:
    case INTL_CODESET_RAW_BYTES:
    default:
      size = 1;
      break;
    }

  return size;
}

/*
 * intl_upper_string_size() - determine the size required for holding
 *               upper case of the input string
 *   return: required size
 *   alphabet(in): alphabet data
 *   src(in): string to uppercase
 *   src_size(in): buffer size
 *   src_length(in): length of the string measured in characters
 */
int
intl_upper_string_size (const ALPHABET_DATA * alphabet, const unsigned char *src, int src_size, int src_length)
{
  int char_count;
  int req_size = src_size;

  assert (alphabet != NULL);

  switch (alphabet->codeset)
    {
    case INTL_CODESET_ISO88591:
    case INTL_CODESET_RAW_BYTES:
      break;

    case INTL_CODESET_KSC5601_EUC:
      break;

    case INTL_CODESET_UTF8:
      {
    unsigned char upper[INTL_UTF8_MAX_CHAR_SIZE];
    unsigned char *next = NULL;

    req_size = 0;
    for (char_count = 0; char_count < src_length && src_size > 0; char_count++)
      {
        req_size += intl_char_toupper_utf8 (alphabet, src, src_size, upper, &next);
        src_size -= CAST_STRLEN (next - src);
        src = next;
      }
      }
      break;

    default:
      assert (false);
      break;
    }

  return req_size;
}

/*
 * intl_upper_string() - replace all lower case characters with their
 *                       upper case characters
 *   return: character counts
 *   alphabet(in): alphabet data
 *   src(in/out): string source to uppercase
 *   dst(in/out): output string
 *   length_in_chars(in): length of the string measured in characters
 */
int
intl_upper_string (const ALPHABET_DATA * alphabet, const unsigned char *src, unsigned char *dst, int length_in_chars)
{
  int char_count = 0;

  assert (alphabet != NULL);

  switch (alphabet->codeset)
    {
    case INTL_CODESET_RAW_BYTES:
      memcpy (dst, src, length_in_chars);
      char_count = length_in_chars;
      break;

    case INTL_CODESET_ISO88591:
      {
    unsigned char *d;
    const unsigned char *s;

    for (d = dst, s = src; d < dst + length_in_chars; d++, s++)
      {
        *d = char_toupper_iso8859 (*s);
      }
    char_count = length_in_chars;
      }
      break;

    case INTL_CODESET_KSC5601_EUC:
      {
    int byte_count;
    intl_char_size (src, length_in_chars, INTL_CODESET_KSC5601_EUC, &byte_count);
    if (byte_count > 0)
      {
        char_count = intl_toupper_euc (src, dst, byte_count);
      }
      }
      break;

    case INTL_CODESET_UTF8:
      {
    int dummy_size;
    char_count = intl_toupper_utf8 (alphabet, src, dst, length_in_chars, &dummy_size);
      }
      break;

    default:
      assert (false);
      break;
    }

  return char_count;
}

/*
 * intl_lower_string_size() - determine the size required for holding
 *               lower case of the input string
 *   return: required size
 *   alphabet(in): alphabet data
 *   src(in): string to lowercase
 *   src_size(in): buffer size
 *   src_length(in): length of the string measured in characters
 */
int
intl_lower_string_size (const ALPHABET_DATA * alphabet, const unsigned char *src, int src_size, int src_length)
{
  int char_count;
  int req_size = src_size;

  assert (alphabet != NULL);

  switch (alphabet->codeset)
    {
    case INTL_CODESET_ISO88591:
    case INTL_CODESET_RAW_BYTES:
      break;

    case INTL_CODESET_KSC5601_EUC:
      break;

    case INTL_CODESET_UTF8:
      {
    unsigned char lower[INTL_UTF8_MAX_CHAR_SIZE];
    unsigned char *next;

    req_size = 0;
    for (char_count = 0; char_count < src_length && src_size > 0; char_count++)
      {
        req_size += intl_char_tolower_utf8 (alphabet, src, src_size, lower, &next);
        src_size -= CAST_STRLEN (next - src);
        src = next;
      }
      }
      break;

    default:
      assert (false);
      break;
    }

  return req_size;
}

/*
 * intl_lower_string() - replace all upper case characters with their
 *                      lower case characters
 *   return: character counts
 *   alphabet(in): alphabet data
 *   src(in/out): string to lowercase
 *   dst(out): output string
 *   length_in_chars(in): length of the string measured in characters
 */
int
intl_lower_string (const ALPHABET_DATA * alphabet, const unsigned char *src, unsigned char *dst, int length_in_chars)
{
  int char_count = 0;

  assert (alphabet != NULL);

  switch (alphabet->codeset)
    {
    case INTL_CODESET_ISO88591:
      {
    unsigned char *d;
    const unsigned char *s;

    for (d = dst, s = src; d < dst + length_in_chars; d++, s++)
      {
        *d = char_tolower_iso8859 (*s);
      }
    char_count = length_in_chars;
      }
      break;

    case INTL_CODESET_RAW_BYTES:
      memcpy (dst, src, length_in_chars);
      break;

    case INTL_CODESET_KSC5601_EUC:
      {
    int byte_count;
    intl_char_size (src, length_in_chars, INTL_CODESET_KSC5601_EUC, &byte_count);
    if (byte_count > 0)
      {
        char_count = intl_tolower_euc (src, dst, byte_count);
      }
      }
      break;

    case INTL_CODESET_UTF8:
      {
    int dummy_size;
    char_count = intl_tolower_utf8 (alphabet, src, dst, length_in_chars, &dummy_size);
      }
      break;

    default:
      assert (false);
      break;
    }

  return char_count;
}

#if defined (ENABLE_UNUSED_FUNCTION)
/*
 * intl_is_korean() - test for a korean character
 *   return: non-zero if ch is a korean character,
 *           0 otherwise.
 *   ch(in): the character to be tested
 */
static int
intl_is_korean (unsigned char ch)
{
  if (prm_get_bool_value (PRM_ID_SINGLE_BYTE_COMPARE))
    {
      return 0;
    }
  return (ch >= 0xb0 && ch <= 0xc8) || (ch >= 0xa1 && ch <= 0xfe);
}

/*
 * intl_language() - Returns the language for the given category of the
 *                   current locale
 *   return: INTL_LANG enumeration
 *   category(in): category argument to setlocale()
 */
INTL_LANG
intl_language (int category)
{
  char *loc = setlocale (category, NULL);

#if defined(WINDOWS) || defined(SOLARIS)
  return INTL_LANG_ENGLISH;
#else /* !WINDOWS && !SOLARIS */
  if (loc != NULL && strcmp (loc, LOCALE_KOREAN) == 0)
    {
      return INTL_LANG_KOREAN;
    }
  else
    {
      return INTL_LANG_ENGLISH;
    }
#endif
}
#endif /* ENABLE_UNUSED_FUNCTION */

/*
 * intl_zone() - Return the zone for the given category of the
 *               current locale
 *   return: INTL_ZONE enumeration
 *   lang_id(in): language identifier
 */
INTL_ZONE
intl_zone (int category)
{
  switch (lang_id ())
    {
    case INTL_LANG_ENGLISH:
      return INTL_ZONE_US;
    case INTL_LANG_KOREAN:
      return INTL_ZONE_KR;
    default:
      return INTL_ZONE_US;
    }
  return INTL_ZONE_US;
}

/*
 * intl_reverse_string() - reverse characters of source string,
 *             into destination string
 *   return: character counts
 *   src(in): source string
 *   dst(out): destination string
 *   length_in_chars(in): length of the string measured in characters
 *   size_in_bytes(in): size of the string in bytes
 *   codeset(in): enumeration of source string
 */
int
intl_reverse_string (const unsigned char *src, unsigned char *dst, int length_in_chars, int size_in_bytes,
             INTL_CODESET codeset)
{
  const unsigned char *end, *s;
  unsigned char *d;
  int char_count = 0;
  int char_size, i;

  assert (src != NULL);
  assert (dst != NULL);

  s = src;

  switch (codeset)
    {
    case INTL_CODESET_ISO88591:
    case INTL_CODESET_RAW_BYTES:
      d = dst + length_in_chars - 1;
      end = src + length_in_chars;
      for (; s < end; char_count++)
    {
      *d = *s;
      s++;
      d--;
    }
      break;

    case INTL_CODESET_KSC5601_EUC:
      {
    d = dst + size_in_bytes - 1;
    end = src + size_in_bytes;
    for (; s < end && char_count < length_in_chars; char_count++)
      {
        if (!IS_8BIT (*s))  /* ASCII character */
          {
        *d-- = *s++;
          }
        else if (*s == SS3) /* Code Set 3 character */
          {
        *(d - 2) = *s;
        *(d - 1) = *(s + 1);
        *d = *(s + 2);
        s += 3;
        d -= 3;
          }
        else        /* 2 byte character (CS1 or CS2) */
          {
        *(d - 1) = *s;
        *d = *(s + 1);
        s += 2;
        d -= 2;
          }
      }
      }
      break;

    case INTL_CODESET_UTF8:
      {
    d = dst + size_in_bytes - 1;
    end = src + size_in_bytes;
    for (; s < end && char_count < length_in_chars; char_count++)
      {
        char_size = intl_Len_utf8_char[*s];

        i = char_size;
        while (i > 0)
          {
        i--;
        *(d - i) = *s;
        s++;
          }
        d -= char_size;
      }
      }
      break;

    default:
      assert (false);
      break;
    }

  return char_count;
}

/*
 * intl_is_max_bound_chr () -
 *
 * return: check if chr points to a char representing the upper bound
 *     codepoint in the selected codeset, for LIKE index optimization.
 *
 * codeset(in) : the codeset to consider
 * chr(in) : upper bound, as bytes
 */
bool
intl_is_max_bound_chr (INTL_CODESET codeset, const unsigned char *chr)
{
  switch (codeset)
    {
    case INTL_CODESET_UTF8:
      if ((*chr == 0xf4) && (*(chr + 1) == 0x8f) && (*(chr + 2) == 0xbf) && (*(chr + 3) == 0xbf))
    {
      return true;
    }
      return false;
    case INTL_CODESET_KSC5601_EUC:
      if (((*chr == 0xff) && (*(chr + 1) == 0xff)))
    {
      return true;
    }
      return false;
    case INTL_CODESET_ISO88591:
    case INTL_CODESET_RAW_BYTES:
    default:
      if (*chr == 0xff)
    {
      return true;
    }
      return false;
    }

  return false;
}

/*
 * intl_is_min_bound_chr () -
 *
 * return: check if chr points to a ISO char / UTF-8 codepoint representing
 *     the lower bound codepoint in the selected codeset, for LIKE
 *         index optimization.
 *
 * codeset(in) : the codeset to consider
 * chr(in) : upper bound, as UTF-8 bytes
 *
 * Note: 'chr' buffer should be able to store at least 1 more byte, for
 *    one space char.
 */
bool
intl_is_min_bound_chr (INTL_CODESET codeset, const unsigned char *chr)
{
  if (*chr == ' ')
    {
      return true;
    }

  return false;
}

/*
 * intl_set_min_bound_chr () - sets chr to a byte array representing
 *                 the lowest bound codepoint in the selected
 *                 codeset, for LIKE index optimization.
 *
 * return: the number of bytes added to chr
 *
 * codeset(in) : the codeset to consider
 * chr(in) : char pointer where to place the bound, as UTF-8 bytes
 */
int
intl_set_min_bound_chr (INTL_CODESET codeset, char *chr)
{
  *chr = ' ';

  return 1;
}

/*
 * intl_set_max_bound_chr () - sets chr to a byte array representing
 *                 the up-most bound codepoint in the selected
 *                 codeset, for LIKE index optimization.
 *
 * return: the number of bytes added to chr
 *
 * codeset(in) : the codeset to consider
 * chr(in) : char pointer where to place the bound
 *
 * Note: 'chr' buffer should be able to store at least one more char:
 *   4 bytes (UTF-8), 2 bytes (EUC-KR), 1 byte (ISO-8859-1).
 *
 */
int
intl_set_max_bound_chr (INTL_CODESET codeset, char *chr)
{
  switch (codeset)
    {
    case INTL_CODESET_UTF8:
      *chr = (char) 0xf4;
      *(chr + 1) = (char) 0x8f;
      *(chr + 2) = (char) 0xbf;
      *(chr + 3) = (char) 0xbf;
      return 4;
    case INTL_CODESET_KSC5601_EUC:
      *chr = (char) 0xff;
      *(chr + 1) = (char) 0xff;
      return 2;
    case INTL_CODESET_ISO88591:
    case INTL_CODESET_RAW_BYTES:
    default:
      *chr = (char) 0xff;
      return 1;
    }

  return 1;
}

/*
 * general routines for UTF-8 encoding
 */

static const unsigned char len_utf8_char[256] = {
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  1, 1, 1, 1, 1, 1, 1,
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  1, 1, 1, 1, 1, 1, 1,
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  1, 1, 1, 1, 1, 1, 1,
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  1, 1, 1, 1, 1, 1, 1,
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  1, 1, 1, 1, 1, 1, 1,
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  1, 1, 1, 1, 1, 1, 1,
  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  2, 2, 2, 2, 2, 2, 2,
  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5,
  5, 5, 5, 6, 6, 1, 1
};

const unsigned char *const intl_Len_utf8_char = len_utf8_char;

/*
 * intl_nextchar_utf8() - returns a pointer to the next character in the
 *              UTF-8 encoded string.
 *   return: pointer to the next character
 *   s(in): input string
 *   curr_char_length(out): length of the character at s
 */
const unsigned char *
intl_nextchar_utf8 (const unsigned char *s, int *curr_char_length)
{
  INTL_GET_NEXTCHAR_UTF8 (s, *curr_char_length);
  return s;
}

/*
 * intl_prevchar_utf8() - returns a pointer to the previous character in the
 *                   UTF-8 encoded string.
 *   return: pointer to the previous character
 *   s(in): string
 *   s_start(in) : start of buffer string
 *   prev_char_length(out): length of the previous character
 */
const unsigned char *
intl_prevchar_utf8 (const unsigned char *s, const unsigned char *s_start, int *prev_char_length)
{
  int l = 0;

  do
    {
      l++;
    }
  while (l < 6 && s - l >= s_start && (*(s - l) & 0xc0) == 0x80);

  l = (*(s - l) & 0xc0) == 0x80 ? 1 : l;
  s -= l;
  *prev_char_length = l;

  return s;
}

/*
 * intl_tolower_utf8() - Replaces all upper case characters inside an UTF-8
 *           encoded string with their lower case codes.
 *   return: character counts
 *   alphabet(in): alphabet to use
 *   s(in): UTF-8 string to lowercase
 *   d(out): output string
 *   length_in_chars(in): length of the string measured in characters
 *   d_size(out): size in bytes of destination
 */
static int
intl_tolower_utf8 (const ALPHABET_DATA * alphabet, const unsigned char *s, unsigned char *d, int length_in_chars,
           int *d_size)
{
  int char_count, size;
  int s_size;
  unsigned char *next = NULL;

  assert (s != NULL);
  assert (d_size != NULL);

  intl_char_size (s, length_in_chars, INTL_CODESET_UTF8, &s_size);
  *d_size = 0;

  for (char_count = 0; char_count < length_in_chars; char_count++)
    {
      if (s_size <= 0)
    {
      break;
    }
      size = intl_char_tolower_utf8 (alphabet, s, s_size, d, &next);
      d += size;
      *d_size += size;

      s_size -= CAST_STRLEN (next - s);
      s = next;
    }

  return char_count;
}

/*
 * intl_toupper_utf8() - Replaces all lower case characters inside an UTF-8
 *           encoded string with their upper case codes.
 *   return: character counts
 *   alphabet(in): alphabet to use
 *   s(in): UTF-8 string to uppercase
 *   d(out): output string
 *   length_in_chars(in): length of the string measured in characters
 *   d_size(out): size in bytes of destination
 */
static int
intl_toupper_utf8 (const ALPHABET_DATA * alphabet, const unsigned char *s, unsigned char *d, int length_in_chars,
           int *d_size)
{
  int char_count, size;
  int s_size;
  unsigned char *next = NULL;

  assert (s != NULL);
  assert (d_size != NULL);

  intl_char_size (s, length_in_chars, INTL_CODESET_UTF8, &s_size);
  *d_size = 0;

  for (char_count = 0; char_count < length_in_chars; char_count++)
    {
      if (s_size <= 0)
    {
      break;
    }
      size = intl_char_toupper_utf8 (alphabet, s, s_size, d, &next);
      d += size;
      *d_size += size;

      s_size -= CAST_STRLEN (next - s);
      s = next;
    }

  return char_count;
}

/*
 * intl_count_utf8_chars() - Counts the number of UTF-8 encoded characters in
 *                     the string. Embedded NULL characters are counted.
 *   return: none
 *   s(in): string
 *   length_in_bytes(in): length of the string
 *   char_count(out): number of UTF-8 encoded characters found
 *
 * Note: Only whole characters are counted.
 *       if s[length_in_bytes-1] is not the last byte of a multi-byte
 *       character or a single byte character, then that character is not
 *       counted.
 */
int
intl_count_utf8_chars (const unsigned char *s, int length_in_bytes)
{
  const unsigned char *end;
  int char_count;

  assert (s != NULL);

  end = s + length_in_bytes;

  /* ASCII fast path: SWAR scan of the per-byte high bit. If every byte is
   * ASCII (MSB clear) then char_count == length_in_bytes.
   *
   *   UTF-8 byte patterns:
   *     0xxxxxxx  ASCII 1-byte char
   *     110xxxxx  2-byte sequence lead
   *     1110xxxx  3-byte sequence lead
   *     11110xxx  4-byte sequence lead
   *     10xxxxxx  continuation byte
   *
   *   Only ASCII has MSB=0, so "(word & 0x80..80) == 0" means 8 ASCII bytes.
   */
  {
    const unsigned char *p = s;

    while (p + 8 <= end)
      {
    UINT64 word;
    memcpy (&word, p, sizeof (word));
    if ((word & UINT64_C (0x8080808080808080)) != 0)
      {
        goto slow_path;
      }
    p += 8;
      }
    while (p < end)
      {
    if (*p >= 0x80)
      {
        goto slow_path;
      }
    p++;
      }
    return length_in_bytes;
  }

slow_path:
  /* Multi-byte present: use the lead-byte length table to skip per char. */
  for (char_count = 0; s < end;)
    {
      s += intl_Len_utf8_char[*s];
      if (s <= end)
    {
      char_count++;
    }
    }

  return char_count;
}

/*
 * intl_count_utf8_bytes() - Counts the number of bytes it takes to encode the
 *                     next <length_in_chars> UTF-8 characters in the string
 *   return: byte counts
 *   s(in): UTF-8 encoded string
 *   lenth_in_chars(in): length of the string in characters
 *   byte_count(out): number of bytes used for encode
 */
static int
intl_count_utf8_bytes (const unsigned char *s, int length_in_chars)
{
  int char_count;
  int byte_count;

  assert (s != NULL);

  /* ASCII fast path: SWAR scan of the per-byte high bit. If every byte is
   * ASCII (MSB clear) then byte_count == length_in_chars since each ASCII
   * char occupies exactly one byte.
   *
   *   UTF-8 byte patterns:
   *     0xxxxxxx  ASCII 1-byte char
   *     110xxxxx  2-byte sequence lead
   *     1110xxxx  3-byte sequence lead
   *     11110xxx  4-byte sequence lead
   *     10xxxxxx  continuation byte
   *
   *   Only ASCII has MSB=0, so "(word & 0x80..80) == 0" means 8 ASCII bytes.
   */
  {
    const unsigned char *p = s;
    const unsigned char *end = s + length_in_chars;

    while (p + 8 <= end)
      {
    UINT64 word;
    memcpy (&word, p, sizeof (word));
    if ((word & UINT64_C (0x8080808080808080)) != 0)
      {
        goto slow_path;
      }
    p += 8;
      }
    while (p < end)
      {
    if (*p >= 0x80)
      {
        goto slow_path;
      }
    p++;
      }
    return length_in_chars;
  }

slow_path:
  /* Multi-byte present: use the lead-byte length table to skip per char. */
  for (char_count = 0, byte_count = 0; char_count < length_in_chars; char_count++)
    {
      byte_count += intl_Len_utf8_char[s[byte_count]];
    }

  return byte_count;
}

/*
 * intl_char_tolower_utf8() - convert uppercase character to lowercase
 *   return: size of UTF-8 lowercase character corresponding to the argument
 *   alphabet(in): alphabet to use
 *   s (in): the UTF-8 buffer holding character to be converted
 *   size(in): size of UTF-8 buffer
 *   d (out): output buffer
 *   next (out): pointer to next character
 *
 *  Note : allocated size of 'd' is assumed to be large enough to fit any
 *     UTF-8 character
 */
static int
intl_char_tolower_utf8 (const ALPHABET_DATA * alphabet, const unsigned char *s, const int size, unsigned char *d,
            unsigned char **next)
{
  unsigned int cp = intl_utf8_to_cp (s, size, next);

  assert (alphabet != NULL);

  if (cp < (unsigned int) (alphabet->l_count))
    {
      if (alphabet->lower_multiplier == 1)
    {
      unsigned int lower_cp = alphabet->lower_cp[cp];

      return intl_cp_to_utf8 (lower_cp, d);
    }
      else
    {
      const unsigned int *case_p;
      int count = 0;
      int bytes;
      int total_bytes = 0;

      assert (alphabet->lower_multiplier > 1 && alphabet->lower_multiplier <= INTL_CASING_EXPANSION_MULTIPLIER);

      case_p = &(alphabet->lower_cp[cp * alphabet->lower_multiplier]);

      do
        {
          bytes = intl_cp_to_utf8 (*case_p, d);
          d += bytes;
          total_bytes += bytes;
          case_p++;
          count++;
        }
      while (count < alphabet->lower_multiplier && *case_p != 0);

      return total_bytes;
    }
    }
  else if (cp == 0xffffffff)
    {
      /* this may happen when UTF-8 text validation is disabled (by default) */
      *d = *s;
      return 1;
    }

  return intl_cp_to_utf8 (cp, d);
}

/*
 * intl_char_toupper_utf8() - convert lowercase character to uppercase
 *   return: size of UTF-8 uppercase character corresponding to the argument
 *   alphabet(in): alphabet to use
 *   s (in): the UTF-8 buffer holding character to be converted
 *   size(in): size of UTF-8 buffer
 *   d (out): output buffer
 *   next (out): pointer to next character
 *
 *  Note : allocated size of 'd' is assumed to be large enough to fit any
 *     UTF-8 character
 */
static int
intl_char_toupper_utf8 (const ALPHABET_DATA * alphabet, const unsigned char *s, const int size, unsigned char *d,
            unsigned char **next)
{
  unsigned int cp = intl_utf8_to_cp (s, size, next);

  assert (alphabet != NULL);

  if (cp < (unsigned int) (alphabet->l_count))
    {
      if (alphabet->upper_multiplier == 1)
    {
      unsigned upper_cp = alphabet->upper_cp[cp];

      return intl_cp_to_utf8 (upper_cp, d);
    }
      else
    {
      const unsigned int *case_p;
      int count = 0;
      int bytes;
      int total_bytes = 0;

      assert (alphabet->upper_multiplier > 1 && alphabet->upper_multiplier <= INTL_CASING_EXPANSION_MULTIPLIER);

      case_p = &(alphabet->upper_cp[cp * alphabet->upper_multiplier]);
      do
        {
          bytes = intl_cp_to_utf8 (*case_p, d);
          d += bytes;
          total_bytes += bytes;
          case_p++;
          count++;
        }
      while (count < alphabet->upper_multiplier && *case_p != 0);

      return total_bytes;
    }
    }
  else if (cp == 0xffffffff)
    {
      /* this may happen when UTF-8 text validation is disabled (by default) */
      *d = *s;
      return 1;
    }

  return intl_cp_to_utf8 (cp, d);
}

/*
 * intl_identifier_casecmp_w_size()
 *   return:  0 if strings are equal, -1 if str1 < str2 , 1 if str1 > str2
 *   str1(in):
 *   str2(in):
 *   size_str1(in): size in bytes of str1
 *   size_str2(in): size in bytes of str2
 *
 */
int
intl_identifier_casecmp_w_size (const INTL_LANG lang_id, unsigned char *str1, unsigned char *str2, const int size_str1,
                const int size_str2)
{
#if INTL_IDENTIFIER_CASING_SIZE_MULTIPLIER <= 1
  if (size_str1 != size_str2)
    {
      return (size_str1 < size_str2) ? -1 : 1;
    }
#endif

  switch (lang_charset ())
    {
    case INTL_CODESET_UTF8:
      {
    unsigned char *str1_end, *str2_end;
    unsigned char *dummy;
    unsigned int cp1, cp2;
    const LANG_LOCALE_DATA *loc = lang_get_specific_locale (lang_id, INTL_CODESET_UTF8);
    const ALPHABET_DATA *alphabet;

    assert (loc != NULL);

    alphabet = &(loc->ident_alphabet);

    str1_end = str1 + size_str1;
    str2_end = str2 + size_str2;

    for (; str1 < str1_end && str2 < str2_end;)
      {
        int skip_size1 = 0, skip_size2 = 0;
        int res;

        cp1 = intl_utf8_to_cp (str1, CAST_STRLEN (str1_end - str1), &dummy);
        cp2 = intl_utf8_to_cp (str2, CAST_STRLEN (str2_end - str2), &dummy);

        res =
          intl_strcasecmp_utf8_one_cp (alphabet, str1, str2, CAST_STRLEN (str1_end - str1),
                       CAST_STRLEN (str2_end - str2), cp1, cp2, &skip_size1, &skip_size2);

        if (res != 0)
          {
        return res;
          }

        str1 += skip_size1;
        str2 += skip_size2;
      }

    return (str1 < str1_end) ? 1 : ((str2 < str2_end) ? -1 : 0);
      }
      break;

    case INTL_CODESET_ISO88591:
      {
    unsigned char *str1_end, *str2_end;
    unsigned char lower1, lower2;

    if (size_str1 != size_str2)
      {
        return (size_str1 < size_str2) ? -1 : 1;
      }

    str1_end = str1 + size_str1;
    str2_end = str2 + size_str2;

    for (; str1 < str1_end && str2 < str2_end; str1++, str2++)
      {
        if (*str1 != *str2)
          {
        lower1 = char_tolower_iso8859 (*str1);
        lower2 = char_tolower_iso8859 (*str2);
        if (lower1 != lower2)
          {
            return (lower1 < lower2) ? -1 : 1;
          }
          }
      }

    return (str1 < str1_end) ? 1 : ((str2 < str2_end) ? -1 : 0);
      }
    case INTL_CODESET_KSC5601_EUC:
    default:
      /* ASCII */
      if (size_str1 != size_str2)
    {
      return (size_str1 < size_str2) ? -1 : 1;
    }

      return strncasecmp ((char *) str1, (char *) str2, size_str1);
    }

  return 0;
}

/*
 * intl_is_case_match() - performs case insensitive matching
 *   return:  0 if strings are equal, -1 if str1 < str2 , 1 if str1 > str2
 *   lang_id(in):
 *   codeset(in):
 *   tok(in): token to check
 *   src(in): string to check for token
 *   size_tok(in): size in bytes of token
 *   size_src(in): size in bytes of source string
 *   matched_size_src(out): size in bytes of matched token in source
 *
 *  Note : Matching is performed by folding to LOWER case;
 *     it takes into account case expansion (length in chars may differ).
 */
int
intl_case_match_tok (const INTL_LANG lang_id, const INTL_CODESET codeset, unsigned char *tok, unsigned char *src,
             const int size_tok, const int size_src, int *matched_size_src)
{
  assert (tok != NULL);
  assert (src != NULL);

  assert (size_tok > 0);
  assert (size_src >= 0);

  assert (matched_size_src != NULL);

  *matched_size_src = 0;

  switch (codeset)
    {
    case INTL_CODESET_UTF8:
      {
    unsigned char *tok_end, *src_end;
    unsigned char *dummy;
    unsigned int cp1, cp2;
    const LANG_LOCALE_DATA *loc = lang_get_specific_locale (lang_id, INTL_CODESET_UTF8);
    const ALPHABET_DATA *alphabet;

    assert (loc != NULL);

    alphabet = &(loc->alphabet);

    tok_end = tok + size_tok;
    src_end = src + size_src;

    for (; tok < tok_end && src < src_end;)
      {
        int skip_size_tok = 0, skip_size_src = 0;
        int res;

        cp1 = intl_utf8_to_cp (tok, CAST_STRLEN (tok_end - tok), &dummy);
        cp2 = intl_utf8_to_cp (src, CAST_STRLEN (src_end - src), &dummy);

        res =
          intl_strcasecmp_utf8_one_cp (alphabet, tok, src, CAST_STRLEN (tok_end - tok), CAST_STRLEN (src_end - src),
                       cp1, cp2, &skip_size_tok, &skip_size_src);

        if (res != 0)
          {
        return res;
          }

        tok += skip_size_tok;
        src += skip_size_src;
        *matched_size_src += skip_size_src;
      }

    return (tok < tok_end) ? 1 : 0;
      }
      break;

    case INTL_CODESET_ISO88591:
      {
    unsigned char *tok_end, *src_end;
    unsigned char lower1, lower2;
    tok_end = tok + size_tok;
    src_end = src + size_src;

    if (size_tok > size_src)
      {
        return 1;
      }

    *matched_size_src = size_tok;
    for (; tok < tok_end && src < src_end; tok++, src++)
      {
        if (*tok != *src)
          {
        lower1 = char_tolower_iso8859 (*tok);
        lower2 = char_tolower_iso8859 (*src);
        if (lower1 != lower2)
          {
            return (lower1 < lower2) ? -1 : 1;
          }
          }
      }
      }
      break;

    case INTL_CODESET_KSC5601_EUC:
    default:
      if (size_tok > size_src)
    {
      return 1;
    }

      *matched_size_src = size_tok;
      return strncasecmp ((char *) tok, (char *) src, size_tok);
    }

  return 0;
}

/*
 * intl_strcasecmp_utf8_one_cp() - compares the first codepoints from two
 *                 strings case insensitive
 *   return:  0 if strings are equal, -1 if cp1 < cp2 , 1 if cp1 > cp2
 *   str1(in):
 *   str2(in):
 *   size_str1(in): size in bytes of str1
 *   size_str2(in): size in bytes of str2
 *   cp1(in): first codepoint in str1
 *   cp2(in): first codepoint in str2
 *   skip_size1(out):  bytes to skip from str1
 *   skip_size2(out):  bytes to skip from str2
 *   identifier_mode(in): true if compares identifiers, false otherwise
 *
 *  Note : skip_size1, skip_size2 are valid only when strings are equal
 *     (returned value is zero).
 */
static int
intl_strcasecmp_utf8_one_cp (const ALPHABET_DATA * alphabet, unsigned char *str1, unsigned char *str2,
                 const int size_str1, const int size_str2, unsigned int cp1, unsigned int cp2,
                 int *skip_size1, int *skip_size2)
{
  int alpha_cnt;
  unsigned int l_array_1[INTL_CASING_EXPANSION_MULTIPLIER];
  unsigned int l_array_2[INTL_CASING_EXPANSION_MULTIPLIER];
  int skip_len1 = 1, skip_len2 = 1;
  int l_count_1 = 0, l_count_2 = 0, l_count = 0;
  int res;
  bool use_original_str1, use_original_str2;

  unsigned int *casing_arr;
  int casing_multiplier;

  assert (alphabet != NULL);
  assert (str1 != NULL);
  assert (str2 != NULL);
  assert (skip_size1 != NULL);
  assert (skip_size2 != NULL);

  if (cp1 == cp2)
    {
      (void) intl_char_size (str1, 1, INTL_CODESET_UTF8, skip_size1);
      (void) intl_char_size (str2, 1, INTL_CODESET_UTF8, skip_size2);

      return 0;
    }

  alpha_cnt = alphabet->l_count;

  if (alphabet->lower_multiplier == 1 && alphabet->upper_multiplier == 1)
    {
      if (cp1 < (unsigned int) alpha_cnt)
    {
      cp1 = alphabet->lower_cp[cp1];
    }

      if (cp2 < (unsigned int) alpha_cnt)
    {
      cp2 = alphabet->lower_cp[cp2];
    }

      if (cp1 != cp2)
    {
      return (cp1 < cp2) ? (-1) : 1;
    }

      (void) intl_char_size (str1, 1, INTL_CODESET_UTF8, skip_size1);
      (void) intl_char_size (str2, 1, INTL_CODESET_UTF8, skip_size2);

      return 0;
    }

  /*
   * Multipliers can be either 1 or 2, as imposed by the LDML parsing code.
   * Currently, alphabets with both multipliers equal to 2 are not supported
   * for case sensitive comparisons.
   */
  assert (alphabet->lower_multiplier == 1 || alphabet->upper_multiplier == 1);
  if (alphabet->lower_multiplier > alphabet->upper_multiplier)
    {
      casing_arr = alphabet->lower_cp;
      casing_multiplier = alphabet->lower_multiplier;
    }
  else
    {
      casing_arr = alphabet->upper_cp;
      casing_multiplier = alphabet->upper_multiplier;
    }

  use_original_str1 = true;
  if (cp1 < (unsigned int) alpha_cnt)
    {
      memcpy (l_array_1, &(casing_arr[cp1 * casing_multiplier]), casing_multiplier * sizeof (unsigned int));

      if (cp1 != l_array_1[0])
    {
      l_count_1 = casing_multiplier;
      while (l_count_1 > 1 && l_array_1[l_count_1 - 1] == 0)
        {
          l_count_1--;
        }

      use_original_str1 = false;
    }
    }

  use_original_str2 = true;
  if (cp2 < (unsigned int) alpha_cnt)
    {
      memcpy (l_array_2, &(casing_arr[cp2 * casing_multiplier]), casing_multiplier * sizeof (unsigned int));

      if (cp2 != l_array_2[0])
    {
      l_count_2 = casing_multiplier;
      while (l_count_2 > 1 && l_array_2[l_count_2 - 1] == 0)
        {
          l_count_2--;
        }

      use_original_str2 = false;
    }
    }

  if (use_original_str1)
    {
      (void) intl_utf8_to_cp_list (str1, size_str1, l_array_1, casing_multiplier, &l_count_1);
    }

  if (use_original_str2)
    {
      (void) intl_utf8_to_cp_list (str2, size_str2, l_array_2, casing_multiplier, &l_count_2);
    }

  l_count = MIN (l_count_1, l_count_2);

  if (use_original_str1)
    {
      l_count_1 = MIN (l_count, l_count_1);
      skip_len1 = l_count_1;
    }
  else
    {
      skip_len1 = 1;
    }

  if (use_original_str2)
    {
      l_count_2 = MIN (l_count, l_count_2);
      skip_len2 = l_count_2;
    }
  else
    {
      skip_len2 = 1;
    }

  if (l_count_1 != l_count_2)
    {
      return (l_count_1 < l_count_2) ? (-1) : (1);
    }

  assert (l_count_1 == l_count_2);

  /* compare lower codepoints */
  res = memcmp (l_array_1, l_array_2, l_count * sizeof (unsigned int));
  if (res != 0)
    {
      return res;
    }

  /* convert supplementary characters in bytes size to skip */
  (void) intl_char_size (str1, skip_len1, INTL_CODESET_UTF8, skip_size1);
  (void) intl_char_size (str2, skip_len2, INTL_CODESET_UTF8, skip_size2);

  return 0;
}

/*
 * intl_identifier_casecmp_for_dblinke() - compares two identifiers strings
 *                 case insensitive excluding double quote for dblink
 *
 *   return: 0 if dblink_col_name equals to remote_col_name
 *   dblink_col_name(in):
 *   remote_col_name(in):
 *
 * NOTE: this routine is the same as intl_identifier_casecmp
 *       the first argument dblink_col_name may start with double quote
 *       but the remote_col_name never
 */
int
intl_identifier_casecmp_for_dblink (const char *dblink_col_name, const char *remote_col_name)
{
  int str1_size;
  int str2_size;
  char *str1 = (char *) dblink_col_name;
  char *str2 = (char *) remote_col_name;

  assert (str1 != NULL);
  assert (str2 != NULL);

  str1_size = strlen (str1);
  str2_size = strlen (str2);

  if (*str1 == '\"' || *str1 == '`')
    {
      str1_size = str1_size - 2;
      str1 = str1 + 1;
    }

  return intl_identifier_casecmp_w_size (lang_id (), (unsigned char *) str1, (unsigned char *) str2, str1_size,
                     str2_size);
}

/*
 * intl_identifier_casecmp() - compares two identifiers strings
 *                 case insensitive
 *   return: 0 if strings are equal, -1 if str1 < str2 , 1 if str1 > str2
 *   str1(in):
 *   str2(in):
 *
 * NOTE: identifier comparison is special, see intl_identifier_casecmp_w_size
 *   for details on comparing identifiers of different length.
 */
int
intl_identifier_casecmp (const char *str1, const char *str2)
{
  int str1_size;
  int str2_size;

  assert (str1 != NULL);
  assert (str2 != NULL);

  str1_size = strlen (str1);
  str2_size = strlen (str2);

  return intl_identifier_casecmp_w_size (lang_id (), (unsigned char *) str1, (unsigned char *) str2, str1_size,
                     str2_size);
}

/*
 * intl_identifier_ncasecmp() - compares two identifiers strings
 *              case insensitive
 *   return:
 *   str1(in):
 *   str2(in):
 *   len(in): number of chars to compare
 *
 */
int
intl_identifier_ncasecmp (const char *str1, const char *str2, const int len)
{
  int str1_size, str2_size;

  (void) intl_char_size ((unsigned char *) str1, len, lang_charset (), &str1_size);
  (void) intl_char_size ((unsigned char *) str2, len, lang_charset (), &str2_size);

  return intl_identifier_casecmp_w_size (lang_id (), (unsigned char *) str1, (unsigned char *) str2, str1_size,
                     str2_size);
}

/*
 * intl_identifier_cmp() - compares two identifiers strings
 *             case sensitive
 *   return:
 *   str1(in):
 *   str2(in):
 *
 */
int
intl_identifier_cmp (const char *str1, const char *str2)
{
  /* when comparing identifiers, order of current collation is not important */
  return strcmp (str1, str2);
}


#if defined(ENABLE_UNUSED_FUNCTION)
/*
 * intl_identifier_namecmp() - compares two identifier string
 *   return: 0 if the identifiers are the "same",
 *           positive number if str1 is greater than str1,
 *           negative number otherwise.
 *   str1(in)
 *   str2(in)
 *
 * Note: "same" means that this function ignores bracket '[', ']'
 *       so str1 = "[value]" and str2 = "value" returns 0
 */
int
intl_identifier_namecmp (const char *str1, const char *str2)
{
  const char *cp1 = str1;
  const char *cp2 = str2;
  int str1_size, str2_size;

  assert (str1 != NULL && str2 != NULL);

  str1_size = strlen (cp1);
  str2_size = strlen (cp2);

  if (cp1[0] == '[')
    {
      cp1++;
      str1_size -= 2;
    }

  if (cp2[0] == '[')
    {
      cp2++;
      str2_size -= 2;
    }

  return intl_identifier_casecmp_w_size (lang_id (), (unsigned char *) cp1, (unsigned char *) cp2, str1_size,
                     str2_size);
}
#endif /* ENABLE_UNUSED_FUNCTION */

/*
 * intl_identifier_lower_string_size() - determine the size required for holding
 *                   lower case of the input string
 *   return: required size
 *   src(in): string to lowercase
 */
int
intl_identifier_lower_string_size (const char *src)
{
  int src_size, src_lower_size;
  INTL_CODESET codeset = lang_charset ();

  src_size = strlen (src);

  switch (codeset)
    {
    case INTL_CODESET_UTF8:
#if (INTL_IDENTIFIER_CASING_SIZE_MULTIPLIER > 1)
      {
    unsigned char lower[INTL_UTF8_MAX_CHAR_SIZE];
    unsigned char *next;
    const unsigned char *s;
    const LANG_LOCALE_DATA *locale = lang_locale ();
    const ALPHABET_DATA *alphabet = &(locale->ident_alphabet);
    int s_size = src_size;
    unsigned int cp;

    const unsigned char *usrc = REINTERPRET_CAST (const unsigned char *, src);

    src_lower_size = 0;

    for (s = usrc; s < usrc + src_size;)
      {
        assert (s_size > 0);

        cp = intl_utf8_to_cp (s, s_size, &next);

        if (cp < (unsigned int) (alphabet->l_count))
          {
        int lower_cnt;
        unsigned int *lower_cp = &(alphabet->lower_cp[cp * alphabet->lower_multiplier]);

        for (lower_cnt = 0; lower_cnt < alphabet->lower_multiplier && *lower_cp != 0; lower_cnt++, lower_cp++)
          {
            src_lower_size += intl_cp_to_utf8 (*lower_cp, lower);
          }
          }
        else
          {
        src_lower_size += intl_cp_to_utf8 (cp, lower);
          }

        s_size -= CAST_STRLEN (next - s);
        s = next;
      }
      }
#else
      src_lower_size = src_size;
#endif
      break;

    case INTL_CODESET_RAW_BYTES:
    case INTL_CODESET_ISO88591:
    case INTL_CODESET_KSC5601_EUC:
    default:
      src_lower_size = src_size;
      break;
    }

  return src_lower_size;
}

/*
 * intl_identifier_lower() - convert given characters to lowercase characters
 *   return: always 0
 *   src(in) : source buffer
 *   dst(out) : destination buffer
 *
 *  Note : 'dst' has always enough size
 */
int
intl_identifier_lower (const char *src, char *dst)
{
  int d_size = 0;
  int length_in_bytes = 0;
  int length_in_chars = 0;
  unsigned char *d;
  const unsigned char *s;

  if (src)
    {
      length_in_bytes = strlen (src);
    }

  unsigned char *udst = REINTERPRET_CAST (unsigned char *, dst);
  const unsigned char *usrc = REINTERPRET_CAST (const unsigned char *, src);

  switch (lang_charset ())
    {
    case INTL_CODESET_UTF8:
      {
    const LANG_LOCALE_DATA *locale = lang_locale ();
    const ALPHABET_DATA *alphabet = &(locale->ident_alphabet);
    length_in_chars = intl_count_utf8_chars (usrc, length_in_bytes);
    (void) intl_tolower_utf8 (alphabet, usrc, udst, length_in_chars, &d_size);
    d = udst + d_size;
      }
      break;

    case INTL_CODESET_ISO88591:
      {
    for (d = udst, s = usrc; d < udst + length_in_bytes; d++, s++)
      {
        *d = char_tolower_iso8859 (*s);
      }
      }
      break;

    case INTL_CODESET_KSC5601_EUC:
    default:
      {
    for (d = udst, s = usrc; d < udst + length_in_bytes; d++, s++)
      {
        *d = char_tolower (*s);
      }
      }
      break;
    }

  *d = '\0';

  return 0;
}

/*
 * intl_identifier_upper_string_size() - determine the size required for holding
 *                   upper case of the input string
 *   return: required size
 *   src(in): string to lowercase
 */
int
intl_identifier_upper_string_size (const char *src)
{
  int src_size, src_upper_size;
  INTL_CODESET codeset = lang_charset ();

  src_size = strlen (src);

  const unsigned char *usrc = REINTERPRET_CAST (const unsigned char *, src);

  switch (codeset)
    {
    case INTL_CODESET_UTF8:
#if (INTL_IDENTIFIER_CASING_SIZE_MULTIPLIER > 1)
      {
    unsigned char upper[INTL_UTF8_MAX_CHAR_SIZE];
    unsigned char *next;
    const unsigned char *s;
    const LANG_LOCALE_DATA *locale = lang_locale ();
    const ALPHABET_DATA *alphabet = &(locale->ident_alphabet);
    int s_size = src_size;
    unsigned int cp;

    src_upper_size = 0;

    for (s = usrc; s < usrc + src_size;)
      {
        assert (s_size > 0);

        cp = intl_utf8_to_cp (s, s_size, &next);

        if (cp < (unsigned int) (alphabet->l_count))
          {
        int upper_cnt;
        unsigned int *upper_cp = &(alphabet->upper_cp[cp * alphabet->upper_multiplier]);

        for (upper_cnt = 0; upper_cnt < alphabet->upper_multiplier && *upper_cp != 0; upper_cnt++, upper_cp++)
          {
            src_upper_size += intl_cp_to_utf8 (*upper_cp, upper);
          }
          }
        else
          {
        src_upper_size += intl_cp_to_utf8 (cp, upper);
          }

        s_size -= CAST_STRLEN (next - s);
        s = next;
      }
      }
#else
      src_upper_size = src_size;
#endif
      break;

    case INTL_CODESET_RAW_BYTES:
    case INTL_CODESET_ISO88591:
    case INTL_CODESET_KSC5601_EUC:
    default:
      src_upper_size = src_size;
      break;
    }

  return src_upper_size;
}

/*
 * intl_identifier_upper() - convert given characters to uppercase characters
 *   return: always 0
 *   src(in):
 *   dst(out):
 *
 *  Note : 'dst' has always enough size;
 */
int
intl_identifier_upper (const char *src, char *dst)
{
  int d_size = 0;
  int length_in_bytes = 0;
  int length_in_chars = 0;
  unsigned char *d;
  const unsigned char *s;

  if (src)
    {
      length_in_bytes = strlen (src);
    }

  unsigned char *udst = REINTERPRET_CAST (unsigned char *, dst);
  const unsigned char *usrc = REINTERPRET_CAST (const unsigned char *, src);

  switch (lang_charset ())
    {
    case INTL_CODESET_UTF8:
      {
    const LANG_LOCALE_DATA *locale = lang_locale ();
    const ALPHABET_DATA *alphabet = &(locale->ident_alphabet);
    length_in_chars = intl_count_utf8_chars (usrc, length_in_bytes);
    (void) intl_toupper_utf8 (alphabet, usrc, udst, length_in_chars, &d_size);
    d = udst + d_size;
      }
      break;
    case INTL_CODESET_ISO88591:
      {
    for (d = udst, s = usrc; d < udst + length_in_bytes; d++, s++)
      {
        *d = char_toupper_iso8859 (*s);
      }
      }
      break;
    case INTL_CODESET_KSC5601_EUC:
    default:
      {
    for (d = udst, s = usrc; d < udst + length_in_bytes; d++, s++)
      {
        *d = char_toupper (*s);
      }
      }
      break;
    }

  *d = '\0';

  return 0;
}

/*
 * intl_identifier_fix - Checks if a string can be an identifier;
 *           Truncates the string to a desired size in bytes,
 *           while making sure that the last char is not truncated
 *           Checks that lower and upper case versions of string
 *           do not exceed maximum allowed size.
 *
 *   return: error code : ER_GENERIC_ERROR or NO_ERROR
 *   name(in): identifier name, nul-terminated C string
 *   ident_max_size(in): allowed size of this identifier, may be -1 in which
 *           case the maximum allowed system size is used
 *   error_on_case_overflow(in): if true, will return error if the lower or
 *               upper version of truncated identifier exceeds
 *               allowed size
 *
 *  Note : Identifier string may be truncated if lexer previously truncated it
 *     in the middle of the last character;
 *     No error message is outputed by this function - in case of error,
 *     the error message should be output by the caller.
 *     DB_MAX_IDENTIFIER_LENGTH is the buffer size for string identifier
 *     This includes the nul-terminator byte; the useful bytes are
 *     (DB_MAX_IDENTIFIER_LENGTH - 1).
 */
int
intl_identifier_fix (char *name, int ident_max_size, bool error_on_case_overflow)
{
  int truncated_size = 0, original_size = 0, char_size = 0;
  const unsigned char *cname = (unsigned char *) name;
  INTL_CODESET codeset = lang_charset ();

  assert (name != NULL);

  if (ident_max_size == -1)
    {
      ident_max_size = DB_MAX_IDENTIFIER_LENGTH - 1;
    }

  assert (ident_max_size > 0 && ident_max_size < DB_MAX_IDENTIFIER_LENGTH);

  original_size = strlen (name);
  if (INTL_CODESET_MULT (codeset) == 1)
    {
      if (original_size > ident_max_size)
    {
      name[ident_max_size] = '\0';
    }
      return NO_ERROR;
    }

  assert (INTL_CODESET_MULT (codeset) > 1);

  /* we do not check contents of non-ASCII if codeset is UTF-8 or EUC; valid codeset sequences are checked with
   * 'intl_check_string' when enabled */

check_truncation:
  /* check if last char of identifier may have been truncated */
  if (original_size + INTL_CODESET_MULT (codeset) > ident_max_size)
    {
      if (ident_max_size < original_size)
    {
      original_size = ident_max_size;
    }

      /* count original size based on the size given by first byte of each char */
      for (truncated_size = 0; truncated_size < original_size;)
    {
      INTL_NEXT_CHAR (cname, cname, codeset, &char_size);
      truncated_size += char_size;
    }
      assert (truncated_size >= original_size);

      /* truncated_size == original_size means last character fit entirely in 'original_size'
       * otherwise assume the last character was truncated */
      if (truncated_size > original_size)
    {
      assert (truncated_size < original_size + INTL_CODESET_MULT (codeset));
      assert ((unsigned char) *(cname - char_size) > 0x80);
      /* truncate after the last full character */
      truncated_size -= char_size;
      original_size = truncated_size;
    }
      name[original_size] = '\0';
    }

  /* ensure that lower or upper versions of identifier do not exceed maximum allowed size of an identifier */
#if (INTL_IDENTIFIER_CASING_SIZE_MULTIPLIER > 1)
  if (intl_identifier_upper_string_size (name) > ident_max_size
      || intl_identifier_lower_string_size (name) > ident_max_size)
    {
      if (error_on_case_overflow)
    {
      /* this is grammar context : reject the identifier string */
      return ER_GENERIC_ERROR;
    }
      else
    {
      /* decrease the initial allowed size and try again */
      ident_max_size -= INTL_CODESET_MULT (codeset);
      if (ident_max_size <= INTL_CODESET_MULT (codeset))
        {
          /* we make sure we have room for at least one character */
          return ER_GENERIC_ERROR;
        }
      goto check_truncation;
    }
    }
#endif

  return NO_ERROR;
}

/*
 * intl_identifier_mht_1strhash - hash a identifier key (in lowercase)
 *   return: hash value
 *   key(in): key to hash
 *   ht_size(in): size of hash table
 *
 * Note: Charset dependent version of 'mht_1strlowerhashTaken' function
 */
unsigned int
intl_identifier_mht_1strlowerhash (const void *key, const unsigned int ht_size)
{
  unsigned int hash;
  unsigned const char *byte_p = (unsigned char *) key;
  unsigned int ch;

  assert (key != NULL);

  switch (lang_charset ())
    {
    case INTL_CODESET_UTF8:
      {
    const LANG_LOCALE_DATA *locale = lang_locale ();
    const ALPHABET_DATA *alphabet = &(locale->ident_alphabet);
    int key_size = strlen ((const char *) key);
    unsigned char *next;

    for (hash = 0; key_size > 0;)
      {
        ch = intl_utf8_to_cp (byte_p, key_size, &next);
        if (ch < (unsigned int) (alphabet->l_count))
          {
        assert (alphabet->lower_multiplier == 1);
        ch = alphabet->lower_cp[ch];
          }

        key_size -= CAST_STRLEN (next - byte_p);
        byte_p = next;

        hash = (hash << 5) - hash + ch;
      }
      }
      break;
    case INTL_CODESET_ISO88591:
      for (hash = 0; *byte_p; byte_p++)
    {
      ch = char_tolower_iso8859 (*byte_p);
      hash = (hash << 5) - hash + ch;
    }
      break;
    case INTL_CODESET_RAW_BYTES:
      for (hash = 0; *byte_p; byte_p++)
    {
      ch = *byte_p;
      hash = (hash << 5) - hash + ch;
    }
      break;
    case INTL_CODESET_KSC5601_EUC:
    default:
      for (hash = 0; *byte_p; byte_p++)
    {
      ch = char_tolower (*byte_p);
      hash = (hash << 5) - hash + ch;
    }
      break;
    }

  return hash % ht_size;
}

#if defined (ENABLE_UNUSED_FUNCTION)
/*
 * intl_strncat() - concatenates at most len characters from 'src' to 'dest'
 *   return: number of bytes copied
 *   dest(in/out):
 *   src(in);
 *   len(in): length to concatenate (in chars)
 *
 *  Note : the NULL terminator is always appended to 'dest';
 *     it is assumed that 'dest' allocated size can fit appended chars
 *
 */
int
intl_strncat (unsigned char *dest, const unsigned char *src, int len)
{
  int result = 0;

  if (lang_charset () == INTL_CODESET_UTF8)
    {
      int copy_len = 0;
      unsigned char *p_dest = dest + strlen ((char *) dest);
      const unsigned char *p_char = NULL;
      int char_len;

      while (*src && copy_len < len)
    {
      if (*src < 0x80)
        {
          *p_dest++ = *src++;
        }
      else
        {
          p_char = src;
          INTL_GET_NEXTCHAR_UTF8 (src, char_len);
          memcpy (p_dest, p_char, char_len);
          p_dest += char_len;
        }
      copy_len++;
    }
      result = p_dest - dest;
    }
  else
    {
      strncat ((char *) dest, (char *) src, len);
      result = len;
    }

  return result;
}
#endif

/*
 * intl_put_char() - puts a character into a string buffer
 *   return: size of character
 *   dest(in/out): destination buffer
 *   char_p(in): pointer to character
 *   codeset(in): codeset of character
 *
 *  Note : It is assumed that 'dest' buffer can fit the character.
 *
 */
int
intl_put_char (unsigned char *dest, const unsigned char *char_p, const INTL_CODESET codeset)
{
  int char_len;

  assert (char_p != NULL);

  switch (codeset)
    {
    case INTL_CODESET_UTF8:
      if (*char_p < 0x80)
    {
      *dest = *char_p;
      return 1;
    }
      else
    {
      char_len = intl_Len_utf8_char[*char_p];
      memcpy (dest, char_p, char_len);
      return char_len;
    }
      break;

    case INTL_CODESET_KSC5601_EUC:
      (void) intl_nextchar_euc (char_p, &char_len);
      memcpy (dest, char_p, char_len);
      return char_len;

    case INTL_CODESET_ISO88591:
    case INTL_CODESET_RAW_BYTES:
    default:
      *dest = *char_p;
      return 1;
    }

  return 1;
}


/*
 * intl_is_space() - checks if character is white-space
 *   return:
 *   str(in):
 *   str_end(in): end of string (pointer to first character after last
 *        character of string) or NULL if str is null terminated
 *   codeset(in): codeset of string
 *   space_size(out): size in bytes of 'whitespace' character
 *
 *  Note : White spaces are: ASCII space, TAB character, CR and LF
 *     If codeset is EUC also the double byte character space (A1 A1) is
 *     considered;
 *
 */
bool
intl_is_space (const char *str, const char *str_end, const INTL_CODESET codeset, int *space_size)
{
  assert (str != NULL);

  if (space_size != NULL)
    {
      *space_size = 1;
    }

  switch (codeset)
    {
    case INTL_CODESET_KSC5601_EUC:
      if (str_end == NULL)
    {
      if (*((unsigned char *) str) == 0xa1 && *((unsigned char *) (str + 1)) == 0xa1)
        {
          if (space_size != NULL)
        {
          *space_size = 2;
        }
          return true;
        }
      else if (char_isspace (*str))
        {
          return true;
        }
    }
      else
    {
      if (str < str_end)
        {
          if (*((const unsigned char *) str) == 0xa1 && str + 1 < str_end
          && *((const unsigned char *) (str + 1)) == 0xa1)
        {
          if (space_size != NULL)
            {
              *space_size = 2;
            }
          return true;
        }
          else if (char_isspace (*str))
        {
          return true;
        }
        }
    }
      break;
    case INTL_CODESET_UTF8:
    case INTL_CODESET_ISO88591:
    case INTL_CODESET_RAW_BYTES:
    default:
      if (str_end == NULL)
    {
      if (char_isspace (*str))
        {
          return true;
        }
    }
      else
    {
      if (str < str_end && char_isspace (*str))
        {
          return true;
        }
    }
      break;
    }

  return false;
}

/*
 * intl_skip_spaces() - skips white spaces in string
 *   return: begining of non-whitespace characters or end of string
 *   str(in):
 *   str_end(in): end of string (pointer to first character after last
 *        character of string) or NULL if str is null terminated
 *   codeset(in): codeset of string
 *
 *  Note : White spaces are: ASCII space, TAB character, CR and LF
 *     If codeset is EUC also the double byte character space (A1 A1) is
 *     considered;
 *
 */
const char *
intl_skip_spaces (const char *str, const char *str_end, const INTL_CODESET codeset)
{
  assert (str != NULL);

  switch (codeset)
    {
    case INTL_CODESET_KSC5601_EUC:
      if (str_end == NULL)
    {
      while (*str != '\0')
        {
          if (*((unsigned char *) str) == 0xa1 && *((unsigned char *) (str + 1)) == 0xa1)
        {
          str++;
          str++;
        }
          else if (char_isspace (*str))
        {
          str++;
        }
          else
        {
          break;
        }
        }
    }
      else
    {
      while (str < str_end)
        {
          if (*((const unsigned char *) str) == 0xa1 && str + 1 < str_end
          && *((const unsigned char *) (str + 1)) == 0xa1)
        {
          str++;
          str++;
        }
          else if (char_isspace (*str))
        {
          str++;
        }
          else
        {
          break;
        }
        }
    }
      break;
    case INTL_CODESET_UTF8:
    case INTL_CODESET_ISO88591:
    case INTL_CODESET_RAW_BYTES:
    default:
      if (str_end == NULL)
    {
      while (char_isspace (*str))
        {
          str++;
        }
    }
      else
    {
      while (str < str_end && char_isspace (*str))
        {
          str++;
        }
    }
      break;
    }

  return str;
}

/*
 * intl_backskip_spaces() - skips trailing white spaces in end of string
 *   return: end of non-whitespace characters or end of string
 *   str_begin(in): start of string
 *   str_end(in): end of string (pointer to last character)
 *   codeset(in): codeset of string
 *
 *  Note : White spaces are: ASCII space, TAB character, CR and LF
 *     If codeset is EUC also the double byte character space (A1 A1) is
 *     considered;
 *
 */
const char *
intl_backskip_spaces (const char *str_begin, const char *str_end, const INTL_CODESET codeset)
{
  assert (str_begin != NULL);
  assert (str_end != NULL);

  switch (codeset)
    {
    case INTL_CODESET_KSC5601_EUC:
      while (str_end > str_begin)
    {
      if (*((const unsigned char *) str_end) == 0xa1 && str_end - 1 > str_begin
          && *((const unsigned char *) (str_end - 1)) == 0xa1)
        {
          str_end--;
          str_end--;
        }
      else if (char_isspace (*str_end))
        {
          str_end--;
        }
      else
        {
          break;
        }
    }
      break;
    case INTL_CODESET_UTF8:
    case INTL_CODESET_ISO88591:
    case INTL_CODESET_RAW_BYTES:
    default:
      while (str_end > str_begin && char_isspace (*str_end))
    {
      str_end++;
    }
      break;
    }

  return str_end;
}

/*
 * intl_cp_to_utf8() - converts a unicode codepoint to its
 *                            UTF-8 encoding
 *  return: number of bytes for UTF-8; 0 means not encoded
 *  codepoint(in) : Unicode code point (32 bit value)
 *  utf8_seq(in/out) : pre-allocated buffer for UTF-8 sequence
 *
 */
int
intl_cp_to_utf8 (const unsigned int codepoint, unsigned char *utf8_seq)
{
  assert (utf8_seq != NULL);

  if (codepoint <= 0x7f)
    {
      /* 1 byte */
      *utf8_seq = (unsigned char) codepoint;
      return 1;
    }
  if (codepoint <= 0x7ff)
    {
      /* 2 bytes */
      *utf8_seq++ = (unsigned char) (0xc0 | (codepoint >> 6));
      *utf8_seq = (unsigned char) (0x80 | (codepoint & 0x3f));
      return 2;
    }
  if (codepoint <= 0xffff)
    {
      /* 3 bytes */
      *utf8_seq++ = (unsigned char) (0xe0 | (codepoint >> 12));
      *utf8_seq++ = (unsigned char) (0x80 | ((codepoint >> 6) & 0x3f));
      *utf8_seq = (unsigned char) (0x80 | (codepoint & 0x3f));
      return 3;
    }
  if (codepoint <= 0x10ffff)
    {
      /* 4 bytes */
      *utf8_seq++ = (unsigned char) (0xf0 | (codepoint >> 18));
      *utf8_seq++ = (unsigned char) (0x80 | ((codepoint >> 12) & 0x3f));
      *utf8_seq++ = (unsigned char) (0x80 | ((codepoint >> 6) & 0x3f));
      *utf8_seq = (unsigned char) (0x80 | (codepoint & 0x3f));
      return 4;
    }

  assert (false);
  *utf8_seq = '?';
  return 1;
}

/*
 * intl_cp_to_dbcs() - converts a codepoint to DBCS encoding
 *  return: number of bytes for encoding; 0 means not encoded
 *  codepoint(in) : code point (16 bit value)
 *  byte_flag(in): flag array : 0: single byte char,
 *              1: is a leading byte for DBCS,
 *              2: byte value not used
 *  seq(in/out) : pre-allocated buffer for DBCS sequence
 *
 */
int
intl_cp_to_dbcs (const unsigned int codepoint, const unsigned char *byte_flag, unsigned char *seq)
{
  assert (seq != NULL);

  /* is_lead_byte is assumed to have 256 elements */
  assert (byte_flag != NULL);

  if (codepoint <= 0xff)
    {
      if (byte_flag[codepoint] == 0)
    {
      /* 1 byte */
      *seq = (unsigned char) codepoint;
    }
      else
    {
      /* undefined or lead byte */
      *seq = '?';
    }
      return 1;
    }
  if (codepoint <= 0xffff)
    {
      /* 2 bytes */
      *seq++ = (unsigned char) (0xff & (codepoint >> 8));
      *seq = (unsigned char) (codepoint & 0xff);
      return 2;
    }

  assert (false);
  *seq = '?';
  return 1;
}

/*
 * intl_utf8_to_cp() - converts a UTF-8 encoded char to unicode codepoint
 *  return: unicode code point; 0xffffffff means error
 *  utf8(in) : buffer for UTF-8 char
 *  size(in) : size of buffer
 *  next_char(in/out): pointer to next character
 *
 */
unsigned int
intl_utf8_to_cp (const unsigned char *utf8, const int size, unsigned char **next_char)
{
  assert (utf8 != NULL);
  assert (size > 0);
  assert (next_char != NULL);

  if (utf8[0] < 0x80)
    {
      *next_char = (unsigned char *) utf8 + 1;
      return (unsigned int) (utf8[0]);
    }
  else if (size >= 2 && utf8[0] >= 0xc0 && utf8[0] < 0xe0)
    {
      *next_char = (unsigned char *) utf8 + 2;
      return (unsigned int) (((utf8[0] & 0x1f) << 6) | (utf8[1] & 0x3f));
    }
  else if (size >= 3 && utf8[0] >= 0xe0 && utf8[0] < 0xf0)
    {
      *next_char = (unsigned char *) utf8 + 3;
      return (unsigned int) (((utf8[0] & 0x0f) << 12) | ((utf8[1] & 0x3f) << 6) | (utf8[2] & 0x3f));
    }
  else if (size >= 4 && utf8[0] >= 0xf0 && utf8[0] < 0xf8)
    {
      *next_char = (unsigned char *) utf8 + 4;
      return (unsigned int) (((utf8[0] & 0x07) << 18) | ((utf8[1] & 0x3f) << 12) | ((utf8[2] & 0x3f) << 6) |
                 (utf8[3] & 0x3f));
    }
#if INTL_UTF8_MAX_CHAR_SIZE > 4
  else if (size >= 5 && utf8[0] >= 0xf8 && utf8[0] < 0xfc)
    {
      *next_char = (unsigned char *) utf8 + 5;
      return (unsigned int) (((utf8[0] & 0x03) << 24) | ((utf8[1] & 0x3f) << 18) | ((utf8[2] & 0x3f) << 12) |
                 ((utf8[3] & 0x3f) << 6) | (utf8[4] & 0x3f));
    }
  else if (size >= 6 && utf8[0] >= 0xfc && utf8[0] < 0xfe)
    {
      *next_char = (unsigned char *) utf8 + 6;
      return (unsigned int) (((utf8[0] & 0x01) << 30) | ((utf8[1] & 0x3f) << 24) | ((utf8[2] & 0x3f) << 18) |
                 ((utf8[3] & 0x3f) << 12) | ((utf8[4] & 0x3f) << 6) | (utf8[5] & 0x3f));
    }
#endif

  *next_char = (unsigned char *) utf8 + 1;
  return 0xffffffff;
}

/*
 * intl_back_utf8_to_cp() - converts a UTF-8 encoded char to unicode codepoint
 *              but starting from the last byte of a character
 *  return: unicode code point; 0xffffffff means error
 *
 *  utf8_start(in) : start of buffer
 *  utf8_last(in) : pointer to last byte of buffer (and last byte of last
 *          character)
 *  last_byte__prev_char(in/out) : pointer to last byte of previous character
 *
 */
unsigned int
intl_back_utf8_to_cp (const unsigned char *utf8_start, const unsigned char *utf8_last,
              unsigned char **last_byte__prev_char)
{
  int char_size = 1;
  unsigned char *dummy;

  assert (utf8_start != NULL);
  assert (utf8_last != NULL);
  assert (utf8_start <= utf8_last);
  assert (last_byte__prev_char != NULL);

  if (*utf8_last < 0x80)
    {
      *last_byte__prev_char = ((unsigned char *) utf8_last) - 1;
      return *utf8_last;
    }

  /* multibyte character */
  do
    {
      if (((*utf8_last--) & 0xc0) != 0x80)
    {
      break;
    }
      if (utf8_last < utf8_start)
    {
      /* broken char, invalid CP */
      *last_byte__prev_char = ((unsigned char *) utf8_start) - 1;
      return 0xffffffff;
    }
    }
  while (++char_size < INTL_UTF8_MAX_CHAR_SIZE);

  *last_byte__prev_char = (unsigned char *) utf8_last;
  return intl_utf8_to_cp (utf8_last + 1, char_size, &dummy);
}

/*
 * intl_dbcs_to_cp() - converts a DBCS encoded char to DBCS codepoint
 *  return: DBCS code point; 0xffffffff means error
 *  seq(in) : buffer for DBCS char
 *  size(in) : size of buffer
 *  byte_flag(in) : array of flags for lead bytes
 *  next_char(in/out): pointer to next character
 *
 */
unsigned int
intl_dbcs_to_cp (const unsigned char *seq, const int size, const unsigned char *byte_flag, unsigned char **next_char)
{
  assert (seq != NULL);
  assert (size > 0);
  assert (next_char != NULL);

  assert (byte_flag != NULL);

  if (byte_flag[seq[0]] == 1 && size >= 2)
    {
      *next_char = (unsigned char *) seq + 2;
      return (unsigned int) (((seq[0]) << 8) | (seq[1]));
    }

  *next_char = (unsigned char *) seq + 1;
  return (unsigned int) (seq[0]);
}


/*
 * intl_utf8_to_cp_list() - converts a UTF-8 encoded string to a list of
 *                          unicode codepoint
 *  return: number of codepoints found in string
 *  utf8(in) : buffer for UTF-8 char
 *  size(in) : size of string buffer
 *  cp_array(in/out) : preallocated array to store computed codepoints list
 *  max_array_size(in) : maximum size of computed codepoints list
 *  cp_count(out) : number of codepoints found in string
 *  array_count(out) : number of elements in codepoints list
 */
int
intl_utf8_to_cp_list (const unsigned char *utf8, const int size, unsigned int *cp_array, const int max_array_size,
              int *array_count)
{
  unsigned char *next = NULL;
  const unsigned char *utf8_end = utf8 + size;
  int i;

  assert (utf8 != NULL);
  assert (size > 0);
  assert (cp_array != NULL);
  assert (max_array_size > 0);
  assert (array_count != NULL);

  for (i = 0, *array_count = 0; utf8 < utf8_end; i++)
    {
      unsigned int cp;
      assert (utf8_end - utf8 > 0);

      cp = intl_utf8_to_cp (utf8, CAST_STRLEN (utf8_end - utf8), &next);
      utf8 = next;

      if (i < max_array_size)
    {
      cp_array[i] = cp;
      (*array_count)++;
    }
    }

  return i;
}

#define UTF8_BYTE_IN_RANGE(b, r1, r2) (!(b < r1 || b > r2))

/*
 * intl_check_utf8 - Checks if a string contains valid UTF-8 sequences
 *
 *   return: 0 if valid,
 *       1 if contains and invalid byte in one char
 *       2 if last char is truncated (missing bytes)
 *   buf(in): buffer
 *   size(out): size of buffer (negative values accepted, in this case buffer
 *      is assumed to be NUL terminated)
 *   pos(out): pointer to beginning of invalid character
 *
 *  Valid ranges:
 *    - 1 byte : 00 - 7F
 *    - 2 bytes: C2 - DF , 80 - BF             (U +80 .. U+7FF)
 *    - 3 bytes: E0  , A0 - BF , 80 - BF           (U +800 .. U+FFF)
 *       E1 - EC , 80 - BF , 80 - BF           (U +1000 .. +CFFF)
 *       ED  , 80 - 9F , 80 - BF           (U +D000 .. +D7FF)
 *       EE - EF , 80 - BF , 80 - BF           (U +E000 .. +FFFF)
 *    - 4 bytes: F0  , 90 - BF , 80 - BF , 80 - BF (U +10000 .. +3FFFF)
 *       F1 - F3 , 80 - BF , 80 - BF , 80 - BF (U +40000 .. +FFFFF)
 *       F4  , 80 - 8F , 80 - BF , 80 - BF (U +100000 .. +10FFFF)
 *
 *  Note:
 *  This function should be used only when the UTF-8 string enters the CUBRID
 *  system.
 */
INTL_UTF8_VALIDITY
intl_check_utf8 (const unsigned char *buf, int size, char **pos)
{
#define OUTPUT(charp_out) if (pos != NULL) *pos = (char *) charp_out

  const unsigned char *p = buf;
  const unsigned char *p_end = NULL;
  const unsigned char *curr_char = NULL;

  if (pos != NULL)
    {
      *pos = NULL;
    }

  if (size < 0)
    {
      size = strlen ((char *) buf);
    }

  p_end = buf + size;

  while (p < p_end)
    {
      /* ASCII fast path : skip 8 bytes while MSB is clear */
      while (p + 8 <= p_end)
    {
      UINT64 word;
      memcpy (&word, p, sizeof (word));
      if ((word & UINT64_C (0x8080808080808080)) != 0)
        {
          break;
        }
      p += 8;
    }

      if (p >= p_end)
    {
      break;
    }

      curr_char = p;

      if (*p < 0x80)
    {
      p++;
      continue;
    }

      /* range 80 - BF is not valid UTF-8 first byte */
      /* range C0 - C1 overlaps 1 byte 00 - 20 (2 byte overflow) */
      if (*p < 0xc2)
    {
      OUTPUT (curr_char);
      return INTL_UTF8_INVALID;
    }

      /* check 2 bytes sequences */
      /* 2 bytes sequence allowed : C2 - DF , 80 - BF */
      if (UTF8_BYTE_IN_RANGE (*p, 0xc2, 0xdf))
    {
      p++;
      if (p >= p_end)
        {
          OUTPUT (curr_char);
          return INTL_UTF8_TRUNCATED;
        }

      if (UTF8_BYTE_IN_RANGE (*p, 0x80, 0xbf))
        {
          p++;
          continue;
        }
      OUTPUT (curr_char);
      return INTL_UTF8_INVALID;
    }

      /* check 3 bytes sequences */
      /* 3 bytes sequence : E0 , A0 - BF , 80 - BF */
      if (*p == 0xe0)
    {
      p++;
      if (p >= p_end)
        {
          OUTPUT (curr_char);
          return INTL_UTF8_TRUNCATED;
        }

      if (UTF8_BYTE_IN_RANGE (*p, 0xa0, 0xbf))
        {
          p++;
          if (p >= p_end)
        {
          OUTPUT (curr_char);
          return INTL_UTF8_TRUNCATED;
        }

          if (UTF8_BYTE_IN_RANGE (*p, 0x80, 0xbf))
        {
          p++;
          continue;
        }
        }

      OUTPUT (curr_char);
      return INTL_UTF8_INVALID;
    }
      /* 3 bytes sequence : E1 - EC , 80 - BF , 80 - BF */
      /* 3 bytes sequence : EE - EF , 80 - BF , 80 - BF */
      else if (UTF8_BYTE_IN_RANGE (*p, 0xe1, 0xec) || UTF8_BYTE_IN_RANGE (*p, 0xee, 0xef))
    {
      p++;
      if (p >= p_end)
        {
          OUTPUT (curr_char);
          return INTL_UTF8_TRUNCATED;
        }

      if (UTF8_BYTE_IN_RANGE (*p, 0x80, 0xbf))
        {
          p++;
          if (p >= p_end)
        {
          OUTPUT (curr_char);
          return INTL_UTF8_TRUNCATED;
        }

          if (UTF8_BYTE_IN_RANGE (*p, 0x80, 0xbf))
        {
          p++;
          continue;
        }
        }
      OUTPUT (curr_char);
      return INTL_UTF8_INVALID;
    }
      /* 3 bytes sequence : ED , 80 - 9F , 80 - BF */
      else if (*p == 0xed)
    {
      p++;
      if (p >= p_end)
        {
          OUTPUT (curr_char);
          return INTL_UTF8_TRUNCATED;
        }

      if (UTF8_BYTE_IN_RANGE (*p, 0x80, 0x9f))
        {
          p++;
          if (p >= p_end)
        {
          OUTPUT (curr_char);
          return INTL_UTF8_TRUNCATED;
        }

          if (UTF8_BYTE_IN_RANGE (*p, 0x80, 0xbf))
        {
          p++;
          continue;
        }
        }
      OUTPUT (curr_char);
      return INTL_UTF8_INVALID;
    }

      /* 4 bytes sequence : F0 , 90 - BF , 80 - BF , 80 - BF */
      if (*p == 0xf0)
    {
      p++;
      if (p >= p_end)
        {
          OUTPUT (curr_char);
          return INTL_UTF8_TRUNCATED;
        }

      if (UTF8_BYTE_IN_RANGE (*p, 0x90, 0xbf))
        {
          p++;
          if (p >= p_end)
        {
          OUTPUT (curr_char);
          return INTL_UTF8_TRUNCATED;
        }

          if (UTF8_BYTE_IN_RANGE (*p, 0x80, 0xbf))
        {
          p++;
          if (p >= p_end)
            {
              OUTPUT (curr_char);
              return INTL_UTF8_TRUNCATED;
            }

          if (UTF8_BYTE_IN_RANGE (*p, 0x80, 0xbf))
            {
              p++;
              continue;
            }
        }
        }
      OUTPUT (curr_char);
      return INTL_UTF8_INVALID;
    }
      /* 4 bytes sequence : F1 - F3 , 80 - BF , 80 - BF , 80 - BF */
      if (UTF8_BYTE_IN_RANGE (*p, 0xf1, 0xf3))
    {
      p++;
      if (p >= p_end)
        {
          OUTPUT (curr_char);
          return INTL_UTF8_TRUNCATED;
        }

      if (UTF8_BYTE_IN_RANGE (*p, 0x80, 0xbf))
        {
          p++;
          if (p >= p_end)
        {
          OUTPUT (curr_char);
          return INTL_UTF8_TRUNCATED;
        }

          if (UTF8_BYTE_IN_RANGE (*p, 0x80, 0xbf))
        {
          p++;
          if (p >= p_end)
            {
              OUTPUT (curr_char);
              return INTL_UTF8_TRUNCATED;
            }

          if (UTF8_BYTE_IN_RANGE (*p, 0x80, 0xbf))
            {
              p++;
              continue;
            }
        }
        }
      OUTPUT (curr_char);
      return INTL_UTF8_INVALID;
    }
      /* 4 bytes sequence : F4 , 80 - 8F , 80 - BF , 80 - BF */
      else if (*p == 0xf4)
    {
      p++;
      if (p >= p_end)
        {
          OUTPUT (curr_char);
          return INTL_UTF8_TRUNCATED;
        }

      if (UTF8_BYTE_IN_RANGE (*p, 0x80, 0x8f))
        {
          p++;
          if (p >= p_end)
        {
          OUTPUT (curr_char);
          return INTL_UTF8_TRUNCATED;
        }

          if (UTF8_BYTE_IN_RANGE (*p, 0x80, 0xbf))
        {
          p++;
          if (p >= p_end)
            {
              OUTPUT (curr_char);
              return INTL_UTF8_TRUNCATED;
            }

          if (UTF8_BYTE_IN_RANGE (*p, 0x80, 0xbf))
            {
              p++;
              continue;
            }
        }
        }
      OUTPUT (curr_char);
      return INTL_UTF8_INVALID;
    }

      assert (*p > 0xf4);
      OUTPUT (curr_char);
      return INTL_UTF8_INVALID;
    }

  return INTL_UTF8_VALID;

#undef OUTPUT
}

/*
 * intl_check_euckr - Checks if a string contains valid EUC-KR sequences
 *
 *
 *   return: 0 if valid,
 *       1 if contains and invalid byte in one char
 *       2 if last char is truncated (missing bytes)
 *   buf(in): buffer
 *   size(out): size of buffer (negative values accepted, in this case buffer is assumed to be NUL terminated)
 *   pos(out): pointer to beginning of invalid character
 *
 *  Valid ranges:
 *    - 1 byte : 00 - 8E ; 90 - A0
 *    - 2 bytes: A1 - FE , 00 - FF
 *    - 3 bytes: 8F  , 00 - FF , 00 - FF
 */
INTL_UTF8_VALIDITY
intl_check_euckr (const unsigned char *buf, int size, char **pos)
{
#define OUTPUT(charp_out) if (pos != NULL) *pos = (char *) charp_out

  const unsigned char *p = buf;
  const unsigned char *p_end = NULL;
  const unsigned char *curr_char = NULL;

  if (pos != NULL)
    {
      *pos = NULL;
    }

  if (size < 0)
    {
      size = strlen ((char *) buf);
    }

  p_end = buf + size;

  while (p < p_end)
    {
      curr_char = p;

      if (*p < 0x80)
    {
      p++;
      continue;
    }

      /* SS3 byte value starts a 3 bytes character */
      if (*p == SS3)
    {
      p++;
      p++;
      p++;
      if (p > p_end)
        {
          OUTPUT (curr_char);
          return INTL_UTF8_TRUNCATED;
        }
      continue;
    }

      /* check 2 bytes sequences */
      if (UTF8_BYTE_IN_RANGE (*p, 0xa1, 0xfe))
    {
      p++;
      p++;
      if (p > p_end)
        {
          OUTPUT (curr_char);
          return INTL_UTF8_TRUNCATED;
        }
      continue;
    }

      OUTPUT (curr_char);
      return INTL_UTF8_INVALID;
    }

  return INTL_UTF8_VALID;

#undef OUTPUT
}

/*
 * intl_check_string - Checks if a string contains valid sequences in current codeset
 *
 *   return: 0 - if valid, non-zero otherwise : 1 - if invalid byte in char
 *       2 - if last char is truncated
 *   buf(in): buffer
 *   size(out): size of buffer (negative values accepted, in this case buffer
 *      is assumed to be NUL terminated)
 *   codeset(in): codeset assumed for buf
 */
INTL_UTF8_VALIDITY
intl_check_string (const char *buf, int size, char **pos, const INTL_CODESET codeset)
{
  if (!intl_String_validation)
    {
      // this function is currently used either in client-modes or for loaddb. if it will be used in other server-mode
      // contexts, that can impact the result of queries, global variable should be replaced with a session parameter.
      return INTL_UTF8_VALID;
    }

  switch (codeset)
    {
    case INTL_CODESET_UTF8:
      return intl_check_utf8 ((const unsigned char *) buf, size, pos);

    case INTL_CODESET_KSC5601_EUC:
      return intl_check_euckr ((const unsigned char *) buf, size, pos);

    case INTL_CODESET_RAW_BYTES:
    default:
      break;
    }

  return INTL_UTF8_VALID;
}

#if !defined (SERVER_MODE)
/*
 * intl_is_bom_magic - Returns 1 if the buffer contains BOM magic for UTF-8
 *
 *   return: true if BOM, false otherwise
 *   buf(in): buffer
 *   size(out): size of buffer (negative means buffer is NUL terminated)
 */
bool
intl_is_bom_magic (const char *buf, const int size)
{
  const char BOM[] = { (char) 0xef, (char) 0xbb, (char) 0xbf };
  if (size >= 3)
    {
      return (memcmp (buf, BOM, 3) == 0) ? true : false;
    }
  else if (size < 0)
    {
      if (*buf == BOM[0] && buf[1] == BOM[1] && buf[2] == BOM[2])
    {
      return true;
    }
    }

  return false;
}
#endif /* SERVER_MODE */

/* UTF-8 to console routines */

/*
 * intl_text_single_byte_to_utf8() - converts a buffer containing text with ISO
 *                   8859-X encoding to UTF-8
 *
 *   return: error code
 *   in_buf(in): buffer
 *   in_size(in): size of input string (NUL terminator not included)
 *   out_buf(int/out) : output buffer : uses the pre-allocated buffer passed
 *          as input or a new allocated buffer; NULL if conversion
 *          is not required
 *   out_size(out): size of string (NUL terminator not included)
 */
int
intl_text_single_byte_to_utf8 (const char *in_buf, const int in_size, char **out_buf, int *out_size)
{
  return intl_text_single_byte_to_utf8_ext (lang_get_txt_conv (), (const unsigned char *) in_buf, in_size,
                        (unsigned char **) out_buf, out_size);
}

/*
 * intl_text_single_byte_to_utf8_ext() - converts a buffer containing text
 *                   with ISO 8859-X encoding to UTF-8
 *
 *   return: error code
 *   t(in): text conversion data
 *   in_buf(in): buffer
 *   in_size(in): size of input string (NUL terminator not included)
 *   out_buf(in/out) : output buffer : uses the pre-allocated buffer passed
 *          as input or a new allocated buffer; NULL if conversion
 *          is not required
 *   out_size(in/out): size of string (NUL terminator not included)
 */
int
intl_text_single_byte_to_utf8_ext (void *t, const unsigned char *in_buf, const int in_size, unsigned char **out_buf,
                   int *out_size)
{

  const unsigned char *p_in = NULL;
  unsigned char *p_out = NULL;
  TEXT_CONVERSION *txt_conv;
  bool is_ascii = true;

  assert (in_buf != NULL);
  assert (out_buf != NULL);
  assert (out_size != NULL);
  assert (t != NULL);

  txt_conv = (TEXT_CONVERSION *) t;

  p_in = in_buf;
  while (p_in < in_buf + in_size)
    {
      if (*p_in++ >= 0x80)
    {
      is_ascii = false;
      break;
    }
    }

  if (is_ascii)
    {
      *out_buf = NULL;
      return NO_ERROR;
    }

  if (*out_buf == NULL)
    {
      /* a ISO8859-X character is encoded on maximum 2 bytes in UTF-8 */
      *out_buf = (unsigned char *) malloc (in_size * 2 + 1);
      if (*out_buf == NULL)
    {
      er_set (ER_ERROR_SEVERITY, ARG_FILE_LINE, ER_OUT_OF_VIRTUAL_MEMORY, 1, (size_t) (in_size * 2 + 1));
      return ER_OUT_OF_VIRTUAL_MEMORY;
    }
    }
  else
    {
      if (*out_size < in_size * 2 + 1)
    {
      er_set (ER_ERROR_SEVERITY, ARG_FILE_LINE, ER_GENERIC_ERROR, 0);
      return ER_GENERIC_ERROR;
    }
    }

  assert (txt_conv->text_last_cp > 0);
  for (p_in = in_buf, p_out = *out_buf; p_in < in_buf + in_size; p_in++)
    {
      if (*p_in >= txt_conv->text_first_cp && *p_in <= txt_conv->text_last_cp)
    {
      unsigned char *utf8_bytes = txt_conv->text_to_utf8[*p_in - txt_conv->text_first_cp].bytes;
      int utf8_size = txt_conv->text_to_utf8[*p_in - txt_conv->text_first_cp].size;

      do
        {
          *p_out++ = *utf8_bytes++;
        }
      while (--utf8_size > 0);
    }
      else
    {
      if (*p_in < 0x80)
        {
          *p_out++ = *p_in;
        }
      else
        {
          assert (false);
          *p_out++ = '?';
        }
    }
    }

  *(p_out) = '\0';
  *out_size = CAST_STRLEN (p_out - *(out_buf));

  return NO_ERROR;
}

/*
 * intl_text_utf8_to_single_byte() - converts a buffer containing UTF-8 text
 *                   to ISO 8859-X encoding
 *
 *   return: error code
 *   in_buf(in): buffer
 *   in_size(in): size of input string (NUL terminator not included)
 *   out_buf(in/out) : output buffer : uses the pre-allocated buffer passed
 *          as input or a new allocated buffer; NULL if conversion
 *          is not required
 *   out_size(in/out): size of output string (NUL terminator not counted)
 */
int
intl_text_utf8_to_single_byte (const char *in_buf, const int in_size, char **out_buf, int *out_size)
{
  const unsigned char *p_in = NULL;
  unsigned char *p_out = NULL;
  unsigned char *p_next = NULL;
  TEXT_CONVERSION *txt_conv = lang_get_txt_conv ();
  bool is_ascii = true;

  assert (in_buf != NULL);
  assert (out_buf != NULL);
  assert (out_size != NULL);
  assert (txt_conv != NULL);

  p_in = (const unsigned char *) in_buf;
  while (p_in < (const unsigned char *) in_buf + in_size)
    {
      if (*p_in++ >= 0x80)
    {
      is_ascii = false;
      break;
    }
    }

  if (is_ascii)
    {
      *out_buf = NULL;
      return NO_ERROR;
    }

  if (*out_buf == NULL)
    {
      *out_buf = (char *) malloc (in_size + 1);
      if (*out_buf == NULL)
    {
      er_set (ER_ERROR_SEVERITY, ARG_FILE_LINE, ER_OUT_OF_VIRTUAL_MEMORY, 1, (size_t) (in_size + 1));
      return ER_OUT_OF_VIRTUAL_MEMORY;
    }
    }
  else
    {
      if (*out_size < in_size + 1)
    {
      er_set (ER_ERROR_SEVERITY, ARG_FILE_LINE, ER_GENERIC_ERROR, 0);
      return ER_GENERIC_ERROR;
    }
    }

  for (p_in = (const unsigned char *) in_buf, p_out = (unsigned char *) *out_buf;
       p_in < (const unsigned char *) in_buf + in_size;)
    {
      unsigned int cp = 0;

      if (*p_in < 0x80)
    {
      *p_out++ = *p_in++;
      continue;
    }

      cp = intl_utf8_to_cp (p_in, CAST_STRLEN (in_buf + in_size - (char *) p_in), &p_next);
      if (cp >= txt_conv->utf8_first_cp && cp <= txt_conv->utf8_last_cp)
    {
      assert (txt_conv->utf8_to_text[cp - txt_conv->utf8_first_cp].size == 1);
      cp = (unsigned int) *(txt_conv->utf8_to_text[cp - txt_conv->utf8_first_cp].bytes);
    }

      if (cp > 0xff)
    {
      *p_out++ = '?';
    }
      else
    {
      *p_out++ = (unsigned char) cp;
    }
      p_in = p_next;
    }

  *(p_out) = '\0';
  *out_size = CAST_STRLEN (p_out - (unsigned char *) *(out_buf));

  return NO_ERROR;
}

/*
 * intl_init_conv_iso8859_1_to_utf8() - initializes conversion map from
 *                      ISO 8859-1 (Latin 1) to UTF-8
 *  return:
 */
static void
intl_init_conv_iso8859_1_to_utf8 (void)
{
  unsigned int i;

  /* 00 - 7E : mapped to ASCII */
  for (i = 0; i <= 0x7e; i++)
    {
      iso8859_1_To_utf8_conv[i].size = 1;
      *((unsigned char *) (iso8859_1_To_utf8_conv[i].bytes)) = (unsigned char) i;
    }

  /* 7F - 9F : not mapped */
  for (i = 0x7f; i <= 0x9f; i++)
    {
      iso8859_1_To_utf8_conv[i].size = 1;
      *((unsigned char *) (iso8859_1_To_utf8_conv[i].bytes)) = (unsigned char) '?';
    }

  /* A0 - FF : mapped to Unicode codepoint with the same value */
  for (i = 0xa0; i <= 0xff; i++)
    {
      iso8859_1_To_utf8_conv[i].size = intl_cp_to_utf8 (i, iso8859_1_To_utf8_conv[i].bytes);
    }

  con_Iso_8859_1_conv.text_first_cp = 0;
  con_Iso_8859_1_conv.text_last_cp = 0xff;
  con_Iso_8859_1_conv.text_to_utf8 = iso8859_1_To_utf8_conv;

  /* no specific mapping here : Unicode codepoints in range 00-FF map directly onto ISO-8859-1 */
  con_Iso_8859_1_conv.utf8_first_cp = 0;
  con_Iso_8859_1_conv.utf8_last_cp = 0;
  con_Iso_8859_1_conv.utf8_to_text = NULL;
}

/*
 * intl_init_conv_iso8859_9_to_utf8() - initializes conversion map from
 *                      ISO 8859-9 (turkish) to UTF-8
 *  return:
 *
 */
static void
intl_init_conv_iso8859_9_to_utf8 (void)
{
  unsigned int i;
  const unsigned int iso8859_9_special_mapping[][2] = {
    {0xd0, 0x11e},      /* capital G with breve */
    {0xdd, 0x130},      /* capital I with dot above */
    {0xde, 0x15e},      /* capital S with cedilla */
    {0xf0, 0x11f},      /* small g with breve */
    {0xfd, 0x131},      /* small i dotless */
    {0xfe, 0x15f}       /* small s with cedilla */
  };

  /* 00 - 7E : mapped to ASCII */
  for (i = 0; i <= 0x7e; i++)
    {
      iso8859_9_To_utf8_conv[i].size = 1;
      *((unsigned char *) (iso8859_9_To_utf8_conv[i].bytes)) = (unsigned char) i;
    }

  /* 7F - 9F : not mapped */
  for (i = 0x7f; i <= 0x9f; i++)
    {
      iso8859_9_To_utf8_conv[i].size = 1;
      *((unsigned char *) (iso8859_9_To_utf8_conv[i].bytes)) = (unsigned char) '?';
    }

  /* A0 - FF : mapped to Unicode codepoint with the same value */
  for (i = 0xa0; i <= 0xff; i++)
    {
      iso8859_9_To_utf8_conv[i].size = intl_cp_to_utf8 (i, iso8859_9_To_utf8_conv[i].bytes);
    }

  for (i = ISO_8859_9_FIRST_CP; i <= ISO_8859_9_LAST_CP; i++)
    {
      utf8_Cp_to_iso_8859_9_conv[i - ISO_8859_9_FIRST_CP].size = 1;
      *(utf8_Cp_to_iso_8859_9_conv[i - ISO_8859_9_FIRST_CP].bytes) = '?';
    }

  /* special mapping */
  for (i = 0; i < DIM (iso8859_9_special_mapping); i++)
    {
      unsigned int val8bit = iso8859_9_special_mapping[i][0];
      unsigned int cp = iso8859_9_special_mapping[i][1];

      iso8859_9_To_utf8_conv[val8bit].size = intl_cp_to_utf8 (cp, iso8859_9_To_utf8_conv[val8bit].bytes);

      *(utf8_Cp_to_iso_8859_9_conv[cp - ISO_8859_9_FIRST_CP].bytes) = val8bit;

      assert (utf8_Cp_to_iso_8859_9_conv[cp - ISO_8859_9_FIRST_CP].size == 1);
    }

  con_Iso_8859_9_conv.text_first_cp = 0;
  con_Iso_8859_9_conv.text_last_cp = 0xff;
  con_Iso_8859_9_conv.text_to_utf8 = iso8859_9_To_utf8_conv;

  con_Iso_8859_9_conv.utf8_first_cp = ISO_8859_9_FIRST_CP;
  con_Iso_8859_9_conv.utf8_last_cp = ISO_8859_9_LAST_CP;
  con_Iso_8859_9_conv.utf8_to_text = utf8_Cp_to_iso_8859_9_conv;
}

/*
 * intl_text_dbcs_to_utf8() - converts a buffer containing text with DBCS
 *                encoding to UTF-8
 *
 *   return: error code
 *   in_buf(in): buffer
 *   in_size(in): size of input string (NUL terminator not included)
 *   out_buf(int/out) : output buffer : uses the pre-allocated buffer passed
 *          as input or a new allocated buffer; NULL if conversion
 *          is not required
 *   out_size(out): size of string (NUL terminator not included)
 */
int
intl_text_dbcs_to_utf8 (const char *in_buf, const int in_size, char **out_buf, int *out_size)
{
  return intl_text_dbcs_to_utf8_ext (lang_get_txt_conv (), (const unsigned char *) in_buf, in_size,
                     (unsigned char **) out_buf, out_size);
}

/*
 * intl_text_dbcs_to_utf8_ext() - converts a buffer containing text with DBCS
 *                encoding to UTF-8
 *
 *   return: error code
 *   t(in): text conversion data
 *   in_buf(in): buffer
 *   in_size(in): size of input string (NUL terminator not included)
 *   out_buf(in/out) : output buffer : uses the pre-allocated buffer passed
 *          as input or a new allocated buffer; NULL if conversion
 *          is not required
 *   out_size(in/out): size of string (NUL terminator not included)
 */
int
intl_text_dbcs_to_utf8_ext (void *t, const unsigned char *in_buf, const int in_size, unsigned char **out_buf,
                int *out_size)
{
  const unsigned char *p_in = NULL;
  unsigned char *p_out = NULL;
  TEXT_CONVERSION *txt_conv;
  bool is_ascii = true;

  assert (in_buf != NULL);
  assert (out_buf != NULL);
  assert (out_size != NULL);
  assert (t != NULL);

  txt_conv = (TEXT_CONVERSION *) t;

  p_in = in_buf;
  while (p_in < in_buf + in_size)
    {
      if (*p_in++ >= 0x80)
    {
      is_ascii = false;
      break;
    }
    }

  if (is_ascii)
    {
      *out_buf = NULL;
      return NO_ERROR;
    }

  if (*out_buf == NULL)
    {
      /* a DBCS text may contain ASCII characters (encoded with 1 byte) which may expand to maximum 2 bytes in UTF-8
       * and DBCS characters (2 bytes) which may expand to maximum 3 bytes in UTF-8; Also it may contain single byte
       * characters which may expand to 3 bytes characters in UTF-8 Apply a safe expansion of 3 */
      *out_buf = (unsigned char *) malloc (in_size * 3 + 1);
      if (*out_buf == NULL)
    {
      er_set (ER_ERROR_SEVERITY, ARG_FILE_LINE, ER_OUT_OF_VIRTUAL_MEMORY, 1, (size_t) (in_size * 3 + 1));
      return ER_OUT_OF_VIRTUAL_MEMORY;
    }
    }
  else
    {
      if (*out_size < in_size * 3 + 1)
    {
      er_set (ER_ERROR_SEVERITY, ARG_FILE_LINE, ER_GENERIC_ERROR, 0);
      return ER_GENERIC_ERROR;
    }
    }

  assert (txt_conv->text_last_cp > 0);
  for (p_in = in_buf, p_out = *out_buf; p_in < in_buf + in_size;)
    {
      unsigned char *p_next;
      unsigned int text_cp =
    intl_dbcs_to_cp (p_in, CAST_STRLEN (in_buf + in_size - p_in), txt_conv->byte_flag, &p_next);

      if (text_cp >= txt_conv->text_first_cp && text_cp <= txt_conv->text_last_cp)
    {
      unsigned char *utf8_bytes = txt_conv->text_to_utf8[text_cp - txt_conv->text_first_cp].bytes;
      int utf8_size = txt_conv->text_to_utf8[text_cp - txt_conv->text_first_cp].size;

      do
        {
          *p_out++ = *utf8_bytes++;
        }
      while (--utf8_size > 0);
    }
      else
    {
      if (text_cp < 0x80)
        {
          *p_out++ = *p_in;
        }
      else
        {
          *p_out++ = '?';
        }
    }

      assert (p_next <= in_buf + in_size);
      p_in = p_next;
    }

  *(p_out) = '\0';
  *out_size = CAST_STRLEN (p_out - *(out_buf));

  return NO_ERROR;
}

/*
 * intl_text_utf8_to_dbcs() - converts a buffer containing UTF-8 text
 *                to DBCS encoding
 *
 *   return: error code
 *   in_buf(in): buffer
 *   in_size(in): size of input string (NUL terminator not included)
 *   out_buf(in/out) : output buffer : uses the pre-allocated buffer passed
 *          as input or a new allocated buffer; NULL if conversion
 *          is not required
 *   out_size(in/out): size of output string (NUL terminator not counted)
 */
int
intl_text_utf8_to_dbcs (const char *in_buf, const int in_size, char **out_buf, int *out_size)
{
  const unsigned char *p_in = NULL;
  unsigned char *p_out = NULL;
  unsigned char *p_next = NULL;
  TEXT_CONVERSION *txt_conv = lang_get_txt_conv ();
  bool is_ascii = true;

  assert (in_buf != NULL);
  assert (out_buf != NULL);
  assert (out_size != NULL);
  assert (txt_conv != NULL);

  p_in = (const unsigned char *) in_buf;
  while (p_in < (const unsigned char *) in_buf + in_size)
    {
      if (*p_in++ >= 0x80)
    {
      is_ascii = false;
      break;
    }
    }

  if (is_ascii)
    {
      *out_buf = NULL;
      return NO_ERROR;
    }

  if (*out_buf == NULL)
    {
      *out_buf = (char *) malloc (in_size + 1);
      if (*out_buf == NULL)
    {
      er_set (ER_ERROR_SEVERITY, ARG_FILE_LINE, ER_OUT_OF_VIRTUAL_MEMORY, 1, (size_t) (in_size + 1));
      return ER_OUT_OF_VIRTUAL_MEMORY;
    }
    }
  else
    {
      if (*out_size < in_size + 1)
    {
      er_set (ER_ERROR_SEVERITY, ARG_FILE_LINE, ER_GENERIC_ERROR, 0);
      return ER_GENERIC_ERROR;
    }
    }

  assert (txt_conv->utf8_last_cp > 0);

  for (p_in = (const unsigned char *) in_buf, p_out = (unsigned char *) *out_buf;
       p_in < (const unsigned char *) in_buf + in_size;)
    {
      unsigned int cp = 0;

      if (*p_in < 0x80)
    {
      *p_out++ = *p_in++;
      continue;
    }

      cp = intl_utf8_to_cp (p_in, CAST_STRLEN (in_buf + in_size - (char *) p_in), &p_next);
      if (cp >= txt_conv->utf8_first_cp && cp <= txt_conv->utf8_last_cp)
    {
      unsigned char *text_bytes = txt_conv->utf8_to_text[cp - txt_conv->utf8_first_cp].bytes;
      int text_size = txt_conv->utf8_to_text[cp - txt_conv->utf8_first_cp].size;

      assert (text_size >= 1);
      do
        {
          *p_out++ = *text_bytes++;
        }
      while (--text_size > 0);
    }
      else if (cp > 0x80)
    {
      *p_out++ = '?';
    }
      else
    {
      *p_out++ = (unsigned char) cp;
    }
      p_in = p_next;
    }

  *(p_out) = '\0';
  *out_size = CAST_STRLEN (p_out - (unsigned char *) *(out_buf));

  return NO_ERROR;
}

/*
 * intl_fast_iso88591_to_utf8() - converts a buffer containing text with ISO
 *                8859-1 encoding to UTF-8
 *
 *   return: 0 conversion ok, 1 conversion done, but invalid characters where
 *       found
 *   in_buf(in): buffer
 *   in_size(in): size of input string (NUL terminator not included)
 *   out_buf(int/out) : output buffer : uses the pre-allocated buffer passed
 *          as input or a new allocated buffer; NULL if conversion
 *          is not required
 *   out_size(out): size of string (NUL terminator not included)
 */
int
intl_fast_iso88591_to_utf8 (const unsigned char *in_buf, const int in_size, unsigned char **out_buf, int *out_size)
{
  const unsigned char *p_in = NULL;
  const unsigned char *p_end;
  unsigned char *p_out = NULL;
  int status = 0;

  assert (in_size > 0);
  assert (in_buf != NULL);
  assert (out_buf != NULL);
  assert (out_size != NULL);

  for (p_in = in_buf, p_end = p_in + in_size, p_out = (unsigned char *) *out_buf; p_in < p_end; p_in++)
    {
      if (*p_in < 0x7f)
    {
      *p_out++ = *p_in;
    }
      else if (*p_in < 0xa0)
    {
      /* ISO 8859-1 characters in this range are not valid */
      *p_out++ = '?';
      status = 1;
    }
      else
    {
      *p_out++ = (unsigned char) (0xc0 | (*p_in >> 6));
      *p_out++ = (unsigned char) (0x80 | (*p_in & 0x3f));
    }
    }

  *out_size = CAST_STRLEN (p_out - *(out_buf));

  return status;
}

/*
 * intl_euckr_to_iso88591() - converts a buffer containing EUCKR text to
 *                ISO88591
 *
 *   return: 0 conversion ok, 1 conversion done, but invalid characters where
 *       found
 *   in_buf(in): buffer
 *   in_size(in): size of input string (NUL terminator not included)
 *   out_buf(int/out) : output buffer : uses the pre-allocated buffer passed
 *          as input or a new allocated buffer;
 *   out_size(out): size of string (NUL terminator not included)
 */
int
intl_euckr_to_iso88591 (const unsigned char *in_buf, const int in_size, unsigned char **out_buf, int *out_size)
{
  const unsigned char *p_in = NULL;
  const unsigned char *p_end;
  unsigned char *p_out = NULL;
  unsigned int unicode_cp;
  int status = 0;

  assert (in_size > 0);
  assert (in_buf != NULL);
  assert (out_buf != NULL);
  assert (out_size != NULL);

  for (p_in = in_buf, p_end = p_in + in_size, p_out = (unsigned char *) *out_buf; p_in < p_end; p_in++)
    {
      if (*p_in < 0x80)
    {
      *p_out++ = *p_in;
    }
      else if (*p_in >= 0xa1 && *p_in < 0xff && p_end - p_in >= 2)
    {
      if (*(p_in + 1) >= 0xa1 && *(p_in + 1) < 0xff)
        {
          /* KSC5601 two-bytes character */
          unsigned char ksc_buf[2];

          ksc_buf[0] = *p_in - 0x80;
          ksc_buf[1] = *(p_in + 1) - 0x80;

          if (ksc5601_mbtowc (&unicode_cp, ksc_buf, 2) <= 0)
        {
          *p_out++ = '?';
          status = 1;
        }
          else
        {
          if ((unicode_cp <= 0x1F) || (unicode_cp > 0xFF) || ((unicode_cp >= 0x7F) && (unicode_cp <= 0x9F)))
            {
              *p_out++ = '?';
              status = 1;
            }
          else
            {
              *p_out++ = unicode_cp;
            }
        }
        }
      else
        {
          *p_out++ = '?';
          status = 1;
        }

      /* skip one additional byte */
      p_in++;
    }
      else if (*p_in == 0x8f && p_end - p_in >= 3)
    {
      if (*(p_in + 1) >= 0xa1 && *(p_in + 1) < 0xff && *(p_in + 2) >= 0xa1 && *(p_in + 2) < 0xff)
        {
          /* JISX0212 three bytes character */
          unsigned char jis_buf[2];

          jis_buf[0] = *(p_in + 1) - 0x80;
          jis_buf[1] = *(p_in + 2) - 0x80;

          if (jisx0212_mbtowc (&unicode_cp, jis_buf, 2) <= 0)
        {
          *p_out++ = '?';
          status = 1;
        }
          else
        {
          if ((unicode_cp <= 0x1F) || (unicode_cp > 0xFF) || ((unicode_cp >= 0x7F) && (unicode_cp <= 0x9F)))
            {
              *p_out++ = '?';
              status = 1;
            }
          else
            {
              *p_out++ = unicode_cp;
            }
        }
        }
      else
        {
          *p_out++ = '?';
          status = 1;
        }

      /* skip two additional bytes */
      p_in++;
      p_in++;
    }
      else
    {
      /* EUC-KR byte not valid */
      *p_out++ = '?';
      status = 1;
    }
    }

  *out_size = CAST_STRLEN (p_out - *(out_buf));

  return status;
}

/*
 * intl_euckr_to_utf8() - converts a buffer containing text with EUC-KR
 *            + JISX0212 to UTF-8
 *
 *   return: 0 conversion ok, 1 conversion done, but invalid characters where
 *       found
 *   in_buf(in): buffer
 *   in_size(in): size of input string (NUL terminator not included)
 *   out_buf(int/out) : output buffer : uses the pre-allocated buffer passed
 *          as input or a new allocated buffer;
 *   out_size(out): size of string (NUL terminator not included)
 */
int
intl_euckr_to_utf8 (const unsigned char *in_buf, const int in_size, unsigned char **out_buf, int *out_size)
{
  const unsigned char *p_in = NULL;
  const unsigned char *p_end;
  unsigned char *p_out = NULL;
  unsigned int unicode_cp;
  int utf8_size;
  int status = 0;

  assert (in_size > 0);
  assert (in_buf != NULL);
  assert (out_buf != NULL);
  assert (out_size != NULL);

  for (p_in = in_buf, p_end = p_in + in_size, p_out = (unsigned char *) *out_buf; p_in < p_end; p_in++)
    {
      if (*p_in < 0x80)
    {
      *p_out++ = *p_in;
    }
      else if (*p_in >= 0xa1 && *p_in < 0xff && p_end - p_in >= 2)
    {
      if (*(p_in + 1) >= 0xa1 && *(p_in + 1) < 0xff)
        {
          /* KSC5601 two-bytes character */
          unsigned char ksc_buf[2];

          ksc_buf[0] = *p_in - 0x80;
          ksc_buf[1] = *(p_in + 1) - 0x80;

          if (ksc5601_mbtowc (&unicode_cp, ksc_buf, 2) <= 0)
        {
          *p_out++ = '?';
          status = 1;
        }
          else
        {
          utf8_size = intl_cp_to_utf8 (unicode_cp, p_out);
          p_out += utf8_size;
        }
        }
      else
        {
          *p_out++ = '?';
          status = 1;
        }

      /* skip one additional byte */
      p_in++;
    }
      else if (*p_in == 0x8f && p_end - p_in >= 3)
    {
      if (*(p_in + 1) >= 0xa1 && *(p_in + 1) < 0xff && *(p_in + 2) >= 0xa1 && *(p_in + 2) < 0xff)
        {
          /* JISX0212 three bytes character */
          unsigned char jis_buf[2];

          jis_buf[0] = *(p_in + 1) - 0x80;
          jis_buf[1] = *(p_in + 2) - 0x80;

          if (jisx0212_mbtowc (&unicode_cp, jis_buf, 2) <= 0)
        {
          *p_out++ = '?';
          status = 1;
        }
          else
        {
          utf8_size = intl_cp_to_utf8 (unicode_cp, p_out);
          p_out += utf8_size;
        }
        }
      else
        {
          *p_out++ = '?';
          status = 1;
        }

      /* skip two additional bytes */
      p_in++;
      p_in++;
    }
      else
    {
      /* EUC-KR byte not valid */
      *p_out++ = '?';
      status = 1;
    }
    }

  *out_size = CAST_STRLEN (p_out - *(out_buf));

  return status;
}

/*
 * intl_utf8_to_iso88591() - converts a buffer containing UTF8 text to ISO88591
 *
 *   return: 0 conversion ok, 1 conversion done, but invalid characters where
 *       found
 *   in_buf(in): buffer
 *   in_size(in): size of input string (NUL terminator not included)
 *   out_buf(int/out) : output buffer : uses the pre-allocated buffer passed
 *          as input or a new allocated buffer;
 *   out_size(out): size of string (NUL terminator not included)
 */
int
intl_utf8_to_iso88591 (const unsigned char *in_buf, const int in_size, unsigned char **out_buf, int *out_size)
{
  const unsigned char *p_in = NULL;
  const unsigned char *p_end;
  unsigned char *p_out = NULL;
  unsigned char *next_utf8;
  int status = 0;
  unsigned int unicode_cp = 0;

  assert (in_size > 0);
  assert (in_buf != NULL);
  assert (out_buf != NULL);
  assert (out_size != NULL);

  for (p_in = in_buf, p_end = in_buf + in_size, p_out = (unsigned char *) *out_buf; p_in < p_end;)
    {
      unicode_cp = intl_utf8_to_cp (p_in, CAST_STRLEN (p_end - p_in), &next_utf8);

      if ((unicode_cp > 0xFF) || ((unicode_cp >= 0x7F) && (unicode_cp <= 0x9F)))
    {
      *p_out++ = '?';
      status = 1;
    }
      else
    {
      *p_out++ = unicode_cp;
    }

      p_in = next_utf8;
    }

  *out_size = CAST_STRLEN (p_out - *(out_buf));

  return status;
}

/*
 * intl_utf8_to_euckr() - converts a buffer containing UTF8 text to EUC-KR
 *            + JISX0212 encoding
 *
 *   return: 0 conversion ok, 1 conversion done, but invalid characters where
 *       found
 *   in_buf(in): buffer
 *   in_size(in): size of input string (NUL terminator not included)
 *   out_buf(int/out) : output buffer : uses the pre-allocated buffer passed
 *          as input or a new allocated buffer;
 *   out_size(out): size of string (NUL terminator not included)
 */
int
intl_utf8_to_euckr (const unsigned char *in_buf, const int in_size, unsigned char **out_buf, int *out_size)
{
  const unsigned char *p_in = NULL;
  const unsigned char *p_end;
  unsigned char *p_out = NULL;
  int status = 0;

  assert (in_size > 0);
  assert (in_buf != NULL);
  assert (out_buf != NULL);
  assert (out_size != NULL);

  for (p_in = in_buf, p_end = p_in + in_size, p_out = (unsigned char *) *out_buf; p_in < p_end;)
    {
      if (*p_in < 0x80)
    {
      *p_out++ = *p_in++;
    }
      else
    {
      unsigned char euc_buf[2];
      int euc_bytes;
      unsigned int unicode_cp;
      unsigned char *next_utf8;

      unicode_cp = intl_utf8_to_cp (p_in, CAST_STRLEN (p_end - p_in), &next_utf8);
      if (unicode_cp == 0xffffffff)
        {
          goto illegal_char;
        }

      /* try to convert to KSC5601 */
      euc_bytes = ksc5601_wctomb (euc_buf, unicode_cp, CAST_STRLEN (next_utf8 - p_in));

      assert (euc_bytes != 0);
      if (euc_bytes == 2)
        {
          *p_out = euc_buf[0] + 0x80;
          *(p_out + 1) = euc_buf[1] + 0x80;
          p_out++;
          p_out++;
          p_in = next_utf8;
          continue;
        }

      if (euc_bytes != RET_ILUNI)
        {
          goto illegal_char;
        }
      assert (euc_bytes == RET_ILUNI);
      /* not found as KSC encoding, try as JISX0212 */
      euc_bytes = jisx0212_wctomb (euc_buf, unicode_cp, CAST_STRLEN (next_utf8 - p_in));

      assert (euc_bytes != 0);
      if (euc_bytes == 2)
        {
          *p_out = 0x8f;
          *(p_out + 1) = euc_buf[0] + 0x80;
          *(p_out + 2) = euc_buf[1] + 0x80;
          p_out += 3;
          p_in = next_utf8;
          continue;
        }

      /* illegal Unicode or impossible to convert to EUC */
    illegal_char:
      p_in = next_utf8;
      *p_out = '?';
      p_out++;
      status = 1;
    }
    }

  *out_size = CAST_STRLEN (p_out - *(out_buf));

  return status;
}

/*
 * intl_iso88591_to_euckr() - converts a buffer containing ISO88591 text to
 *                EUC-KR encoding
 *
 *   return: 0 conversion ok, 1 conversion done, but invalid characters where
 *       found
 *   in_buf(in): buffer
 *   in_size(in): size of input string (NUL terminator not included)
 *   out_buf(int/out) : output buffer : uses the pre-allocated buffer passed
 *          as input or a new allocated buffer;
 *   out_size(out): size of string (NUL terminator not included)
 */
int
intl_iso88591_to_euckr (const unsigned char *in_buf, const int in_size, unsigned char **out_buf, int *out_size)
{
  const unsigned char *p_in = NULL;
  const unsigned char *p_end;
  unsigned char *p_out = NULL;
  int status = 0;

  assert (in_size > 0);
  assert (in_buf != NULL);
  assert (out_buf != NULL);
  assert (out_size != NULL);

  for (p_in = in_buf, p_end = p_in + in_size, p_out = (unsigned char *) *out_buf; p_in < p_end; p_in++)
    {
      if (*p_in < 0x80)
    {
      *p_out++ = *p_in;
    }
      else
    {
      unsigned char euc_buf[2];
      int euc_bytes;

      if (*p_in < 0xa0)
        {
          *p_out = '?';
          p_out++;
          status = 1;
          continue;
        }

      /* try to convert to KSC5601 */
      euc_bytes = ksc5601_wctomb (euc_buf, *p_in, 2);

      assert (euc_bytes != 0);
      if (euc_bytes == 2)
        {
          *p_out = euc_buf[0] + 0x80;
          *(p_out + 1) = euc_buf[1] + 0x80;
          p_out++;
          p_out++;
          continue;
        }

      /* illegal ISO8859-1 or impossible to convert to KSC */
      if (euc_bytes != RET_ILUNI)
        {
          goto illegal_char;
        }
      assert (euc_bytes == RET_ILUNI);

      /* try to convert to JISX0212 */
      euc_bytes = jisx0212_wctomb (euc_buf, *p_in, 2);

      assert (euc_bytes != 0);
      if (euc_bytes == 2)
        {
          *p_out = 0x8f;
          *(p_out + 1) = euc_buf[0] + 0x80;
          *(p_out + 2) = euc_buf[1] + 0x80;
          p_out++;
          p_out++;
          p_out++;
          continue;
        }

    illegal_char:
      *p_out = '?';
      p_out++;
      status = 1;
    }
    }

  *out_size = CAST_STRLEN (p_out - *(out_buf));

  return status;
}

/* monetary symbols */

/* UTF-8 encoding of money symbols - maps to DB_CURRENCY enum type */
static char moneysymbols_utf8[][4] = {
  "$",              /* dollar sign */
  "\xc2\xa5",           /* Japan money symbols */
  "\xc2\xa3",           /* pound sterling - British money symbols */
  "\xe2\x82\xa9",       /* won - Korean money symbols */
  "TL",             /* TL - Turkish money symbols */
  "KHR",            /* KHR - Cambodian money symbols */
  "CNY",            /* chinese money symbols */
  "INR",            /* indian money symbols */
  "RUB",            /* russian money symbols */
  "AUD",            /* australian money symbols */
  "CAD",            /* canadian money symbols */
  "BRL",            /* brasilian money symbols */
  "RON",            /* romanian money symbols */
  "EUR",            /* euro symbol */
  "CHF",            /* swiss money symbols */
  "DKK",            /* danish money symbols */
  "NOK",            /* norwegian money symbols */
  "BGN",            /* bulgarian money symbols */
  "VND",            /* vietnamese dong symbol */
  "CZK",            /* Czech koruna symbol */
  "PLN",            /* Polish zloty symbol */
  "SEK",            /* Swedish krona symbol */
  "HRK",            /* Croatian kuna symbol */
  "RSD",            /* serbian dinar symbol */
  "\xc2\xa4"            /* generic curency symbol */
};

/* encoding (for console output) of money symbols - maps to DB_CURRENCY enum
 * type */
/* used for values printing in CSQL */
static char moneysymbols_console[][4] = {
  "$",              /* dollar sign */
  "Y",              /* japanese yen */
  "&",              /* british pound */
  "\\",             /* Korean won */
  "TL",             /* turkish lira */
  "KHR",            /* cambodian riel */
  "CNY",            /* chinese renminbi */
  "INR",            /* indian rupee */
  "RUB",            /* russian ruble */
  "AUD",            /* australian dollar */
  "CAD",            /* canadian dollar */
  "BRL",            /* brasilian real */
  "RON",            /* romanian leu */
  "EUR",            /* euro */
  "CHF",            /* swiss franc */
  "DKK",            /* danish krone */
  "NOK",            /* norwegian krone */
  "BGN",            /* bulgarian lev */
  "VND",            /* vietnamese dong */
  "CZK",            /* Czech koruna */
  "PLN",            /* Polish zloty */
  "SEK",            /* Swedish krona */
  "HRK",            /* Croatian kuna */
  "RSD",            /* serbian dinar */
  ""                /* generic currency symbol - add new symbols before this */
};

/* encoding (for grammars) of money symbols - maps to DB_CURRENCY enum type */
/* used for values printing in CSQL */
static char moneysymbols_grammar[][5] = {
  "$",              /* dollar sign */
  "\xa1\xef",           /* japanese yen */
  "\\GBP",          /* british pound */
  "\\KRW",          /* Korean won */
  "\\TL",           /* turkish lira */
  "\\KHR",          /* cambodian riel */
  "\\CNY",          /* chinese renminbi */
  "\\INR",          /* indian rupee */
  "\\RUB",          /* russian ruble */
  "\\AUD",          /* australian dollar */
  "\\CAD",          /* canadian dollar */
  "\\BRL",          /* brasilian real */
  "\\RON",          /* romanian leu */
  "\\EUR",          /* euro */
  "\\CHF",          /* swiss franc */
  "\\DKK",          /* danish krone */
  "\\NOK",          /* norwegian krone */
  "\\BGN",          /* bulgarian lev */
  "\\VND",          /* vietnamese dong */
  "\\CZK",          /* Czech koruna */
  "\\PLN",          /* Polish zloty */
  "\\SEK",          /* Swedish krona */
  "\\HRK",          /* Croatian kuna */
  "\\RSD",          /* serbian dinar */
  ""                /* generic currency symbol - add new symbols before this */
};

/* ISO encoding of money symbols - maps to DB_CURRENCY enum type */
static char moneysymbols_iso_codes[][4] = {
  "USD",            /* dollar sign */
  "JPY",            /* japanese yen */
  "GBP",            /* british pound */
  "KRW",            /* Korean won */
  "TRY",            /* turkish lira */
  "KHR",            /* cambodian riel */
  "CNY",            /* chinese renminbi */
  "INR",            /* indian rupee */
  "RUB",            /* russian ruble */
  "AUD",            /* australian dollar */
  "CAD",            /* canadian dollar */
  "BRL",            /* brasilian real */
  "RON",            /* romanian leu */
  "EUR",            /* euro */
  "CHF",            /* swiss franc */
  "DKK",            /* danish krone */
  "NOK",            /* norwegian krone */
  "BGN",            /* bulgarian lev */
  "VND",            /* vietnamese dong */
  "CZK",            /* Czech koruna */
  "PLN",            /* Polish zloty */
  "SEK",            /* Swedish krona */
  "HRK",            /* Croatian kuna */
  "RSD",            /* serbian dinar */
  ""                /* generic currency symbol - add new symbols before this */
};

/* escaped ISO encoding of money symbols - maps to DB_CURRENCY enum type */
static char moneysymbols_esc_iso_codes[][5] = {
  "\\USD",          /* dollar sign */
  "\\JPY",          /* japanese yen */
  "\\GBP",          /* british pound */
  "\\KRW",          /* Korean won */
  "\\TRY",          /* turkish lira */
  "\\KHR",          /* cambodian riel */
  "\\CNY",          /* chinese renminbi */
  "\\INR",          /* indian rupee */
  "\\RUB",          /* russian ruble */
  "\\AUD",          /* australian dollar */
  "\\CAD",          /* canadian dollar */
  "\\BRL",          /* brasilian real */
  "\\RON",          /* romanian leu */
  "\\EUR",          /* euro */
  "\\CHF",          /* swiss franc */
  "\\DKK",          /* danish krone */
  "\\NOK",          /* norwegian krone */
  "\\BGN",          /* bulgarian lev */
  "\\VND",          /* vietnamese dong */
  "\\CZK",          /* Czech koruna */
  "\\PLN",          /* Polish zloty */
  "\\SEK",          /* Swedish krona */
  "\\HRK",          /* Croatian kuna */
  "\\RSD",          /* serbian dinar */
  ""                /* generic currency symbol - add new symbols before this */
};

/* ISO88591 encoding of money symbols - maps to DB_CURRENCY enum type */
static char moneysymbols_iso88591_codes[][4] = {
  "$",              /* dollar sign */
  "\xa5",           /* japanese yen */
  "\xa3",           /* british pound */
  "KRW",            /* Korean won */
  "TL",             /* turkish lira */
  "KHR",            /* cambodian riel */
  "CNY",            /* chinese renminbi */
  "INR",            /* indian rupee */
  "RUB",            /* russian ruble */
  "AUD",            /* australian dollar */
  "CAD",            /* canadian dollar */
  "BRL",            /* brasilian real */
  "RON",            /* romanian leu */
  "EUR",            /* euro */
  "CHF",            /* swiss franc */
  "DKK",            /* danish krone */
  "NOK",            /* norwegian krone */
  "BGN",            /* bulgarian lev */
  "VND",            /* vietnamese dong */
  "CZK",            /* Czech koruna */
  "PLN",            /* Polish zloty */
  "SEK",            /* Swedish krona */
  "HRK",            /* Croatian kuna */
  "RSD",            /* serbian dinar */
  ""                /* generic currency symbol - add new symbols before this */
};

/*
 * intl_is_currency_symbol() - check if a string matches a currency
 *                             symbol (UTF-8)
 *   return: true if a match is found
 *   src(in): NUL terminated string
 *   currency(out): currency found
 */
bool
intl_is_currency_symbol (const char *src, DB_CURRENCY * currency, int *symbol_size,
             const CURRENCY_CHECK_MODE check_mode)
{
  int sym_currency;
  int src_len = strlen (src);

  assert (currency != NULL);
  assert (symbol_size != NULL);

  *currency = DB_CURRENCY_NULL;
  *symbol_size = 0;

  if (check_mode & CURRENCY_CHECK_MODE_ISO)
    {
      for (sym_currency = 0; src_len > 0 && sym_currency < (int) DIM (moneysymbols_iso_codes); sym_currency++)
    {
      int symbol_len = strlen (moneysymbols_iso_codes[sym_currency]);
      if (src_len >= symbol_len && symbol_len > 0
          && !memcmp (src, moneysymbols_iso_codes[sym_currency], symbol_len))
        {
          *currency = (DB_CURRENCY) sym_currency;
          *symbol_size = symbol_len;
          return (*currency == DB_CURRENCY_NULL) ? false : true;
        }
    }
    }

  if (check_mode & CURRENCY_CHECK_MODE_ESC_ISO)
    {
      for (sym_currency = 0; src_len > 0 && sym_currency < (int) DIM (moneysymbols_esc_iso_codes); sym_currency++)
    {
      int symbol_len = strlen (moneysymbols_esc_iso_codes[sym_currency]);
      if (src_len >= symbol_len && symbol_len > 0
          && !memcmp (src, moneysymbols_esc_iso_codes[sym_currency], symbol_len))
        {
          *currency = (DB_CURRENCY) sym_currency;
          *symbol_size = symbol_len;
          return (*currency == DB_CURRENCY_NULL) ? false : true;
        }
    }
    }

  if (check_mode & CURRENCY_CHECK_MODE_UTF8)
    {
      for (sym_currency = 0; src_len > 0 && sym_currency < (int) DIM (moneysymbols_utf8); sym_currency++)
    {
      int symbol_len = strlen (moneysymbols_utf8[sym_currency]);
      if (src_len >= symbol_len && symbol_len > 0 && !memcmp (src, moneysymbols_utf8[sym_currency], symbol_len))
        {
          *currency = (DB_CURRENCY) sym_currency;
          *symbol_size = symbol_len;
          return (*currency == DB_CURRENCY_NULL) ? false : true;
        }
    }
    }

  if (check_mode & CURRENCY_CHECK_MODE_CONSOLE)
    {
      for (sym_currency = 0; src_len > 0 && sym_currency < (int) DIM (moneysymbols_console); sym_currency++)
    {
      int symbol_len = strlen (moneysymbols_console[sym_currency]);
      if (src_len >= symbol_len && symbol_len > 0 && !memcmp (src, moneysymbols_console[sym_currency], symbol_len))
        {
          *currency = (DB_CURRENCY) sym_currency;
          *symbol_size = symbol_len;
          return (*currency == DB_CURRENCY_NULL) ? false : true;
        }
    }
    }

  /* search backwards : "\TL" (turkish lira) symbol may be miss-interpreted as "\" (korean won) */
  if (check_mode & CURRENCY_CHECK_MODE_GRAMMAR)
    {
      for (sym_currency = (int) DIM (moneysymbols_grammar) - 1; src_len > 0 && sym_currency >= 0; sym_currency--)
    {
      int symbol_len = strlen (moneysymbols_grammar[sym_currency]);
      if (src_len >= symbol_len && symbol_len > 0 && !memcmp (src, moneysymbols_grammar[sym_currency], symbol_len))
        {
          *currency = (DB_CURRENCY) sym_currency;
          *symbol_size = symbol_len;
          return (*currency == DB_CURRENCY_NULL) ? false : true;
        }
    }
    }

  if (check_mode & CURRENCY_CHECK_MODE_ISO88591)
    {
      for (sym_currency = 0; src_len > 0 && sym_currency < (int) DIM (moneysymbols_iso88591_codes); sym_currency++)
    {
      int symbol_len = strlen (moneysymbols_iso88591_codes[sym_currency]);
      if (src_len >= symbol_len && symbol_len > 0
          && !memcmp (src, moneysymbols_iso88591_codes[sym_currency], symbol_len))
        {
          *currency = (DB_CURRENCY) sym_currency;
          *symbol_size = symbol_len;
          return (*currency == DB_CURRENCY_NULL) ? false : true;
        }
    }
    }

  return false;
}

/*
 * intl_get_money_symbol() - returns a string representing the currency symbol
 *   return: currency symbol
 *   currency(int): currency code
 *   codeset (in): required codeset
 */
char *
intl_get_money_symbol (const DB_CURRENCY currency, INTL_CODESET codeset)
{
  switch (codeset)
    {
    case INTL_CODESET_ISO88591:
      return intl_get_money_ISO88591_symbol (currency);
    case INTL_CODESET_UTF8:
      return intl_get_money_UTF8_symbol (currency);
    default:
      return intl_get_money_symbol_console (currency);
    }
}

/*
 * intl_get_money_symbol_console() - returns a string representing the
 *                   currency symbol printable on console
 *   return: currency symbol
 *   currency(int): currency code
 */
char *
intl_get_money_symbol_console (const DB_CURRENCY currency)
{
  if (currency >= (int) DIM (moneysymbols_console))
    {
      return moneysymbols_console[DB_CURRENCY_NULL];
    }
  return moneysymbols_console[currency];
}

/*
 * intl_get_money_symbol_grammar() - returns a string representing the
 *                   currency symbol recognizable by grammar
 *   return: currency symbol
 *   currency(int): currency code
 */
char *
intl_get_money_symbol_grammar (const DB_CURRENCY currency)
{
  if (currency >= (int) DIM (moneysymbols_grammar))
    {
      return moneysymbols_grammar[DB_CURRENCY_NULL];
    }
  return moneysymbols_grammar[currency];
}

/*
 * intl_get_currency_symbol_position() - returns an indication of the position
 *                   of currency symbol symbol when
 *                   is printed
 *   return: position indicator : 0 : before value, 1 : after value
 *   currency(int): currency code
 *
 *  Note : currently ony the turkish lira is printed after the value
 */
int
intl_get_currency_symbol_position (const DB_CURRENCY currency)
{
  if (currency == DB_CURRENCY_TL)
    {
      return 1;
    }

  return 0;
}

/*
 * intl_get_money_ISO_symbol() - returns a string representing the currency
 *               ISO symbol, as a 3 letter string.
 *   return: currency ISO symbol
 *   currency(int): currency code
 */
char *
intl_get_money_ISO_symbol (const DB_CURRENCY currency)
{
  if (currency >= (int) DIM (moneysymbols_iso_codes))
    {
      return moneysymbols_iso_codes[DB_CURRENCY_NULL];
    }
  return moneysymbols_iso_codes[currency];
}

/*
 * intl_get_money_esc_ISO_symbol() - returns a string representing the
 *                   currency with escaped ISO symbol
 *   return: currency escaped ISO symbol
 *   currency(int): currency code
 */
char *
intl_get_money_esc_ISO_symbol (const DB_CURRENCY currency)
{
  if (currency >= (int) DIM (moneysymbols_esc_iso_codes))
    {
      return moneysymbols_esc_iso_codes[DB_CURRENCY_NULL];
    }
  return moneysymbols_esc_iso_codes[currency];
}

/*
 * intl_get_money_UTF8_symbol() - returns a string representing the currency
 *               UTF8 symbol, as a 3 letter string.
 *   return: currency UTF8 symbol
 *   currency(int): currency code
 */
char *
intl_get_money_UTF8_symbol (const DB_CURRENCY currency)
{
  if (currency >= (int) DIM (moneysymbols_utf8))
    {
      return moneysymbols_utf8[DB_CURRENCY_NULL];
    }
  return moneysymbols_utf8[currency];
}

/*
 * intl_get_money_ISO88591_symbol() - returns a string representing the currency
 *               ISO88591 symbol, as a 3 letter string.
 *   return: currency ISO88591 symbol
 *   currency(int): currency code
 */
char *
intl_get_money_ISO88591_symbol (const DB_CURRENCY currency)
{
  if (currency >= (int) DIM (moneysymbols_iso88591_codes))
    {
      return moneysymbols_iso88591_codes[DB_CURRENCY_NULL];
    }
  return moneysymbols_iso88591_codes[currency];
}

/*
 * intl_binary_to_utf8 - converts a buffer from binary to utf8, replacing
 *           invalid UTF-8 sequences with '?'
 *
 *   in_buf(in): buffer
 *   in_size(in): size of input string (NUL terminator not included)
 *   out_buf(int/out) : output buffer : uses the pre-allocated buffer passed
 *          as input or a new allocated buffer;
 *   out_size(out): size of string (NUL terminator not included)
 *
 *  Valid ranges:
 *    - 1 byte : 00 - 7F
 *    - 2 bytes: C2 - DF , 80 - BF             (U +80 .. U+7FF)
 *    - 3 bytes: E0  , A0 - BF , 80 - BF           (U +800 .. U+FFF)
 *       E1 - EC , 80 - BF , 80 - BF           (U +1000 .. +CFFF)
 *       ED  , 80 - 9F , 80 - BF           (U +D000 .. +D7FF)
 *       EE - EF , 80 - BF , 80 - BF           (U +E000 .. +FFFF)
 *    - 4 bytes: F0  , 90 - BF , 80 - BF , 80 - BF (U +10000 .. +3FFFF)
 *       F1 - F3 , 80 - BF , 80 - BF , 80 - BF (U +40000 .. +FFFFF)
 *       F4  , 80 - 8F , 80 - BF , 80 - BF (U +100000 .. +10FFFF)
 */
void
intl_binary_to_utf8 (const unsigned char *in_buf, const int in_size, unsigned char **out_buf, int *out_size)
{
  const unsigned char *p = in_buf;
  const unsigned char *p_end = NULL;
  const unsigned char *curr_char = NULL;
  unsigned char *p_out = NULL;

  p_out = (unsigned char *) *out_buf;
  p_end = in_buf + in_size;

  while (p < p_end)
    {
      curr_char = p;

      if (*p < 0x80)
    {
      *p_out++ = *p++;
      continue;
    }

      /* range 80 - BF is not valid UTF-8 first byte */
      /* range C0 - C1 overlaps 1 byte 00 - 20 (2 byte overlongs) */
      if (*p < 0xc2)
    {
      *p_out++ = '?';
      p++;
      continue;
    }

      /* check 2 bytes sequences */
      /* 2 bytes sequence allowed : C2 - DF , 80 - BF */
      if (UTF8_BYTE_IN_RANGE (*p, 0xc2, 0xdf))
    {
      p++;
      if (p >= p_end)
        {
          *p_out++ = '?';
          continue;
        }

      if (UTF8_BYTE_IN_RANGE (*p, 0x80, 0xbf))
        {
          *p_out++ = *(p - 1);
          *p_out++ = *p;
          p++;
          continue;
        }
      p++;
      *p_out++ = '?';
      continue;
    }

      /* check 3 bytes sequences */
      /* 3 bytes sequence : E0 , A0 - BF , 80 - BF */
      if (*p == 0xe0)
    {
      p++;
      if (p >= p_end)
        {
          *p_out++ = '?';
          continue;
        }

      if (UTF8_BYTE_IN_RANGE (*p, 0xa0, 0xbf))
        {
          p++;
          if (p >= p_end)
        {
          *p_out++ = '?';
          continue;
        }

          if (UTF8_BYTE_IN_RANGE (*p, 0x80, 0xbf))
        {
          *p_out++ = *(p - 2);
          *p_out++ = *(p - 1);
          *p_out++ = *p;
          p++;
          continue;
        }
        }
      p++;
      if (p < p_end)
        {
          *p_out++ = '?';
        }
      continue;
    }
      /* 3 bytes sequence : E1 - EC , 80 - BF , 80 - BF */
      /* 3 bytes sequence : EE - EF , 80 - BF , 80 - BF */
      else if (UTF8_BYTE_IN_RANGE (*p, 0xe1, 0xec) || UTF8_BYTE_IN_RANGE (*p, 0xee, 0xef))
    {
      p++;
      if (p >= p_end)
        {
          *p_out++ = '?';
          continue;
        }

      if (UTF8_BYTE_IN_RANGE (*p, 0x80, 0xbf))
        {
          p++;
          if (p >= p_end)
        {
          *p_out++ = '?';
          continue;
        }

          if (UTF8_BYTE_IN_RANGE (*p, 0x80, 0xbf))
        {
          *p_out++ = *(p - 2);
          *p_out++ = *(p - 1);
          *p_out++ = *p;
          p++;
          continue;
        }
        }
      p++;
      *p_out++ = '?';
      continue;
    }
      /* 3 bytes sequence : ED , 80 - 9F , 80 - BF */
      else if (*p == 0xed)
    {
      p++;
      if (p >= p_end)
        {
          *p_out++ = '?';
          continue;
        }

      if (UTF8_BYTE_IN_RANGE (*p, 0x80, 0x9f))
        {
          p++;
          if (p >= p_end)
        {
          *p_out++ = '?';
          continue;
        }

          if (UTF8_BYTE_IN_RANGE (*p, 0x80, 0xbf))
        {
          *p_out++ = *(p - 2);
          *p_out++ = *(p - 1);
          *p_out++ = *p;
          p++;
          continue;
        }
        }
      p++;
      *p_out++ = '?';
      continue;
    }

      /* 4 bytes sequence : F0 , 90 - BF , 80 - BF , 80 - BF */
      if (*p == 0xf0)
    {
      p++;
      if (p >= p_end)
        {
          *p_out++ = '?';
          continue;
        }

      if (UTF8_BYTE_IN_RANGE (*p, 0x90, 0xbf))
        {
          p++;
          if (p >= p_end)
        {
          *p_out++ = '?';
          continue;
        }

          if (UTF8_BYTE_IN_RANGE (*p, 0x80, 0xbf))
        {
          p++;
          if (p >= p_end)
            {
              *p_out++ = '?';
              continue;
            }

          if (UTF8_BYTE_IN_RANGE (*p, 0x80, 0xbf))
            {
              *p_out++ = *(p - 3);
              *p_out++ = *(p - 2);
              *p_out++ = *(p - 1);
              *p_out++ = *p;
              p++;
              continue;
            }
        }
        }
      p++;
      *p_out++ = '?';
      continue;
    }
      /* 4 bytes sequence : F1 - F3 , 80 - BF , 80 - BF , 80 - BF */
      if (UTF8_BYTE_IN_RANGE (*p, 0xf1, 0xf3))
    {
      p++;
      if (p >= p_end)
        {
          *p_out++ = '?';
          continue;
        }

      if (UTF8_BYTE_IN_RANGE (*p, 0x80, 0xbf))
        {
          p++;
          if (p >= p_end)
        {
          *p_out++ = '?';
          continue;
        }

          if (UTF8_BYTE_IN_RANGE (*p, 0x80, 0xbf))
        {
          p++;
          if (p >= p_end)
            {
              *p_out++ = '?';
              continue;
            }

          if (UTF8_BYTE_IN_RANGE (*p, 0x80, 0xbf))
            {
              *p_out++ = *(p - 3);
              *p_out++ = *(p - 2);
              *p_out++ = *(p - 1);
              *p_out++ = *p;
              p++;
              continue;
            }
        }
        }
      p++;
      *p_out++ = '?';
      continue;
    }
      /* 4 bytes sequence : F4 , 80 - 8F , 80 - BF , 80 - BF */
      else if (*p == 0xf4)
    {
      p++;
      if (p >= p_end)
        {
          *p_out++ = '?';
          continue;
        }

      if (UTF8_BYTE_IN_RANGE (*p, 0x80, 0x8f))
        {
          p++;
          if (p >= p_end)
        {
          *p_out++ = '?';
          continue;
        }

          if (UTF8_BYTE_IN_RANGE (*p, 0x80, 0xbf))
        {
          p++;
          if (p >= p_end)
            {
              *p_out++ = '?';
              continue;
            }

          if (UTF8_BYTE_IN_RANGE (*p, 0x80, 0xbf))
            {
              *p_out++ = *(p - 3);
              *p_out++ = *(p - 2);
              *p_out++ = *(p - 1);
              *p_out++ = *p;
              p++;
              continue;
            }
        }
        }
      p++;
      *p_out++ = '?';
      continue;
    }

      assert (*p > 0xf4);
    }

  *out_size = CAST_STRLEN (p_out - *(out_buf));
}

/*
 * intl_binary_to_euckr - converts a buffer from binary to euckr, replacing
 *           invalid euckr sequences with '?'
 *
 *   in_buf(in): buffer
 *   in_size(in): size of input string (NUL terminator not included)
 *   out_buf(int/out) : output buffer : uses the pre-allocated buffer passed
 *          as input or a new allocated buffer;
 *   out_size(out): size of string (NUL terminator not included)
 *
 *  Valid ranges:
 *    - 1 byte : 00 - 8E ; 90 - A0
 *    - 2 bytes: A1 - FE , 00 - FF
 *    - 3 bytes: 8F  , 00 - FF , 00 - FF
 */
void
intl_binary_to_euckr (const unsigned char *in_buf, const int in_size, unsigned char **out_buf, int *out_size)
{
  const unsigned char *p = in_buf;
  const unsigned char *p_end = NULL;
  const unsigned char *curr_char = NULL;
  unsigned char *p_out = NULL;

  p_out = (unsigned char *) *out_buf;
  p_end = in_buf + in_size;

  while (p < p_end)
    {
      curr_char = p;

      if (*p < 0x80)
    {
      *p_out++ = *p++;
      continue;
    }

      /* SS3 byte value starts a 3 bytes character */
      if (*p == SS3)
    {
      p++;
      p++;
      p++;
      if (p > p_end)
        {
          *p_out++ = '?';
          continue;
        }
      *p_out++ = *(p - 3);
      *p_out++ = *(p - 2);
      *p_out++ = *(p - 1);
      continue;
    }

      /* check 2 bytes sequences */
      if (UTF8_BYTE_IN_RANGE (*p, 0xa1, 0xfe))
    {
      p++;
      p++;
      if (p > p_end)
        {
          *p_out++ = '?';
          continue;
        }
      *p_out++ = *(p - 2);
      *p_out++ = *(p - 1);
      continue;
    }
      p++;
      *p_out++ = '?';
    }

  *out_size = CAST_STRLEN (p_out - *(out_buf));
}