Skip to content

File unicode_support.c

File List > base > unicode_support.c

Go to the documentation of this file

/*
 * Copyright 2008 Search Solution Corporation
 * Copyright 2016 CUBRID Corporation
 *
 *  Licensed under the Apache License, Version 2.0 (the "License");
 *  you may not use this file except in compliance with the License.
 *  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 *
 */

/*
 * unicode_support.c : Unicode support
 */
#include "config.h"

#include <stdio.h>
#include <stdlib.h>
#include <assert.h>

#include "porting.h"

#include "locale_support.h"
#include "intl_support.h"
#include "language_support.h"
#include "error_manager.h"
#include "utility.h"
#include "environment_variable.h"
#include "system_parameter.h"
#include "unicode_support.h"
// XXX: SHOULD BE THE LAST INCLUDE HEADER
#include "memory_wrapper.hpp"

#if defined(SA_MODE)

#define UNICODEDATA_FILE "unicodedata.txt"

/* Unicode data file constants */
#define UNICODE_FILE_LINE_SIZE 512
#define UNICODE_FILE_FIELDS 14

/* Field position : starting from 0 */
#define UNICODE_FILE_GENERAL_CAT_POS        2
#define UNICODE_FILE_CHAR_DECOMPOSITION_MAPPING 5
#define UNICODE_FILE_UPPER_CASE_MAP     12
#define UNICODE_FILE_LOWER_CASE_MAP     13

typedef enum
{
  CAT_Cn = 0,           /* other, not assigned */
  CAT_Lu,           /* Letter, uppercase */
  CAT_Ll,           /* Letter, lowercase */

  /* add new values here */
  CAT_MAX           /* maximum category value */
} GENERAL_CATEG_ID;

typedef struct
{
  GENERAL_CATEG_ID id;
  const char *val;
} GENERAL_CATEGORY;


/* available list of general categories (id, name) */
static const GENERAL_CATEGORY list_gen_cat[] = {
  {CAT_Lu, "Lu"},
  {CAT_Ll, "Ll"},
};

typedef struct
{
  int id;
  char *std_val;        /* Standard value as defined by Unicode Consortium */
} CANONICAL_COMBINING_CLASS;

/* The maximum number of codepoints to which a single codepoint can be
 * rewritten in canonically fully decomposed form.
 */
#define UNICODE_DECOMP_MAP_CP_COUNT 4

typedef struct
{
  /* general category for this character */
  GENERAL_CATEG_ID gen_cat_id;

  uint32 lower_cp[INTL_CASING_EXPANSION_MULTIPLIER];
  uint32 upper_cp[INTL_CASING_EXPANSION_MULTIPLIER];

  char unicode_mapping_cp_count;

  uint32 unicode_mapping[UNICODE_DECOMP_MAP_CP_COUNT];
  char unicode_full_decomp_cp_count;

} UNICODE_CHAR;

typedef struct
{
  uint32 cp;            /* Codepoint value */
  uint32 *map;          /* A fully decomposed canonical mapping stored as codepoints */

  int size;         /* The number of codepoints in the mapping */
  bool is_full_decomp;      /* true - if map is fully decomposed false - otherwise. */
} UNICODE_CP_MAPPING;

static UNICODE_CHAR *unicode_data = NULL;
static int unicode_data_lower_mult = 1;
static int unicode_data_upper_mult = 1;

static char last_unicode_file[PATH_MAX] = { 0 };

static int load_unicode_data (const LOCALE_DATA * ld);
static int create_alphabet (ALPHABET_DATA * a, const int max_letters, const int lower_multiplier,
                const int upper_multiplier);
static int count_full_decomp_cp (int cp);
static int count_decomp_steps (int cp);
static int unicode_make_normalization_data (UNICODE_CP_MAPPING * decomp_maps, LOCALE_DATA * ld);
static int comp_func_unicode_cp_mapping (const void *arg1, const void *arg2);
static int comp_func_grouping_unicode_cp_mapping (const void *arg1, const void *arg2);


/*
 * unicode_process_alphabet() - Process alphabet (casing) data for given
 *              locale
 *
 * Returns: error code
 * ld(in/out) : locale data structure
 */
int
unicode_process_alphabet (LOCALE_DATA * ld, bool is_verbose)
{
  ALPHABET_DATA *a = NULL;
  ALPHABET_DATA *i_a = NULL;
  ALPHABET_TAILORING *a_tailoring = NULL;
  char unicode_file[PATH_MAX];
  char err_msg[ERR_MSG_SIZE];
  int er_status = NO_ERROR;
  uint32 cp;
  int lower_mult = 1;
  int upper_mult = 1;
  int i;

  assert (ld != NULL);

  a = &(ld->alphabet);
  i_a = &(ld->identif_alphabet);
  a_tailoring = &(ld->alpha_tailoring);

  /* compute lower and upper multiplier from rules */
  for (i = 0; i < a_tailoring->count_rules; i++)
    {
      TRANSFORM_RULE *tf_rule = &(a_tailoring->rules[i]);
      uint32 dummy_array;
      int dummy;
      int dest_len;

      dest_len = intl_utf8_to_cp_list ((unsigned char *) (tf_rule->dest), tf_rule->dest_size, &dummy_array, 1, &dummy);

      if (dest_len > INTL_CASING_EXPANSION_MULTIPLIER)
    {
      snprintf (err_msg, sizeof (err_msg) - 1,
            "Invalid alphabet rule :%d" ". Destination buffer contains more than 2 characters", i);
      LOG_LOCALE_ERROR (err_msg, ER_LOC_GEN, true);
      er_status = ER_LOC_GEN;
      goto error;
    }
      if (tf_rule->type == TR_UPPER)
    {
      upper_mult = MAX (upper_mult, dest_len);
    }
      else
    {
      assert (tf_rule->type == TR_LOWER);
      lower_mult = MAX (lower_mult, dest_len);
    }
    }

  if (a_tailoring->alphabet_mode == 2)
    {
      if (is_verbose)
    {
      printf ("Creating ASCII alphabet\n");
    }

      /* ASCII alphabet */
      er_status = create_alphabet (a, a_tailoring->sett_max_letters, lower_mult, upper_mult);
      if (er_status != NO_ERROR)
    {
      goto error;
    }

      er_status = create_alphabet (i_a, a_tailoring->sett_max_letters, 1, 1);
      if (er_status != NO_ERROR)
    {
      goto error;
    }

      for (cp = 0; (int) cp < a->l_count; cp++)
    {
      i_a->upper_cp[cp] = a->upper_cp[cp] = cp;
      i_a->lower_cp[cp] = a->lower_cp[cp] = cp;
    }

      for (cp = (int) 'a'; cp <= (int) 'z'; cp++)
    {
      i_a->upper_cp[cp] = a->upper_cp[cp] = cp - ('a' - 'A');
      i_a->lower_cp[cp - ('a' - 'A')] = a->lower_cp[cp - ('a' - 'A')] = cp;
    }

      i_a->a_type = a->a_type = ALPHABET_ASCII;
    }
  else
    {
      if (a_tailoring->alphabet_mode == 1)
    {
      strncpy (unicode_file, a_tailoring->unicode_data_file, sizeof (unicode_file));
      unicode_file[sizeof (unicode_file) - 1] = '\0';

      /* a user defined unicode file is handled as a tailored alphabet */
      a->a_type = ALPHABET_TAILORED;
    }
      else
    {
      assert (a_tailoring->alphabet_mode == 0);
      envvar_localedatadir_file (unicode_file, sizeof (unicode_file), UNICODEDATA_FILE);

      a->a_type = ALPHABET_UNICODE;
    }

      if (is_verbose)
    {
      printf ("Creating UNICODE alphabet from: %s\n", unicode_file);
    }

      er_status = load_unicode_data (ld);
      if (er_status != NO_ERROR)
    {
      goto error;
    }

      lower_mult = MAX (lower_mult, unicode_data_lower_mult);
      upper_mult = MAX (upper_mult, unicode_data_upper_mult);

      er_status = create_alphabet (a, a_tailoring->sett_max_letters, lower_mult, upper_mult);
      if (er_status != NO_ERROR)
    {
      goto error;
    }

      er_status =
    create_alphabet (i_a, a_tailoring->sett_max_letters, unicode_data_lower_mult, unicode_data_upper_mult);
      if (er_status != NO_ERROR)
    {
      goto error;
    }

      for (cp = 0; (int) cp < a_tailoring->sett_max_letters; cp++)
    {
      /* set lower and upper case of each codepoint to itself */
      a->lower_cp[cp * lower_mult] = cp;
      a->upper_cp[cp * upper_mult] = cp;

      i_a->lower_cp[cp * unicode_data_lower_mult] = cp;
      i_a->upper_cp[cp * unicode_data_upper_mult] = cp;

      /* overwrite with UnicodeData */
      if (unicode_data[cp].gen_cat_id == CAT_Lu)
        {
          memcpy (&(a->lower_cp[cp * lower_mult]), &(unicode_data[cp].lower_cp),
              sizeof (uint32) * MIN (unicode_data_lower_mult, lower_mult));

          memcpy (&(i_a->lower_cp[cp * unicode_data_lower_mult]), &(unicode_data[cp].lower_cp),
              sizeof (uint32) * unicode_data_lower_mult);
        }
      else if (unicode_data[cp].gen_cat_id == CAT_Ll)
        {
          memcpy (&(a->upper_cp[cp * upper_mult]), &(unicode_data[cp].upper_cp),
              sizeof (uint32) * MIN (unicode_data_upper_mult, upper_mult));

          memcpy (&(i_a->upper_cp[cp * unicode_data_upper_mult]), &(unicode_data[cp].upper_cp),
              sizeof (uint32) * unicode_data_upper_mult);
        }
    }
    }

  if (a_tailoring->count_rules > 0)
    {
      a->a_type = ALPHABET_TAILORED;
    }

  if (is_verbose && a_tailoring->count_rules > 0)
    {
      printf ("Applying %d alphabet tailoring rules\n", a_tailoring->count_rules);
    }
  /* apply tailoring rules on user-alphabet only */
  for (i = 0; i < a_tailoring->count_rules; i++)
    {
      TRANSFORM_RULE *tf_rule = &(a_tailoring->rules[i]);
      uint32 cp_src;
      uint32 cp_dest[INTL_CASING_EXPANSION_MULTIPLIER];
      int src_cp_count = 0;
      int src_len = 0;
      int dest_cp_count = 0;
      int dest_len = 0;

      /* source codepoints */
      /* TODO : allow casing compression (many CPs for source) */
      src_len = intl_utf8_to_cp_list ((unsigned char *) (tf_rule->src), tf_rule->src_size, &cp_src, 1, &src_cp_count);

      if (src_len != 1 || src_len != src_cp_count)
    {
      LOG_LOCALE_ERROR ("Invalid source buffer for alphabet rule", ER_LOC_GEN, true);
      er_status = ER_LOC_GEN;
      goto error;
    }

      if ((int) cp_src >= a_tailoring->sett_max_letters)
    {
      LOG_LOCALE_ERROR ("Codepoint for casing rule exceeds maximum" " allowed value", ER_LOC_GEN, true);
      er_status = ER_LOC_GEN;
      goto error;
    }

      /* destination codepoints */
      dest_len =
    intl_utf8_to_cp_list ((unsigned char *) (tf_rule->dest), tf_rule->dest_size, cp_dest,
                  INTL_CASING_EXPANSION_MULTIPLIER, &dest_cp_count);

      if (dest_len < 1 || dest_len != dest_cp_count)
    {
      LOG_LOCALE_ERROR ("Invalid destination buffer for alphabet rule", ER_LOC_GEN, true);
      er_status = ER_LOC_GEN;
      goto error;
    }

      if (tf_rule->type == TR_UPPER)
    {
      assert (dest_cp_count <= upper_mult);
      memset (&(a->upper_cp[cp_src * upper_mult]), 0, upper_mult * sizeof (uint32));
      memcpy (&(a->upper_cp[cp_src * upper_mult]), cp_dest, sizeof (uint32) * MIN (dest_cp_count, upper_mult));
    }
      else
    {
      assert (tf_rule->type == TR_LOWER);

      assert (dest_cp_count <= lower_mult);
      memset (&(a->lower_cp[cp_src * lower_mult]), 0, lower_mult * sizeof (uint32));
      memcpy (&(a->lower_cp[cp_src * lower_mult]), cp_dest, sizeof (uint32) * MIN (dest_cp_count, lower_mult));
    }
    }

  return er_status;

error:

  return er_status;
}

/*
 * load_unicode_data() - Loads the UNICODEDATA file (standardised
 *           and availabe at Unicode.org).
 * Returns: error code
 * ld(in) : locale data
 */
static int
load_unicode_data (const LOCALE_DATA * ld)
{
  FILE *fp = NULL;
  char err_msg[ERR_MSG_SIZE];
  int status = NO_ERROR;
  char str[UNICODE_FILE_LINE_SIZE];
  int line_count = 0;

  assert (ld != NULL);

  /* Build the full filepath to the selected (or default) Unicode data file */
  if (ld->unicode_mode == 0)
    {
      /* using default Unicode file */
      envvar_localedatadir_file ((char *) (ld->unicode_data_file), sizeof (ld->unicode_data_file), UNICODEDATA_FILE);
    }
  else
    {
      assert (ld->unicode_mode == 1);
    }

  if (strcmp (ld->unicode_data_file, last_unicode_file) == 0)
    {
      assert (unicode_data != NULL);
      return status;
    }

  unicode_free_data ();

  unicode_data = (UNICODE_CHAR *) malloc (MAX_UNICODE_CHARS * sizeof (UNICODE_CHAR));
  if (unicode_data == NULL)
    {
      LOG_LOCALE_ERROR ("memory allocation failed", ER_LOC_GEN, true);
      status = ER_LOC_GEN;
      goto error;
    }

  memset (unicode_data, 0, MAX_UNICODE_CHARS * sizeof (UNICODE_CHAR));

  fp = fopen_ex (ld->unicode_data_file, "rt");
  if (fp == NULL)
    {
      snprintf_dots_truncate (err_msg, sizeof (err_msg) - 1, "Cannot open file %s", ld->unicode_data_file);
      LOG_LOCALE_ERROR (err_msg, ER_LOC_GEN, true);
      status = ER_LOC_GEN;
      goto error;
    }

  while (fgets (str, sizeof (str), fp))
    {
      uint32 cp = 0;
      int result = 0;
      int i;
      char *s, *end, *end_p;
      UNICODE_CHAR *uc = NULL;

      line_count++;

      result = str_to_uint32 (&cp, &end_p, str, 16);
      /* skip Unicode values above 0xFFFF */
      if (result != 0 || cp >= MAX_UNICODE_CHARS)
    {
      continue;
    }

      s = str;
      uc = &(unicode_data[cp]);
      uc->lower_cp[0] = cp;
      uc->upper_cp[0] = cp;

      /* next field */
      s = strchr (s, ';');

      assert (s != NULL);
      if (s == NULL)
    {
      continue;
    }
      s++;

      for (i = 1; i < UNICODE_FILE_FIELDS; i++)
    {
      char str_p[UNICODE_FILE_LINE_SIZE];
      char *save;
      int cp_count;

      strcpy (str_p, s);

      end = strtok_r (str_p, ";", &save);

      /* check generic category */
      if (i == UNICODE_FILE_GENERAL_CAT_POS)
        {
          int cat_idx;

          for (cat_idx = 0; cat_idx < (int) (sizeof (list_gen_cat) / sizeof (list_gen_cat[0])); cat_idx++)
        {
          if (strcmp (list_gen_cat[cat_idx].val, str_p) == 0)
            {
              uc->gen_cat_id = list_gen_cat[cat_idx].id;
              break;
            }
        }
        }
      else if (i == UNICODE_FILE_UPPER_CASE_MAP && uc->gen_cat_id == CAT_Ll)
        {
          /* lower case codepoints */
          cp_count = string_to_int_array (str_p, uc->upper_cp, INTL_CASING_EXPANSION_MULTIPLIER, " ");
          if (cp_count > INTL_CASING_EXPANSION_MULTIPLIER)
        {
          snprintf_dots_truncate (err_msg, sizeof (err_msg) - 1,
                      "Invalid line %d" " of file %s contains more than 2 characters for "
                      "upper case definition", line_count, ld->unicode_data_file);
          LOG_LOCALE_ERROR (err_msg, ER_LOC_GEN, true);
          status = ER_LOC_GEN;
          goto error;
        }

          unicode_data_upper_mult = (cp_count > unicode_data_upper_mult) ? cp_count : unicode_data_upper_mult;
        }
      else if (i == UNICODE_FILE_LOWER_CASE_MAP && uc->gen_cat_id == CAT_Lu)
        {
          /* lower case codepoints */
          cp_count = string_to_int_array (str_p, uc->lower_cp, INTL_CASING_EXPANSION_MULTIPLIER, " ");

          if (cp_count > INTL_CASING_EXPANSION_MULTIPLIER)
        {
          snprintf_dots_truncate (err_msg, sizeof (err_msg) - 1,
                      "Invalid line %d" " of file %s contains more than 2 characters for "
                      "lower case definition", line_count, ld->unicode_data_file);
          LOG_LOCALE_ERROR (err_msg, ER_LOC_GEN, true);
          status = ER_LOC_GEN;
          goto error;
        }

          unicode_data_lower_mult = (cp_count > unicode_data_lower_mult) ? cp_count : unicode_data_lower_mult;
        }
      else if (i == UNICODE_FILE_CHAR_DECOMPOSITION_MAPPING)
        {
          uc->unicode_mapping_cp_count = 0; /* init */

          do
        {
          /* if no decomposition available, or decomposition is a compatibility one, discard the specified
           * decomposition */
          if (str_p[0] == ';' || str_p[0] == '<')
            {
              break;
            }

          if (str_p != NULL)
            {
              uc->unicode_mapping_cp_count =
            string_to_int_array (str_p, uc->unicode_mapping, UNICODE_DECOMP_MAP_CP_COUNT, " ");
            }
          break;
        }
          while (0);
        }

      s = strchr (s, ';');
      if (s == NULL)
        {
          break;
        }

      s++;
    }
    }

  assert (fp != NULL);
  fclose (fp);

  strncpy (last_unicode_file, ld->unicode_data_file, sizeof (last_unicode_file) - 1);
  last_unicode_file[sizeof (last_unicode_file) - 1] = '\0';

  return status;

error:

  if (fp != NULL)
    {
      fclose (fp);
    }

  unicode_free_data ();

  return status;
}

/*
 * unicode_free_data() - Frees Unicode data structures.
 * Returns:
 */
void
unicode_free_data (void)
{
  if (unicode_data != NULL)
    {
      free (unicode_data);
      unicode_data = NULL;
    }

  *last_unicode_file = '\0';
}

/*
 * create_alphabet () - allocated arrays for alphabet
 * Returns: error code
 * a(in/out) : alphabet
 * max_letters(in) : number of letters in alphabet
 * lower_multiplier(in) : lower case multlipier
 * upper_multiplier(in) : upper case multlipier
 */
static int
create_alphabet (ALPHABET_DATA * a, const int max_letters, const int lower_multiplier, const int upper_multiplier)
{
  int er_status = NO_ERROR;

  assert (a != NULL);
  assert (lower_multiplier > 0 && lower_multiplier <= INTL_CASING_EXPANSION_MULTIPLIER);
  assert (upper_multiplier > 0 && upper_multiplier <= INTL_CASING_EXPANSION_MULTIPLIER);

  if (lower_multiplier > 1 && upper_multiplier > 1)
    {
      LOG_LOCALE_ERROR ("CUBRID does not support collations with both lower "
            "and upper multipliers with values above 1.", ER_LOC_GEN, true);
      return ER_LOC_GEN;
    }

  memset (a, 0, sizeof (ALPHABET_DATA));

  if (max_letters <= 0 || max_letters > MAX_UNICODE_CHARS)
    {
      LOG_LOCALE_ERROR ("invalid number of letters", ER_LOC_GEN, true);
      return ER_LOC_GEN;
    }

  if (max_letters > 0)
    {
      a->lower_cp = (uint32 *) malloc (max_letters * lower_multiplier * sizeof (uint32));
      a->upper_cp = (uint32 *) malloc (max_letters * upper_multiplier * sizeof (uint32));

      if (a->lower_cp == NULL || a->upper_cp == NULL)
    {
      LOG_LOCALE_ERROR ("memory allocation failed", ER_LOC_GEN, true);
      er_status = ER_LOC_GEN;
      goto er_exit;
    }

      memset (a->lower_cp, 0, max_letters * lower_multiplier * sizeof (uint32));
      memset (a->upper_cp, 0, max_letters * upper_multiplier * sizeof (uint32));
    }

  a->l_count = max_letters;
  a->lower_multiplier = lower_multiplier;
  a->upper_multiplier = upper_multiplier;

  return er_status;

er_exit:
  if (a->lower_cp != NULL)
    {
      free (a->lower_cp);
      a->lower_cp = NULL;
    }

  if (a->upper_cp != NULL)
    {
      free (a->upper_cp);
      a->upper_cp = NULL;
    }

  return er_status;
}

/*
 * string_to_int_array() - builds a list of codepoints from a string
 *
 * Returns: count of codepoints found
 * s(in): nul-terminated string
 * cp_list(out): array of codepoints
 * cp_list_size(in): maximum allowed size of codepoint list
 * delims(in) : possible delimiters between values
 *
 *  Note : the string containts unsigned integers in hexadecimal.
 *     The case of returned number of codepoints is greater than
 *      'cp_list_size' should be handled as error.
 *
 */
int
string_to_int_array (char *s, uint32 * cp_list, const int cp_list_size, const char *delims)
{
  int i = 0;
  char *str;
  char *str_end;
  char *str_cursor;

  assert (cp_list != NULL);

  str = s;
  str_end = s + strlen (s);

  while (str != NULL && str < str_end)
    {
      int result = 0;
      uint32 val;

      result = str_to_uint32 (&val, &str_cursor, str, 16);
      if (result != 0 || str_cursor <= str)
    {
      break;
    }

      if (i < cp_list_size)
    {
      *cp_list++ = val;
    }
      i++;

      while (str_cursor < str_end && strchr (delims, *str_cursor) != NULL)
    {
      str_cursor++;
    }
      str = str_cursor;
    }

  return i;
}

/*
 * unicode_process_normalization() - Process character decomposition mappings
 *            imported from the Unicode data file, and prepare
 *            the data structures required for converting strings
 *            to fully composed.
 *
 * Returns: error code
 * ld(in/out) :    locale data structure
 * is_verbose(in): enable or disable verbose mode
 */
int
unicode_process_normalization (LOCALE_DATA * ld, bool is_verbose)
{
  int i, orig_mapping_count, curr_mapping, mapping_cursor;
  UNICODE_CP_MAPPING *um;
  UNICODE_CP_MAPPING *new_map;
  UNICODE_CHAR *uc;
  int mapping_start, mapping_count;
  UNICODE_NORMALIZATION *norm;
  uint32 cp, old_cp, j;
  int err_status = NO_ERROR;

  int *unicode_decomp_map_count = NULL;
  /* perm_unicode_mapping[cp] = the number of possible sorted permutations of the cp decomposition mapping */
  UNICODE_CP_MAPPING *temp_list_unicode_decomp_maps = NULL;

  assert (ld != NULL);
  norm = &(ld->unicode_normalization);

  err_status = load_unicode_data (ld);
  if (err_status != NO_ERROR)
    {
      goto exit;
    }

  unicode_decomp_map_count = (int *) malloc (MAX_UNICODE_CHARS * sizeof (int));
  if (unicode_decomp_map_count == NULL)
    {
      LOG_LOCALE_ERROR ("memory allocation failed", ER_LOC_GEN, true);
      err_status = ER_LOC_GEN;
      goto exit;
    }
  memset (unicode_decomp_map_count, 0, MAX_UNICODE_CHARS * sizeof (int));

  /* Count the number of steps (buffers) necessary for the decomposition of each codepoint. */
  for (cp = 0; cp < MAX_UNICODE_CHARS; cp++)
    {
      uc = &(unicode_data[cp]);

      if (uc->unicode_mapping_cp_count <= 1 || uc->unicode_mapping[0] > MAX_UNICODE_CHARS)
    {
      unicode_decomp_map_count[cp] = 0;
    }
      else
    {
      uc->unicode_full_decomp_cp_count = count_full_decomp_cp (cp);
      unicode_decomp_map_count[cp] = count_decomp_steps (cp);
    }
      if (is_verbose)
    {
      printf ("CP : %04X\t\tDeco CP count: %2d\t\tDeco steps: %2d\n", cp, uc->unicode_full_decomp_cp_count,
          unicode_decomp_map_count[cp]);
    }
      norm->unicode_mappings_count += unicode_decomp_map_count[cp];
    }

  if (is_verbose)
    {
      printf ("\nTotal number of composition maps (sum of deco steps) : %d\n", norm->unicode_mappings_count);
    }

  /* Prepare the generation of all decomposition steps for all codepoints */
  temp_list_unicode_decomp_maps =
    (UNICODE_CP_MAPPING *) malloc (norm->unicode_mappings_count * sizeof (UNICODE_CP_MAPPING));
  if (temp_list_unicode_decomp_maps == NULL)
    {
      LOG_LOCALE_ERROR ("memory allocation failed", ER_LOC_GEN, true);
      err_status = ER_LOC_GEN;
      goto exit;
    }
  memset (temp_list_unicode_decomp_maps, 0, norm->unicode_mappings_count * sizeof (UNICODE_CP_MAPPING));

  /* Copy mappings loaded from UnicodeData.txt */
  cp = 0;
  orig_mapping_count = 0;
  while (cp < MAX_UNICODE_CHARS)
    {
      if (unicode_decomp_map_count[cp] > 0)
    {
      um = &(temp_list_unicode_decomp_maps[orig_mapping_count]);
      um->cp = cp;
      um->size = unicode_data[cp].unicode_mapping_cp_count;
      um->map = (uint32 *) malloc (um->size * sizeof (uint32));
      if (um->map == NULL)
        {
          LOG_LOCALE_ERROR ("memory allocation failed", ER_LOC_GEN, true);
          err_status = ER_LOC_GEN;
          goto exit;
        }
      memcpy (um->map, unicode_data[cp].unicode_mapping, um->size * sizeof (uint32));
      orig_mapping_count++;
    }
      cp++;
    }

  /* Decompose each mapping, top-down, until no mapping can be further decomposed. Total number of decomposition
   * mappings(steps) was computed previously for each codepoint in unicode_decomp_map_count[cp] and their sum in
   * unicode_decomp_map_total. These constants will be used for validation (as assert args). */
  mapping_cursor = orig_mapping_count;
  curr_mapping = 0;
  while (curr_mapping < mapping_cursor)
    {
      if (mapping_cursor >= norm->unicode_mappings_count)
    {
      break;
    }
      um = &(temp_list_unicode_decomp_maps[curr_mapping]);
      new_map = &(temp_list_unicode_decomp_maps[mapping_cursor]);

      if (um->size > 0 && um->map[0] < MAX_UNICODE_CHARS)
    {
      if (unicode_decomp_map_count[um->map[0]] > 0)
        {
          new_map->size = um->size - 1 + unicode_data[um->map[0]].unicode_mapping_cp_count;
          new_map->cp = um->cp;
          new_map->map = (uint32 *) malloc (new_map->size * sizeof (uint32));
          if (new_map->map == NULL)
        {
          LOG_LOCALE_ERROR ("memory allocation failed", ER_LOC_GEN, true);
          err_status = ER_LOC_GEN;
          goto exit;
        }

          for (i = 0; i < new_map->size; i++)
        {
          if (i < unicode_data[um->map[0]].unicode_mapping_cp_count)
            {
              new_map->map[i] = unicode_data[um->map[0]].unicode_mapping[i];
            }
          else
            {
              new_map->map[i] = um->map[1 + i - unicode_data[um->map[0]].unicode_mapping_cp_count];
            }
        }
          mapping_cursor++;
          if (is_verbose)
        {
          printf ("\nNew mapping step : %04X -> ", um->cp);
          for (i = 0; i < new_map->size; i++)
            {
              printf ("%04X ", new_map->map[i]);
            }
        }
        }
    }
      curr_mapping++;
    }

  for (i = 0; i < norm->unicode_mappings_count; i++)
    {
      um = &(temp_list_unicode_decomp_maps[i]);
      if (um->size > 0 && unicode_decomp_map_count[um->map[0]] == 0)
    {
      /* This means that for um->cp, the um->map can't be further decomposed, thus being the fully decomposed
       * representation for um->cp. It will be marked as such. */
      um->is_full_decomp = true;
    }
    }

  /* Sort/group the decompositions in list_unicode_decomp_maps by the value of the first codepoint in each mapping. The
   * grouping is necessary for optimizing the future search for possible decompositions when putting a string in fully
   * composed form. */
  qsort (temp_list_unicode_decomp_maps, norm->unicode_mappings_count, sizeof (UNICODE_CP_MAPPING),
     comp_func_grouping_unicode_cp_mapping);

  /* Build starting indexes for each cp which is the first cp in a compact group of mappings */
  norm->unicode_mapping_index = (int *) malloc ((MAX_UNICODE_CHARS + 1) * sizeof (int));
  if (norm->unicode_mapping_index == NULL)
    {
      LOG_LOCALE_ERROR ("memory allocation failed", ER_LOC_GEN, true);
      err_status = ER_LOC_GEN;
      goto exit;
    }
  memset (norm->unicode_mapping_index, 0, (MAX_UNICODE_CHARS + 1) * sizeof (int));
  cp = temp_list_unicode_decomp_maps[0].map[0];
  mapping_start = 0;
  mapping_count = 1;
  for (i = 1; i < norm->unicode_mappings_count; i++)
    {
      if (temp_list_unicode_decomp_maps[i].map[0] == (uint32) cp)
    {
      mapping_count++;
    }
      else
    {
      SET_MAPPING_INDEX (norm->unicode_mapping_index[cp], true, mapping_start);
      old_cp = cp;
      cp = (uint32) temp_list_unicode_decomp_maps[i].map[0];
      mapping_count = 1;
      mapping_start = i;
      for (j = old_cp + 1; j < cp; j++)
        {
          SET_MAPPING_INDEX (norm->unicode_mapping_index[j], false, mapping_start);
        }
    }
    }
  SET_MAPPING_INDEX (norm->unicode_mapping_index[cp], true, mapping_start);
  SET_MAPPING_INDEX (norm->unicode_mapping_index[cp + 1], false, (mapping_start + mapping_count));

  /* Sort descending each range of UNICODE_MAPPINGs from list_unicode_decomp_maps, having the same codepoint value in
   * UNICODE_MAPPING.map[0], using memcmp. The sorting is necessary for optimizing the future search for possible
   * decompositions when putting a string in fully composed form. */
  for (cp = 0; cp < MAX_UNICODE_CHARS; cp++)
    {
      int mapping_start = 0;
      int mapping_count = 0;

      if (!CP_HAS_MAPPINGS (norm->unicode_mapping_index[cp]))
    {
      continue;
    }
      mapping_start = GET_MAPPING_OFFSET (norm->unicode_mapping_index[cp]);
      mapping_count = GET_MAPPING_OFFSET (norm->unicode_mapping_index[cp + 1]) - mapping_start;

      qsort (temp_list_unicode_decomp_maps + mapping_start, mapping_count, sizeof (UNICODE_CP_MAPPING),
         comp_func_unicode_cp_mapping);
    }

  err_status = unicode_make_normalization_data (temp_list_unicode_decomp_maps, ld);

exit:
  if (unicode_decomp_map_count != NULL)
    {
      free (unicode_decomp_map_count);
      unicode_decomp_map_count = NULL;
    }

  if (temp_list_unicode_decomp_maps != NULL)
    {
      for (i = 0; i < norm->unicode_mappings_count; i++)
    {
      um = &(temp_list_unicode_decomp_maps[i]);
      if (um->map != NULL)
        {
          free (um->map);
          /* um->map = NULL not necessary, list_unicode_decomp_maps is freed afterwards. */
        }
    }
      free (temp_list_unicode_decomp_maps);
      temp_list_unicode_decomp_maps = NULL;
    }

  return err_status;
}

/*
 * count_full_decomp_cp() - Counts the number of codepoints to needed to store
 *            the full decomposition representation for a
 *            codepoint.
 *
 * Returns: codepoint count
 * cp(in) : codepoint
 *
 *  Note : this is a recursive function.
 */
static int
count_full_decomp_cp (int cp)
{
  UNICODE_CHAR *uc;

  uc = &(unicode_data[cp]);
  if (cp >= MAX_UNICODE_CHARS)
    {
      return 1;
    }

  uc = &(unicode_data[cp]);

  if (uc->unicode_mapping_cp_count == 0)
    {
      return 1;
    }

  return uc->unicode_mapping_cp_count - 1 + count_full_decomp_cp ((int) uc->unicode_mapping[0]);
}

/*
 * count_decomp_steps() - Counts the number of steps for putting a codepoint
 *          into fully decomposed form, by replacing one
 *          decomposable codepoint at every step.
 *
 * Returns: step count
 * cp(in) : codepoint
 *
 *  Note : this is a recursive function.
 */
static int
count_decomp_steps (int cp)
{
  UNICODE_CHAR *uc;

  uc = &(unicode_data[cp]);
  if (uc->unicode_mapping_cp_count == 0)
    {
      return 0;
    }
  if ((uc->unicode_mapping_cp_count == 1 && uc->unicode_mapping[0] < MAX_UNICODE_CHARS)
      || (uc->unicode_mapping_cp_count > 1))
    {
      return 1 + count_decomp_steps (uc->unicode_mapping[0]);
    }

  return 0;
}

/*
 * unicode_make_normalization_data() - takes the data loaded from UnicodeData,
 *      which was previously sorted, and puts it into optimized form
 *      into the locale data structure, ready to be exported into
 *      a shared library.
 *
 * Returns: ER_LOC_GEN if error
 *      NO_ERROR otherwise
 * decomp_maps(in): variable holding the loaded and partially processed
 *          unicode data
 * ld(in/out): locale data
 *
 */
static int
unicode_make_normalization_data (UNICODE_CP_MAPPING * decomp_maps, LOCALE_DATA * ld)
{
  int err_status = NO_ERROR;
  int i, j;
  UNICODE_CP_MAPPING *um_cp;
  UNICODE_MAPPING *um;
  unsigned char str_buf[INTL_UTF8_MAX_CHAR_SIZE * UNICODE_DECOMP_MAP_CP_COUNT];
  unsigned char *cur_pos;
  char cur_size, byte_count;
  UNICODE_NORMALIZATION *norm;

  assert (ld != NULL);
  assert (decomp_maps != NULL);

  norm = &(ld->unicode_normalization);

  /* Prepare the unicode_mappings array for storing the data from decomp_maps as utf8 buffers + length + original
   * codepoint. */
  norm->unicode_mappings = (UNICODE_MAPPING *) malloc (norm->unicode_mappings_count * sizeof (UNICODE_MAPPING));
  if (norm->unicode_mappings == NULL)
    {
      LOG_LOCALE_ERROR ("memory allocation failed", ER_LOC_GEN, true);
      err_status = ER_LOC_GEN;
      goto exit;
    }
  memset (norm->unicode_mappings, 0, norm->unicode_mappings_count * sizeof (UNICODE_MAPPING));

  /* Prepare the index list for fully decomposed mappings */
  norm->list_full_decomp = (int *) malloc (MAX_UNICODE_CHARS * sizeof (int));
  if (norm->list_full_decomp == NULL)
    {
      LOG_LOCALE_ERROR ("memory allocation failed", ER_LOC_GEN, true);
      err_status = ER_LOC_GEN;
      goto exit;
    }
  for (i = 0; i < MAX_UNICODE_CHARS; i++)
    {
      norm->list_full_decomp[i] = -1;
    }

  /* Start importing data from decomp_maps into unicode_mappings. */
  for (i = 0; i < norm->unicode_mappings_count; i++)
    {
      um_cp = &(decomp_maps[i]);
      um = &(norm->unicode_mappings[i]);

      um->cp = um_cp->cp;

      /* Empty temporary utf8 buffer */
      memset (str_buf, 0, INTL_UTF8_MAX_CHAR_SIZE * UNICODE_DECOMP_MAP_CP_COUNT);

      /* Convert the list of codepoints into a utf8 buffer */
      cur_pos = str_buf;
      cur_size = 0;
      byte_count = 0;

      for (j = 0; j < um_cp->size; j++)
    {
      byte_count = intl_cp_to_utf8 (um_cp->map[j], cur_pos);
      cur_size += byte_count;
      cur_pos += byte_count;
    }

      memset (um->buffer, 0, sizeof (um->buffer));

      /* Make the final utf8 buffer used for normalization */
      memcpy (um->buffer, str_buf, cur_size);
      um->size = cur_size;

      /* If um_cp is a fully decomposed representation for cp, mark it as such. */
      if (um_cp->is_full_decomp)
    {
      norm->list_full_decomp[um_cp->cp] = i;
    }
    }

exit:

  return err_status;
}

/*
 * comp_func_unicode_cp_mapping() - compare function for sorting a group of
 *                  unicode decompositions starting with the
 *                  same codepoint
 *
 * Returns: compare result
 * arg1(in) :
 * arg2(in) :
 */
static int
comp_func_unicode_cp_mapping (const void *arg1, const void *arg2)
{
  UNICODE_CP_MAPPING *um1, *um2;
  int min_size, result;

  um1 = (UNICODE_CP_MAPPING *) arg1;
  um2 = (UNICODE_CP_MAPPING *) arg2;

  min_size = (um1->size < um2->size) ? um1->size : um2->size;
  result = memcmp (um1->map, um2->map, min_size * sizeof (uint32));
  /* Result will be reverted to obtain reverse ordering */
  if (result == 0)
    {
      if (um1->size > min_size)
    {
      return -1;
    }
      if (um2->size > min_size)
    {
      return 1;
    }
      if (um1->cp < um2->cp)
    {
      return -1;
    }
      return 1;
    }

  return -result;
}

/*
 * comp_func_grouping_unicode_cp_mapping() - compare function for sorting
 *                  all decompositions
 *
 * Returns: compare result
 * arg1(in) :
 * arg2(in) :
 */
static int
comp_func_grouping_unicode_cp_mapping (const void *arg1, const void *arg2)
{
  UNICODE_CP_MAPPING *um1, *um2;
  int result;

  um1 = (UNICODE_CP_MAPPING *) arg1;
  um2 = (UNICODE_CP_MAPPING *) arg2;

  if (um1->map[0] > um2->map[0])
    {
      result = 1;
    }
  else
    {
      result = -1;
    }

  return result;
}
#endif //#if defined(SA_MODE)

#if !defined (SERVER_MODE)
/*
 * unicode_string_need_compose() - Checks if a string needs composition
 *                 and returns the size required by fully
 *                 composed form.
 *
 * Returns:
 * str_in(in) : string to normalize
 * size_in(in) : size in bytes of string
 * size_out(out) : size in bytes of composed string
 * need_compose(out) : true if composition is required, false otherwise
 * norm(in) : the unicode data for normalization
 *
 *  Note : this is light check, since full check requires more complex
 *     processing - same as composing algorithm.
 *     All input is assumed in UTF-8 character set
 */
bool
unicode_string_need_compose (const char *str_in, const int size_in, int *size_out, const UNICODE_NORMALIZATION * norm)
{
  const char *pc;
  const char *p_end;

  assert (size_out != NULL);

  *size_out = 0;

  if (!prm_get_bool_value (PRM_ID_UNICODE_INPUT_NORMALIZATION) || norm == NULL || size_in == 0 || str_in == NULL)
    {
      return false;
    }

  assert (str_in != NULL);

  /* If all chars are in the range 0-127, then the string is ASCII and no unicode operations are neccessary e.g.
   * composition */
  /* Reuse match_found as validation flag. */
  p_end = str_in + size_in;

  for (pc = str_in; pc < p_end; pc++)
    {
      if ((unsigned char) (*pc) >= 0x80)
    {
      *size_out = size_in;
      return true;
    }
    }

  return false;
}

/*
 * unicode_compose_string() - Put a string into fully composed form.
 *
 * Returns:
 * str_in(in) : string to normalize
 * size_in(in) : size in bytes of string
 * str_out(out) : preallocated buffer to store composed string, output string
 *        is not null terminated
 * size_out(out) : actual size in bytes of composed string
 * is_composed (out) : true if the string required composition
 * norm(in) : the unicode data for normalization
 */
void
unicode_compose_string (const char *str_in, const int size_in, char *str_out, int *size_out, bool * is_composed,
            const UNICODE_NORMALIZATION * norm)
{
  char *composed_str;
  int composed_index, remaining_bytes;
  const char *str_next = NULL;
  unsigned int cp;
  int map_start, map_end, i, byte_count;
  bool match_found = false, composition_found;
  UNICODE_MAPPING *um;
  const char *str_cursor;
  const char *str_end;

  assert (prm_get_bool_value (PRM_ID_UNICODE_INPUT_NORMALIZATION) && norm != NULL && size_in > 0 && str_in != NULL);

  composed_index = 0;

  /* Build composed string */
  str_next = str_in;
  str_cursor = str_in;
  remaining_bytes = size_in;
  composition_found = false;
  composed_str = str_out;
  str_end = str_in + size_in;

  while (str_cursor < str_end)
    {
      int first_cp_size;

      cp = intl_utf8_to_cp ((unsigned char *) str_cursor, remaining_bytes, (unsigned char **) &str_next);

      first_cp_size = CAST_STRLEN (str_next - str_cursor);
      remaining_bytes -= first_cp_size;

      match_found = false;

      if (cp >= MAX_UNICODE_CHARS - 2 || !CP_HAS_MAPPINGS (norm->unicode_mapping_index[cp]))
    {
      goto match_not_found;
    }

      map_start = GET_MAPPING_OFFSET (norm->unicode_mapping_index[cp]);
      map_end = GET_MAPPING_OFFSET (norm->unicode_mapping_index[cp + 1]);

      /* Search the mapping list for a possible match */
      for (i = map_start; i < map_end; i++)
    {
      um = &(norm->unicode_mappings[i]);
      if (um->size > remaining_bytes + first_cp_size)
        {
          continue;
        }

      if (memcmp (um->buffer, str_cursor, um->size) == 0)
        {
          /* If a composition matches, apply it. */
          composed_index += intl_cp_to_utf8 (um->cp, (unsigned char *) (&(composed_str[composed_index])));
          str_cursor += um->size;
          match_found = true;
          composition_found = true;
          break;
        }
    }

      /* If no composition can be matched to start with the decoded codepoint, just copy the bytes corresponding to the
       * codepoint from the input string to the output, adjust pointers and loop again. */
    match_not_found:
      if (!match_found)
    {
      byte_count = CAST_STRLEN (str_next - str_cursor);
      memcpy (&(composed_str[composed_index]), str_cursor, byte_count);
      composed_index += byte_count;
      str_cursor += byte_count;
    }
    }               /* while */

  /* Set output variables */
  *size_out = composed_index;
  if (composition_found)
    {
      *is_composed = true;
    }

  return;
}

/*
 * unicode_string_need_decompose() - Checks if a string needs
 *                   decomposition and returns the size
 *                   required by decomposed form.
 *
 * Returns: true if decomposition is required
 * str_in(in) : string to normalize
 * size_in(in) : size of string in bytes
 * decomp_size(out) : size required by decomposed form in bytes
 * norm(in) : the unicode context in which the normalization is performed
 *
 *  Note : Input string is assumed UTF-8 character set.
 */
bool
unicode_string_need_decompose (const char *str_in, const int size_in, int *decomp_size,
                   const UNICODE_NORMALIZATION * norm)
{
  int bytes_read, decomp_index, decomposed_size = 0;
  unsigned int cp;
  const char *src_cursor;
  const char *src_end;
  const char *next;
  bool can_decompose;

  if (!prm_get_bool_value (PRM_ID_UNICODE_OUTPUT_NORMALIZATION) || norm == NULL)
    {
      goto no_decompose_cnt;
    }

  assert (str_in != NULL);

  /* check if ASCII */
  can_decompose = false;
  src_end = str_in + size_in;
  for (src_cursor = str_in; src_cursor < src_end; src_cursor++)
    {
      if ((unsigned char) (*src_cursor) >= 0x80)
    {
      can_decompose = true;
      break;
    }
    }
  if (!can_decompose)
    {
      goto no_decompose_cnt;
    }

  /* Read each codepoint and add its expanded size to the overall size */
  src_cursor = str_in;
  next = str_in;
  can_decompose = false;
  src_end = str_in + size_in;
  while (src_cursor < src_end)
    {
      cp = intl_utf8_to_cp ((unsigned char *) src_cursor, CAST_STRLEN (src_end - src_cursor), (unsigned char **) &next);
      bytes_read = CAST_STRLEN (next - src_cursor);

      decomp_index = (cp < MAX_UNICODE_CHARS) ? norm->list_full_decomp[cp] : -1;
      if (decomp_index > -1)
    {
      decomposed_size += norm->unicode_mappings[decomp_index].size;
      can_decompose = true;
    }
      else
    {
      decomposed_size += bytes_read;
    }

      src_cursor = next;
    }

  /* If no decomposition is needed, return the same size as the input string and exit. */
  if (!can_decompose)
    {
      goto no_decompose_cnt;
    }

  *decomp_size = decomposed_size;

  return true;

no_decompose_cnt:
  *decomp_size = size_in;

  return false;
}

/*
 * unicode_decompose_string() - Put a string into fully decomposed form.
 *
 * Returns: ER_OUT_OF_VIRTUAL_MEMORY if internal memory allocation fails
 *      NO_ERROR if successfull
 * str_in(in) : string to normalize
 * size_in(in) : size in bytes of string
 * str_out(out): preallocated buffer for string in decomposed form
 * size_out(out): actual size of decomposed form in bytes
 * norm(in) : the unicode context in which the normalization is performed
 */
void
unicode_decompose_string (const char *str_in, const int size_in, char *str_out, int *size_out,
              const UNICODE_NORMALIZATION * norm)
{
  int bytes_read, decomp_index;
  unsigned int cp;
  const char *src_cursor;
  const char *src_end;
  const char *next;
  char *dest_cursor;

  assert (prm_get_bool_value (PRM_ID_UNICODE_OUTPUT_NORMALIZATION) && norm != NULL);

  assert (str_in != NULL);
  assert (str_out != NULL);
  assert (size_out != NULL);

  src_cursor = str_in;
  dest_cursor = str_out;
  next = str_in;
  src_end = str_in + size_in;
  while (src_cursor < src_end)
    {
      cp = intl_utf8_to_cp ((unsigned char *) src_cursor, CAST_STRLEN (src_end - src_cursor), (unsigned char **) &next);
      bytes_read = CAST_STRLEN (next - src_cursor);
      decomp_index = (cp < MAX_UNICODE_CHARS) ? norm->list_full_decomp[cp] : -1;
      if (decomp_index > -1)
    {
      memcpy (dest_cursor, norm->unicode_mappings[decomp_index].buffer, norm->unicode_mappings[decomp_index].size);
      dest_cursor += norm->unicode_mappings[decomp_index].size;
    }
      else
    {
      memcpy (dest_cursor, src_cursor, bytes_read);
      dest_cursor += bytes_read;
    }
      src_cursor = next;
    }

  *size_out = CAST_STRLEN (dest_cursor - str_out);
}
#endif /* SERVER_MODE */