cubrid-doxygen/unicode__support_8c_source.html

 /*
  * Copyright 2008 Search Solution Corporation
  * Copyright 2016 CUBRID Corporation
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
  *  You may obtain a copy of the License at
  *
  *      http://www.apache.org/licenses/LICENSE-2.0
  *
  *  Unless required by applicable law or agreed to in writing, software
  *  distributed under the License is distributed on an "AS IS" BASIS,
  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  *  See the License for the specific language governing permissions and
  *  limitations under the License.
  *
  */

 /*
  * unicode_support.c : Unicode support
  */
 #include "config.h"

 #include <stdio.h>
 #include <stdlib.h>
 #include <assert.h>

 #include "porting.h"

 #include "locale_support.h"
 #include "intl_support.h"
 #include "language_support.h"
 #include "error_manager.h"
 #include "utility.h"
 #include "environment_variable.h"
 #include "system_parameter.h"
 #include "unicode_support.h"


 #define UNICODEDATA_FILE "unicodedata.txt"

 /* Unicode data file constants */
 #define UNICODE_FILE_LINE_SIZE 512
 #define UNICODE_FILE_FIELDS 14

 /* Field position : starting from 0 */
 #define UNICODE_FILE_GENERAL_CAT_POS        2
 #define UNICODE_FILE_CHAR_DECOMPOSITION_MAPPING 5
 #define UNICODE_FILE_UPPER_CASE_MAP     12
 #define UNICODE_FILE_LOWER_CASE_MAP     13

 typedef enum
 {
   CAT_Cn = 0,           /* other, not assigned */
   CAT_Lu,           /* Letter, uppercase */
   CAT_Ll,           /* Letter, lowercase */

   /* add new values here */
   CAT_MAX           /* maximum category value */
 } GENERAL_CATEG_ID;

 typedef struct
 {
   GENERAL_CATEG_ID id;
   const char *val;
 } GENERAL_CATEGORY;


 /* available list of general categories (id, name) */
 GENERAL_CATEGORY list_gen_cat[] = {
   {CAT_Lu, "Lu"},
   {CAT_Ll, "Ll"},
 };

 typedef struct
 {
   int id;
   char *std_val;        /* Standard value as defined by Unicode Consortium */
 } CANONICAL_COMBINING_CLASS;

 /* The maximum number of codepoints to which a single codepoint can be
  * rewritten in canonically fully decomposed form.
  */
 #define UNICODE_DECOMP_MAP_CP_COUNT 4

 typedef struct
 {
   /* general category for this character */
   GENERAL_CATEG_ID gen_cat_id;

   uint32 lower_cp[INTL_CASING_EXPANSION_MULTIPLIER];
   uint32 upper_cp[INTL_CASING_EXPANSION_MULTIPLIER];

   char unicode_mapping_cp_count;

   uint32 unicode_mapping[UNICODE_DECOMP_MAP_CP_COUNT];
   char unicode_full_decomp_cp_count;

 } UNICODE_CHAR;

 typedef struct
 {
   uint32 cp;            /* Codepoint value */
   uint32 *map;          /* A fully decomposed canonical mapping stored as codepoints */

   int size;         /* The number of codepoints in the mapping */
   bool is_full_decomp;      /* true - if map is fully decomposed false - otherwise. */
 } UNICODE_CP_MAPPING;

 static UNICODE_CHAR *unicode_data = NULL;
 static int unicode_data_lower_mult = 1;
 static int unicode_data_upper_mult = 1;

 static char last_unicode_file[PATH_MAX] = { 0 };

 static int load_unicode_data (const LOCALE_DATA * ld);
 static int create_alphabet (ALPHABET_DATA * a, const int max_letters, const int lower_multiplier,
                 const int upper_multiplier);
 static int count_full_decomp_cp (int cp);
 static int count_decomp_steps (int cp);
 static int unicode_make_normalization_data (UNICODE_CP_MAPPING * decomp_maps, LOCALE_DATA * ld);
 static int comp_func_unicode_cp_mapping (const void *arg1, const void *arg2);
 static int comp_func_grouping_unicode_cp_mapping (const void *arg1, const void *arg2);


 /*
  * unicode_process_alphabet() - Process alphabet (casing) data for given
  *              locale
  *
  * Returns: error code
  * ld(in/out) : locale data structure
  */
 int
 unicode_process_alphabet (LOCALE_DATA * ld, bool is_verbose)
 {
   ALPHABET_DATA *a = NULL;
   ALPHABET_DATA *i_a = NULL;
   ALPHABET_TAILORING *a_tailoring = NULL;
   char unicode_file[PATH_MAX];
   char err_msg[ERR_MSG_SIZE];
   int er_status = NO_ERROR;
   uint32 cp;
   int lower_mult = 1;
   int upper_mult = 1;
   int i;

   assert (ld != NULL);

   a = &(ld->alphabet);
   i_a = &(ld->identif_alphabet);
   a_tailoring = &(ld->alpha_tailoring);

   /* compute lower and upper multiplier from rules */
   for (i = 0; i < a_tailoring->count_rules; i++)
     {
       TRANSFORM_RULE *tf_rule = &(a_tailoring->rules[i]);
       uint32 dummy_array;
       int dummy;
       int dest_len;

       dest_len = intl_utf8_to_cp_list ((unsigned char *) (tf_rule->dest), tf_rule->dest_size, &dummy_array, 1, &dummy);

       if (dest_len > INTL_CASING_EXPANSION_MULTIPLIER)
     {
       snprintf (err_msg, sizeof (err_msg) - 1,
             "Invalid alphabet rule :%d" ". Destination buffer contains more than 2 characters", i);
       LOG_LOCALE_ERROR (err_msg, ER_LOC_GEN, true);
       er_status = ER_LOC_GEN;
       goto error;
     }
       if (tf_rule->type == TR_UPPER)
     {
       upper_mult = MAX (upper_mult, dest_len);
     }
       else
     {
       assert (tf_rule->type == TR_LOWER);
       lower_mult = MAX (lower_mult, dest_len);
     }
     }

   if (a_tailoring->alphabet_mode == 2)
     {
       if (is_verbose)
     {
       printf ("Creating ASCII alphabet\n");
     }

       /* ASCII alphabet */
       er_status = create_alphabet (a, a_tailoring->sett_max_letters, lower_mult, upper_mult);
       if (er_status != NO_ERROR)
     {
       goto error;
     }

       er_status = create_alphabet (i_a, a_tailoring->sett_max_letters, 1, 1);
       if (er_status != NO_ERROR)
     {
       goto error;
     }

       for (cp = 0; (int) cp < a->l_count; cp++)
     {
       i_a->upper_cp[cp] = a->upper_cp[cp] = cp;
       i_a->lower_cp[cp] = a->lower_cp[cp] = cp;
     }

       for (cp = (int) 'a'; cp <= (int) 'z'; cp++)
     {
       i_a->upper_cp[cp] = a->upper_cp[cp] = cp - ('a' - 'A');
       i_a->lower_cp[cp - ('a' - 'A')] = a->lower_cp[cp - ('a' - 'A')] = cp;
     }

       i_a->a_type = a->a_type = ALPHABET_ASCII;
     }
   else
     {
       if (a_tailoring->alphabet_mode == 1)
     {
       strncpy (unicode_file, a_tailoring->unicode_data_file, sizeof (unicode_file));
       unicode_file[sizeof (unicode_file) - 1] = '\0';

       /* a user defined unicode file is handled as a tailored alphabet */
       a->a_type = ALPHABET_TAILORED;
     }
       else
     {
       assert (a_tailoring->alphabet_mode == 0);
       envvar_localedatadir_file (unicode_file, sizeof (unicode_file), UNICODEDATA_FILE);

       a->a_type = ALPHABET_UNICODE;
     }

       if (is_verbose)
     {
       printf ("Creating UNICODE alphabet from: %s\n", unicode_file);
     }

       er_status = load_unicode_data (ld);
       if (er_status != NO_ERROR)
     {
       goto error;
     }

       lower_mult = MAX (lower_mult, unicode_data_lower_mult);
       upper_mult = MAX (upper_mult, unicode_data_upper_mult);

       er_status = create_alphabet (a, a_tailoring->sett_max_letters, lower_mult, upper_mult);
       if (er_status != NO_ERROR)
     {
       goto error;
     }

       er_status =
     create_alphabet (i_a, a_tailoring->sett_max_letters, unicode_data_lower_mult, unicode_data_upper_mult);
       if (er_status != NO_ERROR)
     {
       goto error;
     }

       for (cp = 0; (int) cp < a_tailoring->sett_max_letters; cp++)
     {
       /* set lower and upper case of each codepoint to itself */
       a->lower_cp[cp * lower_mult] = cp;
       a->upper_cp[cp * upper_mult] = cp;

       i_a->lower_cp[cp * unicode_data_lower_mult] = cp;
       i_a->upper_cp[cp * unicode_data_upper_mult] = cp;

       /* overwrite with UnicodeData */
       if (unicode_data[cp].gen_cat_id == CAT_Lu)
         {
           memcpy (&(a->lower_cp[cp * lower_mult]), &(unicode_data[cp].lower_cp),
               sizeof (uint32) * MIN (unicode_data_lower_mult, lower_mult));

           memcpy (&(i_a->lower_cp[cp * unicode_data_lower_mult]), &(unicode_data[cp].lower_cp),
               sizeof (uint32) * unicode_data_lower_mult);
         }
       else if (unicode_data[cp].gen_cat_id == CAT_Ll)
         {
           memcpy (&(a->upper_cp[cp * upper_mult]), &(unicode_data[cp].upper_cp),
               sizeof (uint32) * MIN (unicode_data_upper_mult, upper_mult));

           memcpy (&(i_a->upper_cp[cp * unicode_data_upper_mult]), &(unicode_data[cp].upper_cp),
               sizeof (uint32) * unicode_data_upper_mult);
         }
     }
     }

   if (a_tailoring->count_rules > 0)
     {
       a->a_type = ALPHABET_TAILORED;
     }

   if (is_verbose && a_tailoring->count_rules > 0)
     {
       printf ("Applying %d alphabet tailoring rules\n", a_tailoring->count_rules);
     }
   /* apply tailoring rules on user-alphabet only */
   for (i = 0; i < a_tailoring->count_rules; i++)
     {
       TRANSFORM_RULE *tf_rule = &(a_tailoring->rules[i]);
       uint32 cp_src;
       uint32 cp_dest[INTL_CASING_EXPANSION_MULTIPLIER];
       int src_cp_count = 0;
       int src_len = 0;
       int dest_cp_count = 0;
       int dest_len = 0;

       /* source codepoints */
       /* TODO : allow casing compression (many CPs for source) */
       src_len = intl_utf8_to_cp_list ((unsigned char *) (tf_rule->src), tf_rule->src_size, &cp_src, 1, &src_cp_count);

       if (src_len != 1 || src_len != src_cp_count)
     {
       LOG_LOCALE_ERROR ("Invalid source buffer for alphabet rule", ER_LOC_GEN, true);
       er_status = ER_LOC_GEN;
       goto error;
     }

       if ((int) cp_src >= a_tailoring->sett_max_letters)
     {
       LOG_LOCALE_ERROR ("Codepoint for casing rule exceeds maximum" " allowed value", ER_LOC_GEN, true);
       er_status = ER_LOC_GEN;
       goto error;
     }

       /* destination codepoints */
       dest_len =
     intl_utf8_to_cp_list ((unsigned char *) (tf_rule->dest), tf_rule->dest_size, cp_dest,
                   INTL_CASING_EXPANSION_MULTIPLIER, &dest_cp_count);

       if (dest_len < 1 || dest_len != dest_cp_count)
     {
       LOG_LOCALE_ERROR ("Invalid destination buffer for alphabet rule", ER_LOC_GEN, true);
       er_status = ER_LOC_GEN;
       goto error;
     }

       if (tf_rule->type == TR_UPPER)
     {
       assert (dest_cp_count <= upper_mult);
       memset (&(a->upper_cp[cp_src * upper_mult]), 0, upper_mult * sizeof (uint32));
       memcpy (&(a->upper_cp[cp_src * upper_mult]), cp_dest, sizeof (uint32) * MIN (dest_cp_count, upper_mult));
     }
       else
     {
       assert (tf_rule->type == TR_LOWER);

       assert (dest_cp_count <= lower_mult);
       memset (&(a->lower_cp[cp_src * lower_mult]), 0, lower_mult * sizeof (uint32));
       memcpy (&(a->lower_cp[cp_src * lower_mult]), cp_dest, sizeof (uint32) * MIN (dest_cp_count, lower_mult));
     }
     }

   return er_status;

 error:

   return er_status;
 }

 /*
  * load_unicode_data() - Loads the UNICODEDATA file (standardised
  *           and availabe at Unicode.org).
  * Returns: error code
  * ld(in) : locale data
  */
 static int
 load_unicode_data (const LOCALE_DATA * ld)
 {
   FILE *fp = NULL;
   char err_msg[ERR_MSG_SIZE];
   int status = NO_ERROR;
   char str[UNICODE_FILE_LINE_SIZE];
   int line_count = 0;

   assert (ld != NULL);

   /* Build the full filepath to the selected (or default) Unicode data file */
   if (ld->unicode_mode == 0)
     {
       /* using default Unicode file */
       envvar_localedatadir_file ((char *) (ld->unicode_data_file), sizeof (ld->unicode_data_file), UNICODEDATA_FILE);
     }
   else
     {
       assert (ld->unicode_mode == 1);
     }

   if (strcmp (ld->unicode_data_file, last_unicode_file) == 0)
     {
       assert (unicode_data != NULL);
       return status;
     }

   unicode_free_data ();

   unicode_data = (UNICODE_CHAR *) malloc (MAX_UNICODE_CHARS * sizeof (UNICODE_CHAR));
   if (unicode_data == NULL)
     {
       LOG_LOCALE_ERROR ("memory allocation failed", ER_LOC_GEN, true);
       status = ER_LOC_GEN;
       goto error;
     }

   memset (unicode_data, 0, MAX_UNICODE_CHARS * sizeof (UNICODE_CHAR));

   fp = fopen_ex (ld->unicode_data_file, "rt");
   if (fp == NULL)
     {
       snprintf_dots_truncate (err_msg, sizeof (err_msg) - 1, "Cannot open file %s", ld->unicode_data_file);
       LOG_LOCALE_ERROR (err_msg, ER_LOC_GEN, true);
       status = ER_LOC_GEN;
       goto error;
     }

   while (fgets (str, sizeof (str), fp))
     {
       uint32 cp = 0;
       int result = 0;
       int i;
       char *s, *end, *end_p;
       UNICODE_CHAR *uc = NULL;

       line_count++;

       result = str_to_uint32 (&cp, &end_p, str, 16);
       /* skip Unicode values above 0xFFFF */
       if (result != 0 || cp >= MAX_UNICODE_CHARS)
     {
       continue;
     }

       s = str;
       uc = &(unicode_data[cp]);
       uc->lower_cp[0] = cp;
       uc->upper_cp[0] = cp;

       /* next field */
       s = strchr (s, ';');

       assert (s != NULL);
       if (s == NULL)
     {
       continue;
     }
       s++;

       for (i = 1; i < UNICODE_FILE_FIELDS; i++)
     {
       char str_p[UNICODE_FILE_LINE_SIZE];
       char *save;
       int cp_count;

       strcpy (str_p, s);

       end = strtok_r (str_p, ";", &save);

       /* check generic category */
       if (i == UNICODE_FILE_GENERAL_CAT_POS)
         {
           int cat_idx;

           for (cat_idx = 0; cat_idx < (int) (sizeof (list_gen_cat) / sizeof (list_gen_cat[0])); cat_idx++)
         {
           if (strcmp (list_gen_cat[cat_idx].val, str_p) == 0)
             {
               uc->gen_cat_id = list_gen_cat[cat_idx].id;
               break;
             }
         }
         }
       else if (i == UNICODE_FILE_UPPER_CASE_MAP && uc->gen_cat_id == CAT_Ll)
         {
           /* lower case codepoints */
           cp_count = string_to_int_array (str_p, uc->upper_cp, INTL_CASING_EXPANSION_MULTIPLIER, " ");
           if (cp_count > INTL_CASING_EXPANSION_MULTIPLIER)
         {
           snprintf_dots_truncate (err_msg, sizeof (err_msg) - 1,
                       "Invalid line %d" " of file %s contains more than 2 characters for "
                       "upper case definition", line_count, ld->unicode_data_file);
           LOG_LOCALE_ERROR (err_msg, ER_LOC_GEN, true);
           status = ER_LOC_GEN;
           goto error;
         }

           unicode_data_upper_mult = (cp_count > unicode_data_upper_mult) ? cp_count : unicode_data_upper_mult;
         }
       else if (i == UNICODE_FILE_LOWER_CASE_MAP && uc->gen_cat_id == CAT_Lu)
         {
           /* lower case codepoints */
           cp_count = string_to_int_array (str_p, uc->lower_cp, INTL_CASING_EXPANSION_MULTIPLIER, " ");

           if (cp_count > INTL_CASING_EXPANSION_MULTIPLIER)
         {
           snprintf_dots_truncate (err_msg, sizeof (err_msg) - 1,
                       "Invalid line %d" " of file %s contains more than 2 characters for "
                       "lower case definition", line_count, ld->unicode_data_file);
           LOG_LOCALE_ERROR (err_msg, ER_LOC_GEN, true);
           status = ER_LOC_GEN;
           goto error;
         }

           unicode_data_lower_mult = (cp_count > unicode_data_lower_mult) ? cp_count : unicode_data_lower_mult;
         }
       else if (i == UNICODE_FILE_CHAR_DECOMPOSITION_MAPPING)
         {
           uc->unicode_mapping_cp_count = 0; /* init */

           do
         {
           /* if no decomposition available, or decomposition is a compatibility one, discard the specified
            * decomposition */
           if (str_p[0] == ';' || str_p[0] == '<')
             {
               break;
             }

           if (str_p != NULL)
             {
               uc->unicode_mapping_cp_count =
             string_to_int_array (str_p, uc->unicode_mapping, UNICODE_DECOMP_MAP_CP_COUNT, " ");
             }
           break;
         }
           while (0);
         }

       s = strchr (s, ';');
       if (s == NULL)
         {
           break;
         }

       s++;
     }
     }

   assert (fp != NULL);
   fclose (fp);

   strncpy (last_unicode_file, ld->unicode_data_file, sizeof (last_unicode_file) - 1);
   last_unicode_file[sizeof (last_unicode_file) - 1] = '\0';

   return status;

 error:

   if (fp != NULL)
     {
       fclose (fp);
     }

   unicode_free_data ();

   return status;
 }

 /*
  * unicode_free_data() - Frees Unicode data structures.
  * Returns:
  */
 void
 unicode_free_data (void)
 {
   if (unicode_data != NULL)
     {
       free (unicode_data);
       unicode_data = NULL;
     }

   *last_unicode_file = '\0';
 }

 /*
  * create_alphabet () - allocated arrays for alphabet
  * Returns: error code
  * a(in/out) : alphabet
  * max_letters(in) : number of letters in alphabet
  * lower_multiplier(in) : lower case multlipier
  * upper_multiplier(in) : upper case multlipier
  */
 static int
 create_alphabet (ALPHABET_DATA * a, const int max_letters, const int lower_multiplier, const int upper_multiplier)
 {
   int er_status = NO_ERROR;

   assert (a != NULL);
   assert (lower_multiplier > 0 && lower_multiplier <= INTL_CASING_EXPANSION_MULTIPLIER);
   assert (upper_multiplier > 0 && upper_multiplier <= INTL_CASING_EXPANSION_MULTIPLIER);

   if (lower_multiplier > 1 && upper_multiplier > 1)
     {
       LOG_LOCALE_ERROR ("CUBRID does not support collations with both lower "
             "and upper multipliers with values above 1.", ER_LOC_GEN, true);
       return ER_LOC_GEN;
     }

   memset (a, 0, sizeof (ALPHABET_DATA));

   if (max_letters <= 0 || max_letters > MAX_UNICODE_CHARS)
     {
       LOG_LOCALE_ERROR ("invalid number of letters", ER_LOC_GEN, true);
       return ER_LOC_GEN;
     }

   if (max_letters > 0)
     {
       a->lower_cp = (uint32 *) malloc (max_letters * lower_multiplier * sizeof (uint32));
       a->upper_cp = (uint32 *) malloc (max_letters * upper_multiplier * sizeof (uint32));

       if (a->lower_cp == NULL || a->upper_cp == NULL)
     {
       LOG_LOCALE_ERROR ("memory allocation failed", ER_LOC_GEN, true);
       er_status = ER_LOC_GEN;
       goto er_exit;
     }

       memset (a->lower_cp, 0, max_letters * lower_multiplier * sizeof (uint32));
       memset (a->upper_cp, 0, max_letters * upper_multiplier * sizeof (uint32));
     }

   a->l_count = max_letters;
   a->lower_multiplier = lower_multiplier;
   a->upper_multiplier = upper_multiplier;

   return er_status;

 er_exit:
   if (a->lower_cp != NULL)
     {
       free (a->lower_cp);
       a->lower_cp = NULL;
     }

   if (a->upper_cp != NULL)
     {
       free (a->upper_cp);
       a->upper_cp = NULL;
     }

   return er_status;
 }

 /*
  * string_to_int_array() - builds a list of codepoints from a string
  *
  * Returns: count of codepoints found
  * s(in): nul-terminated string
  * cp_list(out): array of codepoints
  * cp_list_size(in): maximum allowed size of codepoint list
  * delims(in) : possible delimiters between values
  *
  *  Note : the string containts unsigned integers in hexadecimal.
  *     The case of returned number of codepoints is greater than
  *      'cp_list_size' should be handled as error.
  *
  */
 int
 string_to_int_array (char *s, uint32 * cp_list, const int cp_list_size, const char *delims)
 {
   int i = 0;
   char *str;
   char *str_end;
   char *str_cursor;

   assert (cp_list != NULL);

   str = s;
   str_end = s + strlen (s);

   while (str != NULL && str < str_end)
     {
       int result = 0;
       uint32 val;

       result = str_to_uint32 (&val, &str_cursor, str, 16);
       if (result != 0 || str_cursor <= str)
     {
       break;
     }

       if (i < cp_list_size)
     {
       *cp_list++ = val;
     }
       i++;

       while (str_cursor < str_end && strchr (delims, *str_cursor) != NULL)
     {
       str_cursor++;
     }
       str = str_cursor;
     }

   return i;
 }

 /*
  * unicode_process_normalization() - Process character decomposition mappings
  *            imported from the Unicode data file, and prepare
  *            the data structures required for converting strings
  *            to fully composed.
  *
  * Returns: error code
  * ld(in/out) :    locale data structure
  * is_verbose(in): enable or disable verbose mode
  */
 int
 unicode_process_normalization (LOCALE_DATA * ld, bool is_verbose)
 {
   int i, orig_mapping_count, curr_mapping, mapping_cursor;
   UNICODE_CP_MAPPING *um;
   UNICODE_CP_MAPPING *new_map;
   UNICODE_CHAR *uc;
   int mapping_start, mapping_count;
   UNICODE_NORMALIZATION *norm;
   uint32 cp, old_cp, j;
   int err_status = NO_ERROR;

   int *unicode_decomp_map_count = NULL;
   /* perm_unicode_mapping[cp] = the number of possible sorted permutations of the cp decomposition mapping */
   UNICODE_CP_MAPPING *temp_list_unicode_decomp_maps = NULL;

   assert (ld != NULL);
   norm = &(ld->unicode_normalization);

   err_status = load_unicode_data (ld);
   if (err_status != NO_ERROR)
     {
       goto exit;
     }

   unicode_decomp_map_count = (int *) malloc (MAX_UNICODE_CHARS * sizeof (int));
   if (unicode_decomp_map_count == NULL)
     {
       LOG_LOCALE_ERROR ("memory allocation failed", ER_LOC_GEN, true);
       err_status = ER_LOC_GEN;
       goto exit;
     }
   memset (unicode_decomp_map_count, 0, MAX_UNICODE_CHARS * sizeof (int));

   /* Count the number of steps (buffers) necessary for the decomposition of each codepoint. */
   for (cp = 0; cp < MAX_UNICODE_CHARS; cp++)
     {
       uc = &(unicode_data[cp]);

       if (uc->unicode_mapping_cp_count <= 1 || uc->unicode_mapping[0] > MAX_UNICODE_CHARS)
     {
       unicode_decomp_map_count[cp] = 0;
     }
       else
     {
       uc->unicode_full_decomp_cp_count = count_full_decomp_cp (cp);
       unicode_decomp_map_count[cp] = count_decomp_steps (cp);
     }
       if (is_verbose)
     {
       printf ("CP : %04X\t\tDeco CP count: %2d\t\tDeco steps: %2d\n", cp, uc->unicode_full_decomp_cp_count,
           unicode_decomp_map_count[cp]);
     }
       norm->unicode_mappings_count += unicode_decomp_map_count[cp];
     }

   if (is_verbose)
     {
       printf ("\nTotal number of composition maps (sum of deco steps) : %d\n", norm->unicode_mappings_count);
     }

   /* Prepare the generation of all decomposition steps for all codepoints */
   temp_list_unicode_decomp_maps =
     (UNICODE_CP_MAPPING *) malloc (norm->unicode_mappings_count * sizeof (UNICODE_CP_MAPPING));
   if (temp_list_unicode_decomp_maps == NULL)
     {
       LOG_LOCALE_ERROR ("memory allocation failed", ER_LOC_GEN, true);
       err_status = ER_LOC_GEN;
       goto exit;
     }
   memset (temp_list_unicode_decomp_maps, 0, norm->unicode_mappings_count * sizeof (UNICODE_CP_MAPPING));

   /* Copy mappings loaded from UnicodeData.txt */
   cp = 0;
   orig_mapping_count = 0;
   while (cp < MAX_UNICODE_CHARS)
     {
       if (unicode_decomp_map_count[cp] > 0)
     {
       um = &(temp_list_unicode_decomp_maps[orig_mapping_count]);
       um->cp = cp;
       um->size = unicode_data[cp].unicode_mapping_cp_count;
       um->map = (uint32 *) malloc (um->size * sizeof (uint32));
       if (um->map == NULL)
         {
           LOG_LOCALE_ERROR ("memory allocation failed", ER_LOC_GEN, true);
           err_status = ER_LOC_GEN;
           goto exit;
         }
       memcpy (um->map, unicode_data[cp].unicode_mapping, um->size * sizeof (uint32));
       orig_mapping_count++;
     }
       cp++;
     }

   /* Decompose each mapping, top-down, until no mapping can be further decomposed. Total number of decomposition
    * mappings(steps) was computed previously for each codepoint in unicode_decomp_map_count[cp] and their sum in
    * unicode_decomp_map_total. These constants will be used for validation (as assert args). */
   mapping_cursor = orig_mapping_count;
   curr_mapping = 0;
   while (curr_mapping < mapping_cursor)
     {
       if (mapping_cursor >= norm->unicode_mappings_count)
     {
       break;
     }
       um = &(temp_list_unicode_decomp_maps[curr_mapping]);
       new_map = &(temp_list_unicode_decomp_maps[mapping_cursor]);

       if (um->size > 0 && um->map[0] < MAX_UNICODE_CHARS)
     {
       if (unicode_decomp_map_count[um->map[0]] > 0)
         {
           new_map->size = um->size - 1 + unicode_data[um->map[0]].unicode_mapping_cp_count;
           new_map->cp = um->cp;
           new_map->map = (uint32 *) malloc (new_map->size * sizeof (uint32));
           if (new_map->map == NULL)
         {
           LOG_LOCALE_ERROR ("memory allocation failed", ER_LOC_GEN, true);
           err_status = ER_LOC_GEN;
           goto exit;
         }

           for (i = 0; i < new_map->size; i++)
         {
           if (i < unicode_data[um->map[0]].unicode_mapping_cp_count)
             {
               new_map->map[i] = unicode_data[um->map[0]].unicode_mapping[i];
             }
           else
             {
               new_map->map[i] = um->map[1 + i - unicode_data[um->map[0]].unicode_mapping_cp_count];
             }
         }
           mapping_cursor++;
           if (is_verbose)
         {
           printf ("\nNew mapping step : %04X -> ", um->cp);
           for (i = 0; i < new_map->size; i++)
             {
               printf ("%04X ", new_map->map[i]);
             }
         }
         }
     }
       curr_mapping++;
     }

   for (i = 0; i < norm->unicode_mappings_count; i++)
     {
       um = &(temp_list_unicode_decomp_maps[i]);
       if (um->size > 0 && unicode_decomp_map_count[um->map[0]] == 0)
     {
       /* This means that for um->cp, the um->map can't be further decomposed, thus being the fully decomposed
        * representation for um->cp. It will be marked as such. */
       um->is_full_decomp = true;
     }
     }

   /* Sort/group the decompositions in list_unicode_decomp_maps by the value of the first codepoint in each mapping. The
    * grouping is necessary for optimizing the future search for possible decompositions when putting a string in fully
    * composed form. */
   qsort (temp_list_unicode_decomp_maps, norm->unicode_mappings_count, sizeof (UNICODE_CP_MAPPING),
      comp_func_grouping_unicode_cp_mapping);

   /* Build starting indexes for each cp which is the first cp in a compact group of mappings */
   norm->unicode_mapping_index = (int *) malloc ((MAX_UNICODE_CHARS + 1) * sizeof (int));
   if (norm->unicode_mapping_index == NULL)
     {
       LOG_LOCALE_ERROR ("memory allocation failed", ER_LOC_GEN, true);
       err_status = ER_LOC_GEN;
       goto exit;
     }
   memset (norm->unicode_mapping_index, 0, (MAX_UNICODE_CHARS + 1) * sizeof (int));
   cp = temp_list_unicode_decomp_maps[0].map[0];
   mapping_start = 0;
   mapping_count = 1;
   for (i = 1; i < norm->unicode_mappings_count; i++)
     {
       if (temp_list_unicode_decomp_maps[i].map[0] == (uint32) cp)
     {
       mapping_count++;
     }
       else
     {
       SET_MAPPING_INDEX (norm->unicode_mapping_index[cp], true, mapping_start);
       old_cp = cp;
       cp = (uint32) temp_list_unicode_decomp_maps[i].map[0];
       mapping_count = 1;
       mapping_start = i;
       for (j = old_cp + 1; j < cp; j++)
         {
           SET_MAPPING_INDEX (norm->unicode_mapping_index[j], false, mapping_start);
         }
     }
     }
   SET_MAPPING_INDEX (norm->unicode_mapping_index[cp], true, mapping_start);
   SET_MAPPING_INDEX (norm->unicode_mapping_index[cp + 1], false, (mapping_start + mapping_count));

   /* Sort descending each range of UNICODE_MAPPINGs from list_unicode_decomp_maps, having the same codepoint value in
    * UNICODE_MAPPING.map[0], using memcmp. The sorting is necessary for optimizing the future search for possible
    * decompositions when putting a string in fully composed form. */
   for (cp = 0; cp < MAX_UNICODE_CHARS; cp++)
     {
       int mapping_start = 0;
       int mapping_count = 0;

       if (!CP_HAS_MAPPINGS (norm->unicode_mapping_index[cp]))
     {
       continue;
     }
       mapping_start = GET_MAPPING_OFFSET (norm->unicode_mapping_index[cp]);
       mapping_count = GET_MAPPING_OFFSET (norm->unicode_mapping_index[cp + 1]) - mapping_start;

       qsort (temp_list_unicode_decomp_maps + mapping_start, mapping_count, sizeof (UNICODE_CP_MAPPING),
          comp_func_unicode_cp_mapping);
     }

   err_status = unicode_make_normalization_data (temp_list_unicode_decomp_maps, ld);

 exit:
   if (unicode_decomp_map_count != NULL)
     {
       free (unicode_decomp_map_count);
       unicode_decomp_map_count = NULL;
     }

   if (temp_list_unicode_decomp_maps != NULL)
     {
       for (i = 0; i < norm->unicode_mappings_count; i++)
     {
       um = &(temp_list_unicode_decomp_maps[i]);
       if (um->map != NULL)
         {
           free (um->map);
           /* um->map = NULL not necessary, list_unicode_decomp_maps is freed afterwards. */
         }
     }
       free (temp_list_unicode_decomp_maps);
       temp_list_unicode_decomp_maps = NULL;
     }

   return err_status;
 }

 /*
  * count_full_decomp_cp() - Counts the number of codepoints to needed to store
  *            the full decomposition representation for a
  *            codepoint.
  *
  * Returns: codepoint count
  * cp(in) : codepoint
  *
  *  Note : this is a recursive function.
  */
 static int
 count_full_decomp_cp (int cp)
 {
   UNICODE_CHAR *uc;

   uc = &(unicode_data[cp]);
   if (cp >= MAX_UNICODE_CHARS)
     {
       return 1;
     }

   uc = &(unicode_data[cp]);

   if (uc->unicode_mapping_cp_count == 0)
     {
       return 1;
     }

   return uc->unicode_mapping_cp_count - 1 + count_full_decomp_cp ((int) uc->unicode_mapping[0]);
 }

 /*
  * count_decomp_steps() - Counts the number of steps for putting a codepoint
  *          into fully decomposed form, by replacing one
  *          decomposable codepoint at every step.
  *
  * Returns: step count
  * cp(in) : codepoint
  *
  *  Note : this is a recursive function.
  */
 static int
 count_decomp_steps (int cp)
 {
   UNICODE_CHAR *uc;

   uc = &(unicode_data[cp]);
   if (uc->unicode_mapping_cp_count == 0)
     {
       return 0;
     }
   if ((uc->unicode_mapping_cp_count == 1 && uc->unicode_mapping[0] < MAX_UNICODE_CHARS)
       || (uc->unicode_mapping_cp_count > 1))
     {
       return 1 + count_decomp_steps (uc->unicode_mapping[0]);
     }

   return 0;
 }

 /*
  * unicode_make_normalization_data() - takes the data loaded from UnicodeData,
  *      which was previously sorted, and puts it into optimized form
  *      into the locale data structure, ready to be exported into
  *      a shared library.
  *
  * Returns: ER_LOC_GEN if error
  *      NO_ERROR otherwise
  * decomp_maps(in): variable holding the loaded and partially processed
  *          unicode data
  * ld(in/out): locale data
  *
  */
 static int
 unicode_make_normalization_data (UNICODE_CP_MAPPING * decomp_maps, LOCALE_DATA * ld)
 {
   int err_status = NO_ERROR;
   int i, j;
   UNICODE_CP_MAPPING *um_cp;
   UNICODE_MAPPING *um;
   unsigned char str_buf[INTL_UTF8_MAX_CHAR_SIZE * UNICODE_DECOMP_MAP_CP_COUNT];
   unsigned char *cur_pos;
   char cur_size, byte_count;
   UNICODE_NORMALIZATION *norm;

   assert (ld != NULL);
   assert (decomp_maps != NULL);

   norm = &(ld->unicode_normalization);

   /* Prepare the unicode_mappings array for storing the data from decomp_maps as utf8 buffers + length + original
    * codepoint. */
   norm->unicode_mappings = (UNICODE_MAPPING *) malloc (norm->unicode_mappings_count * sizeof (UNICODE_MAPPING));
   if (norm->unicode_mappings == NULL)
     {
       LOG_LOCALE_ERROR ("memory allocation failed", ER_LOC_GEN, true);
       err_status = ER_LOC_GEN;
       goto exit;
     }
   memset (norm->unicode_mappings, 0, norm->unicode_mappings_count * sizeof (UNICODE_MAPPING));

   /* Prepare the index list for fully decomposed mappings */
   norm->list_full_decomp = (int *) malloc (MAX_UNICODE_CHARS * sizeof (int));
   if (norm->list_full_decomp == NULL)
     {
       LOG_LOCALE_ERROR ("memory allocation failed", ER_LOC_GEN, true);
       err_status = ER_LOC_GEN;
       goto exit;
     }
   for (i = 0; i < MAX_UNICODE_CHARS; i++)
     {
       norm->list_full_decomp[i] = -1;
     }

   /* Start importing data from decomp_maps into unicode_mappings. */
   for (i = 0; i < norm->unicode_mappings_count; i++)
     {
       um_cp = &(decomp_maps[i]);
       um = &(norm->unicode_mappings[i]);

       um->cp = um_cp->cp;

       /* Empty temporary utf8 buffer */
       memset (str_buf, 0, INTL_UTF8_MAX_CHAR_SIZE * UNICODE_DECOMP_MAP_CP_COUNT);

       /* Convert the list of codepoints into a utf8 buffer */
       cur_pos = str_buf;
       cur_size = 0;
       byte_count = 0;

       for (j = 0; j < um_cp->size; j++)
     {
       byte_count = intl_cp_to_utf8 (um_cp->map[j], cur_pos);
       cur_size += byte_count;
       cur_pos += byte_count;
     }

       memset (um->buffer, 0, sizeof (um->buffer));

       /* Make the final utf8 buffer used for normalization */
       memcpy (um->buffer, str_buf, cur_size);
       um->size = cur_size;

       /* If um_cp is a fully decomposed representation for cp, mark it as such. */
       if (um_cp->is_full_decomp)
     {
       norm->list_full_decomp[um_cp->cp] = i;
     }
     }

 exit:

   return err_status;
 }

 #if !defined (SERVER_MODE)
 /*
  * unicode_string_need_compose() - Checks if a string needs composition
  *                 and returns the size required by fully
  *                 composed form.
  *
  * Returns:
  * str_in(in) : string to normalize
  * size_in(in) : size in bytes of string
  * size_out(out) : size in bytes of composed string
  * need_compose(out) : true if composition is required, false otherwise
  * norm(in) : the unicode data for normalization
  *
  *  Note : this is light check, since full check requires more complex
  *     processing - same as composing algorithm.
  *     All input is assumed in UTF-8 character set
  */
 bool
 unicode_string_need_compose (const char *str_in, const int size_in, int *size_out, const UNICODE_NORMALIZATION * norm)
 {
   const char *pc;
   const char *p_end;

   assert (size_out != NULL);

   *size_out = 0;

   if (!prm_get_bool_value (PRM_ID_UNICODE_INPUT_NORMALIZATION) || norm == NULL || size_in == 0 || str_in == NULL)
     {
       return false;
     }

   assert (str_in != NULL);

   /* If all chars are in the range 0-127, then the string is ASCII and no unicode operations are neccessary e.g.
    * composition */
   /* Reuse match_found as validation flag. */
   p_end = str_in + size_in;

   for (pc = str_in; pc < p_end; pc++)
     {
       if ((unsigned char) (*pc) >= 0x80)
     {
       *size_out = size_in;
       return true;
     }
     }

   return false;
 }

 /*
  * unicode_compose_string() - Put a string into fully composed form.
  *
  * Returns:
  * str_in(in) : string to normalize
  * size_in(in) : size in bytes of string
  * str_out(out) : preallocated buffer to store composed string, output string
  *        is not null terminated
  * size_out(out) : actual size in bytes of composed string
  * is_composed (out) : true if the string required composition
  * norm(in) : the unicode data for normalization
  */
 void
 unicode_compose_string (const char *str_in, const int size_in, char *str_out, int *size_out, bool * is_composed,
             const UNICODE_NORMALIZATION * norm)
 {
   char *composed_str;
   int composed_index, remaining_bytes;
   const char *str_next = NULL;
   unsigned int cp;
   int map_start, map_end, i, byte_count;
   bool match_found = false, composition_found;
   UNICODE_MAPPING *um;
   const char *str_cursor;
   const char *str_end;

   assert (prm_get_bool_value (PRM_ID_UNICODE_INPUT_NORMALIZATION) && norm != NULL && size_in > 0 && str_in != NULL);

   composed_index = 0;

   /* Build composed string */
   str_next = str_in;
   str_cursor = str_in;
   remaining_bytes = size_in;
   composition_found = false;
   composed_str = str_out;
   str_end = str_in + size_in;

   while (str_cursor < str_end)
     {
       int first_cp_size;

       cp = intl_utf8_to_cp ((unsigned char *) str_cursor, remaining_bytes, (unsigned char **) &str_next);

       first_cp_size = CAST_STRLEN (str_next - str_cursor);
       remaining_bytes -= first_cp_size;

       match_found = false;

       if (cp >= MAX_UNICODE_CHARS - 2 || !CP_HAS_MAPPINGS (norm->unicode_mapping_index[cp]))
     {
       goto match_not_found;
     }

       map_start = GET_MAPPING_OFFSET (norm->unicode_mapping_index[cp]);
       map_end = GET_MAPPING_OFFSET (norm->unicode_mapping_index[cp + 1]);

       /* Search the mapping list for a possible match */
       for (i = map_start; i < map_end; i++)
     {
       um = &(norm->unicode_mappings[i]);
       if (um->size > remaining_bytes + first_cp_size)
         {
           continue;
         }

       if (memcmp (um->buffer, str_cursor, um->size) == 0)
         {
           /* If a composition matches, apply it. */
           composed_index += intl_cp_to_utf8 (um->cp, (unsigned char *) (&(composed_str[composed_index])));
           str_cursor += um->size;
           match_found = true;
           composition_found = true;
           break;
         }
     }

       /* If no composition can be matched to start with the decoded codepoint, just copy the bytes corresponding to the
        * codepoint from the input string to the output, adjust pointers and loop again. */
     match_not_found:
       if (!match_found)
     {
       byte_count = CAST_STRLEN (str_next - str_cursor);
       memcpy (&(composed_str[composed_index]), str_cursor, byte_count);
       composed_index += byte_count;
       str_cursor += byte_count;
     }
     }               /* while */

   /* Set output variables */
   *size_out = composed_index;
   if (composition_found)
     {
       *is_composed = true;
     }

   return;
 }

 /*
  * unicode_string_need_decompose() - Checks if a string needs
  *                   decomposition and returns the size
  *                   required by decomposed form.
  *
  * Returns: true if decomposition is required
  * str_in(in) : string to normalize
  * size_in(in) : size of string in bytes
  * decomp_size(out) : size required by decomposed form in bytes
  * norm(in) : the unicode context in which the normalization is performed
  *
  *  Note : Input string is assumed UTF-8 character set.
  */
 bool
 unicode_string_need_decompose (const char *str_in, const int size_in, int *decomp_size,
                    const UNICODE_NORMALIZATION * norm)
 {
   int bytes_read, decomp_index, decomposed_size = 0;
   unsigned int cp;
   const char *src_cursor;
   const char *src_end;
   const char *next;
   bool can_decompose;

   if (!prm_get_bool_value (PRM_ID_UNICODE_OUTPUT_NORMALIZATION) || norm == NULL)
     {
       goto no_decompose_cnt;
     }

   assert (str_in != NULL);

   /* check if ASCII */
   can_decompose = false;
   src_end = str_in + size_in;
   for (src_cursor = str_in; src_cursor < src_end; src_cursor++)
     {
       if ((unsigned char) (*src_cursor) >= 0x80)
     {
       can_decompose = true;
       break;
     }
     }
   if (!can_decompose)
     {
       goto no_decompose_cnt;
     }

   /* Read each codepoint and add its expanded size to the overall size */
   src_cursor = str_in;
   next = str_in;
   can_decompose = false;
   src_end = str_in + size_in;
   while (src_cursor < src_end)
     {
       cp = intl_utf8_to_cp ((unsigned char *) src_cursor, CAST_STRLEN (src_end - src_cursor), (unsigned char **) &next);
       bytes_read = CAST_STRLEN (next - src_cursor);

       decomp_index = (cp < MAX_UNICODE_CHARS) ? norm->list_full_decomp[cp] : -1;
       if (decomp_index > -1)
     {
       decomposed_size += norm->unicode_mappings[decomp_index].size;
       can_decompose = true;
     }
       else
     {
       decomposed_size += bytes_read;
     }

       src_cursor = next;
     }

   /* If no decomposition is needed, return the same size as the input string and exit. */
   if (!can_decompose)
     {
       goto no_decompose_cnt;
     }

   *decomp_size = decomposed_size;

   return true;

 no_decompose_cnt:
   *decomp_size = size_in;

   return false;
 }

 /*
  * unicode_decompose_string() - Put a string into fully decomposed form.
  *
  * Returns: ER_OUT_OF_VIRTUAL_MEMORY if internal memory allocation fails
  *      NO_ERROR if successfull
  * str_in(in) : string to normalize
  * size_in(in) : size in bytes of string
  * str_out(out): preallocated buffer for string in decomposed form
  * size_out(out): actual size of decomposed form in bytes
  * norm(in) : the unicode context in which the normalization is performed
  */
 void
 unicode_decompose_string (const char *str_in, const int size_in, char *str_out, int *size_out,
               const UNICODE_NORMALIZATION * norm)
 {
   int bytes_read, decomp_index;
   unsigned int cp;
   const char *src_cursor;
   const char *src_end;
   const char *next;
   char *dest_cursor;

   assert (prm_get_bool_value (PRM_ID_UNICODE_OUTPUT_NORMALIZATION) && norm != NULL);

   assert (str_in != NULL);
   assert (str_out != NULL);
   assert (size_out != NULL);

   src_cursor = str_in;
   dest_cursor = str_out;
   next = str_in;
   src_end = str_in + size_in;
   while (src_cursor < src_end)
     {
       cp = intl_utf8_to_cp ((unsigned char *) src_cursor, CAST_STRLEN (src_end - src_cursor), (unsigned char **) &next);
       bytes_read = CAST_STRLEN (next - src_cursor);
       decomp_index = (cp < MAX_UNICODE_CHARS) ? norm->list_full_decomp[cp] : -1;
       if (decomp_index > -1)
     {
       memcpy (dest_cursor, norm->unicode_mappings[decomp_index].buffer, norm->unicode_mappings[decomp_index].size);
       dest_cursor += norm->unicode_mappings[decomp_index].size;
     }
       else
     {
       memcpy (dest_cursor, src_cursor, bytes_read);
       dest_cursor += bytes_read;
     }
       src_cursor = next;
     }

   *size_out = CAST_STRLEN (dest_cursor - str_out);
 }
 #endif /* SERVER_MODE */
 /*
  * comp_func_unicode_cp_mapping() - compare function for sorting a group of
  *                  unicode decompositions starting with the
  *                  same codepoint
  *
  * Returns: compare result
  * arg1(in) :
  * arg2(in) :
  */
 static int
 comp_func_unicode_cp_mapping (const void *arg1, const void *arg2)
 {
   UNICODE_CP_MAPPING *um1, *um2;
   int min_size, result;

   um1 = (UNICODE_CP_MAPPING *) arg1;
   um2 = (UNICODE_CP_MAPPING *) arg2;

   min_size = (um1->size < um2->size) ? um1->size : um2->size;
   result = memcmp (um1->map, um2->map, min_size * sizeof (uint32));
   /* Result will be reverted to obtain reverse ordering */
   if (result == 0)
     {
       if (um1->size > min_size)
     {
       return -1;
     }
       if (um2->size > min_size)
     {
       return 1;
     }
       if (um1->cp < um2->cp)
     {
       return -1;
     }
       return 1;
     }

   return -result;
 }

 /*
  * comp_func_grouping_unicode_cp_mapping() - compare function for sorting
  *                  all decompositions
  *
  * Returns: compare result
  * arg1(in) :
  * arg2(in) :
  */
 static int
 comp_func_grouping_unicode_cp_mapping (const void *arg1, const void *arg2)
 {
   UNICODE_CP_MAPPING *um1, *um2;
   int result;

   um1 = (UNICODE_CP_MAPPING *) arg1;
   um2 = (UNICODE_CP_MAPPING *) arg2;

   if (um1->map[0] > um2->map[0])
     {
       result = 1;
     }
   else
     {
       result = -1;
     }

   return result;
 }
locale_data::unicode_mode
int unicode_mode
Definition: locale_support.h:610

comp_func_grouping_unicode_cp_mapping
static int comp_func_grouping_unicode_cp_mapping(const void *arg1, const void *arg2)
Definition: unicode_support.c:1450

unicode_support.h

UNICODE_FILE_CHAR_DECOMPOSITION_MAPPING
#define UNICODE_FILE_CHAR_DECOMPOSITION_MAPPING
Definition: unicode_support.c:48

UNICODE_FILE_LOWER_CASE_MAP
#define UNICODE_FILE_LOWER_CASE_MAP
Definition: unicode_support.c:50

GENERAL_CATEGORY
Definition: unicode_support.c:62

NO_ERROR
#define NO_ERROR
Definition: error_code.h:46

unicode_string_need_compose
bool unicode_string_need_compose(const char *str_in, const int size_in, int *size_out, const UNICODE_NORMALIZATION *norm)
Definition: unicode_support.c:1128

ER_LOC_GEN
#define ER_LOC_GEN
Definition: error_code.h:1371

unicode_mapping::buffer
unsigned char buffer[NORMALIZATION_MAX_BUF_SIZE]
Definition: locale_lib_common.h:98

locale_support.h

unicode_mapping::size
int size
Definition: locale_lib_common.h:96

UNICODE_FILE_UPPER_CASE_MAP
#define UNICODE_FILE_UPPER_CASE_MAP
Definition: unicode_support.c:49

UNICODE_CP_MAPPING::map
uint32 * map
Definition: unicode_support.c:104

list_gen_cat
GENERAL_CATEGORY list_gen_cat[]
Definition: unicode_support.c:70

CAT_Cn
Definition: unicode_support.c:54

PRM_ID_UNICODE_OUTPUT_NORMALIZATION
Definition: system_parameter.h:300

locale_data::identif_alphabet
ALPHABET_DATA identif_alphabet
Definition: locale_support.h:607

CAT_Lu
Definition: unicode_support.c:55

porting.h

create_alphabet
static int create_alphabet(ALPHABET_DATA *a, const int max_letters, const int lower_multiplier, const int upper_multiplier)
Definition: unicode_support.c:585

unicode_mapping
Definition: locale_lib_common.h:93

str_to_uint32
int str_to_uint32(unsigned int *ret_p, char **end_p, const char *str_p, int base)
Definition: porting.c:2382

alphabet_tailoring::alphabet_mode
int alphabet_mode
Definition: locale_support.h:480

CANONICAL_COMBINING_CLASS
Definition: unicode_support.c:75

TR_LOWER
Definition: locale_support.h:455

UNICODE_CHAR::unicode_mapping_cp_count
char unicode_mapping_cp_count
Definition: unicode_support.c:94

unicode_make_normalization_data
static int unicode_make_normalization_data(UNICODE_CP_MAPPING *decomp_maps, LOCALE_DATA *ld)
Definition: unicode_support.c:1029

unicode_compose_string
void unicode_compose_string(const char *str_in, const int size_in, char *str_out, int *size_out, bool *is_composed, const UNICODE_NORMALIZATION *norm)
Definition: unicode_support.c:1174

alphabet_data::a_type
ALPHABET_TYPE a_type
Definition: locale_support.h:439

CAST_STRLEN
#define CAST_STRLEN
Definition: porting.h:470

CAT_MAX
Definition: unicode_support.c:59

intl_cp_to_utf8
int intl_cp_to_utf8(const unsigned int codepoint, unsigned char *utf8_seq)
Definition: intl_support.c:3621

UNICODE_CHAR::lower_cp
uint32 lower_cp[INTL_CASING_EXPANSION_MULTIPLIER]
Definition: unicode_support.c:91

unicode_data
static UNICODE_CHAR * unicode_data
Definition: unicode_support.c:110

locale_data::alphabet
ALPHABET_DATA alphabet
Definition: locale_support.h:606

last_unicode_file
static char last_unicode_file[PATH_MAX]
Definition: unicode_support.c:114

GENERAL_CATEGORY::val
const char * val
Definition: unicode_support.c:65

MAX_UNICODE_CHARS
#define MAX_UNICODE_CHARS
Definition: locale_support.h:37

load_unicode_data
static int load_unicode_data(const LOCALE_DATA *ld)
Definition: unicode_support.c:370

UNICODE_CHAR::unicode_mapping
uint32 unicode_mapping[UNICODE_DECOMP_MAP_CP_COUNT]
Definition: unicode_support.c:96

unicode_mapping::cp
unsigned int cp
Definition: locale_lib_common.h:95

unicode_normalization::unicode_mappings
UNICODE_MAPPING * unicode_mappings
Definition: locale_support.h:544

comp_func_unicode_cp_mapping
static int comp_func_unicode_cp_mapping(const void *arg1, const void *arg2)
Definition: unicode_support.c:1410

unicode_decompose_string
void unicode_decompose_string(const char *str_in, const int size_in, char *str_out, int *size_out, const UNICODE_NORMALIZATION *norm)
Definition: unicode_support.c:1359

UNICODE_CHAR
Definition: unicode_support.c:86

envvar_localedatadir_file
char * envvar_localedatadir_file(char *path, size_t size, const char *filename)
Definition: environment_variable.c:579

assert
#define assert(x)
Definition: malloc_2_8_3.c:1204

unicode_normalization
Definition: locale_support.h:542

UNICODEDATA_FILE
#define UNICODEDATA_FILE
Definition: unicode_support.c:40

alphabet_tailoring::unicode_data_file
char unicode_data_file[PATH_MAX]
Definition: locale_support.h:483

transform_rule::dest_size
int dest_size
Definition: locale_support.h:469

transform_rule
Definition: locale_support.h:461

UNICODE_FILE_LINE_SIZE
#define UNICODE_FILE_LINE_SIZE
Definition: unicode_support.c:43

UNICODE_CHAR::unicode_full_decomp_cp_count
char unicode_full_decomp_cp_count
Definition: unicode_support.c:97

unicode_normalization::unicode_mapping_index
int * unicode_mapping_index
Definition: locale_support.h:546

ALPHABET_TAILORED
Definition: locale_support.h:432

unicode_free_data
void unicode_free_data(void)
Definition: unicode_support.c:565

GENERAL_CATEGORY::id
GENERAL_CATEG_ID id
Definition: unicode_support.c:64

count_decomp_steps
static int count_decomp_steps(int cp)
Definition: unicode_support.c:997

NULL
#define NULL
Definition: freelistheap.h:34

transform_rule::src_size
int src_size
Definition: locale_support.h:466

intl_utf8_to_cp
unsigned int intl_utf8_to_cp(const unsigned char *utf8, const int size, unsigned char **next_char)
Definition: intl_support.c:3715

transform_rule::src
char * src
Definition: locale_support.h:465

alphabet_data::lower_cp
unsigned int * lower_cp
Definition: locale_support.h:444

if
if(extra_options)
Definition: dynamic_load.c:958

unicode_data_upper_mult
static int unicode_data_upper_mult
Definition: unicode_support.c:112

ALPHABET_ASCII
Definition: locale_support.h:431

locale_data::unicode_data_file
char unicode_data_file[PATH_MAX]
Definition: locale_support.h:612

unicode_process_alphabet
int unicode_process_alphabet(LOCALE_DATA *ld, bool is_verbose)
Definition: unicode_support.c:134

GENERAL_CATEG_ID
GENERAL_CATEG_ID
Definition: unicode_support.c:52

transform_rule::type
TRANSFORM_TYPE type
Definition: locale_support.h:463

fopen_ex
FILE * fopen_ex(const char *filename, const char *type)
Definition: util_common.c:322

language_support.h

transform_rule::dest
char * dest
Definition: locale_support.h:468

INTL_UTF8_MAX_CHAR_SIZE
#define INTL_UTF8_MAX_CHAR_SIZE
Definition: locale_lib_common.h:35

unicode_process_normalization
int unicode_process_normalization(LOCALE_DATA *ld, bool is_verbose)
Definition: unicode_support.c:711

string_to_int_array
int string_to_int_array(char *s, uint32 *cp_list, const int cp_list_size, const char *delims)
Definition: unicode_support.c:661

error
static void error(const char *msg)
Definition: gencat.c:331

utility.h

ALPHABET_UNICODE
Definition: locale_support.h:430

intl_utf8_to_cp_list
int intl_utf8_to_cp_list(const unsigned char *utf8, const int size, unsigned int *cp_array, const int max_array_size, int *array_count)
Definition: intl_support.c:3851

locale_data::unicode_normalization
UNICODE_NORMALIZATION unicode_normalization
Definition: locale_support.h:615

alphabet_data::l_count
int l_count
Definition: locale_support.h:441

intl_support.h

snprintf_dots_truncate
#define snprintf_dots_truncate(dest, max_len,...)
Definition: porting.h:323

UNICODE_CP_MAPPING::size
int size
Definition: unicode_support.c:106

UNICODE_CHAR::upper_cp
uint32 upper_cp[INTL_CASING_EXPANSION_MULTIPLIER]
Definition: unicode_support.c:92

UNICODE_CP_MAPPING::is_full_decomp
bool is_full_decomp
Definition: unicode_support.c:107

str_out
static void str_out(const char *fmt,...)
Definition: broker_monitor.c:389

alphabet_data::lower_multiplier
int lower_multiplier
Definition: locale_support.h:443

alphabet_tailoring::count_rules
int count_rules
Definition: locale_support.h:485

strlen
#define strlen(s1)
Definition: intl_support.c:43

CANONICAL_COMBINING_CLASS::std_val
char * std_val
Definition: unicode_support.c:78

CP_HAS_MAPPINGS
#define CP_HAS_MAPPINGS(val)
Definition: locale_support.h:172

environment_variable.h

alphabet_data
Definition: locale_support.h:437

SET_MAPPING_INDEX
#define SET_MAPPING_INDEX(val, is_used, offset)
Definition: locale_support.h:163

alphabet_tailoring::rules
TRANSFORM_RULE * rules
Definition: locale_support.h:487

alphabet_tailoring::sett_max_letters
int sett_max_letters
Definition: locale_support.h:478

UNICODE_FILE_GENERAL_CAT_POS
#define UNICODE_FILE_GENERAL_CAT_POS
Definition: unicode_support.c:47

prm_get_bool_value
bool prm_get_bool_value(PARAM_ID prm_id)
Definition: system_parameter.c:10979

CAT_Ll
Definition: unicode_support.c:56

unicode_normalization::unicode_mappings_count
int unicode_mappings_count
Definition: locale_support.h:545

uint32
unsigned int uint32
Definition: unicode_support.h:34

unicode_string_need_decompose
bool unicode_string_need_decompose(const char *str_in, const int size_in, int *decomp_size, const UNICODE_NORMALIZATION *norm)
Definition: unicode_support.c:1274

error_manager.h

UNICODE_CP_MAPPING
Definition: unicode_support.c:101

unicode_normalization::list_full_decomp
int * list_full_decomp
Definition: locale_support.h:547

i
int i
Definition: dynamic_load.c:954

LOG_LOCALE_ERROR
#define LOG_LOCALE_ERROR(msg, er_status, do_print)
Definition: locale_support.h:151

system_parameter.h

CANONICAL_COMBINING_CLASS::id
int id
Definition: unicode_support.c:77

GET_MAPPING_OFFSET
#define GET_MAPPING_OFFSET(val)
Definition: locale_support.h:175

UNICODE_CHAR::gen_cat_id
GENERAL_CATEG_ID gen_cat_id
Definition: unicode_support.c:89

UNICODE_DECOMP_MAP_CP_COUNT
#define UNICODE_DECOMP_MAP_CP_COUNT
Definition: unicode_support.c:84

INTL_CASING_EXPANSION_MULTIPLIER
#define INTL_CASING_EXPANSION_MULTIPLIER
Definition: locale_support.h:42

UNICODE_FILE_FIELDS
#define UNICODE_FILE_FIELDS
Definition: unicode_support.c:44

locale_data::alpha_tailoring
ALPHABET_TAILORING alpha_tailoring
Definition: locale_support.h:605

PRM_ID_UNICODE_INPUT_NORMALIZATION
Definition: system_parameter.h:299

locale_data
Definition: locale_support.h:568

alphabet_tailoring
Definition: locale_support.h:474

TR_UPPER
Definition: locale_support.h:454

count_full_decomp_cp
static int count_full_decomp_cp(int cp)
Definition: unicode_support.c:966

UNICODE_CP_MAPPING::cp
uint32 cp
Definition: unicode_support.c:103

ERR_MSG_SIZE
#define ERR_MSG_SIZE
Definition: locale_support.h:149

alphabet_data::upper_multiplier
int upper_multiplier
Definition: locale_support.h:446

unicode_data_lower_mult
static int unicode_data_lower_mult
Definition: unicode_support.c:111

alphabet_data::upper_cp
unsigned int * upper_cp
Definition: locale_support.h:447