CUBRID Engine  latest
unicode_support.c
Go to the documentation of this file.
1 /*
2  * Copyright 2008 Search Solution Corporation
3  * Copyright 2016 CUBRID Corporation
4  *
5  * Licensed under the Apache License, Version 2.0 (the "License");
6  * you may not use this file except in compliance with the License.
7  * You may obtain a copy of the License at
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  *
17  */
18 
19 /*
20  * unicode_support.c : Unicode support
21  */
22 #include "config.h"
23 
24 #include <stdio.h>
25 #include <stdlib.h>
26 #include <assert.h>
27 
28 #include "porting.h"
29 
30 #include "locale_support.h"
31 #include "intl_support.h"
32 #include "language_support.h"
33 #include "error_manager.h"
34 #include "utility.h"
35 #include "environment_variable.h"
36 #include "system_parameter.h"
37 #include "unicode_support.h"
38 
39 
40 #define UNICODEDATA_FILE "unicodedata.txt"
41 
42 /* Unicode data file constants */
43 #define UNICODE_FILE_LINE_SIZE 512
44 #define UNICODE_FILE_FIELDS 14
45 
46 /* Field position : starting from 0 */
47 #define UNICODE_FILE_GENERAL_CAT_POS 2
48 #define UNICODE_FILE_CHAR_DECOMPOSITION_MAPPING 5
49 #define UNICODE_FILE_UPPER_CASE_MAP 12
50 #define UNICODE_FILE_LOWER_CASE_MAP 13
51 
52 typedef enum
53 {
54  CAT_Cn = 0, /* other, not assigned */
55  CAT_Lu, /* Letter, uppercase */
56  CAT_Ll, /* Letter, lowercase */
57 
58  /* add new values here */
59  CAT_MAX /* maximum category value */
61 
62 typedef struct
63 {
65  const char *val;
67 
68 
69 /* available list of general categories (id, name) */
71  {CAT_Lu, "Lu"},
72  {CAT_Ll, "Ll"},
73 };
74 
75 typedef struct
76 {
77  int id;
78  char *std_val; /* Standard value as defined by Unicode Consortium */
80 
81 /* The maximum number of codepoints to which a single codepoint can be
82  * rewritten in canonically fully decomposed form.
83  */
84 #define UNICODE_DECOMP_MAP_CP_COUNT 4
85 
86 typedef struct
87 {
88  /* general category for this character */
90 
93 
95 
98 
99 } UNICODE_CHAR;
100 
101 typedef struct
102 {
103  uint32 cp; /* Codepoint value */
104  uint32 *map; /* A fully decomposed canonical mapping stored as codepoints */
105 
106  int size; /* The number of codepoints in the mapping */
107  bool is_full_decomp; /* true - if map is fully decomposed false - otherwise. */
109 
111 static int unicode_data_lower_mult = 1;
112 static int unicode_data_upper_mult = 1;
113 
114 static char last_unicode_file[PATH_MAX] = { 0 };
115 
116 static int load_unicode_data (const LOCALE_DATA * ld);
117 static int create_alphabet (ALPHABET_DATA * a, const int max_letters, const int lower_multiplier,
118  const int upper_multiplier);
119 static int count_full_decomp_cp (int cp);
120 static int count_decomp_steps (int cp);
121 static int unicode_make_normalization_data (UNICODE_CP_MAPPING * decomp_maps, LOCALE_DATA * ld);
122 static int comp_func_unicode_cp_mapping (const void *arg1, const void *arg2);
123 static int comp_func_grouping_unicode_cp_mapping (const void *arg1, const void *arg2);
124 
125 
126 /*
127  * unicode_process_alphabet() - Process alphabet (casing) data for given
128  * locale
129  *
130  * Returns: error code
131  * ld(in/out) : locale data structure
132  */
133 int
134 unicode_process_alphabet (LOCALE_DATA * ld, bool is_verbose)
135 {
136  ALPHABET_DATA *a = NULL;
137  ALPHABET_DATA *i_a = NULL;
138  ALPHABET_TAILORING *a_tailoring = NULL;
139  char unicode_file[PATH_MAX];
140  char err_msg[ERR_MSG_SIZE];
141  int er_status = NO_ERROR;
142  uint32 cp;
143  int lower_mult = 1;
144  int upper_mult = 1;
145  int i;
146 
147  assert (ld != NULL);
148 
149  a = &(ld->alphabet);
150  i_a = &(ld->identif_alphabet);
151  a_tailoring = &(ld->alpha_tailoring);
152 
153  /* compute lower and upper multiplier from rules */
154  for (i = 0; i < a_tailoring->count_rules; i++)
155  {
156  TRANSFORM_RULE *tf_rule = &(a_tailoring->rules[i]);
157  uint32 dummy_array;
158  int dummy;
159  int dest_len;
160 
161  dest_len = intl_utf8_to_cp_list ((unsigned char *) (tf_rule->dest), tf_rule->dest_size, &dummy_array, 1, &dummy);
162 
163  if (dest_len > INTL_CASING_EXPANSION_MULTIPLIER)
164  {
165  snprintf (err_msg, sizeof (err_msg) - 1,
166  "Invalid alphabet rule :%d" ". Destination buffer contains more than 2 characters", i);
167  LOG_LOCALE_ERROR (err_msg, ER_LOC_GEN, true);
168  er_status = ER_LOC_GEN;
169  goto error;
170  }
171  if (tf_rule->type == TR_UPPER)
172  {
173  upper_mult = MAX (upper_mult, dest_len);
174  }
175  else
176  {
177  assert (tf_rule->type == TR_LOWER);
178  lower_mult = MAX (lower_mult, dest_len);
179  }
180  }
181 
182  if (a_tailoring->alphabet_mode == 2)
183  {
184  if (is_verbose)
185  {
186  printf ("Creating ASCII alphabet\n");
187  }
188 
189  /* ASCII alphabet */
190  er_status = create_alphabet (a, a_tailoring->sett_max_letters, lower_mult, upper_mult);
191  if (er_status != NO_ERROR)
192  {
193  goto error;
194  }
195 
196  er_status = create_alphabet (i_a, a_tailoring->sett_max_letters, 1, 1);
197  if (er_status != NO_ERROR)
198  {
199  goto error;
200  }
201 
202  for (cp = 0; (int) cp < a->l_count; cp++)
203  {
204  i_a->upper_cp[cp] = a->upper_cp[cp] = cp;
205  i_a->lower_cp[cp] = a->lower_cp[cp] = cp;
206  }
207 
208  for (cp = (int) 'a'; cp <= (int) 'z'; cp++)
209  {
210  i_a->upper_cp[cp] = a->upper_cp[cp] = cp - ('a' - 'A');
211  i_a->lower_cp[cp - ('a' - 'A')] = a->lower_cp[cp - ('a' - 'A')] = cp;
212  }
213 
214  i_a->a_type = a->a_type = ALPHABET_ASCII;
215  }
216  else
217  {
218  if (a_tailoring->alphabet_mode == 1)
219  {
220  strncpy (unicode_file, a_tailoring->unicode_data_file, sizeof (unicode_file));
221  unicode_file[sizeof (unicode_file) - 1] = '\0';
222 
223  /* a user defined unicode file is handled as a tailored alphabet */
225  }
226  else
227  {
228  assert (a_tailoring->alphabet_mode == 0);
229  envvar_localedatadir_file (unicode_file, sizeof (unicode_file), UNICODEDATA_FILE);
230 
232  }
233 
234  if (is_verbose)
235  {
236  printf ("Creating UNICODE alphabet from: %s\n", unicode_file);
237  }
238 
239  er_status = load_unicode_data (ld);
240  if (er_status != NO_ERROR)
241  {
242  goto error;
243  }
244 
245  lower_mult = MAX (lower_mult, unicode_data_lower_mult);
246  upper_mult = MAX (upper_mult, unicode_data_upper_mult);
247 
248  er_status = create_alphabet (a, a_tailoring->sett_max_letters, lower_mult, upper_mult);
249  if (er_status != NO_ERROR)
250  {
251  goto error;
252  }
253 
254  er_status =
256  if (er_status != NO_ERROR)
257  {
258  goto error;
259  }
260 
261  for (cp = 0; (int) cp < a_tailoring->sett_max_letters; cp++)
262  {
263  /* set lower and upper case of each codepoint to itself */
264  a->lower_cp[cp * lower_mult] = cp;
265  a->upper_cp[cp * upper_mult] = cp;
266 
267  i_a->lower_cp[cp * unicode_data_lower_mult] = cp;
268  i_a->upper_cp[cp * unicode_data_upper_mult] = cp;
269 
270  /* overwrite with UnicodeData */
271  if (unicode_data[cp].gen_cat_id == CAT_Lu)
272  {
273  memcpy (&(a->lower_cp[cp * lower_mult]), &(unicode_data[cp].lower_cp),
274  sizeof (uint32) * MIN (unicode_data_lower_mult, lower_mult));
275 
276  memcpy (&(i_a->lower_cp[cp * unicode_data_lower_mult]), &(unicode_data[cp].lower_cp),
277  sizeof (uint32) * unicode_data_lower_mult);
278  }
279  else if (unicode_data[cp].gen_cat_id == CAT_Ll)
280  {
281  memcpy (&(a->upper_cp[cp * upper_mult]), &(unicode_data[cp].upper_cp),
282  sizeof (uint32) * MIN (unicode_data_upper_mult, upper_mult));
283 
284  memcpy (&(i_a->upper_cp[cp * unicode_data_upper_mult]), &(unicode_data[cp].upper_cp),
285  sizeof (uint32) * unicode_data_upper_mult);
286  }
287  }
288  }
289 
290  if (a_tailoring->count_rules > 0)
291  {
293  }
294 
295  if (is_verbose && a_tailoring->count_rules > 0)
296  {
297  printf ("Applying %d alphabet tailoring rules\n", a_tailoring->count_rules);
298  }
299  /* apply tailoring rules on user-alphabet only */
300  for (i = 0; i < a_tailoring->count_rules; i++)
301  {
302  TRANSFORM_RULE *tf_rule = &(a_tailoring->rules[i]);
303  uint32 cp_src;
305  int src_cp_count = 0;
306  int src_len = 0;
307  int dest_cp_count = 0;
308  int dest_len = 0;
309 
310  /* source codepoints */
311  /* TODO : allow casing compression (many CPs for source) */
312  src_len = intl_utf8_to_cp_list ((unsigned char *) (tf_rule->src), tf_rule->src_size, &cp_src, 1, &src_cp_count);
313 
314  if (src_len != 1 || src_len != src_cp_count)
315  {
316  LOG_LOCALE_ERROR ("Invalid source buffer for alphabet rule", ER_LOC_GEN, true);
317  er_status = ER_LOC_GEN;
318  goto error;
319  }
320 
321  if ((int) cp_src >= a_tailoring->sett_max_letters)
322  {
323  LOG_LOCALE_ERROR ("Codepoint for casing rule exceeds maximum" " allowed value", ER_LOC_GEN, true);
324  er_status = ER_LOC_GEN;
325  goto error;
326  }
327 
328  /* destination codepoints */
329  dest_len =
330  intl_utf8_to_cp_list ((unsigned char *) (tf_rule->dest), tf_rule->dest_size, cp_dest,
331  INTL_CASING_EXPANSION_MULTIPLIER, &dest_cp_count);
332 
333  if (dest_len < 1 || dest_len != dest_cp_count)
334  {
335  LOG_LOCALE_ERROR ("Invalid destination buffer for alphabet rule", ER_LOC_GEN, true);
336  er_status = ER_LOC_GEN;
337  goto error;
338  }
339 
340  if (tf_rule->type == TR_UPPER)
341  {
342  assert (dest_cp_count <= upper_mult);
343  memset (&(a->upper_cp[cp_src * upper_mult]), 0, upper_mult * sizeof (uint32));
344  memcpy (&(a->upper_cp[cp_src * upper_mult]), cp_dest, sizeof (uint32) * MIN (dest_cp_count, upper_mult));
345  }
346  else
347  {
348  assert (tf_rule->type == TR_LOWER);
349 
350  assert (dest_cp_count <= lower_mult);
351  memset (&(a->lower_cp[cp_src * lower_mult]), 0, lower_mult * sizeof (uint32));
352  memcpy (&(a->lower_cp[cp_src * lower_mult]), cp_dest, sizeof (uint32) * MIN (dest_cp_count, lower_mult));
353  }
354  }
355 
356  return er_status;
357 
358 error:
359 
360  return er_status;
361 }
362 
363 /*
364  * load_unicode_data() - Loads the UNICODEDATA file (standardised
365  * and availabe at Unicode.org).
366  * Returns: error code
367  * ld(in) : locale data
368  */
369 static int
371 {
372  FILE *fp = NULL;
373  char err_msg[ERR_MSG_SIZE];
374  int status = NO_ERROR;
375  char str[UNICODE_FILE_LINE_SIZE];
376  int line_count = 0;
377 
378  assert (ld != NULL);
379 
380  /* Build the full filepath to the selected (or default) Unicode data file */
381  if (ld->unicode_mode == 0)
382  {
383  /* using default Unicode file */
385  }
386  else
387  {
388  assert (ld->unicode_mode == 1);
389  }
390 
391  if (strcmp (ld->unicode_data_file, last_unicode_file) == 0)
392  {
393  assert (unicode_data != NULL);
394  return status;
395  }
396 
398 
399  unicode_data = (UNICODE_CHAR *) malloc (MAX_UNICODE_CHARS * sizeof (UNICODE_CHAR));
400  if (unicode_data == NULL)
401  {
402  LOG_LOCALE_ERROR ("memory allocation failed", ER_LOC_GEN, true);
403  status = ER_LOC_GEN;
404  goto error;
405  }
406 
407  memset (unicode_data, 0, MAX_UNICODE_CHARS * sizeof (UNICODE_CHAR));
408 
409  fp = fopen_ex (ld->unicode_data_file, "rt");
410  if (fp == NULL)
411  {
412  snprintf_dots_truncate (err_msg, sizeof (err_msg) - 1, "Cannot open file %s", ld->unicode_data_file);
413  LOG_LOCALE_ERROR (err_msg, ER_LOC_GEN, true);
414  status = ER_LOC_GEN;
415  goto error;
416  }
417 
418  while (fgets (str, sizeof (str), fp))
419  {
420  uint32 cp = 0;
421  int result = 0;
422  int i;
423  char *s, *end, *end_p;
424  UNICODE_CHAR *uc = NULL;
425 
426  line_count++;
427 
428  result = str_to_uint32 (&cp, &end_p, str, 16);
429  /* skip Unicode values above 0xFFFF */
430  if (result != 0 || cp >= MAX_UNICODE_CHARS)
431  {
432  continue;
433  }
434 
435  s = str;
436  uc = &(unicode_data[cp]);
437  uc->lower_cp[0] = cp;
438  uc->upper_cp[0] = cp;
439 
440  /* next field */
441  s = strchr (s, ';');
442 
443  assert (s != NULL);
444  if (s == NULL)
445  {
446  continue;
447  }
448  s++;
449 
450  for (i = 1; i < UNICODE_FILE_FIELDS; i++)
451  {
452  char str_p[UNICODE_FILE_LINE_SIZE];
453  char *save;
454  int cp_count;
455 
456  strcpy (str_p, s);
457 
458  end = strtok_r (str_p, ";", &save);
459 
460  /* check generic category */
462  {
463  int cat_idx;
464 
465  for (cat_idx = 0; cat_idx < (int) (sizeof (list_gen_cat) / sizeof (list_gen_cat[0])); cat_idx++)
466  {
467  if (strcmp (list_gen_cat[cat_idx].val, str_p) == 0)
468  {
469  uc->gen_cat_id = list_gen_cat[cat_idx].id;
470  break;
471  }
472  }
473  }
474  else if (i == UNICODE_FILE_UPPER_CASE_MAP && uc->gen_cat_id == CAT_Ll)
475  {
476  /* lower case codepoints */
477  cp_count = string_to_int_array (str_p, uc->upper_cp, INTL_CASING_EXPANSION_MULTIPLIER, " ");
478  if (cp_count > INTL_CASING_EXPANSION_MULTIPLIER)
479  {
480  snprintf_dots_truncate (err_msg, sizeof (err_msg) - 1,
481  "Invalid line %d" " of file %s contains more than 2 characters for "
482  "upper case definition", line_count, ld->unicode_data_file);
483  LOG_LOCALE_ERROR (err_msg, ER_LOC_GEN, true);
484  status = ER_LOC_GEN;
485  goto error;
486  }
487 
489  }
490  else if (i == UNICODE_FILE_LOWER_CASE_MAP && uc->gen_cat_id == CAT_Lu)
491  {
492  /* lower case codepoints */
493  cp_count = string_to_int_array (str_p, uc->lower_cp, INTL_CASING_EXPANSION_MULTIPLIER, " ");
494 
495  if (cp_count > INTL_CASING_EXPANSION_MULTIPLIER)
496  {
497  snprintf_dots_truncate (err_msg, sizeof (err_msg) - 1,
498  "Invalid line %d" " of file %s contains more than 2 characters for "
499  "lower case definition", line_count, ld->unicode_data_file);
500  LOG_LOCALE_ERROR (err_msg, ER_LOC_GEN, true);
501  status = ER_LOC_GEN;
502  goto error;
503  }
504 
506  }
508  {
509  uc->unicode_mapping_cp_count = 0; /* init */
510 
511  do
512  {
513  /* if no decomposition available, or decomposition is a compatibility one, discard the specified
514  * decomposition */
515  if (str_p[0] == ';' || str_p[0] == '<')
516  {
517  break;
518  }
519 
520  if (str_p != NULL)
521  {
524  }
525  break;
526  }
527  while (0);
528  }
529 
530  s = strchr (s, ';');
531  if (s == NULL)
532  {
533  break;
534  }
535 
536  s++;
537  }
538  }
539 
540  assert (fp != NULL);
541  fclose (fp);
542 
543  strncpy (last_unicode_file, ld->unicode_data_file, sizeof (last_unicode_file) - 1);
544  last_unicode_file[sizeof (last_unicode_file) - 1] = '\0';
545 
546  return status;
547 
548 error:
549 
550  if (fp != NULL)
551  {
552  fclose (fp);
553  }
554 
556 
557  return status;
558 }
559 
560 /*
561  * unicode_free_data() - Frees Unicode data structures.
562  * Returns:
563  */
564 void
566 {
567  if (unicode_data != NULL)
568  {
569  free (unicode_data);
570  unicode_data = NULL;
571  }
572 
573  *last_unicode_file = '\0';
574 }
575 
576 /*
577  * create_alphabet () - allocated arrays for alphabet
578  * Returns: error code
579  * a(in/out) : alphabet
580  * max_letters(in) : number of letters in alphabet
581  * lower_multiplier(in) : lower case multlipier
582  * upper_multiplier(in) : upper case multlipier
583  */
584 static int
585 create_alphabet (ALPHABET_DATA * a, const int max_letters, const int lower_multiplier, const int upper_multiplier)
586 {
587  int er_status = NO_ERROR;
588 
589  assert (a != NULL);
590  assert (lower_multiplier > 0 && lower_multiplier <= INTL_CASING_EXPANSION_MULTIPLIER);
591  assert (upper_multiplier > 0 && upper_multiplier <= INTL_CASING_EXPANSION_MULTIPLIER);
592 
593  if (lower_multiplier > 1 && upper_multiplier > 1)
594  {
595  LOG_LOCALE_ERROR ("CUBRID does not support collations with both lower "
596  "and upper multipliers with values above 1.", ER_LOC_GEN, true);
597  return ER_LOC_GEN;
598  }
599 
600  memset (a, 0, sizeof (ALPHABET_DATA));
601 
602  if (max_letters <= 0 || max_letters > MAX_UNICODE_CHARS)
603  {
604  LOG_LOCALE_ERROR ("invalid number of letters", ER_LOC_GEN, true);
605  return ER_LOC_GEN;
606  }
607 
608  if (max_letters > 0)
609  {
610  a->lower_cp = (uint32 *) malloc (max_letters * lower_multiplier * sizeof (uint32));
611  a->upper_cp = (uint32 *) malloc (max_letters * upper_multiplier * sizeof (uint32));
612 
613  if (a->lower_cp == NULL || a->upper_cp == NULL)
614  {
615  LOG_LOCALE_ERROR ("memory allocation failed", ER_LOC_GEN, true);
616  er_status = ER_LOC_GEN;
617  goto er_exit;
618  }
619 
620  memset (a->lower_cp, 0, max_letters * lower_multiplier * sizeof (uint32));
621  memset (a->upper_cp, 0, max_letters * upper_multiplier * sizeof (uint32));
622  }
623 
624  a->l_count = max_letters;
625  a->lower_multiplier = lower_multiplier;
626  a->upper_multiplier = upper_multiplier;
627 
628  return er_status;
629 
630 er_exit:
631  if (a->lower_cp != NULL)
632  {
633  free (a->lower_cp);
634  a->lower_cp = NULL;
635  }
636 
637  if (a->upper_cp != NULL)
638  {
639  free (a->upper_cp);
640  a->upper_cp = NULL;
641  }
642 
643  return er_status;
644 }
645 
646 /*
647  * string_to_int_array() - builds a list of codepoints from a string
648  *
649  * Returns: count of codepoints found
650  * s(in): nul-terminated string
651  * cp_list(out): array of codepoints
652  * cp_list_size(in): maximum allowed size of codepoint list
653  * delims(in) : possible delimiters between values
654  *
655  * Note : the string containts unsigned integers in hexadecimal.
656  * The case of returned number of codepoints is greater than
657  * 'cp_list_size' should be handled as error.
658  *
659  */
660 int
661 string_to_int_array (char *s, uint32 * cp_list, const int cp_list_size, const char *delims)
662 {
663  int i = 0;
664  char *str;
665  char *str_end;
666  char *str_cursor;
667 
668  assert (cp_list != NULL);
669 
670  str = s;
671  str_end = s + strlen (s);
672 
673  while (str != NULL && str < str_end)
674  {
675  int result = 0;
676  uint32 val;
677 
678  result = str_to_uint32 (&val, &str_cursor, str, 16);
679  if (result != 0 || str_cursor <= str)
680  {
681  break;
682  }
683 
684  if (i < cp_list_size)
685  {
686  *cp_list++ = val;
687  }
688  i++;
689 
690  while (str_cursor < str_end && strchr (delims, *str_cursor) != NULL)
691  {
692  str_cursor++;
693  }
694  str = str_cursor;
695  }
696 
697  return i;
698 }
699 
700 /*
701  * unicode_process_normalization() - Process character decomposition mappings
702  * imported from the Unicode data file, and prepare
703  * the data structures required for converting strings
704  * to fully composed.
705  *
706  * Returns: error code
707  * ld(in/out) : locale data structure
708  * is_verbose(in): enable or disable verbose mode
709  */
710 int
712 {
713  int i, orig_mapping_count, curr_mapping, mapping_cursor;
714  UNICODE_CP_MAPPING *um;
715  UNICODE_CP_MAPPING *new_map;
716  UNICODE_CHAR *uc;
717  int mapping_start, mapping_count;
718  UNICODE_NORMALIZATION *norm;
719  uint32 cp, old_cp, j;
720  int err_status = NO_ERROR;
721 
722  int *unicode_decomp_map_count = NULL;
723  /* perm_unicode_mapping[cp] = the number of possible sorted permutations of the cp decomposition mapping */
724  UNICODE_CP_MAPPING *temp_list_unicode_decomp_maps = NULL;
725 
726  assert (ld != NULL);
727  norm = &(ld->unicode_normalization);
728 
729  err_status = load_unicode_data (ld);
730  if (err_status != NO_ERROR)
731  {
732  goto exit;
733  }
734 
735  unicode_decomp_map_count = (int *) malloc (MAX_UNICODE_CHARS * sizeof (int));
736  if (unicode_decomp_map_count == NULL)
737  {
738  LOG_LOCALE_ERROR ("memory allocation failed", ER_LOC_GEN, true);
739  err_status = ER_LOC_GEN;
740  goto exit;
741  }
742  memset (unicode_decomp_map_count, 0, MAX_UNICODE_CHARS * sizeof (int));
743 
744  /* Count the number of steps (buffers) necessary for the decomposition of each codepoint. */
745  for (cp = 0; cp < MAX_UNICODE_CHARS; cp++)
746  {
747  uc = &(unicode_data[cp]);
748 
749  if (uc->unicode_mapping_cp_count <= 1 || uc->unicode_mapping[0] > MAX_UNICODE_CHARS)
750  {
751  unicode_decomp_map_count[cp] = 0;
752  }
753  else
754  {
756  unicode_decomp_map_count[cp] = count_decomp_steps (cp);
757  }
758  if (is_verbose)
759  {
760  printf ("CP : %04X\t\tDeco CP count: %2d\t\tDeco steps: %2d\n", cp, uc->unicode_full_decomp_cp_count,
761  unicode_decomp_map_count[cp]);
762  }
763  norm->unicode_mappings_count += unicode_decomp_map_count[cp];
764  }
765 
766  if (is_verbose)
767  {
768  printf ("\nTotal number of composition maps (sum of deco steps) : %d\n", norm->unicode_mappings_count);
769  }
770 
771  /* Prepare the generation of all decomposition steps for all codepoints */
772  temp_list_unicode_decomp_maps =
773  (UNICODE_CP_MAPPING *) malloc (norm->unicode_mappings_count * sizeof (UNICODE_CP_MAPPING));
774  if (temp_list_unicode_decomp_maps == NULL)
775  {
776  LOG_LOCALE_ERROR ("memory allocation failed", ER_LOC_GEN, true);
777  err_status = ER_LOC_GEN;
778  goto exit;
779  }
780  memset (temp_list_unicode_decomp_maps, 0, norm->unicode_mappings_count * sizeof (UNICODE_CP_MAPPING));
781 
782  /* Copy mappings loaded from UnicodeData.txt */
783  cp = 0;
784  orig_mapping_count = 0;
785  while (cp < MAX_UNICODE_CHARS)
786  {
787  if (unicode_decomp_map_count[cp] > 0)
788  {
789  um = &(temp_list_unicode_decomp_maps[orig_mapping_count]);
790  um->cp = cp;
791  um->size = unicode_data[cp].unicode_mapping_cp_count;
792  um->map = (uint32 *) malloc (um->size * sizeof (uint32));
793  if (um->map == NULL)
794  {
795  LOG_LOCALE_ERROR ("memory allocation failed", ER_LOC_GEN, true);
796  err_status = ER_LOC_GEN;
797  goto exit;
798  }
799  memcpy (um->map, unicode_data[cp].unicode_mapping, um->size * sizeof (uint32));
800  orig_mapping_count++;
801  }
802  cp++;
803  }
804 
805  /* Decompose each mapping, top-down, until no mapping can be further decomposed. Total number of decomposition
806  * mappings(steps) was computed previously for each codepoint in unicode_decomp_map_count[cp] and their sum in
807  * unicode_decomp_map_total. These constants will be used for validation (as assert args). */
808  mapping_cursor = orig_mapping_count;
809  curr_mapping = 0;
810  while (curr_mapping < mapping_cursor)
811  {
812  if (mapping_cursor >= norm->unicode_mappings_count)
813  {
814  break;
815  }
816  um = &(temp_list_unicode_decomp_maps[curr_mapping]);
817  new_map = &(temp_list_unicode_decomp_maps[mapping_cursor]);
818 
819  if (um->size > 0 && um->map[0] < MAX_UNICODE_CHARS)
820  {
821  if (unicode_decomp_map_count[um->map[0]] > 0)
822  {
823  new_map->size = um->size - 1 + unicode_data[um->map[0]].unicode_mapping_cp_count;
824  new_map->cp = um->cp;
825  new_map->map = (uint32 *) malloc (new_map->size * sizeof (uint32));
826  if (new_map->map == NULL)
827  {
828  LOG_LOCALE_ERROR ("memory allocation failed", ER_LOC_GEN, true);
829  err_status = ER_LOC_GEN;
830  goto exit;
831  }
832 
833  for (i = 0; i < new_map->size; i++)
834  {
835  if (i < unicode_data[um->map[0]].unicode_mapping_cp_count)
836  {
837  new_map->map[i] = unicode_data[um->map[0]].unicode_mapping[i];
838  }
839  else
840  {
841  new_map->map[i] = um->map[1 + i - unicode_data[um->map[0]].unicode_mapping_cp_count];
842  }
843  }
844  mapping_cursor++;
845  if (is_verbose)
846  {
847  printf ("\nNew mapping step : %04X -> ", um->cp);
848  for (i = 0; i < new_map->size; i++)
849  {
850  printf ("%04X ", new_map->map[i]);
851  }
852  }
853  }
854  }
855  curr_mapping++;
856  }
857 
858  for (i = 0; i < norm->unicode_mappings_count; i++)
859  {
860  um = &(temp_list_unicode_decomp_maps[i]);
861  if (um->size > 0 && unicode_decomp_map_count[um->map[0]] == 0)
862  {
863  /* This means that for um->cp, the um->map can't be further decomposed, thus being the fully decomposed
864  * representation for um->cp. It will be marked as such. */
865  um->is_full_decomp = true;
866  }
867  }
868 
869  /* Sort/group the decompositions in list_unicode_decomp_maps by the value of the first codepoint in each mapping. The
870  * grouping is necessary for optimizing the future search for possible decompositions when putting a string in fully
871  * composed form. */
872  qsort (temp_list_unicode_decomp_maps, norm->unicode_mappings_count, sizeof (UNICODE_CP_MAPPING),
874 
875  /* Build starting indexes for each cp which is the first cp in a compact group of mappings */
876  norm->unicode_mapping_index = (int *) malloc ((MAX_UNICODE_CHARS + 1) * sizeof (int));
877  if (norm->unicode_mapping_index == NULL)
878  {
879  LOG_LOCALE_ERROR ("memory allocation failed", ER_LOC_GEN, true);
880  err_status = ER_LOC_GEN;
881  goto exit;
882  }
883  memset (norm->unicode_mapping_index, 0, (MAX_UNICODE_CHARS + 1) * sizeof (int));
884  cp = temp_list_unicode_decomp_maps[0].map[0];
885  mapping_start = 0;
886  mapping_count = 1;
887  for (i = 1; i < norm->unicode_mappings_count; i++)
888  {
889  if (temp_list_unicode_decomp_maps[i].map[0] == (uint32) cp)
890  {
891  mapping_count++;
892  }
893  else
894  {
895  SET_MAPPING_INDEX (norm->unicode_mapping_index[cp], true, mapping_start);
896  old_cp = cp;
897  cp = (uint32) temp_list_unicode_decomp_maps[i].map[0];
898  mapping_count = 1;
899  mapping_start = i;
900  for (j = old_cp + 1; j < cp; j++)
901  {
902  SET_MAPPING_INDEX (norm->unicode_mapping_index[j], false, mapping_start);
903  }
904  }
905  }
906  SET_MAPPING_INDEX (norm->unicode_mapping_index[cp], true, mapping_start);
907  SET_MAPPING_INDEX (norm->unicode_mapping_index[cp + 1], false, (mapping_start + mapping_count));
908 
909  /* Sort descending each range of UNICODE_MAPPINGs from list_unicode_decomp_maps, having the same codepoint value in
910  * UNICODE_MAPPING.map[0], using memcmp. The sorting is necessary for optimizing the future search for possible
911  * decompositions when putting a string in fully composed form. */
912  for (cp = 0; cp < MAX_UNICODE_CHARS; cp++)
913  {
914  int mapping_start = 0;
915  int mapping_count = 0;
916 
917  if (!CP_HAS_MAPPINGS (norm->unicode_mapping_index[cp]))
918  {
919  continue;
920  }
921  mapping_start = GET_MAPPING_OFFSET (norm->unicode_mapping_index[cp]);
922  mapping_count = GET_MAPPING_OFFSET (norm->unicode_mapping_index[cp + 1]) - mapping_start;
923 
924  qsort (temp_list_unicode_decomp_maps + mapping_start, mapping_count, sizeof (UNICODE_CP_MAPPING),
926  }
927 
928  err_status = unicode_make_normalization_data (temp_list_unicode_decomp_maps, ld);
929 
930 exit:
931  if (unicode_decomp_map_count != NULL)
932  {
933  free (unicode_decomp_map_count);
934  unicode_decomp_map_count = NULL;
935  }
936 
937  if (temp_list_unicode_decomp_maps != NULL)
938  {
939  for (i = 0; i < norm->unicode_mappings_count; i++)
940  {
941  um = &(temp_list_unicode_decomp_maps[i]);
942  if (um->map != NULL)
943  {
944  free (um->map);
945  /* um->map = NULL not necessary, list_unicode_decomp_maps is freed afterwards. */
946  }
947  }
948  free (temp_list_unicode_decomp_maps);
949  temp_list_unicode_decomp_maps = NULL;
950  }
951 
952  return err_status;
953 }
954 
955 /*
956  * count_full_decomp_cp() - Counts the number of codepoints to needed to store
957  * the full decomposition representation for a
958  * codepoint.
959  *
960  * Returns: codepoint count
961  * cp(in) : codepoint
962  *
963  * Note : this is a recursive function.
964  */
965 static int
967 {
968  UNICODE_CHAR *uc;
969 
970  uc = &(unicode_data[cp]);
971  if (cp >= MAX_UNICODE_CHARS)
972  {
973  return 1;
974  }
975 
976  uc = &(unicode_data[cp]);
977 
978  if (uc->unicode_mapping_cp_count == 0)
979  {
980  return 1;
981  }
982 
983  return uc->unicode_mapping_cp_count - 1 + count_full_decomp_cp ((int) uc->unicode_mapping[0]);
984 }
985 
986 /*
987  * count_decomp_steps() - Counts the number of steps for putting a codepoint
988  * into fully decomposed form, by replacing one
989  * decomposable codepoint at every step.
990  *
991  * Returns: step count
992  * cp(in) : codepoint
993  *
994  * Note : this is a recursive function.
995  */
996 static int
998 {
999  UNICODE_CHAR *uc;
1000 
1001  uc = &(unicode_data[cp]);
1002  if (uc->unicode_mapping_cp_count == 0)
1003  {
1004  return 0;
1005  }
1007  || (uc->unicode_mapping_cp_count > 1))
1008  {
1009  return 1 + count_decomp_steps (uc->unicode_mapping[0]);
1010  }
1011 
1012  return 0;
1013 }
1014 
1015 /*
1016  * unicode_make_normalization_data() - takes the data loaded from UnicodeData,
1017  * which was previously sorted, and puts it into optimized form
1018  * into the locale data structure, ready to be exported into
1019  * a shared library.
1020  *
1021  * Returns: ER_LOC_GEN if error
1022  * NO_ERROR otherwise
1023  * decomp_maps(in): variable holding the loaded and partially processed
1024  * unicode data
1025  * ld(in/out): locale data
1026  *
1027  */
1028 static int
1030 {
1031  int err_status = NO_ERROR;
1032  int i, j;
1033  UNICODE_CP_MAPPING *um_cp;
1034  UNICODE_MAPPING *um;
1035  unsigned char str_buf[INTL_UTF8_MAX_CHAR_SIZE * UNICODE_DECOMP_MAP_CP_COUNT];
1036  unsigned char *cur_pos;
1037  char cur_size, byte_count;
1038  UNICODE_NORMALIZATION *norm;
1039 
1040  assert (ld != NULL);
1041  assert (decomp_maps != NULL);
1042 
1043  norm = &(ld->unicode_normalization);
1044 
1045  /* Prepare the unicode_mappings array for storing the data from decomp_maps as utf8 buffers + length + original
1046  * codepoint. */
1047  norm->unicode_mappings = (UNICODE_MAPPING *) malloc (norm->unicode_mappings_count * sizeof (UNICODE_MAPPING));
1048  if (norm->unicode_mappings == NULL)
1049  {
1050  LOG_LOCALE_ERROR ("memory allocation failed", ER_LOC_GEN, true);
1051  err_status = ER_LOC_GEN;
1052  goto exit;
1053  }
1054  memset (norm->unicode_mappings, 0, norm->unicode_mappings_count * sizeof (UNICODE_MAPPING));
1055 
1056  /* Prepare the index list for fully decomposed mappings */
1057  norm->list_full_decomp = (int *) malloc (MAX_UNICODE_CHARS * sizeof (int));
1058  if (norm->list_full_decomp == NULL)
1059  {
1060  LOG_LOCALE_ERROR ("memory allocation failed", ER_LOC_GEN, true);
1061  err_status = ER_LOC_GEN;
1062  goto exit;
1063  }
1064  for (i = 0; i < MAX_UNICODE_CHARS; i++)
1065  {
1066  norm->list_full_decomp[i] = -1;
1067  }
1068 
1069  /* Start importing data from decomp_maps into unicode_mappings. */
1070  for (i = 0; i < norm->unicode_mappings_count; i++)
1071  {
1072  um_cp = &(decomp_maps[i]);
1073  um = &(norm->unicode_mappings[i]);
1074 
1075  um->cp = um_cp->cp;
1076 
1077  /* Empty temporary utf8 buffer */
1079 
1080  /* Convert the list of codepoints into a utf8 buffer */
1081  cur_pos = str_buf;
1082  cur_size = 0;
1083  byte_count = 0;
1084 
1085  for (j = 0; j < um_cp->size; j++)
1086  {
1087  byte_count = intl_cp_to_utf8 (um_cp->map[j], cur_pos);
1088  cur_size += byte_count;
1089  cur_pos += byte_count;
1090  }
1091 
1092  memset (um->buffer, 0, sizeof (um->buffer));
1093 
1094  /* Make the final utf8 buffer used for normalization */
1095  memcpy (um->buffer, str_buf, cur_size);
1096  um->size = cur_size;
1097 
1098  /* If um_cp is a fully decomposed representation for cp, mark it as such. */
1099  if (um_cp->is_full_decomp)
1100  {
1101  norm->list_full_decomp[um_cp->cp] = i;
1102  }
1103  }
1104 
1105 exit:
1106 
1107  return err_status;
1108 }
1109 
1110 #if !defined (SERVER_MODE)
1111 /*
1112  * unicode_string_need_compose() - Checks if a string needs composition
1113  * and returns the size required by fully
1114  * composed form.
1115  *
1116  * Returns:
1117  * str_in(in) : string to normalize
1118  * size_in(in) : size in bytes of string
1119  * size_out(out) : size in bytes of composed string
1120  * need_compose(out) : true if composition is required, false otherwise
1121  * norm(in) : the unicode data for normalization
1122  *
1123  * Note : this is light check, since full check requires more complex
1124  * processing - same as composing algorithm.
1125  * All input is assumed in UTF-8 character set
1126  */
1127 bool
1128 unicode_string_need_compose (const char *str_in, const int size_in, int *size_out, const UNICODE_NORMALIZATION * norm)
1129 {
1130  const char *pc;
1131  const char *p_end;
1132 
1133  assert (size_out != NULL);
1134 
1135  *size_out = 0;
1136 
1137  if (!prm_get_bool_value (PRM_ID_UNICODE_INPUT_NORMALIZATION) || norm == NULL || size_in == 0 || str_in == NULL)
1138  {
1139  return false;
1140  }
1141 
1142  assert (str_in != NULL);
1143 
1144  /* If all chars are in the range 0-127, then the string is ASCII and no unicode operations are neccessary e.g.
1145  * composition */
1146  /* Reuse match_found as validation flag. */
1147  p_end = str_in + size_in;
1148 
1149  for (pc = str_in; pc < p_end; pc++)
1150  {
1151  if ((unsigned char) (*pc) >= 0x80)
1152  {
1153  *size_out = size_in;
1154  return true;
1155  }
1156  }
1157 
1158  return false;
1159 }
1160 
1161 /*
1162  * unicode_compose_string() - Put a string into fully composed form.
1163  *
1164  * Returns:
1165  * str_in(in) : string to normalize
1166  * size_in(in) : size in bytes of string
1167  * str_out(out) : preallocated buffer to store composed string, output string
1168  * is not null terminated
1169  * size_out(out) : actual size in bytes of composed string
1170  * is_composed (out) : true if the string required composition
1171  * norm(in) : the unicode data for normalization
1172  */
1173 void
1174 unicode_compose_string (const char *str_in, const int size_in, char *str_out, int *size_out, bool * is_composed,
1175  const UNICODE_NORMALIZATION * norm)
1176 {
1177  char *composed_str;
1178  int composed_index, remaining_bytes;
1179  const char *str_next = NULL;
1180  unsigned int cp;
1181  int map_start, map_end, i, byte_count;
1182  bool match_found = false, composition_found;
1183  UNICODE_MAPPING *um;
1184  const char *str_cursor;
1185  const char *str_end;
1186 
1187  assert (prm_get_bool_value (PRM_ID_UNICODE_INPUT_NORMALIZATION) && norm != NULL && size_in > 0 && str_in != NULL);
1188 
1189  composed_index = 0;
1190 
1191  /* Build composed string */
1192  str_next = str_in;
1193  str_cursor = str_in;
1194  remaining_bytes = size_in;
1195  composition_found = false;
1196  composed_str = str_out;
1197  str_end = str_in + size_in;
1198 
1199  while (str_cursor < str_end)
1200  {
1201  int first_cp_size;
1202 
1203  cp = intl_utf8_to_cp ((unsigned char *) str_cursor, remaining_bytes, (unsigned char **) &str_next);
1204 
1205  first_cp_size = CAST_STRLEN (str_next - str_cursor);
1206  remaining_bytes -= first_cp_size;
1207 
1208  match_found = false;
1209 
1210  if (cp >= MAX_UNICODE_CHARS - 2 || !CP_HAS_MAPPINGS (norm->unicode_mapping_index[cp]))
1211  {
1212  goto match_not_found;
1213  }
1214 
1215  map_start = GET_MAPPING_OFFSET (norm->unicode_mapping_index[cp]);
1216  map_end = GET_MAPPING_OFFSET (norm->unicode_mapping_index[cp + 1]);
1217 
1218  /* Search the mapping list for a possible match */
1219  for (i = map_start; i < map_end; i++)
1220  {
1221  um = &(norm->unicode_mappings[i]);
1222  if (um->size > remaining_bytes + first_cp_size)
1223  {
1224  continue;
1225  }
1226 
1227  if (memcmp (um->buffer, str_cursor, um->size) == 0)
1228  {
1229  /* If a composition matches, apply it. */
1230  composed_index += intl_cp_to_utf8 (um->cp, (unsigned char *) (&(composed_str[composed_index])));
1231  str_cursor += um->size;
1232  match_found = true;
1233  composition_found = true;
1234  break;
1235  }
1236  }
1237 
1238  /* If no composition can be matched to start with the decoded codepoint, just copy the bytes corresponding to the
1239  * codepoint from the input string to the output, adjust pointers and loop again. */
1240  match_not_found:
1241  if (!match_found)
1242  {
1243  byte_count = CAST_STRLEN (str_next - str_cursor);
1244  memcpy (&(composed_str[composed_index]), str_cursor, byte_count);
1245  composed_index += byte_count;
1246  str_cursor += byte_count;
1247  }
1248  } /* while */
1249 
1250  /* Set output variables */
1251  *size_out = composed_index;
1252  if (composition_found)
1253  {
1254  *is_composed = true;
1255  }
1256 
1257  return;
1258 }
1259 
1260 /*
1261  * unicode_string_need_decompose() - Checks if a string needs
1262  * decomposition and returns the size
1263  * required by decomposed form.
1264  *
1265  * Returns: true if decomposition is required
1266  * str_in(in) : string to normalize
1267  * size_in(in) : size of string in bytes
1268  * decomp_size(out) : size required by decomposed form in bytes
1269  * norm(in) : the unicode context in which the normalization is performed
1270  *
1271  * Note : Input string is assumed UTF-8 character set.
1272  */
1273 bool
1274 unicode_string_need_decompose (const char *str_in, const int size_in, int *decomp_size,
1275  const UNICODE_NORMALIZATION * norm)
1276 {
1277  int bytes_read, decomp_index, decomposed_size = 0;
1278  unsigned int cp;
1279  const char *src_cursor;
1280  const char *src_end;
1281  const char *next;
1282  bool can_decompose;
1283 
1285  {
1286  goto no_decompose_cnt;
1287  }
1288 
1289  assert (str_in != NULL);
1290 
1291  /* check if ASCII */
1292  can_decompose = false;
1293  src_end = str_in + size_in;
1294  for (src_cursor = str_in; src_cursor < src_end; src_cursor++)
1295  {
1296  if ((unsigned char) (*src_cursor) >= 0x80)
1297  {
1298  can_decompose = true;
1299  break;
1300  }
1301  }
1302  if (!can_decompose)
1303  {
1304  goto no_decompose_cnt;
1305  }
1306 
1307  /* Read each codepoint and add its expanded size to the overall size */
1308  src_cursor = str_in;
1309  next = str_in;
1310  can_decompose = false;
1311  src_end = str_in + size_in;
1312  while (src_cursor < src_end)
1313  {
1314  cp = intl_utf8_to_cp ((unsigned char *) src_cursor, CAST_STRLEN (src_end - src_cursor), (unsigned char **) &next);
1315  bytes_read = CAST_STRLEN (next - src_cursor);
1316 
1317  decomp_index = (cp < MAX_UNICODE_CHARS) ? norm->list_full_decomp[cp] : -1;
1318  if (decomp_index > -1)
1319  {
1320  decomposed_size += norm->unicode_mappings[decomp_index].size;
1321  can_decompose = true;
1322  }
1323  else
1324  {
1325  decomposed_size += bytes_read;
1326  }
1327 
1328  src_cursor = next;
1329  }
1330 
1331  /* If no decomposition is needed, return the same size as the input string and exit. */
1332  if (!can_decompose)
1333  {
1334  goto no_decompose_cnt;
1335  }
1336 
1337  *decomp_size = decomposed_size;
1338 
1339  return true;
1340 
1341 no_decompose_cnt:
1342  *decomp_size = size_in;
1343 
1344  return false;
1345 }
1346 
1347 /*
1348  * unicode_decompose_string() - Put a string into fully decomposed form.
1349  *
1350  * Returns: ER_OUT_OF_VIRTUAL_MEMORY if internal memory allocation fails
1351  * NO_ERROR if successfull
1352  * str_in(in) : string to normalize
1353  * size_in(in) : size in bytes of string
1354  * str_out(out): preallocated buffer for string in decomposed form
1355  * size_out(out): actual size of decomposed form in bytes
1356  * norm(in) : the unicode context in which the normalization is performed
1357  */
1358 void
1359 unicode_decompose_string (const char *str_in, const int size_in, char *str_out, int *size_out,
1360  const UNICODE_NORMALIZATION * norm)
1361 {
1362  int bytes_read, decomp_index;
1363  unsigned int cp;
1364  const char *src_cursor;
1365  const char *src_end;
1366  const char *next;
1367  char *dest_cursor;
1368 
1370 
1371  assert (str_in != NULL);
1372  assert (str_out != NULL);
1373  assert (size_out != NULL);
1374 
1375  src_cursor = str_in;
1376  dest_cursor = str_out;
1377  next = str_in;
1378  src_end = str_in + size_in;
1379  while (src_cursor < src_end)
1380  {
1381  cp = intl_utf8_to_cp ((unsigned char *) src_cursor, CAST_STRLEN (src_end - src_cursor), (unsigned char **) &next);
1382  bytes_read = CAST_STRLEN (next - src_cursor);
1383  decomp_index = (cp < MAX_UNICODE_CHARS) ? norm->list_full_decomp[cp] : -1;
1384  if (decomp_index > -1)
1385  {
1386  memcpy (dest_cursor, norm->unicode_mappings[decomp_index].buffer, norm->unicode_mappings[decomp_index].size);
1387  dest_cursor += norm->unicode_mappings[decomp_index].size;
1388  }
1389  else
1390  {
1391  memcpy (dest_cursor, src_cursor, bytes_read);
1392  dest_cursor += bytes_read;
1393  }
1394  src_cursor = next;
1395  }
1396 
1397  *size_out = CAST_STRLEN (dest_cursor - str_out);
1398 }
1399 #endif /* SERVER_MODE */
1400 /*
1401  * comp_func_unicode_cp_mapping() - compare function for sorting a group of
1402  * unicode decompositions starting with the
1403  * same codepoint
1404  *
1405  * Returns: compare result
1406  * arg1(in) :
1407  * arg2(in) :
1408  */
1409 static int
1410 comp_func_unicode_cp_mapping (const void *arg1, const void *arg2)
1411 {
1412  UNICODE_CP_MAPPING *um1, *um2;
1413  int min_size, result;
1414 
1415  um1 = (UNICODE_CP_MAPPING *) arg1;
1416  um2 = (UNICODE_CP_MAPPING *) arg2;
1417 
1418  min_size = (um1->size < um2->size) ? um1->size : um2->size;
1419  result = memcmp (um1->map, um2->map, min_size * sizeof (uint32));
1420  /* Result will be reverted to obtain reverse ordering */
1421  if (result == 0)
1422  {
1423  if (um1->size > min_size)
1424  {
1425  return -1;
1426  }
1427  if (um2->size > min_size)
1428  {
1429  return 1;
1430  }
1431  if (um1->cp < um2->cp)
1432  {
1433  return -1;
1434  }
1435  return 1;
1436  }
1437 
1438  return -result;
1439 }
1440 
1441 /*
1442  * comp_func_grouping_unicode_cp_mapping() - compare function for sorting
1443  * all decompositions
1444  *
1445  * Returns: compare result
1446  * arg1(in) :
1447  * arg2(in) :
1448  */
1449 static int
1450 comp_func_grouping_unicode_cp_mapping (const void *arg1, const void *arg2)
1451 {
1452  UNICODE_CP_MAPPING *um1, *um2;
1453  int result;
1454 
1455  um1 = (UNICODE_CP_MAPPING *) arg1;
1456  um2 = (UNICODE_CP_MAPPING *) arg2;
1457 
1458  if (um1->map[0] > um2->map[0])
1459  {
1460  result = 1;
1461  }
1462  else
1463  {
1464  result = -1;
1465  }
1466 
1467  return result;
1468 }
static int comp_func_grouping_unicode_cp_mapping(const void *arg1, const void *arg2)
#define UNICODE_FILE_CHAR_DECOMPOSITION_MAPPING
#define UNICODE_FILE_LOWER_CASE_MAP
#define NO_ERROR
Definition: error_code.h:46
bool unicode_string_need_compose(const char *str_in, const int size_in, int *size_out, const UNICODE_NORMALIZATION *norm)
#define ER_LOC_GEN
Definition: error_code.h:1371
unsigned char buffer[NORMALIZATION_MAX_BUF_SIZE]
#define UNICODE_FILE_UPPER_CASE_MAP
GENERAL_CATEGORY list_gen_cat[]
ALPHABET_DATA identif_alphabet
static int create_alphabet(ALPHABET_DATA *a, const int max_letters, const int lower_multiplier, const int upper_multiplier)
int str_to_uint32(unsigned int *ret_p, char **end_p, const char *str_p, int base)
Definition: porting.c:2382
char unicode_mapping_cp_count
static int unicode_make_normalization_data(UNICODE_CP_MAPPING *decomp_maps, LOCALE_DATA *ld)
void unicode_compose_string(const char *str_in, const int size_in, char *str_out, int *size_out, bool *is_composed, const UNICODE_NORMALIZATION *norm)
ALPHABET_TYPE a_type
#define CAST_STRLEN
Definition: porting.h:470
int intl_cp_to_utf8(const unsigned int codepoint, unsigned char *utf8_seq)
uint32 lower_cp[INTL_CASING_EXPANSION_MULTIPLIER]
static UNICODE_CHAR * unicode_data
ALPHABET_DATA alphabet
static char last_unicode_file[PATH_MAX]
const char * val
#define MAX_UNICODE_CHARS
static int load_unicode_data(const LOCALE_DATA *ld)
uint32 unicode_mapping[UNICODE_DECOMP_MAP_CP_COUNT]
UNICODE_MAPPING * unicode_mappings
static int comp_func_unicode_cp_mapping(const void *arg1, const void *arg2)
void unicode_decompose_string(const char *str_in, const int size_in, char *str_out, int *size_out, const UNICODE_NORMALIZATION *norm)
char * envvar_localedatadir_file(char *path, size_t size, const char *filename)
#define assert(x)
#define UNICODEDATA_FILE
char unicode_data_file[PATH_MAX]
#define UNICODE_FILE_LINE_SIZE
char unicode_full_decomp_cp_count
void unicode_free_data(void)
GENERAL_CATEG_ID id
static int count_decomp_steps(int cp)
#define NULL
Definition: freelistheap.h:34
unsigned int intl_utf8_to_cp(const unsigned char *utf8, const int size, unsigned char **next_char)
unsigned int * lower_cp
if(extra_options)
Definition: dynamic_load.c:958
static int unicode_data_upper_mult
char unicode_data_file[PATH_MAX]
int unicode_process_alphabet(LOCALE_DATA *ld, bool is_verbose)
GENERAL_CATEG_ID
TRANSFORM_TYPE type
FILE * fopen_ex(const char *filename, const char *type)
Definition: util_common.c:322
#define INTL_UTF8_MAX_CHAR_SIZE
int unicode_process_normalization(LOCALE_DATA *ld, bool is_verbose)
int string_to_int_array(char *s, uint32 *cp_list, const int cp_list_size, const char *delims)
static void error(const char *msg)
Definition: gencat.c:331
int intl_utf8_to_cp_list(const unsigned char *utf8, const int size, unsigned int *cp_array, const int max_array_size, int *array_count)
UNICODE_NORMALIZATION unicode_normalization
#define snprintf_dots_truncate(dest, max_len,...)
Definition: porting.h:323
uint32 upper_cp[INTL_CASING_EXPANSION_MULTIPLIER]
static void str_out(const char *fmt,...)
#define strlen(s1)
Definition: intl_support.c:43
#define CP_HAS_MAPPINGS(val)
#define SET_MAPPING_INDEX(val, is_used, offset)
TRANSFORM_RULE * rules
#define UNICODE_FILE_GENERAL_CAT_POS
bool prm_get_bool_value(PARAM_ID prm_id)
unsigned int uint32
bool unicode_string_need_decompose(const char *str_in, const int size_in, int *decomp_size, const UNICODE_NORMALIZATION *norm)
int i
Definition: dynamic_load.c:954
#define LOG_LOCALE_ERROR(msg, er_status, do_print)
#define GET_MAPPING_OFFSET(val)
GENERAL_CATEG_ID gen_cat_id
#define UNICODE_DECOMP_MAP_CP_COUNT
#define INTL_CASING_EXPANSION_MULTIPLIER
#define UNICODE_FILE_FIELDS
ALPHABET_TAILORING alpha_tailoring
static int count_full_decomp_cp(int cp)
#define ERR_MSG_SIZE
static int unicode_data_lower_mult
unsigned int * upper_cp