CUBRID Engine  latest
intl_support.c
Go to the documentation of this file.
1 /*
2  * Copyright 2008 Search Solution Corporation
3  * Copyright 2016 CUBRID Corporation
4  *
5  * Licensed under the Apache License, Version 2.0 (the "License");
6  * you may not use this file except in compliance with the License.
7  * You may obtain a copy of the License at
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  *
17  */
18 
19 /*
20  * intl_support.c : platform independent internationalization functions.
21  */
22 
23 #ident "$Id$"
24 
25 #include "config.h"
26 
27 #include <assert.h>
28 #include <stdio.h>
29 #include <string.h>
30 #include <errno.h>
31 #include <locale.h>
32 #include <ctype.h>
33 #include <wctype.h>
34 
35 #include "error_manager.h"
36 #include "intl_support.h"
37 #include "language_support.h"
38 #include "chartype.h"
39 #include "system_parameter.h"
40 #include "charset_converters.h"
41 
42 #if defined (SUPPRESS_STRLEN_WARNING)
43 #define strlen(s1) ((int) strlen(s1))
44 #endif /* defined (SUPPRESS_STRLEN_WARNING) */
45 
46 #define IS_8BIT(c) ((c) >> 7)
47 /* Special values for EUC encodings */
48 #ifndef SS3
49 #define SS3 143
50 #endif
51 
52 #define LOCALE_C "C"
53 #if defined(AIX)
54 #define LOCALE_KOREAN "ko_KR.IBM-eucKR"
55 #else
56 #define LOCALE_KOREAN "korean"
57 #endif
58 
59 #if defined (ENABLE_UNUSED_FUNCTION)
60 /* EUC-KR characters may be used with ISO-88591-1 charset when
61  * PRM_SINGLE_BYTE_COMPARE is 'no'
62  * EUC-KR have either 3 (when first byte is SS3) or two bytes (use this macro
63  * to check the byte range) */
64 #define IS_PSEUDO_KOREAN(ch) \
65  ( ((unsigned char) ch >= (unsigned char) 0xa1) \
66  && ((unsigned char) ch <= (unsigned char) 0xfe) )
67 #endif
68 
69 #define CHAR_BYTE_TO_LOWER(c) ((c) + ('a' - 'A'))
70 
71 #define CHAR_BYTE_TO_UPPER(c) ((c) - ('a' - 'A'))
72 
73 /* conversion from turkish ISO 8859-9 to UTF-8 */
74 #define ISO_8859_9_FIRST_CP 0x11e
75 #define ISO_8859_9_LAST_CP 0x15f
76 
79 
80 /* conversion from Latin 1 ISO 8859-1 to UTF-8: */
82 
83 
84 /* identifiers : support for multibyte chars in INTL_CODESET_ISO88591 codeset
85  * (default legacy codeset) */
86 bool intl_Mbs_support = true;
88 
89 /* General EUC string manipulations */
90 static int intl_tolower_euc (const unsigned char *src, unsigned char *d, int byte_size);
91 static int intl_toupper_euc (const unsigned char *src, unsigned char *d, int byte_size);
92 static int intl_count_euc_chars (const unsigned char *s, int length_in_bytes);
93 static int intl_count_euc_bytes (const unsigned char *s, int length_in_chars);
94 #if defined (ENABLE_UNUSED_FUNCTION)
95 static wchar_t *intl_copy_lowercase (const wchar_t * ws, size_t n);
96 static int intl_is_korean (unsigned char ch);
97 #endif /* ENABLE_UNUSED_FUNCTION */
98 
99 /* UTF-8 string manipulations */
100 static int intl_tolower_utf8 (const ALPHABET_DATA * a, const unsigned char *s, unsigned char *d, int length_in_chars,
101  int *d_size);
102 static int intl_toupper_utf8 (const ALPHABET_DATA * a, const unsigned char *s, unsigned char *d, int length_in_chars,
103  int *d_size);
104 static int intl_count_utf8_bytes (const unsigned char *s, int length_in_chars);
105 static int intl_char_tolower_utf8 (const ALPHABET_DATA * a, const unsigned char *s, const int size, unsigned char *d,
106  unsigned char **next);
107 static int intl_char_toupper_utf8 (const ALPHABET_DATA * a, const unsigned char *s, const int size, unsigned char *d,
108  unsigned char **next);
109 static int intl_strcasecmp_utf8_one_cp (const ALPHABET_DATA * alphabet, unsigned char *str1, unsigned char *str2,
110  const int size_str1, const int size_str2, unsigned int cp1, unsigned int cp2,
111  int *skip_size1, int *skip_size2);
112 static void intl_init_conv_iso8859_9_to_utf8 (void);
113 static void intl_init_conv_iso8859_1_to_utf8 (void);
114 
115 
117  TEXT_CONV_ISO_88599_BUILTIN, /* type */
118  (char *) "28599", /* Windows Code page */
119  (char *) "iso88599", /* Linux charset identifiers */
120  {0}, /* byte flags : not used for ISO */
121  0, 0, NULL, /* UTF-8 to console : filled by init function */
122  0, 0, NULL, /* console to UTF-8 : filled by init function */
123  intl_text_utf8_to_single_byte, /* UTF-8 to console conversion function */
124  intl_text_single_byte_to_utf8, /* console to UTF-8 conversion function */
125  intl_init_conv_iso8859_9_to_utf8, /* init function */
126 };
127 
129  TEXT_CONV_ISO_88591_BUILTIN, /* type */
130  (char *) "28591", /* Windows Code page */
131  (char *) "iso88591", /* Linux charset identifiers */
132  {0}, /* byte flags : not used for ISO */
133  0, 0, NULL, /* UTF-8 to console : filled by init function */
134  0, 0, NULL, /* console to UTF-8 : filled by init function */
135  intl_text_utf8_to_single_byte, /* UTF-8 to console conversion function */
136  intl_text_single_byte_to_utf8, /* console to UTF-8 conversion function */
137  intl_init_conv_iso8859_1_to_utf8, /* init function */
138 };
139 
140 
141 /*
142  * intl_mbs_chr() - find first occurrence of the given character
143  * return: a pointer to the first occurrence of the given character in
144  * the given multibyte string, or NULL if no occurrence is found
145  * mbs(in)
146  * wc(in)
147  */
148 char *
149 intl_mbs_chr (const char *mbs, wchar_t wc)
150 {
151  int nbytes;
152  wchar_t cur_wc;
153 
154  assert (mbs != NULL);
155 
156  if (!intl_Mbs_support)
157  {
158  return (char *) (strchr (mbs, (int) wc));
159  }
160 
161  for (nbytes = 0; (nbytes = mbtowc (&cur_wc, mbs, MB_LEN_MAX)) > 0 && cur_wc != L'\0' && cur_wc != wc; mbs += nbytes)
162  {
163  continue;
164  }
165 
166  if (!*mbs && wc)
167  {
168  return NULL;
169  }
170 
171  return (char *) mbs;
172 }
173 
174 /*
175  * intl_mbs_len() - computes the number of multibyte character sequences in the multibyte
176  * character string, not including the terminating zero byte
177  * return: number of characters if success.
178  * On error, 0 is returned and errno is set.
179  * EINVAL : mbs contains an invalid byte sequence.
180  * mbs(in)
181  */
182 int
183 intl_mbs_len (const char *mbs)
184 {
185  int num_of_chars;
186  int clen;
187 
188  assert (mbs != NULL);
189 
190  if (!intl_Mbs_support)
191  {
192  return strlen (mbs);
193  }
194 
195  for (num_of_chars = 0; (clen = mblen (mbs, MB_LEN_MAX)) > 0 && *mbs; mbs += clen, num_of_chars++)
196  {
197  continue;
198  }
199 
200  if (clen < 0)
201  {
202  errno = EINVAL;
203  num_of_chars = 0;
204  }
205 
206  return num_of_chars;
207 }
208 
209 /*
210  * intl_mbs_nth() - finds the nth multibyte character in the multibyte string
211  * return: a pointer to the nth character in n.
212  * NULL if either an error occurs or there are not n characters
213  * in the string
214  * mbs(in)
215  * n(in)
216  */
217 
218 const char *
219 intl_mbs_nth (const char *mbs, size_t n)
220 {
221  size_t num_of_chars;
222  int clen;
223 
224  assert (mbs != NULL);
225  if (mbs == NULL)
226  {
227  return NULL;
228  }
229 
230  if (!intl_Mbs_support)
231  {
232  if (strlen (mbs) < (int) n)
233  {
234  errno = EINVAL;
235  return NULL;
236  }
237  return &mbs[n];
238  }
239 
240  for (num_of_chars = 0, clen = 0; num_of_chars < n && (clen = mblen (mbs, MB_LEN_MAX)) > 0 && *mbs;
241  mbs += clen, num_of_chars++)
242  {
243  continue;
244  }
245 
246  if (clen < 0)
247  {
248  errno = EINVAL;
249  mbs = NULL;
250  }
251  else if (num_of_chars < n)
252  {
253  mbs = NULL;
254  }
255 
256  return mbs;
257 }
258 
259 /*
260  * intl_mbs_spn() - return the size of the prefix of the given multibyte string
261  * consisting of the given wide characters.
262  * return: size in bytes.
263  * If mbs contains an invalid byte sequence,
264  * errno is set and 0 is returned.
265  * mbs(in)
266  * chars(in)
267  */
268 int
269 intl_mbs_spn (const char *mbs, const wchar_t * chars)
270 {
271  int clen;
272  wchar_t wc;
273  int size;
274 
275  assert (mbs != NULL && chars != NULL);
276 
277  if (!intl_Mbs_support)
278  {
279  return (int) strspn (mbs, (const char *) chars);
280  }
281 
282  for (size = 0; (clen = mbtowc (&wc, mbs, MB_LEN_MAX)) > 0 && *mbs && wcschr (chars, wc); mbs += clen, size += clen)
283  {
284  continue;
285  }
286 
287  if (clen < 0)
288  {
289  errno = EINVAL;
290  size = 0;
291  }
292 
293  return size;
294 }
295 
296 #if defined (ENABLE_UNUSED_FUNCTION)
297 /*
298  * intl_mbs_namecmp() - compares successive multi-byte character
299  * from two multi-byte identifier string
300  * return: 0 if all the multi-byte character identifier are the "same",
301  * positive number if mbs1 is greater than mbs2,
302  * negative number otherwise.
303  * mbs1(in)
304  * mbs2(in)
305  *
306  * Note: "same" means that this function ignores bracket '[', ']'
307  * so mbs1 = "[value]" and mbs2 = "value" returns 0
308  */
309 int
310 intl_mbs_namecmp (const char *mbs1, const char *mbs2)
311 {
312  const char *cp1 = mbs1;
313  const char *cp2 = mbs2;
314  int cp1_len, cp2_len;
315 
316  assert (mbs1 != NULL && mbs2 != NULL);
317 
318  cp1_len = strlen (cp1);
319  cp2_len = strlen (cp2);
320 
321  if (cp1[0] == '[')
322  {
323  cp1++;
324  cp1_len -= 2;
325  }
326 
327  if (cp2[0] == '[')
328  {
329  cp2++;
330  cp2_len -= 2;
331  }
332 
333  if (cp1_len != cp2_len)
334  {
335  /* fail return */
336  return intl_mbs_casecmp (cp1, cp2);
337  }
338 
339  return intl_mbs_ncasecmp (cp1, cp2, cp1_len);
340 }
341 #endif
342 
343 /*
344  * intl_mbs_casecmp() - compares successive multi-byte character elements
345  * from two multi-byte strings
346  * return: 0 if all the multi-byte character elements are the same,
347  * positive number if mbs1 is greater than mbs2,
348  * negative number otherwise.
349  * mbs1(in)
350  * mbs2(in)
351  *
352  * Note: This function does not use the collating sequences specified
353  * in the LC_COLLATE category of the current locale.
354  * This function set errno if mbs1 or mbs2 contain one or more
355  * invalid multi-byte characters.
356  */
357 int
358 intl_mbs_casecmp (const char *mbs1, const char *mbs2)
359 {
360  wchar_t wc1, wc2;
361  int mb1_len, mb2_len;
362 
363  assert (mbs1 != NULL && mbs2 != NULL);
364 
365  if (!intl_Mbs_support)
366  {
367 #if defined(WINDOWS)
368  return _stricmp (mbs1, mbs2);
369 #else
370  return strcasecmp (mbs1, mbs2);
371 #endif
372  }
373 
374  for (mb1_len = mbtowc (&wc1, mbs1, MB_LEN_MAX), mb2_len = mbtowc (&wc2, mbs2, MB_LEN_MAX);
375  mb1_len > 0 && mb2_len > 0 && wc1 && wc2 && !(towlower (wc1) - towlower (wc2));)
376  {
377  mbs1 += mb1_len;
378  mbs2 += mb2_len;
379 
380  mb1_len = mbtowc (&wc1, mbs1, MB_LEN_MAX);
381  mb2_len = mbtowc (&wc2, mbs2, MB_LEN_MAX);
382  }
383 
384  if (mb1_len < 0 || mb2_len < 0)
385  {
386  errno = EINVAL;
387  }
388 
389  return (int) (towlower (wc1) - towlower (wc2));
390 }
391 
392 #if defined (ENABLE_UNUSED_FUNCTION)
393 int
394 intl_mbs_cmp (const char *mbs1, const char *mbs2)
395 {
396  wchar_t wc1, wc2;
397  int mb1_len, mb2_len;
398 
399  assert (mbs1 != NULL && mbs2 != NULL);
400 
401  if (!intl_Mbs_support)
402  {
403  return strcmp (mbs1, mbs2);
404  }
405 
406  for (mb1_len = mbtowc (&wc1, mbs1, MB_LEN_MAX), mb2_len = mbtowc (&wc2, mbs2, MB_LEN_MAX);
407  mb1_len > 0 && mb2_len > 0 && wc1 && wc2 && !(wc1 - wc2);)
408  {
409  mbs1 += mb1_len;
410  mbs2 += mb2_len;
411 
412  mb1_len = mbtowc (&wc1, mbs1, MB_LEN_MAX);
413  mb2_len = mbtowc (&wc2, mbs2, MB_LEN_MAX);
414  }
415 
416  if (mb1_len < 0 || mb2_len < 0)
417  {
418  errno = EINVAL;
419  }
420 
421  return (int) (wc1 - wc2);
422 }
423 #endif
424 
425 /*
426  * intl_mbs_ncasecmp() - compares the first n successive multi-byte character elements
427  * from two multi-byte strings
428  * return: 0 if the first n multi-byte character elements are the same,
429  * positive number if mbs1 is greater than mbs2,
430  * negative number otherwise.
431  * mbs1(in)
432  * mbs2(in)
433  * n (in)
434  *
435  * Note: This function does not use the collating sequences specified
436  * in the LC_COLLATE category of the current locale.
437  * This function set errno if mbs1 or mbs2 contain one or more
438  * invalid multi-byte characters.
439  */
440 int
441 intl_mbs_ncasecmp (const char *mbs1, const char *mbs2, size_t n)
442 {
443  wchar_t wc1, wc2;
444  int mb1_len, mb2_len;
445  size_t num_of_chars;
446 
447  assert (mbs1 != NULL && mbs2 != NULL);
448 
449  if (!intl_Mbs_support)
450  {
451 #if defined(WINDOWS)
452  return _strnicmp (mbs1, mbs2, n);
453 #else
454  return strncasecmp (mbs1, mbs2, n);
455 #endif
456  }
457 
458  for (num_of_chars = 1, mb1_len = mbtowc (&wc1, mbs1, MB_LEN_MAX), mb2_len = mbtowc (&wc2, mbs2, MB_LEN_MAX);
459  mb1_len > 0 && mb2_len > 0 && wc1 && wc2 && num_of_chars < n && !(towlower (wc1) - towlower (wc2));
460  num_of_chars++)
461  {
462  mbs1 += mb1_len;
463  mbs2 += mb2_len;
464 
465  mb1_len = mbtowc (&wc1, mbs1, MB_LEN_MAX);
466  mb2_len = mbtowc (&wc2, mbs2, MB_LEN_MAX);
467  }
468 
469  if (mb1_len < 0 || mb2_len < 0)
470  {
471  errno = EINVAL;
472  }
473 
474  return (int) (towlower (wc1) - towlower (wc2));
475 }
476 
477 /*
478  * intl_mbs_ncpy() - Copy characters from mbs2 to mbs1 at most (n-1) bytes
479  * return: mbs1, null-terminated string.
480  * mbs1(out)
481  * mbs2(in)
482  * n(in): size of destination buffer, including null-terminator
483  *
484  * Note: If mbs2 contains an invalid multi-byte character, errno is set and the
485  * function returns NULL. In this case, the contents of mbs1 are undefined.
486  */
487 
488 char *
489 intl_mbs_ncpy (char *mbs1, const char *mbs2, size_t n)
490 {
491  size_t num_of_bytes;
492  int clen, i;
493  char *dest;
494 
495  assert (mbs1 != NULL && mbs2 != NULL);
496 
497  if (!intl_Mbs_support)
498  {
499  size_t src_len = strlen (mbs2);
500 
501  strncpy (mbs1, mbs2, n - 1);
502  if (src_len < n)
503  {
504  mbs1[src_len] = '\0';
505  }
506  else
507  {
508  mbs1[n - 1] = '\0';
509  }
510 
511  return mbs1;
512  }
513 
514  for (num_of_bytes = 0, clen = mblen (mbs2, MB_LEN_MAX), dest = mbs1; clen > 0 && (num_of_bytes + clen) <= n - 1;
515  clen = mblen (mbs2, MB_LEN_MAX))
516  {
517  /* copy the next multi-byte char */
518  for (i = 0; i < clen; i++)
519  {
520  *dest++ = *mbs2++;
521  }
522 
523  /* advance the byte counter */
524  num_of_bytes += clen;
525  }
526 
527  if (clen < 0)
528  {
529  errno = EINVAL;
530  mbs1 = NULL;
531  }
532  else
533  {
534  *dest = '\0';
535  }
536 
537  return mbs1;
538 }
539 
540 #if defined (ENABLE_UNUSED_FUNCTION)
541 /*
542  * intl_mbs_lower() - convert given characters to lowercase characters
543  * return: always 0
544  * mbs1(in)
545  * mbs2(out)
546  */
547 int
548 intl_mbs_lower (const char *mbs1, char *mbs2)
549 {
550  int char_count = 0;
551  int length_in_bytes = 0;
552 
553  if (!intl_Mbs_support)
554  {
555  char *s;
556  s = strcpy (mbs2, mbs1);
557  while (*s)
558  {
559  *s = char_tolower (*s);
560  s++;
561  }
562  return 0;
563  }
564 
565  if (mbs1)
566  {
567  length_in_bytes = strlen (mbs1);
568  }
569 
570  if (length_in_bytes)
571  {
572  intl_char_count ((unsigned char *) mbs1, length_in_bytes, lang_charset (), &char_count);
573  intl_lower_string ((unsigned char *) mbs1, (unsigned char *) mbs2, char_count, lang_charset ());
574  mbs2[length_in_bytes] = '\0';
575  }
576  else
577  {
578  mbs2[0] = '\0';
579  }
580 
581  return 0;
582 }
583 
584 /*
585  * intl_mbs_nlower() - convert given characters to lowercase characters
586  * return: always 0
587  * dest(out) : destination buffer
588  * src(in) : source buffer
589  * max_len(in) : maximum buffer length
590  */
591 
592 int
593 intl_mbs_nlower (char *dest, const char *src, const int max_len)
594 {
595  int char_count = 0;
596  int length_in_bytes = 0;
597 
598  if (src == NULL)
599  {
600  dest[0] = '\0';
601  return 0;
602  }
603 
604  if (!intl_Mbs_support)
605  {
606  int i = 0;
607  for (i = 0; (src[i] != '\0') && (i < max_len - 1); ++i)
608  {
609  dest[i] = char_tolower (src[i]);
610  }
611  dest[i] = '\0';
612  return 0;
613  }
614 
615  length_in_bytes = strlen (src);
616 
617  if (length_in_bytes >= max_len)
618  {
619  /* include null */
620  length_in_bytes = max_len - 1;
621  }
622 
623  if (length_in_bytes > 0)
624  {
625  intl_char_count ((unsigned char *) src, length_in_bytes, lang_charset (), &char_count);
626  intl_lower_string ((unsigned char *) src, (unsigned char *) dest, char_count, lang_charset ());
627  dest[length_in_bytes] = '\0';
628  }
629  else
630  {
631  dest[0] = '\0';
632  }
633 
634  return 0;
635 }
636 
637 /*
638  * intl_mbs_upper() - convert given characters to uppercase characters
639  * return: always 0
640  * mbs1(in)
641  * mbs2(out)
642  */
643 int
644 intl_mbs_upper (const char *mbs1, char *mbs2)
645 {
646  int char_count = 0;
647  int length_in_bytes = 0;
648 
649  if (!intl_Mbs_support)
650  {
651  char *s;
652 
653  for (s = strcpy (mbs2, mbs1); *s; s++)
654  {
655  *s = char_toupper (*s);
656  }
657  return 0;
658  }
659 
660  if (mbs1)
661  {
662  length_in_bytes = strlen (mbs1);
663  }
664 
665  if (length_in_bytes)
666  {
667  intl_char_count ((unsigned char *) mbs1, length_in_bytes, lang_charset (), &char_count);
668  intl_upper_string ((unsigned char *) mbs1, (unsigned char *) mbs2, char_count, lang_charset ());
669  mbs2[length_in_bytes] = '\0';
670  }
671  else
672  {
673  mbs2[0] = '\0';
674  }
675  return 0;
676 }
677 
678 /*
679  * intl_copy_lowercase() - converts the given wide character string to
680  * a lowercase wide character string
681  * return: new wide character string.
682  * At most n wide characters will be converted and the new wide
683  * character string is null terminated.
684  * ws(in)
685  * n (in)
686  *
687  * Note: The returned pointer must be freed using wcs_delete().
688  */
689 static wchar_t *
690 intl_copy_lowercase (const wchar_t * ws, size_t n)
691 {
692  size_t i;
693  wchar_t *lower_ws;
694 
695  lower_ws = (wchar_t *) malloc (sizeof (wchar_t) * (n + 1));
696  if (lower_ws)
697  {
698  for (i = 0; ws[i] && i < n; i++)
699  {
700  lower_ws[i] = towlower (ws[i]);
701  }
702  lower_ws[i] = L'\0';
703  }
704 
705  return lower_ws;
706 }
707 #endif /* ENABLE_UNUSED_FUNCTION */
708 
709 /*
710  * ISO 8859-1 encoding functions
711  */
712 
713 /*
714  * intl_tolower_iso8859() - replaces all upper case ISO88591 characters
715  * with their lower case codes.
716  * return: character counts
717  * s(in/out): string to lowercase
718  * length(in): length of the string
719  */
720 int
721 intl_tolower_iso8859 (unsigned char *s, int length)
722 {
723  int char_count = length;
724  unsigned char *end;
725 
726  assert (s != NULL);
727 
728  for (end = s + length; s < end; s++)
729  {
730  if (char_isupper_iso8859 (*s))
731  {
732  *s = CHAR_BYTE_TO_LOWER (*s);
733  }
734  }
735 
736  return char_count;
737 }
738 
739 /*
740  * intl_toupper_iso8859() - replaces all lower case ISO88591 characters
741  * with their upper case codes.
742  * return: character counts
743  * s(in/out): string to uppercase
744  * length(in): length of the string
745  */
746 int
747 intl_toupper_iso8859 (unsigned char *s, int length)
748 {
749  int char_count = length;
750  unsigned char *end;
751 
752  assert (s != NULL);
753 
754  for (end = s + length; s < end; s++)
755  {
756  if (char_islower_iso8859 (*s))
757  {
758  *s = CHAR_BYTE_TO_UPPER (*s);
759  }
760  }
761 
762  return char_count;
763 }
764 
765 /*
766  * general routines for EUC encoding
767  */
768 
769 /*
770  * intl_nextchar_euc() - returns a pointer to the next character in the EUC encoded
771  * string.
772  * return: pointer to the next EUC character in the string.
773  * s(in): string
774  * curr_char_length(out): length of the character at s
775  */
776 const unsigned char *
777 intl_nextchar_euc (const unsigned char *s, int *curr_char_length)
778 {
779  assert (s != NULL);
780 
781  if (!IS_8BIT (*s)) /* Detected ASCII character */
782  {
783  *curr_char_length = 1;
784  }
785  else if (*s == SS3) /* Detected Code Set 3 character */
786  {
787  *curr_char_length = 3;
788  }
789  else /* Detected 2 byte character (CS1 or CS2) */
790  {
791  *curr_char_length = 2;
792  }
793 
794  return (s + (*curr_char_length));
795 }
796 
797 /*
798  * intl_prevchar_euc() - returns a pointer to the previous character in the EUC
799  * encoded string.
800  * return: pointer to the previous EUC character in the string s.
801  * s(in): string
802  * s_start(in) : start of buffer string
803  * prev_char_length(out): length of the previous character
804  */
805 const unsigned char *
806 intl_prevchar_euc (const unsigned char *s, const unsigned char *s_start, int *prev_char_length)
807 {
808  assert (s != NULL);
809  assert (s > s_start);
810 
811  if (s - 3 >= s_start && *(s - 3) == SS3)
812  {
813  *prev_char_length = 3;
814  return s - 3;
815  }
816  else if (s - 2 >= s_start && IS_8BIT (*(s - 2)))
817  {
818  *prev_char_length = 2;
819  return s - 2;
820  }
821 
822  *prev_char_length = 1;
823  return --s;
824 }
825 
826 /*
827  * intl_tolower_euc() - Replaces all upper case ASCII characters inside an EUC
828  * encoded string with their lower case codes.
829  * return: character counts
830  * src(in): EUC string to lowercase
831  * byte_size(in): size in bytes of source string
832  */
833 static int
834 intl_tolower_euc (const unsigned char *src, unsigned char *d, int byte_size)
835 {
836  int byte_count;
837  const unsigned char *s = src;
838 
839  assert (src != NULL);
840 
841  for (byte_count = 0; byte_count < byte_size; byte_count++)
842  {
843  *d = char_tolower (*s);
844  s++;
845  d++;
846  }
847 
848  return intl_count_euc_chars (src, byte_size);
849 }
850 
851 /*
852  * intl_toupper_euc() - Replaces all upper case ASCII characters inside an EUC
853  * encoded string with their upper case codes.
854  * return: character counts
855  * src(in): EUC string to uppercase
856  * byte_size(in): size in bytes of source string
857  */
858 static int
859 intl_toupper_euc (const unsigned char *src, unsigned char *d, int byte_size)
860 {
861  int byte_count;
862  const unsigned char *s = src;
863 
864  assert (src != NULL);
865 
866  for (byte_count = 0; byte_count < byte_size; byte_count++)
867  {
868  *d = char_toupper (*s);
869  s++;
870  d++;
871  }
872 
873  return intl_count_euc_chars (src, byte_size);;
874 }
875 
876 /*
877  * intl_count_euc_chars() - Counts the number of EUC encoded characters in the
878  * string. Embedded NULL characters are counted.
879  * return: none
880  * s(in): string
881  * length_in_bytes(in): length of the string
882  * char_count(out): number of EUC encoded characters found
883  *
884  * Note: Only whole characters are counted.
885  * if s[length_in_bytes-1] is not the last byte of a multi-byte
886  * character or a single byte character, then that character is not
887  * counted.
888  */
889 static int
890 intl_count_euc_chars (const unsigned char *s, int length_in_bytes)
891 {
892  const unsigned char *end;
893  int dummy;
894  int char_count;
895 
896  assert (s != NULL);
897 
898  for (end = s + length_in_bytes, char_count = 0; s < end;)
899  {
900  s = intl_nextchar_euc (s, &dummy);
901  if (s <= end)
902  {
903  char_count++;
904  }
905  }
906 
907  return char_count;
908 }
909 
910 /*
911  * intl_count_euc_bytes() - Counts the number of bytes it takes to encode the
912  * next <length_in_chars> EUC characters in the string
913  * return: byte counts
914  * s(in): EUC encoded string
915  * lenth_in_chars(in): length of the string in characters
916  * byte_count(out): number of bytes used for encode
917  */
918 static int
919 intl_count_euc_bytes (const unsigned char *s, int length_in_chars)
920 {
921  int char_count;
922  int char_width;
923  int byte_count;
924 
925  assert (s != NULL);
926 
927  for (char_count = 0, byte_count = 0; char_count < length_in_chars; char_count++)
928  {
929  s = intl_nextchar_euc (s, &char_width);
930  byte_count += char_width;
931  }
932 
933  return byte_count;
934 }
935 
936 /*
937  * string handling functions
938  */
939 
940 /*
941  * intl_convert_charset() - converts a character string from one codeset to another
942  * return: error code
943  * src(in): string to convert
944  * length_in_chars(in): number of characters from src to convert
945  * src_codeset(IN): enumeration of src codeset
946  * dest(out): string of converted characters
947  * dest_codeset(in): enumeration of dest codeset
948  * unconverted(out): number of chars that could not be converted
949  *
950  * Note: Currently, codeset conversion is not supported
951  */
952 int
953 intl_convert_charset (const unsigned char *src, int length_in_chars, INTL_CODESET src_codeset, unsigned char *dest,
954  INTL_CODESET dest_codeset, int *unconverted)
955 {
956  int error_code = NO_ERROR;
957 
958  switch (src_codeset)
959  {
962  case INTL_CODESET_UTF8:
964  default:
965  error_code = ER_QSTR_BAD_SRC_CODESET;
966  break;
967  }
968 
969  return (error_code);
970 }
971 
972 /*
973  * intl_char_count() - Counts the number of characters in the string
974  * return: number of characters found
975  * src(in): string of characters to count
976  * length_in_bytes(in): length of the string
977  * src_codeset(in): enumeration of src codeset
978  * char_count(out): number of characters found
979  *
980  * Note: Embedded NULL characters are counted.
981  */
982 int
983 intl_char_count (const unsigned char *src, int length_in_bytes, INTL_CODESET src_codeset, int *char_count)
984 {
985  switch (src_codeset)
986  {
989  *char_count = length_in_bytes;
990  break;
991 
993  *char_count = intl_count_euc_chars (src, length_in_bytes);
994  break;
995 
996  case INTL_CODESET_UTF8:
997  *char_count = intl_count_utf8_chars (src, length_in_bytes);
998  break;
999 
1000  default:
1001  assert (false);
1002  *char_count = 0;
1003  break;
1004  }
1005 
1006  return *char_count;
1007 }
1008 
1009 /*
1010  * intl_char_size() - returns the number of bytes in a string given the
1011  * start and character length of the string
1012  * return: none
1013  * src(in): number of byets
1014  * length_in_chars(in): legnth of the string in characters
1015  * src_code_set(in): enumeration of src codeset
1016  * bytes_count(out): number of byets used for encode the number of
1017  * characters specified
1018  *
1019  * Note: Embedded NULL's are counted as characters.
1020  */
1021 int
1022 intl_char_size (const unsigned char *src, int length_in_chars, INTL_CODESET src_codeset, int *byte_count)
1023 {
1024  switch (src_codeset)
1025  {
1026  case INTL_CODESET_ISO88591:
1028  *byte_count = length_in_chars;
1029  break;
1030 
1032  *byte_count = intl_count_euc_bytes (src, length_in_chars);
1033  break;
1034 
1035  case INTL_CODESET_UTF8:
1036  *byte_count = intl_count_utf8_bytes (src, length_in_chars);
1037  break;
1038 
1039  default:
1040  assert (false);
1041  *byte_count = 0;
1042  break;
1043  }
1044 
1045  return *byte_count;
1046 }
1047 
1048 #if defined (ENABLE_UNUSED_FUNCTION)
1049 /*
1050  * intl_char_size_pseudo_kor() - returns the number of bytes in a string given
1051  * the start and character length of the string
1052  *
1053  * return: none
1054  * src(in): number of byets
1055  * length_in_chars(in): legnth of the string in characters
1056  * src_code_set(in): enumeration of src codeset
1057  * bytes_count(out): number of byets used for encode teh number of
1058  * characters specified
1059  *
1060  * Note: Embedded NULL's are counted as characters.
1061  * This is similar to 'intl_char_size' except with INTL_CODESET_ISO88591
1062  * codeset, some bytes are considered korean characters
1063  * This function is used in context of some specific string functions.
1064  */
1065 int
1066 intl_char_size_pseudo_kor (const unsigned char *src, int length_in_chars, INTL_CODESET src_codeset, int *byte_count)
1067 {
1068  switch (src_codeset)
1069  {
1070  case INTL_CODESET_ISO88591:
1071  if (!prm_get_bool_value (PRM_ID_SINGLE_BYTE_COMPARE))
1072  {
1073  int b_count = 0;
1074  while (length_in_chars-- > 0)
1075  {
1076  if (*src == SS3)
1077  {
1078  b_count += 3;
1079  src += 3;
1080  }
1081  else if (IS_PSEUDO_KOREAN (*src))
1082  {
1083  b_count += 2;
1084  src += 2;
1085  }
1086  else
1087  {
1088  b_count++;
1089  src++;
1090  }
1091  }
1092  *byte_count = b_count;
1093  }
1094  else
1095  {
1096  *byte_count = length_in_chars;
1097  }
1098  break;
1099 
1101  *byte_count = intl_count_euc_bytes (src, length_in_chars);
1102  break;
1103 
1104  case INTL_CODESET_UTF8:
1105  *byte_count = intl_count_utf8_bytes (src, length_in_chars);
1106  break;
1107 
1108  default:
1109  assert (false);
1110  *byte_count = 0;
1111  break;
1112  }
1113 
1114  return *byte_count;
1115 }
1116 #endif
1117 
1118 /*
1119  * intl_prev_char() - returns pointer to the previous char in string
1120  *
1121  * return : pointer to previous character
1122  * s(in) : string
1123  * s_start(in) : start of buffer string
1124  * codeset(in) : enumeration of src codeset
1125  * prev_char_size(out) : size of previous character
1126  */
1127 const unsigned char *
1128 intl_prev_char (const unsigned char *s, const unsigned char *s_start, INTL_CODESET codeset, int *prev_char_size)
1129 {
1130  assert (s > s_start);
1131 
1132  switch (codeset)
1133  {
1135  return intl_prevchar_euc (s, s_start, prev_char_size);
1136 
1137  case INTL_CODESET_UTF8:
1138  return intl_prevchar_utf8 (s, s_start, prev_char_size);
1139 
1140  case INTL_CODESET_ISO88591:
1142  break;
1143  default:
1144  assert (false);
1145  }
1146 
1147  *prev_char_size = 1;
1148  return --s;
1149 }
1150 
1151 #if defined (ENABLE_UNUSED_FUNCTION)
1152 /*
1153  * intl_prev_char_pseudo_kor() - returns pointer to the previous char in
1154  * string
1155  *
1156  * return : pointer to previous character
1157  * s(in) : string
1158  * s_start(in) : start of buffer string
1159  * codeset(in) : enumeration of src codeset
1160  * prev_char_size(out) : size of previous character
1161  *
1162  * Note: This is similar to 'intl_prev_char' except with INTL_CODESET_ISO88591
1163  * codeset, some bytes are considered korean characters
1164  * This function is used in context of some specific string functions.
1165  */
1166 unsigned char *
1167 intl_prev_char_pseudo_kor (const unsigned char *s, const unsigned char *s_start, INTL_CODESET codeset,
1168  int *prev_char_size)
1169 {
1170  assert (s > s_start);
1171 
1172  switch (codeset)
1173  {
1174  case INTL_CODESET_ISO88591:
1175  if (!prm_get_bool_value (PRM_ID_SINGLE_BYTE_COMPARE) && IS_PSEUDO_KOREAN (*(s - 1)))
1176  {
1177  if (s - 2 >= s_start && *(s - 2) == SS3)
1178  {
1179  *prev_char_size = 3;
1180  return s - 3;
1181  }
1182  else if (s - 1 >= s_start && IS_PSEUDO_KOREAN (*(s - 1)))
1183  {
1184  *prev_char_size = 2;
1185  return s - 2;
1186  }
1187  }
1188 
1189  break;
1190 
1192  return intl_prevchar_euc (s, s_start, prev_char_size);
1193 
1194  case INTL_CODESET_UTF8:
1195  return intl_prevchar_utf8 (s, s_start, prev_char_size);
1196 
1197  default:
1198  assert (false);
1199  }
1200 
1201  *prev_char_size = 1;
1202  return --s;
1203 }
1204 #endif
1205 
1206 /*
1207  * intl_next_char () - returns pointer to the next char in string
1208  *
1209  * return: Pointer to the next character in the string.
1210  * s(in) : string
1211  * codeset(in) : enumeration of the codeset of s
1212  * current_char_size(out) : length of the character at s
1213  *
1214  * Note: Returns a pointer to the next character in the string.
1215  * curr_char_length is set to the byte length of the current character.
1216  */
1217 const unsigned char *
1218 intl_next_char (const unsigned char *s, INTL_CODESET codeset, int *current_char_size)
1219 {
1220  switch (codeset)
1221  {
1222  case INTL_CODESET_ISO88591:
1224  *current_char_size = 1;
1225  return ++s;
1226 
1228  return intl_nextchar_euc (s, current_char_size);
1229 
1230  case INTL_CODESET_UTF8:
1231  return intl_nextchar_utf8 (s, current_char_size);
1232 
1233  default:
1234  assert (false);
1235  *current_char_size = 0;
1236  return s;
1237  }
1238 }
1239 
1240 #if defined (ENABLE_UNUSED_FUNCTION)
1241 /*
1242  * intl_next_char_pseudo_kor () - returns pointer to the next char in string
1243  *
1244  * return: Pointer to the next character in the string.
1245  * s(in) : string
1246  * codeset(in) : enumeration of the codeset of s
1247  * current_char_size(out) : length of the character at s
1248  *
1249  * Note: This is similar to 'intl_next_char' except with INTL_CODESET_ISO88591
1250  * codeset, some bytes are considered korean characters
1251  * This function should be used only in context of string functions
1252  * where korean characters are expected to be handled.
1253  */
1254 unsigned char *
1255 intl_next_char_pseudo_kor (const unsigned char *s, INTL_CODESET codeset, int *current_char_size)
1256 {
1257  switch (codeset)
1258  {
1259  case INTL_CODESET_ISO88591:
1260  if (!prm_get_bool_value (PRM_ID_SINGLE_BYTE_COMPARE) && IS_PSEUDO_KOREAN (*s))
1261  {
1262  if (*s == SS3)
1263  {
1264  *current_char_size = 3;
1265  return s + 3;
1266  }
1267  else if (IS_PSEUDO_KOREAN (*s))
1268  {
1269  *current_char_size = 2;
1270  return s + 2;
1271  }
1272  }
1273 
1274  *current_char_size = 1;
1275  return ++s;
1276 
1278  return intl_nextchar_euc (s, current_char_size);
1279 
1280  case INTL_CODESET_UTF8:
1281  return intl_nextchar_utf8 (s, current_char_size);
1282 
1283  default:
1284  assert (false);
1285  *current_char_size = 0;
1286  return s;
1287  }
1288 }
1289 #endif
1290 
1291 /*
1292  * intl_cmp_char() - compares the first character of two strings
1293  * return: zero if character are equal, non-zero otherwise
1294  * s1(in):
1295  * s2(in):
1296  * codeset:
1297  * char_size(in): size of char in bytes of the first character in s1
1298  *
1299  * Note: it is assumed that both strings contain at least one character of
1300  * the given codeset.
1301  *
1302  */
1303 int
1304 intl_cmp_char (const unsigned char *s1, const unsigned char *s2, INTL_CODESET codeset, int *char_size)
1305 {
1306 
1307  switch (codeset)
1308  {
1309  case INTL_CODESET_ISO88591:
1311  *char_size = 1;
1312  return *s1 - *s2;
1313 
1315  (void) intl_nextchar_euc (s1, char_size);
1316  return memcmp (s1, s2, *char_size);
1317 
1318  case INTL_CODESET_UTF8:
1319  *char_size = intl_Len_utf8_char[*s1];
1320  return memcmp (s1, s2, *char_size);
1321 
1322  default:
1323  assert (false);
1324  *char_size = 1;
1325  return 0;
1326  }
1327 
1328  return 0;
1329 }
1330 
1331 #if defined (ENABLE_UNUSED_FUNCTION)
1332 /*
1333  * intl_cmp_char_pseudo_kor() - compares the first character of two strings
1334  * return: zero if character are equal, non-zero otherwise
1335  * s1(in):
1336  * s2(in):
1337  * codeset:
1338  * char_size(out): size of char in bytes of the first character in s1
1339  *
1340  * Note: same as intl_cmp_char, except that with ISO-8859-1 codeset, some
1341  * bytes are handled as Korean characters.
1342  *
1343  */
1344 int
1345 intl_cmp_char_pseudo_kor (const unsigned char *s1, const unsigned char *s2, INTL_CODESET codeset, int *char_size)
1346 {
1347  switch (codeset)
1348  {
1349  case INTL_CODESET_ISO88591:
1350  if (!prm_get_bool_value (PRM_ID_SINGLE_BYTE_COMPARE) && IS_PSEUDO_KOREAN (*s1))
1351  {
1352  if (*s1 == SS3)
1353  {
1354  *char_size = 3;
1355  return memcmp (s1, s2, 3);
1356  }
1357  else if (IS_PSEUDO_KOREAN (*s1))
1358  {
1359  *char_size = 2;
1360  return memcmp (s1, s2, 2);
1361  }
1362  }
1363  *char_size = 1;
1364  return *s1 - *s2;
1365 
1367  (void) intl_nextchar_euc ((unsigned char *) s1, char_size);
1368  return memcmp (s1, s2, *char_size);
1369 
1370  case INTL_CODESET_UTF8:
1371  *char_size = intl_Len_utf8_char[*s1];
1372  return memcmp (s1, s2, *char_size);
1373 
1374  default:
1375  assert (false);
1376  *char_size = 1;
1377  return 0;
1378  }
1379 
1380  return 0;
1381 }
1382 
1383 /*
1384  * intl_kor_cmp() - compares first characters of two strings
1385  * return: required size
1386  * s1(in):
1387  * s2(in):
1388  * size(in): max size in bytes to compare
1389  *
1390  * Note: this function is used only in context of 'replace' string function
1391  * strncmp function should be used.
1392  */
1393 int
1394 intl_kor_cmp (unsigned char *s1, unsigned char *s2, int size)
1395 {
1396  int r;
1397  while (size > 0)
1398  {
1399  if (!prm_get_bool_value (PRM_ID_SINGLE_BYTE_COMPARE) && IS_PSEUDO_KOREAN (*s1) && IS_PSEUDO_KOREAN (*s2))
1400  {
1401  r = memcmp (s1, s2, 2);
1402  if (r == 0)
1403  {
1404  s1 += 2;
1405  s2 += 2;
1406  size -= 2;
1407  }
1408  else
1409  {
1410  return r;
1411  }
1412  }
1413  else if ((prm_get_bool_value (PRM_ID_SINGLE_BYTE_COMPARE) || !IS_PSEUDO_KOREAN (*s1)) && *s1 == *s2)
1414  {
1415  s1++;
1416  s2++;
1417  size--;
1418  }
1419  else
1420  {
1421  return (*s1 - *s2);
1422  }
1423  }
1424  return 0;
1425 }
1426 #endif
1427 
1428 /*
1429  * intl_pad_char() - returns the pad character of requested codeset
1430  * return: none
1431  * codeset(in): International codeset.
1432  * pad_char(in/out): Pointer to array which will be filled with
1433  * the pad character.
1434  * pad_size(out): Size of pad character.
1435  *
1436  * Note:
1437  * There is a pad character associated with every character code
1438  * set. This function will retrieve the pad character for a given
1439  * code set. The pad character is written into an array that must
1440  * allocated by the caller.
1441  *
1442  */
1443 void
1444 intl_pad_char (const INTL_CODESET codeset, unsigned char *pad_char, int *pad_size)
1445 {
1446  switch (codeset)
1447  {
1448  case INTL_CODESET_RAW_BITS:
1450  pad_char[0] = '\0';
1451  *pad_size = 1;
1452  break;
1453 
1455  pad_char[0] = pad_char[1] = '\241';
1456  *pad_size = 2;
1457  break;
1458 
1459  case INTL_CODESET_ASCII:
1460  case INTL_CODESET_ISO88591:
1461  case INTL_CODESET_UTF8:
1462  pad_char[0] = ' ';
1463  *pad_size = 1;
1464  break;
1465 
1466  default:
1467  assert (false);
1468  break;
1469  }
1470 }
1471 
1472 /*
1473  * intl_pad_size() - Returns the byte size of the pad character for the given
1474  * codeset.
1475  * return: size of pading char
1476  * codeset(in): International codeset.
1477  *
1478  * Note:
1479  * There is a pad character associated with every character code
1480  * set. This function will retrieve the pad character for a given
1481  * code set. The pad character is written into an array that must
1482  * allocated by the caller.
1483  *
1484  */
1485 int
1487 {
1488  int size;
1489 
1490  switch (codeset)
1491  {
1493  size = 2;
1494  break;
1495  case INTL_CODESET_ISO88591:
1496  case INTL_CODESET_UTF8:
1498  default:
1499  size = 1;
1500  break;
1501  }
1502 
1503  return size;
1504 }
1505 
1506 /*
1507  * intl_upper_string_size() - determine the size required for holding
1508  * upper case of the input string
1509  * return: required size
1510  * alphabet(in): alphabet data
1511  * src(in): string to uppercase
1512  * src_size(in): buffer size
1513  * src_length(in): length of the string measured in characters
1514  */
1515 int
1516 intl_upper_string_size (const ALPHABET_DATA * alphabet, const unsigned char *src, int src_size, int src_length)
1517 {
1518  int char_count;
1519  int req_size = src_size;
1520 
1521  assert (alphabet != NULL);
1522 
1523  switch (alphabet->codeset)
1524  {
1525  case INTL_CODESET_ISO88591:
1527  break;
1528 
1530  break;
1531 
1532  case INTL_CODESET_UTF8:
1533  {
1534  unsigned char upper[INTL_UTF8_MAX_CHAR_SIZE];
1535  unsigned char *next = NULL;
1536 
1537  req_size = 0;
1538  for (char_count = 0; char_count < src_length && src_size > 0; char_count++)
1539  {
1540  req_size += intl_char_toupper_utf8 (alphabet, src, src_size, upper, &next);
1541  src_size -= CAST_STRLEN (next - src);
1542  src = next;
1543  }
1544  }
1545  break;
1546 
1547  default:
1548  assert (false);
1549  break;
1550  }
1551 
1552  return req_size;
1553 }
1554 
1555 /*
1556  * intl_upper_string() - replace all lower case characters with their
1557  * upper case characters
1558  * return: character counts
1559  * alphabet(in): alphabet data
1560  * src(in/out): string source to uppercase
1561  * dst(in/out): output string
1562  * length_in_chars(in): length of the string measured in characters
1563  */
1564 int
1565 intl_upper_string (const ALPHABET_DATA * alphabet, const unsigned char *src, unsigned char *dst, int length_in_chars)
1566 {
1567  int char_count = 0;
1568 
1569  assert (alphabet != NULL);
1570 
1571  switch (alphabet->codeset)
1572  {
1574  memcpy (dst, src, length_in_chars);
1575  char_count = length_in_chars;
1576  break;
1577 
1578  case INTL_CODESET_ISO88591:
1579  {
1580  unsigned char *d;
1581  const unsigned char *s;
1582 
1583  for (d = dst, s = src; d < dst + length_in_chars; d++, s++)
1584  {
1585  *d = char_toupper_iso8859 (*s);
1586  }
1587  char_count = length_in_chars;
1588  }
1589  break;
1590 
1592  {
1593  int byte_count;
1594  intl_char_size (src, length_in_chars, INTL_CODESET_KSC5601_EUC, &byte_count);
1595  if (byte_count > 0)
1596  {
1597  char_count = intl_toupper_euc (src, dst, byte_count);
1598  }
1599  }
1600  break;
1601 
1602  case INTL_CODESET_UTF8:
1603  {
1604  int dummy_size;
1605  char_count = intl_toupper_utf8 (alphabet, src, dst, length_in_chars, &dummy_size);
1606  }
1607  break;
1608 
1609  default:
1610  assert (false);
1611  break;
1612  }
1613 
1614  return char_count;
1615 }
1616 
1617 /*
1618  * intl_lower_string_size() - determine the size required for holding
1619  * lower case of the input string
1620  * return: required size
1621  * alphabet(in): alphabet data
1622  * src(in): string to lowercase
1623  * src_size(in): buffer size
1624  * src_length(in): length of the string measured in characters
1625  */
1626 int
1627 intl_lower_string_size (const ALPHABET_DATA * alphabet, const unsigned char *src, int src_size, int src_length)
1628 {
1629  int char_count;
1630  int req_size = src_size;
1631 
1632  assert (alphabet != NULL);
1633 
1634  switch (alphabet->codeset)
1635  {
1636  case INTL_CODESET_ISO88591:
1638  break;
1639 
1641  break;
1642 
1643  case INTL_CODESET_UTF8:
1644  {
1645  unsigned char lower[INTL_UTF8_MAX_CHAR_SIZE];
1646  unsigned char *next;
1647 
1648  req_size = 0;
1649  for (char_count = 0; char_count < src_length && src_size > 0; char_count++)
1650  {
1651  req_size += intl_char_tolower_utf8 (alphabet, src, src_size, lower, &next);
1652  src_size -= CAST_STRLEN (next - src);
1653  src = next;
1654  }
1655  }
1656  break;
1657 
1658  default:
1659  assert (false);
1660  break;
1661  }
1662 
1663  return req_size;
1664 }
1665 
1666 /*
1667  * intl_lower_string() - replace all upper case characters with their
1668  * lower case characters
1669  * return: character counts
1670  * alphabet(in): alphabet data
1671  * src(in/out): string to lowercase
1672  * dst(out): output string
1673  * length_in_chars(in): length of the string measured in characters
1674  */
1675 int
1676 intl_lower_string (const ALPHABET_DATA * alphabet, const unsigned char *src, unsigned char *dst, int length_in_chars)
1677 {
1678  int char_count = 0;
1679 
1680  assert (alphabet != NULL);
1681 
1682  switch (alphabet->codeset)
1683  {
1684  case INTL_CODESET_ISO88591:
1685  {
1686  unsigned char *d;
1687  const unsigned char *s;
1688 
1689  for (d = dst, s = src; d < dst + length_in_chars; d++, s++)
1690  {
1691  *d = char_tolower_iso8859 (*s);
1692  }
1693  char_count = length_in_chars;
1694  }
1695  break;
1696 
1698  memcpy (dst, src, length_in_chars);
1699  break;
1700 
1702  {
1703  int byte_count;
1704  intl_char_size (src, length_in_chars, INTL_CODESET_KSC5601_EUC, &byte_count);
1705  if (byte_count > 0)
1706  {
1707  char_count = intl_tolower_euc (src, dst, byte_count);
1708  }
1709  }
1710  break;
1711 
1712  case INTL_CODESET_UTF8:
1713  {
1714  int dummy_size;
1715  char_count = intl_tolower_utf8 (alphabet, src, dst, length_in_chars, &dummy_size);
1716  }
1717  break;
1718 
1719  default:
1720  assert (false);
1721  break;
1722  }
1723 
1724  return char_count;
1725 }
1726 
1727 #if defined (ENABLE_UNUSED_FUNCTION)
1728 /*
1729  * intl_is_korean() - test for a korean character
1730  * return: non-zero if ch is a korean character,
1731  * 0 otherwise.
1732  * ch(in): the character to be tested
1733  */
1734 static int
1735 intl_is_korean (unsigned char ch)
1736 {
1737  if (prm_get_bool_value (PRM_ID_SINGLE_BYTE_COMPARE))
1738  {
1739  return 0;
1740  }
1741  return (ch >= 0xb0 && ch <= 0xc8) || (ch >= 0xa1 && ch <= 0xfe);
1742 }
1743 
1744 /*
1745  * intl_language() - Returns the language for the given category of the
1746  * current locale
1747  * return: INTL_LANG enumeration
1748  * category(in): category argument to setlocale()
1749  */
1750 INTL_LANG
1751 intl_language (int category)
1752 {
1753  char *loc = setlocale (category, NULL);
1754 
1755 #if defined(WINDOWS) || defined(SOLARIS)
1756  return INTL_LANG_ENGLISH;
1757 #else /* !WINDOWS && !SOLARIS */
1758  if (loc != NULL && strcmp (loc, LOCALE_KOREAN) == 0)
1759  {
1760  return INTL_LANG_KOREAN;
1761  }
1762  else
1763  {
1764  return INTL_LANG_ENGLISH;
1765  }
1766 #endif
1767 }
1768 #endif /* ENABLE_UNUSED_FUNCTION */
1769 
1770 /*
1771  * intl_zone() - Return the zone for the given category of the
1772  * current locale
1773  * return: INTL_ZONE enumeration
1774  * lang_id(in): language identifier
1775  */
1776 INTL_ZONE
1777 intl_zone (int category)
1778 {
1779  switch (lang_id ())
1780  {
1781  case INTL_LANG_ENGLISH:
1782  return INTL_ZONE_US;
1783  case INTL_LANG_KOREAN:
1784  return INTL_ZONE_KR;
1785  default:
1786  return INTL_ZONE_US;
1787  }
1788  return INTL_ZONE_US;
1789 }
1790 
1791 /*
1792  * intl_reverse_string() - reverse characters of source string,
1793  * into destination string
1794  * return: character counts
1795  * src(in): source string
1796  * dst(out): destination string
1797  * length_in_chars(in): length of the string measured in characters
1798  * size_in_bytes(in): size of the string in bytes
1799  * codeset(in): enumeration of source string
1800  */
1801 int
1802 intl_reverse_string (const unsigned char *src, unsigned char *dst, int length_in_chars, int size_in_bytes,
1803  INTL_CODESET codeset)
1804 {
1805  const unsigned char *end, *s;
1806  unsigned char *d;
1807  int char_count = 0;
1808  int char_size, i;
1809 
1810  assert (src != NULL);
1811  assert (dst != NULL);
1812 
1813  s = src;
1814 
1815  switch (codeset)
1816  {
1817  case INTL_CODESET_ISO88591:
1819  d = dst + length_in_chars - 1;
1820  end = src + length_in_chars;
1821  for (; s < end; char_count++)
1822  {
1823  *d = *s;
1824  s++;
1825  d--;
1826  }
1827  break;
1828 
1830  {
1831  d = dst + size_in_bytes - 1;
1832  end = src + size_in_bytes;
1833  for (; s < end && char_count < length_in_chars; char_count++)
1834  {
1835  if (!IS_8BIT (*s)) /* ASCII character */
1836  {
1837  *d-- = *s++;
1838  }
1839  else if (*s == SS3) /* Code Set 3 character */
1840  {
1841  *(d - 2) = *s;
1842  *(d - 1) = *(s + 1);
1843  *d = *(s + 2);
1844  s += 3;
1845  d -= 3;
1846  }
1847  else /* 2 byte character (CS1 or CS2) */
1848  {
1849  *(d - 1) = *s;
1850  *d = *(s + 1);
1851  s += 2;
1852  d -= 2;
1853  }
1854  }
1855  }
1856  break;
1857 
1858  case INTL_CODESET_UTF8:
1859  {
1860  d = dst + size_in_bytes - 1;
1861  end = src + size_in_bytes;
1862  for (; s < end && char_count < length_in_chars; char_count++)
1863  {
1864  intl_nextchar_utf8 (s, &char_size);
1865 
1866  i = char_size;
1867  while (i > 0)
1868  {
1869  i--;
1870  *(d - i) = *s;
1871  s++;
1872  }
1873  d -= char_size;
1874  }
1875  }
1876  break;
1877 
1878  default:
1879  assert (false);
1880  break;
1881  }
1882 
1883  return char_count;
1884 }
1885 
1886 /*
1887  * intl_is_max_bound_chr () -
1888  *
1889  * return: check if chr points to a char representing the upper bound
1890  * codepoint in the selected codeset, for LIKE index optimization.
1891  *
1892  * codeset(in) : the codeset to consider
1893  * chr(in) : upper bound, as bytes
1894  */
1895 bool
1896 intl_is_max_bound_chr (INTL_CODESET codeset, const unsigned char *chr)
1897 {
1898  switch (codeset)
1899  {
1900  case INTL_CODESET_UTF8:
1901  if ((*chr == 0xf4) && (*(chr + 1) == 0x8f) && (*(chr + 2) == 0xbf) && (*(chr + 3) == 0xbf))
1902  {
1903  return true;
1904  }
1905  return false;
1907  if (((*chr == 0xff) && (*(chr + 1) == 0xff)))
1908  {
1909  return true;
1910  }
1911  return false;
1912  case INTL_CODESET_ISO88591:
1914  default:
1915  if (*chr == 0xff)
1916  {
1917  return true;
1918  }
1919  return false;
1920  }
1921 
1922  return false;
1923 }
1924 
1925 /*
1926  * intl_is_min_bound_chr () -
1927  *
1928  * return: check if chr points to a ISO char / UTF-8 codepoint representing
1929  * the lower bound codepoint in the selected codeset, for LIKE
1930  * index optimization.
1931  *
1932  * codeset(in) : the codeset to consider
1933  * chr(in) : upper bound, as UTF-8 bytes
1934  *
1935  * Note: 'chr' buffer should be able to store at least 1 more byte, for
1936  * one space char.
1937  */
1938 bool
1939 intl_is_min_bound_chr (INTL_CODESET codeset, const unsigned char *chr)
1940 {
1941  if (*chr == ' ')
1942  {
1943  return true;
1944  }
1945 
1946  return false;
1947 }
1948 
1949 /*
1950  * intl_set_min_bound_chr () - sets chr to a byte array representing
1951  * the lowest bound codepoint in the selected
1952  * codeset, for LIKE index optimization.
1953  *
1954  * return: the number of bytes added to chr
1955  *
1956  * codeset(in) : the codeset to consider
1957  * chr(in) : char pointer where to place the bound, as UTF-8 bytes
1958  */
1959 int
1961 {
1962  *chr = ' ';
1963 
1964  return 1;
1965 }
1966 
1967 /*
1968  * intl_set_max_bound_chr () - sets chr to a byte array representing
1969  * the up-most bound codepoint in the selected
1970  * codeset, for LIKE index optimization.
1971  *
1972  * return: the number of bytes added to chr
1973  *
1974  * codeset(in) : the codeset to consider
1975  * chr(in) : char pointer where to place the bound
1976  *
1977  * Note: 'chr' buffer should be able to store at least one more char:
1978  * 4 bytes (UTF-8), 2 bytes (EUC-KR), 1 byte (ISO-8859-1).
1979  *
1980  */
1981 int
1983 {
1984  switch (codeset)
1985  {
1986  case INTL_CODESET_UTF8:
1987  *chr = (char) 0xf4;
1988  *(chr + 1) = (char) 0x8f;
1989  *(chr + 2) = (char) 0xbf;
1990  *(chr + 3) = (char) 0xbf;
1991  return 4;
1993  *chr = (char) 0xff;
1994  *(chr + 1) = (char) 0xff;
1995  return 2;
1996  case INTL_CODESET_ISO88591:
1998  default:
1999  *chr = (char) 0xff;
2000  return 1;
2001  }
2002 
2003  return 1;
2004 }
2005 
2006 /*
2007  * general routines for UTF-8 encoding
2008  */
2009 
2010 static const unsigned char len_utf8_char[256] = {
2011  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2012  1, 1, 1, 1, 1, 1, 1,
2013  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2014  1, 1, 1, 1, 1, 1, 1,
2015  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2016  1, 1, 1, 1, 1, 1, 1,
2017  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2018  1, 1, 1, 1, 1, 1, 1,
2019  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2020  1, 1, 1, 1, 1, 1, 1,
2021  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2022  1, 1, 1, 1, 1, 1, 1,
2023  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2024  2, 2, 2, 2, 2, 2, 2,
2025  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5,
2026  5, 5, 5, 6, 6, 1, 1
2027 };
2028 
2029 const unsigned char *const intl_Len_utf8_char = len_utf8_char;
2030 
2031 /*
2032  * intl_nextchar_utf8() - returns a pointer to the next character in the
2033  * UTF-8 encoded string.
2034  * return: pointer to the next character
2035  * s(in): input string
2036  * curr_char_length(out): length of the character at s
2037  */
2038 const unsigned char *
2039 intl_nextchar_utf8 (const unsigned char *s, int *curr_char_length)
2040 {
2041  INTL_GET_NEXTCHAR_UTF8 (s, *curr_char_length);
2042  return s;
2043 }
2044 
2045 /*
2046  * intl_prevchar_utf8() - returns a pointer to the previous character in the
2047  * UTF-8 encoded string.
2048  * return: pointer to the previous character
2049  * s(in): string
2050  * s_start(in) : start of buffer string
2051  * prev_char_length(out): length of the previous character
2052  */
2053 const unsigned char *
2054 intl_prevchar_utf8 (const unsigned char *s, const unsigned char *s_start, int *prev_char_length)
2055 {
2056  int l = 0;
2057 
2058  do
2059  {
2060  l++;
2061  }
2062  while (l < 6 && s - l >= s_start && (*(s - l) & 0xc0) == 0x80);
2063 
2064  l = (*(s - l) & 0xc0) == 0x80 ? 1 : l;
2065  s -= l;
2066  *prev_char_length = l;
2067 
2068  return s;
2069 }
2070 
2071 /*
2072  * intl_tolower_utf8() - Replaces all upper case characters inside an UTF-8
2073  * encoded string with their lower case codes.
2074  * return: character counts
2075  * alphabet(in): alphabet to use
2076  * s(in): UTF-8 string to lowercase
2077  * d(out): output string
2078  * length_in_chars(in): length of the string measured in characters
2079  * d_size(out): size in bytes of destination
2080  */
2081 static int
2082 intl_tolower_utf8 (const ALPHABET_DATA * alphabet, const unsigned char *s, unsigned char *d, int length_in_chars,
2083  int *d_size)
2084 {
2085  int char_count, size;
2086  int s_size;
2087  unsigned char *next = NULL;
2088 
2089  assert (s != NULL);
2090  assert (d_size != NULL);
2091 
2092  intl_char_size (s, length_in_chars, INTL_CODESET_UTF8, &s_size);
2093  *d_size = 0;
2094 
2095  for (char_count = 0; char_count < length_in_chars; char_count++)
2096  {
2097  if (s_size <= 0)
2098  {
2099  break;
2100  }
2101  size = intl_char_tolower_utf8 (alphabet, s, s_size, d, &next);
2102  d += size;
2103  *d_size += size;
2104 
2105  s_size -= CAST_STRLEN (next - s);
2106  s = next;
2107  }
2108 
2109  return char_count;
2110 }
2111 
2112 /*
2113  * intl_toupper_utf8() - Replaces all lower case characters inside an UTF-8
2114  * encoded string with their upper case codes.
2115  * return: character counts
2116  * alphabet(in): alphabet to use
2117  * s(in): UTF-8 string to uppercase
2118  * d(out): output string
2119  * length_in_chars(in): length of the string measured in characters
2120  * d_size(out): size in bytes of destination
2121  */
2122 static int
2123 intl_toupper_utf8 (const ALPHABET_DATA * alphabet, const unsigned char *s, unsigned char *d, int length_in_chars,
2124  int *d_size)
2125 {
2126  int char_count, size;
2127  int s_size;
2128  unsigned char *next = NULL;
2129 
2130  assert (s != NULL);
2131  assert (d_size != NULL);
2132 
2133  intl_char_size (s, length_in_chars, INTL_CODESET_UTF8, &s_size);
2134  *d_size = 0;
2135 
2136  for (char_count = 0; char_count < length_in_chars; char_count++)
2137  {
2138  if (s_size <= 0)
2139  {
2140  break;
2141  }
2142  size = intl_char_toupper_utf8 (alphabet, s, s_size, d, &next);
2143  d += size;
2144  *d_size += size;
2145 
2146  s_size -= CAST_STRLEN (next - s);
2147  s = next;
2148  }
2149 
2150  return char_count;
2151 }
2152 
2153 /*
2154  * intl_count_utf8_chars() - Counts the number of UTF-8 encoded characters in
2155  * the string. Embedded NULL characters are counted.
2156  * return: none
2157  * s(in): string
2158  * length_in_bytes(in): length of the string
2159  * char_count(out): number of UTF-8 encoded characters found
2160  *
2161  * Note: Only whole characters are counted.
2162  * if s[length_in_bytes-1] is not the last byte of a multi-byte
2163  * character or a single byte character, then that character is not
2164  * counted.
2165  */
2166 int
2167 intl_count_utf8_chars (const unsigned char *s, int length_in_bytes)
2168 {
2169  const unsigned char *end;
2170  int dummy;
2171  int char_count;
2172 
2173  assert (s != NULL);
2174 
2175  for (end = s + length_in_bytes, char_count = 0; s < end;)
2176  {
2177  s = intl_nextchar_utf8 (s, &dummy);
2178  if (s <= end)
2179  {
2180  char_count++;
2181  }
2182  }
2183 
2184  return char_count;
2185 }
2186 
2187 /*
2188  * intl_count_utf8_bytes() - Counts the number of bytes it takes to encode the
2189  * next <length_in_chars> UTF-8 characters in the string
2190  * return: byte counts
2191  * s(in): UTF-8 encoded string
2192  * lenth_in_chars(in): length of the string in characters
2193  * byte_count(out): number of bytes used for encode
2194  */
2195 static int
2196 intl_count_utf8_bytes (const unsigned char *s, int length_in_chars)
2197 {
2198  int char_count;
2199  int char_width;
2200  int byte_count;
2201 
2202  assert (s != NULL);
2203 
2204  for (char_count = 0, byte_count = 0; char_count < length_in_chars; char_count++)
2205  {
2206  s = intl_nextchar_utf8 (s, &char_width);
2207  byte_count += char_width;
2208  }
2209 
2210  return byte_count;
2211 }
2212 
2213 /*
2214  * intl_char_tolower_utf8() - convert uppercase character to lowercase
2215  * return: size of UTF-8 lowercase character corresponding to the argument
2216  * alphabet(in): alphabet to use
2217  * s (in): the UTF-8 buffer holding character to be converted
2218  * size(in): size of UTF-8 buffer
2219  * d (out): output buffer
2220  * next (out): pointer to next character
2221  *
2222  * Note : allocated size of 'd' is assumed to be large enough to fit any
2223  * UTF-8 character
2224  */
2225 static int
2226 intl_char_tolower_utf8 (const ALPHABET_DATA * alphabet, const unsigned char *s, const int size, unsigned char *d,
2227  unsigned char **next)
2228 {
2229  unsigned int cp = intl_utf8_to_cp (s, size, next);
2230 
2231  assert (alphabet != NULL);
2232 
2233  if (cp < (unsigned int) (alphabet->l_count))
2234  {
2235  if (alphabet->lower_multiplier == 1)
2236  {
2237  unsigned int lower_cp = alphabet->lower_cp[cp];
2238 
2239  return intl_cp_to_utf8 (lower_cp, d);
2240  }
2241  else
2242  {
2243  const unsigned int *case_p;
2244  int count = 0;
2245  int bytes;
2246  int total_bytes = 0;
2247 
2249 
2250  case_p = &(alphabet->lower_cp[cp * alphabet->lower_multiplier]);
2251 
2252  do
2253  {
2254  bytes = intl_cp_to_utf8 (*case_p, d);
2255  d += bytes;
2256  total_bytes += bytes;
2257  case_p++;
2258  count++;
2259  }
2260  while (count < alphabet->lower_multiplier && *case_p != 0);
2261 
2262  return total_bytes;
2263  }
2264  }
2265  else if (cp == 0xffffffff)
2266  {
2267  /* this may happen when UTF-8 text validation is disabled (by default) */
2268  *d = *s;
2269  return 1;
2270  }
2271 
2272  return intl_cp_to_utf8 (cp, d);
2273 }
2274 
2275 /*
2276  * intl_char_toupper_utf8() - convert lowercase character to uppercase
2277  * return: size of UTF-8 uppercase character corresponding to the argument
2278  * alphabet(in): alphabet to use
2279  * s (in): the UTF-8 buffer holding character to be converted
2280  * size(in): size of UTF-8 buffer
2281  * d (out): output buffer
2282  * next (out): pointer to next character
2283  *
2284  * Note : allocated size of 'd' is assumed to be large enough to fit any
2285  * UTF-8 character
2286  */
2287 static int
2288 intl_char_toupper_utf8 (const ALPHABET_DATA * alphabet, const unsigned char *s, const int size, unsigned char *d,
2289  unsigned char **next)
2290 {
2291  unsigned int cp = intl_utf8_to_cp (s, size, next);
2292 
2293  assert (alphabet != NULL);
2294 
2295  if (cp < (unsigned int) (alphabet->l_count))
2296  {
2297  if (alphabet->upper_multiplier == 1)
2298  {
2299  unsigned upper_cp = alphabet->upper_cp[cp];
2300 
2301  return intl_cp_to_utf8 (upper_cp, d);
2302  }
2303  else
2304  {
2305  const unsigned int *case_p;
2306  int count = 0;
2307  int bytes;
2308  int total_bytes = 0;
2309 
2311 
2312  case_p = &(alphabet->upper_cp[cp * alphabet->upper_multiplier]);
2313  do
2314  {
2315  bytes = intl_cp_to_utf8 (*case_p, d);
2316  d += bytes;
2317  total_bytes += bytes;
2318  case_p++;
2319  count++;
2320  }
2321  while (count < alphabet->upper_multiplier && *case_p != 0);
2322 
2323  return total_bytes;
2324  }
2325  }
2326  else if (cp == 0xffffffff)
2327  {
2328  /* this may happen when UTF-8 text validation is disabled (by default) */
2329  *d = *s;
2330  return 1;
2331  }
2332 
2333  return intl_cp_to_utf8 (cp, d);
2334 }
2335 
2336 /*
2337  * intl_identifier_casecmp_w_size()
2338  * return: 0 if strings are equal, -1 if str1 < str2 , 1 if str1 > str2
2339  * str1(in):
2340  * str2(in):
2341  * size_str1(in): size in bytes of str1
2342  * size_str2(in): size in bytes of str2
2343  *
2344  */
2345 int
2346 intl_identifier_casecmp_w_size (const INTL_LANG lang_id, unsigned char *str1, unsigned char *str2, const int size_str1,
2347  const int size_str2)
2348 {
2349 #if INTL_IDENTIFIER_CASING_SIZE_MULTIPLIER <= 1
2350  if (size_str1 != size_str2)
2351  {
2352  return (size_str1 < size_str2) ? -1 : 1;
2353  }
2354 #endif
2355 
2356  switch (lang_charset ())
2357  {
2358  case INTL_CODESET_UTF8:
2359  {
2360  unsigned char *str1_end, *str2_end;
2361  unsigned char *dummy;
2362  unsigned int cp1, cp2;
2364  const ALPHABET_DATA *alphabet;
2365 
2366  assert (loc != NULL);
2367 
2368  alphabet = &(loc->ident_alphabet);
2369 
2370  str1_end = str1 + size_str1;
2371  str2_end = str2 + size_str2;
2372 
2373  for (; str1 < str1_end && str2 < str2_end;)
2374  {
2375  int skip_size1 = 0, skip_size2 = 0;
2376  int res;
2377 
2378  cp1 = intl_utf8_to_cp (str1, CAST_STRLEN (str1_end - str1), &dummy);
2379  cp2 = intl_utf8_to_cp (str2, CAST_STRLEN (str2_end - str2), &dummy);
2380 
2381  res =
2382  intl_strcasecmp_utf8_one_cp (alphabet, str1, str2, CAST_STRLEN (str1_end - str1),
2383  CAST_STRLEN (str2_end - str2), cp1, cp2, &skip_size1, &skip_size2);
2384 
2385  if (res != 0)
2386  {
2387  return res;
2388  }
2389 
2390  str1 += skip_size1;
2391  str2 += skip_size2;
2392  }
2393 
2394  return (str1 < str1_end) ? 1 : ((str2 < str2_end) ? -1 : 0);
2395  }
2396  break;
2397 
2398  case INTL_CODESET_ISO88591:
2399  {
2400  unsigned char *str1_end, *str2_end;
2401  unsigned char lower1, lower2;
2402 
2403  if (size_str1 != size_str2)
2404  {
2405  return (size_str1 < size_str2) ? -1 : 1;
2406  }
2407 
2408  str1_end = str1 + size_str1;
2409  str2_end = str2 + size_str2;
2410 
2411  for (; str1 < str1_end && str2 < str2_end; str1++, str2++)
2412  {
2413  if (*str1 != *str2)
2414  {
2415  lower1 = char_tolower_iso8859 (*str1);
2416  lower2 = char_tolower_iso8859 (*str2);
2417  if (lower1 != lower2)
2418  {
2419  return (lower1 < lower2) ? -1 : 1;
2420  }
2421  }
2422  }
2423 
2424  return (str1 < str1_end) ? 1 : ((str2 < str2_end) ? -1 : 0);
2425  }
2427  default:
2428  /* ASCII */
2429  if (size_str1 != size_str2)
2430  {
2431  return (size_str1 < size_str2) ? -1 : 1;
2432  }
2433 
2434  return strncasecmp ((char *) str1, (char *) str2, size_str1);
2435  }
2436 
2437  return 0;
2438 }
2439 
2440 /*
2441  * intl_is_case_match() - performs case insensitive matching
2442  * return: 0 if strings are equal, -1 if str1 < str2 , 1 if str1 > str2
2443  * lang_id(in):
2444  * codeset(in):
2445  * tok(in): token to check
2446  * src(in): string to check for token
2447  * size_tok(in): size in bytes of token
2448  * size_src(in): size in bytes of source string
2449  * matched_size_src(out): size in bytes of matched token in source
2450  *
2451  * Note : Matching is performed by folding to LOWER case;
2452  * it takes into account case expansion (length in chars may differ).
2453  */
2454 int
2455 intl_case_match_tok (const INTL_LANG lang_id, const INTL_CODESET codeset, unsigned char *tok, unsigned char *src,
2456  const int size_tok, const int size_src, int *matched_size_src)
2457 {
2458  assert (tok != NULL);
2459  assert (src != NULL);
2460 
2461  assert (size_tok > 0);
2462  assert (size_src >= 0);
2463 
2464  assert (matched_size_src != NULL);
2465 
2466  *matched_size_src = 0;
2467 
2468  switch (codeset)
2469  {
2470  case INTL_CODESET_UTF8:
2471  {
2472  unsigned char *tok_end, *src_end;
2473  unsigned char *dummy;
2474  unsigned int cp1, cp2;
2476  const ALPHABET_DATA *alphabet;
2477 
2478  assert (loc != NULL);
2479 
2480  alphabet = &(loc->alphabet);
2481 
2482  tok_end = tok + size_tok;
2483  src_end = src + size_src;
2484 
2485  for (; tok < tok_end && src < src_end;)
2486  {
2487  int skip_size_tok = 0, skip_size_src = 0;
2488  int res;
2489 
2490  cp1 = intl_utf8_to_cp (tok, CAST_STRLEN (tok_end - tok), &dummy);
2491  cp2 = intl_utf8_to_cp (src, CAST_STRLEN (src_end - src), &dummy);
2492 
2493  res =
2494  intl_strcasecmp_utf8_one_cp (alphabet, tok, src, CAST_STRLEN (tok_end - tok), CAST_STRLEN (src_end - src),
2495  cp1, cp2, &skip_size_tok, &skip_size_src);
2496 
2497  if (res != 0)
2498  {
2499  return res;
2500  }
2501 
2502  tok += skip_size_tok;
2503  src += skip_size_src;
2504  *matched_size_src += skip_size_src;
2505  }
2506 
2507  return (tok < tok_end) ? 1 : 0;
2508  }
2509  break;
2510 
2511  case INTL_CODESET_ISO88591:
2512  {
2513  unsigned char *tok_end, *src_end;
2514  unsigned char lower1, lower2;
2515  tok_end = tok + size_tok;
2516  src_end = src + size_src;
2517 
2518  if (size_tok > size_src)
2519  {
2520  return 1;
2521  }
2522 
2523  *matched_size_src = size_tok;
2524  for (; tok < tok_end && src < src_end; tok++, src++)
2525  {
2526  if (*tok != *src)
2527  {
2528  lower1 = char_tolower_iso8859 (*tok);
2529  lower2 = char_tolower_iso8859 (*src);
2530  if (lower1 != lower2)
2531  {
2532  return (lower1 < lower2) ? -1 : 1;
2533  }
2534  }
2535  }
2536  }
2537  break;
2538 
2540  default:
2541  if (size_tok > size_src)
2542  {
2543  return 1;
2544  }
2545 
2546  *matched_size_src = size_tok;
2547  return strncasecmp ((char *) tok, (char *) src, size_tok);
2548  }
2549 
2550  return 0;
2551 }
2552 
2553 /*
2554  * intl_strcasecmp_utf8_one_cp() - compares the first codepoints from two
2555  * strings case insensitive
2556  * return: 0 if strings are equal, -1 if cp1 < cp2 , 1 if cp1 > cp2
2557  * str1(in):
2558  * str2(in):
2559  * size_str1(in): size in bytes of str1
2560  * size_str2(in): size in bytes of str2
2561  * cp1(in): first codepoint in str1
2562  * cp2(in): first codepoint in str2
2563  * skip_size1(out): bytes to skip from str1
2564  * skip_size2(out): bytes to skip from str2
2565  * identifier_mode(in): true if compares identifiers, false otherwise
2566  *
2567  * Note : skip_size1, skip_size2 are valid only when strings are equal
2568  * (returned value is zero).
2569  */
2570 static int
2571 intl_strcasecmp_utf8_one_cp (const ALPHABET_DATA * alphabet, unsigned char *str1, unsigned char *str2,
2572  const int size_str1, const int size_str2, unsigned int cp1, unsigned int cp2,
2573  int *skip_size1, int *skip_size2)
2574 {
2575  int alpha_cnt;
2576  unsigned int l_array_1[INTL_CASING_EXPANSION_MULTIPLIER];
2577  unsigned int l_array_2[INTL_CASING_EXPANSION_MULTIPLIER];
2578  int skip_len1 = 1, skip_len2 = 1;
2579  int l_count_1 = 0, l_count_2 = 0, l_count = 0;
2580  int res;
2581  bool use_original_str1, use_original_str2;
2582 
2583  unsigned int *casing_arr;
2584  int casing_multiplier;
2585 
2586  assert (alphabet != NULL);
2587  assert (str1 != NULL);
2588  assert (str2 != NULL);
2589  assert (skip_size1 != NULL);
2590  assert (skip_size2 != NULL);
2591 
2592  if (cp1 == cp2)
2593  {
2594  (void) intl_char_size (str1, 1, INTL_CODESET_UTF8, skip_size1);
2595  (void) intl_char_size (str2, 1, INTL_CODESET_UTF8, skip_size2);
2596 
2597  return 0;
2598  }
2599 
2600  alpha_cnt = alphabet->l_count;
2601 
2602  if (alphabet->lower_multiplier == 1 && alphabet->upper_multiplier == 1)
2603  {
2604  if (cp1 < (unsigned int) alpha_cnt)
2605  {
2606  cp1 = alphabet->lower_cp[cp1];
2607  }
2608 
2609  if (cp2 < (unsigned int) alpha_cnt)
2610  {
2611  cp2 = alphabet->lower_cp[cp2];
2612  }
2613 
2614  if (cp1 != cp2)
2615  {
2616  return (cp1 < cp2) ? (-1) : 1;
2617  }
2618 
2619  (void) intl_char_size (str1, 1, INTL_CODESET_UTF8, skip_size1);
2620  (void) intl_char_size (str2, 1, INTL_CODESET_UTF8, skip_size2);
2621 
2622  return 0;
2623  }
2624 
2625  /*
2626  * Multipliers can be either 1 or 2, as imposed by the LDML parsing code.
2627  * Currently, alphabets with both multipliers equal to 2 are not supported
2628  * for case sensitive comparisons.
2629  */
2630  assert (alphabet->lower_multiplier == 1 || alphabet->upper_multiplier == 1);
2631  if (alphabet->lower_multiplier > alphabet->upper_multiplier)
2632  {
2633  casing_arr = alphabet->lower_cp;
2634  casing_multiplier = alphabet->lower_multiplier;
2635  }
2636  else
2637  {
2638  casing_arr = alphabet->upper_cp;
2639  casing_multiplier = alphabet->upper_multiplier;
2640  }
2641 
2642  use_original_str1 = true;
2643  if (cp1 < (unsigned int) alpha_cnt)
2644  {
2645  memcpy (l_array_1, &(casing_arr[cp1 * casing_multiplier]), casing_multiplier * sizeof (unsigned int));
2646 
2647  if (cp1 != l_array_1[0])
2648  {
2649  l_count_1 = casing_multiplier;
2650  while (l_count_1 > 1 && l_array_1[l_count_1 - 1] == 0)
2651  {
2652  l_count_1--;
2653  }
2654 
2655  use_original_str1 = false;
2656  }
2657  }
2658 
2659  use_original_str2 = true;
2660  if (cp2 < (unsigned int) alpha_cnt)
2661  {
2662  memcpy (l_array_2, &(casing_arr[cp2 * casing_multiplier]), casing_multiplier * sizeof (unsigned int));
2663 
2664  if (cp2 != l_array_2[0])
2665  {
2666  l_count_2 = casing_multiplier;
2667  while (l_count_2 > 1 && l_array_2[l_count_2 - 1] == 0)
2668  {
2669  l_count_2--;
2670  }
2671 
2672  use_original_str2 = false;
2673  }
2674  }
2675 
2676  if (use_original_str1)
2677  {
2678  (void) intl_utf8_to_cp_list (str1, size_str1, l_array_1, casing_multiplier, &l_count_1);
2679  }
2680 
2681  if (use_original_str2)
2682  {
2683  (void) intl_utf8_to_cp_list (str2, size_str2, l_array_2, casing_multiplier, &l_count_2);
2684  }
2685 
2686  l_count = MIN (l_count_1, l_count_2);
2687 
2688  if (use_original_str1)
2689  {
2690  l_count_1 = MIN (l_count, l_count_1);
2691  skip_len1 = l_count_1;
2692  }
2693  else
2694  {
2695  skip_len1 = 1;
2696  }
2697 
2698  if (use_original_str2)
2699  {
2700  l_count_2 = MIN (l_count, l_count_2);
2701  skip_len2 = l_count_2;
2702  }
2703  else
2704  {
2705  skip_len2 = 1;
2706  }
2707 
2708  if (l_count_1 != l_count_2)
2709  {
2710  return (l_count_1 < l_count_2) ? (-1) : (1);
2711  }
2712 
2713  assert (l_count_1 == l_count_2);
2714 
2715  /* compare lower codepoints */
2716  res = memcmp (l_array_1, l_array_2, l_count * sizeof (unsigned int));
2717  if (res != 0)
2718  {
2719  return res;
2720  }
2721 
2722  /* convert supplementary characters in bytes size to skip */
2723  (void) intl_char_size (str1, skip_len1, INTL_CODESET_UTF8, skip_size1);
2724  (void) intl_char_size (str2, skip_len2, INTL_CODESET_UTF8, skip_size2);
2725 
2726  return 0;
2727 }
2728 
2729 /*
2730  * intl_identifier_casecmp() - compares two identifiers strings
2731  * case insensitive
2732  * return: 0 if strings are equal, -1 if str1 < str2 , 1 if str1 > str2
2733  * str1(in):
2734  * str2(in):
2735  *
2736  * NOTE: identifier comparison is special, see intl_identifier_casecmp_w_size
2737  * for details on comparing identifiers of different length.
2738  */
2739 int
2740 intl_identifier_casecmp (const char *str1, const char *str2)
2741 {
2742  int str1_size;
2743  int str2_size;
2744 
2745  assert (str1 != NULL);
2746  assert (str2 != NULL);
2747 
2748  str1_size = strlen (str1);
2749  str2_size = strlen (str2);
2750 
2751  return intl_identifier_casecmp_w_size (lang_id (), (unsigned char *) str1, (unsigned char *) str2, str1_size,
2752  str2_size);
2753 }
2754 
2755 /*
2756  * intl_identifier_ncasecmp() - compares two identifiers strings
2757  * case insensitive
2758  * return:
2759  * str1(in):
2760  * str2(in):
2761  * len(in): number of chars to compare
2762  *
2763  */
2764 int
2765 intl_identifier_ncasecmp (const char *str1, const char *str2, const int len)
2766 {
2767  int str1_size, str2_size;
2768 
2769  (void) intl_char_size ((unsigned char *) str1, len, lang_charset (), &str1_size);
2770  (void) intl_char_size ((unsigned char *) str2, len, lang_charset (), &str2_size);
2771 
2772  return intl_identifier_casecmp_w_size (lang_id (), (unsigned char *) str1, (unsigned char *) str2, str1_size,
2773  str2_size);
2774 }
2775 
2776 /*
2777  * intl_identifier_cmp() - compares two identifiers strings
2778  * case sensitive
2779  * return:
2780  * str1(in):
2781  * str2(in):
2782  *
2783  */
2784 int
2785 intl_identifier_cmp (const char *str1, const char *str2)
2786 {
2787  /* when comparing identifiers, order of current collation is not important */
2788  return strcmp (str1, str2);
2789 }
2790 
2791 /*
2792  * intl_identifier_namecmp() - compares two identifier string
2793  * return: 0 if the identifiers are the "same",
2794  * positive number if str1 is greater than str1,
2795  * negative number otherwise.
2796  * str1(in)
2797  * str2(in)
2798  *
2799  * Note: "same" means that this function ignores bracket '[', ']'
2800  * so str1 = "[value]" and str2 = "value" returns 0
2801  */
2802 int
2803 intl_identifier_namecmp (const char *str1, const char *str2)
2804 {
2805  const char *cp1 = str1;
2806  const char *cp2 = str2;
2807  int str1_size, str2_size;
2808 
2809  assert (str1 != NULL && str2 != NULL);
2810 
2811  str1_size = strlen (cp1);
2812  str2_size = strlen (cp2);
2813 
2814  if (cp1[0] == '[')
2815  {
2816  cp1++;
2817  str1_size -= 2;
2818  }
2819 
2820  if (cp2[0] == '[')
2821  {
2822  cp2++;
2823  str2_size -= 2;
2824  }
2825 
2826  return intl_identifier_casecmp_w_size (lang_id (), (unsigned char *) cp1, (unsigned char *) cp2, str1_size,
2827  str2_size);
2828 }
2829 
2830 /*
2831  * intl_identifier_lower_string_size() - determine the size required for holding
2832  * lower case of the input string
2833  * return: required size
2834  * src(in): string to lowercase
2835  */
2836 int
2838 {
2839  int src_size, src_lower_size;
2840  INTL_CODESET codeset = lang_charset ();
2841 
2842  src_size = strlen (src);
2843 
2844  switch (codeset)
2845  {
2846  case INTL_CODESET_UTF8:
2847 #if (INTL_IDENTIFIER_CASING_SIZE_MULTIPLIER > 1)
2848  {
2849  unsigned char lower[INTL_UTF8_MAX_CHAR_SIZE];
2850  unsigned char *next;
2851  const unsigned char *s;
2852  const LANG_LOCALE_DATA *locale = lang_locale ();
2853  const ALPHABET_DATA *alphabet = &(locale->ident_alphabet);
2854  int s_size = src_size;
2855  unsigned int cp;
2856  int src_len;
2857 
2858  const unsigned char *usrc = REINTERPRET_CAST (const unsigned char *, src);
2859  intl_char_count (usrc, src_size, codeset, &src_len);
2860 
2861  src_lower_size = 0;
2862 
2863  for (s = usrc; s < usrc + src_size;)
2864  {
2865  assert (s_size > 0);
2866 
2867  cp = intl_utf8_to_cp (s, s_size, &next);
2868 
2869  if (cp < (unsigned int) (alphabet->l_count))
2870  {
2871  int lower_cnt;
2872  unsigned int *lower_cp = &(alphabet->lower_cp[cp * alphabet->lower_multiplier]);
2873 
2874  for (lower_cnt = 0; lower_cnt < alphabet->lower_multiplier && *lower_cp != 0; lower_cnt++, lower_cp++)
2875  {
2876  src_lower_size += intl_cp_to_utf8 (*lower_cp, lower);
2877  }
2878  }
2879  else
2880  {
2881  src_lower_size += intl_cp_to_utf8 (cp, lower);
2882  }
2883 
2884  s_size -= CAST_STRLEN (next - s);
2885  s = next;
2886  }
2887  }
2888 #else
2889  src_lower_size = src_size;
2890 #endif
2891  break;
2892 
2894  case INTL_CODESET_ISO88591:
2896  default:
2897  src_lower_size = src_size;
2898  break;
2899  }
2900 
2901  return src_lower_size;
2902 }
2903 
2904 /*
2905  * intl_identifier_lower() - convert given characters to lowercase characters
2906  * return: always 0
2907  * src(in) : source buffer
2908  * dst(out) : destination buffer
2909  *
2910  * Note : 'dst' has always enough size
2911  */
2912 int
2913 intl_identifier_lower (const char *src, char *dst)
2914 {
2915  int d_size = 0;
2916  int length_in_bytes = 0;
2917  int length_in_chars = 0;
2918  unsigned char *d;
2919  const unsigned char *s;
2920 
2921  if (src)
2922  {
2923  length_in_bytes = strlen (src);
2924  }
2925 
2926  unsigned char *udst = REINTERPRET_CAST (unsigned char *, dst);
2927  const unsigned char *usrc = REINTERPRET_CAST (const unsigned char *, src);
2928 
2929  switch (lang_charset ())
2930  {
2931  case INTL_CODESET_UTF8:
2932  {
2933  const LANG_LOCALE_DATA *locale = lang_locale ();
2934  const ALPHABET_DATA *alphabet = &(locale->ident_alphabet);
2935  length_in_chars = intl_count_utf8_chars (usrc, length_in_bytes);
2936  (void) intl_tolower_utf8 (alphabet, usrc, udst, length_in_chars, &d_size);
2937  d = udst + d_size;
2938  }
2939  break;
2940 
2941  case INTL_CODESET_ISO88591:
2942  {
2943  for (d = udst, s = usrc; d < udst + length_in_bytes; d++, s++)
2944  {
2945  *d = char_tolower_iso8859 (*s);
2946  }
2947  }
2948  break;
2949 
2951  default:
2952  {
2953  for (d = udst, s = usrc; d < udst + length_in_bytes; d++, s++)
2954  {
2955  *d = char_tolower (*s);
2956  }
2957  }
2958  break;
2959  }
2960 
2961  *d = '\0';
2962 
2963  return 0;
2964 }
2965 
2966 /*
2967  * intl_identifier_upper_string_size() - determine the size required for holding
2968  * upper case of the input string
2969  * return: required size
2970  * src(in): string to lowercase
2971  */
2972 int
2974 {
2975  int src_size, src_upper_size;
2976  INTL_CODESET codeset = lang_charset ();
2977 
2978  src_size = strlen (src);
2979 
2980  const unsigned char *usrc = REINTERPRET_CAST (const unsigned char *, src);
2981 
2982  switch (codeset)
2983  {
2984  case INTL_CODESET_UTF8:
2985 #if (INTL_IDENTIFIER_CASING_SIZE_MULTIPLIER > 1)
2986  {
2987  unsigned char upper[INTL_UTF8_MAX_CHAR_SIZE];
2988  unsigned char *next;
2989  const unsigned char *s;
2990  const LANG_LOCALE_DATA *locale = lang_locale ();
2991  const ALPHABET_DATA *alphabet = &(locale->ident_alphabet);
2992  int s_size = src_size;
2993  unsigned int cp;
2994  int src_len;
2995 
2996  intl_char_count (usrc, src_size, codeset, &src_len);
2997 
2998  src_upper_size = 0;
2999 
3000  for (s = usrc; s < usrc + src_size;)
3001  {
3002  assert (s_size > 0);
3003 
3004  cp = intl_utf8_to_cp (s, s_size, &next);
3005 
3006  if (cp < (unsigned int) (alphabet->l_count))
3007  {
3008  int upper_cnt;
3009  unsigned int *upper_cp = &(alphabet->upper_cp[cp * alphabet->upper_multiplier]);
3010 
3011  for (upper_cnt = 0; upper_cnt < alphabet->upper_multiplier && *upper_cp != 0; upper_cnt++, upper_cp++)
3012  {
3013  src_upper_size += intl_cp_to_utf8 (*upper_cp, upper);
3014  }
3015  }
3016  else
3017  {
3018  src_upper_size += intl_cp_to_utf8 (cp, upper);
3019  }
3020 
3021  s_size -= CAST_STRLEN (next - s);
3022  s = next;
3023  }
3024  }
3025 #else
3026  src_upper_size = src_size;
3027 #endif
3028  break;
3029 
3031  case INTL_CODESET_ISO88591:
3033  default:
3034  src_upper_size = src_size;
3035  break;
3036  }
3037 
3038  return src_upper_size;
3039 }
3040 
3041 /*
3042  * intl_identifier_upper() - convert given characters to uppercase characters
3043  * return: always 0
3044  * src(in):
3045  * dst(out):
3046  *
3047  * Note : 'dst' has always enough size;
3048  */
3049 int
3050 intl_identifier_upper (const char *src, char *dst)
3051 {
3052  int d_size = 0;
3053  int length_in_bytes = 0;
3054  int length_in_chars = 0;
3055  unsigned char *d;
3056  const unsigned char *s;
3057 
3058  if (src)
3059  {
3060  length_in_bytes = strlen (src);
3061  }
3062 
3063  unsigned char *udst = REINTERPRET_CAST (unsigned char *, dst);
3064  const unsigned char *usrc = REINTERPRET_CAST (const unsigned char *, src);
3065 
3066  switch (lang_charset ())
3067  {
3068  case INTL_CODESET_UTF8:
3069  {
3070  const LANG_LOCALE_DATA *locale = lang_locale ();
3071  const ALPHABET_DATA *alphabet = &(locale->ident_alphabet);
3072  length_in_chars = intl_count_utf8_chars (usrc, length_in_bytes);
3073  (void) intl_toupper_utf8 (alphabet, usrc, udst, length_in_chars, &d_size);
3074  d = udst + d_size;
3075  }
3076  break;
3077  case INTL_CODESET_ISO88591:
3078  {
3079  for (d = udst, s = usrc; d < udst + length_in_bytes; d++, s++)
3080  {
3081  *d = char_toupper_iso8859 (*s);
3082  }
3083  }
3084  break;
3086  default:
3087  {
3088  for (d = udst, s = usrc; d < udst + length_in_bytes; d++, s++)
3089  {
3090  *d = char_toupper (*s);
3091  }
3092  }
3093  break;
3094  }
3095 
3096  *d = '\0';
3097 
3098  return 0;
3099 }
3100 
3101 /*
3102  * intl_identifier_fix - Checks if a string can be an identifier;
3103  * Truncates the string to a desired size in bytes,
3104  * while making sure that the last char is not truncated
3105  * Checks that lower and upper case versions of string
3106  * do not exceed maximum allowed size.
3107  *
3108  * return: error code : ER_GENERIC_ERROR or NO_ERROR
3109  * name(in): identifier name, nul-terminated C string
3110  * ident_max_size(in): allowed size of this identifier, may be -1 in which
3111  * case the maximum allowed system size is used
3112  * error_on_case_overflow(in): if true, will return error if the lower or
3113  * upper version of truncated identifier exceeds
3114  * allowed size
3115  *
3116  * Note : Identifier string may be truncated if lexer previously truncated it
3117  * in the middle of the last character;
3118  * No error message is outputed by this function - in case of error,
3119  * the error message should be output by the caller.
3120  * DB_MAX_IDENTIFIER_LENGTH is the buffer size for string identifier
3121  * This includes the nul-terminator byte; the useful bytes are
3122  * (DB_MAX_IDENTIFIER_LENGTH - 1).
3123  */
3124 int
3125 intl_identifier_fix (char *name, int ident_max_size, bool error_on_case_overflow)
3126 {
3127  int truncated_size = 0, original_size = 0, char_size = 0;
3128  const unsigned char *cname = (unsigned char *) name;
3129  INTL_CODESET codeset = lang_charset ();
3130 
3131  assert (name != NULL);
3132 
3133  if (ident_max_size == -1)
3134  {
3135  ident_max_size = DB_MAX_IDENTIFIER_LENGTH - 1;
3136  }
3137 
3138  assert (ident_max_size > 0 && ident_max_size < DB_MAX_IDENTIFIER_LENGTH);
3139 
3140  original_size = strlen (name);
3141  if (INTL_CODESET_MULT (codeset) == 1)
3142  {
3143  if (original_size > ident_max_size)
3144  {
3145  name[ident_max_size] = '\0';
3146  }
3147  return NO_ERROR;
3148  }
3149 
3150  assert (INTL_CODESET_MULT (codeset) > 1);
3151 
3152  /* we do not check contents of non-ASCII if codeset is UTF-8 or EUC; valid codeset sequences are checked with
3153  * 'intl_check_string' when enabled */
3154 
3155 check_truncation:
3156  /* check if last char of identifier may have been truncated */
3157  if (original_size + INTL_CODESET_MULT (codeset) > ident_max_size)
3158  {
3159  if (ident_max_size < original_size)
3160  {
3161  original_size = ident_max_size;
3162  }
3163 
3164  /* count original size based on the size given by first byte of each char */
3165  for (truncated_size = 0; truncated_size < original_size;)
3166  {
3167  INTL_NEXT_CHAR (cname, cname, codeset, &char_size);
3168  truncated_size += char_size;
3169  }
3170  assert (truncated_size >= original_size);
3171 
3172  /* truncated_size == original_size means last character fit entirely in 'original_size'
3173  * otherwise assume the last character was truncated */
3174  if (truncated_size > original_size)
3175  {
3176  assert (truncated_size < original_size + INTL_CODESET_MULT (codeset));
3177  assert ((unsigned char) *(cname - char_size) > 0x80);
3178  /* truncate after the last full character */
3179  truncated_size -= char_size;
3180  original_size = truncated_size;
3181  }
3182  name[original_size] = '\0';
3183  }
3184 
3185  /* ensure that lower or upper versions of identifier do not exceed maximum allowed size of an identifier */
3186 #if (INTL_IDENTIFIER_CASING_SIZE_MULTIPLIER > 1)
3187  if (intl_identifier_upper_string_size (name) > ident_max_size
3188  || intl_identifier_lower_string_size (name) > ident_max_size)
3189  {
3190  if (error_on_case_overflow)
3191  {
3192  /* this is grammar context : reject the identifier string */
3193  return ER_GENERIC_ERROR;
3194  }
3195  else
3196  {
3197  /* decrease the initial allowed size and try again */
3198  ident_max_size -= INTL_CODESET_MULT (codeset);
3199  if (ident_max_size <= INTL_CODESET_MULT (codeset))
3200  {
3201  /* we make sure we have room for at least one character */
3202  return ER_GENERIC_ERROR;
3203  }
3204  goto check_truncation;
3205  }
3206  }
3207 #endif
3208 
3209  return NO_ERROR;
3210 }
3211 
3212 /*
3213  * intl_identifier_mht_1strhash - hash a identifier key (in lowercase)
3214  * return: hash value
3215  * key(in): key to hash
3216  * ht_size(in): size of hash table
3217  *
3218  * Note: Charset dependent version of 'mht_1strlowerhashTaken' function
3219  */
3220 unsigned int
3221 intl_identifier_mht_1strlowerhash (const void *key, const unsigned int ht_size)
3222 {
3223  unsigned int hash;
3224  unsigned const char *byte_p = (unsigned char *) key;
3225  unsigned int ch;
3226 
3227  assert (key != NULL);
3228 
3229  switch (lang_charset ())
3230  {
3231  case INTL_CODESET_UTF8:
3232  {
3233  const LANG_LOCALE_DATA *locale = lang_locale ();
3234  const ALPHABET_DATA *alphabet = &(locale->ident_alphabet);
3235  int key_size = strlen ((const char *) key);
3236  unsigned char *next;
3237 
3238  for (hash = 0; key_size > 0;)
3239  {
3240  ch = intl_utf8_to_cp (byte_p, key_size, &next);
3241  if (ch < (unsigned int) (alphabet->l_count))
3242  {
3243  assert (alphabet->lower_multiplier == 1);
3244  ch = alphabet->lower_cp[ch];
3245  }
3246 
3247  key_size -= CAST_STRLEN (next - byte_p);
3248  byte_p = next;
3249 
3250  hash = (hash << 5) - hash + ch;
3251  }
3252  }
3253  break;
3254  case INTL_CODESET_ISO88591:
3255  for (hash = 0; *byte_p; byte_p++)
3256  {
3257  if (char_isupper_iso8859 (*byte_p))
3258  {
3259  ch = char_tolower_iso8859 (*byte_p);
3260  }
3261  else
3262  {
3263  ch = char_tolower (*byte_p);
3264  }
3265  hash = (hash << 5) - hash + ch;
3266  }
3267  break;
3269  for (hash = 0; *byte_p; byte_p++)
3270  {
3271  ch = *byte_p;
3272  hash = (hash << 5) - hash + ch;
3273  }
3274  break;
3276  default:
3277  for (hash = 0; *byte_p; byte_p++)
3278  {
3279  ch = char_tolower (*byte_p);
3280  hash = (hash << 5) - hash + ch;
3281  }
3282  break;
3283  }
3284 
3285  return hash % ht_size;
3286 }
3287 
3288 #if defined (ENABLE_UNUSED_FUNCTION)
3289 /*
3290  * intl_strncat() - concatenates at most len characters from 'src' to 'dest'
3291  * return: number of bytes copied
3292  * dest(in/out):
3293  * src(in);
3294  * len(in): length to concatenate (in chars)
3295  *
3296  * Note : the NULL terminator is always appended to 'dest';
3297  * it is assumed that 'dest' allocated size can fit appended chars
3298  *
3299  */
3300 int
3301 intl_strncat (unsigned char *dest, const unsigned char *src, int len)
3302 {
3303  int result = 0;
3304 
3305  if (lang_charset () == INTL_CODESET_UTF8)
3306  {
3307  int copy_len = 0;
3308  unsigned char *p_dest = dest + strlen ((char *) dest);
3309  const unsigned char *p_char = NULL;
3310  int char_len;
3311 
3312  while (*src && copy_len < len)
3313  {
3314  if (*src < 0x80)
3315  {
3316  *p_dest++ = *src++;
3317  }
3318  else
3319  {
3320  p_char = src;
3321  INTL_GET_NEXTCHAR_UTF8 (src, char_len);
3322  memcpy (p_dest, p_char, char_len);
3323  p_dest += char_len;
3324  }
3325  copy_len++;
3326  }
3327  result = p_dest - dest;
3328  }
3329  else
3330  {
3331  strncat ((char *) dest, (char *) src, len);
3332  result = len;
3333  }
3334 
3335  return result;
3336 }
3337 #endif
3338 
3339 /*
3340  * intl_put_char() - puts a character into a string buffer
3341  * return: size of character
3342  * dest(in/out): destination buffer
3343  * char_p(in): pointer to character
3344  * codeset(in): codeset of character
3345  *
3346  * Note : It is assumed that 'dest' buffer can fit the character.
3347  *
3348  */
3349 int
3350 intl_put_char (unsigned char *dest, const unsigned char *char_p, const INTL_CODESET codeset)
3351 {
3352  int char_len;
3353 
3354  assert (char_p != NULL);
3355 
3356  switch (codeset)
3357  {
3358  case INTL_CODESET_UTF8:
3359  if (*char_p < 0x80)
3360  {
3361  *dest = *char_p;
3362  return 1;
3363  }
3364  else
3365  {
3366  char_len = intl_Len_utf8_char[*char_p];
3367  memcpy (dest, char_p, char_len);
3368  return char_len;
3369  }
3370  break;
3371 
3373  (void) intl_nextchar_euc (char_p, &char_len);
3374  memcpy (dest, char_p, char_len);
3375  return char_len;
3376 
3377  case INTL_CODESET_ISO88591:
3379  default:
3380  *dest = *char_p;
3381  return 1;
3382  }
3383 
3384  return 1;
3385 }
3386 
3387 
3388 /*
3389  * intl_is_space() - checks if character is white-space
3390  * return:
3391  * str(in):
3392  * str_end(in): end of string (pointer to first character after last
3393  * character of string) or NULL if str is null terminated
3394  * codeset(in): codeset of string
3395  * space_size(out): size in bytes of 'whitespace' character
3396  *
3397  * Note : White spaces are: ASCII space, TAB character, CR and LF
3398  * If codeset is EUC also the double byte character space (A1 A1) is
3399  * considered;
3400  *
3401  */
3402 bool
3403 intl_is_space (const char *str, const char *str_end, const INTL_CODESET codeset, int *space_size)
3404 {
3405  assert (str != NULL);
3406 
3407  if (space_size != NULL)
3408  {
3409  *space_size = 1;
3410  }
3411 
3412  switch (codeset)
3413  {
3415  if (str_end == NULL)
3416  {
3417  if (*((unsigned char *) str) == 0xa1 && *((unsigned char *) (str + 1)) == 0xa1)
3418  {
3419  if (space_size != NULL)
3420  {
3421  *space_size = 2;
3422  }
3423  return true;
3424  }
3425  else if (char_isspace (*str))
3426  {
3427  return true;
3428  }
3429  }
3430  else
3431  {
3432  if (str < str_end)
3433  {
3434  if (*((const unsigned char *) str) == 0xa1 && str + 1 < str_end
3435  && *((const unsigned char *) (str + 1)) == 0xa1)
3436  {
3437  if (space_size != NULL)
3438  {
3439  *space_size = 2;
3440  }
3441  return true;
3442  }
3443  else if (char_isspace (*str))
3444  {
3445  return true;
3446  }
3447  }
3448  }
3449  break;
3450  case INTL_CODESET_UTF8:
3451  case INTL_CODESET_ISO88591:
3453  default:
3454  if (str_end == NULL)
3455  {
3456  if (char_isspace (*str))
3457  {
3458  return true;
3459  }
3460  }
3461  else
3462  {
3463  if (str < str_end && char_isspace (*str))
3464  {
3465  return true;
3466  }
3467  }
3468  break;
3469  }
3470 
3471  return false;
3472 }
3473 
3474 /*
3475  * intl_skip_spaces() - skips white spaces in string
3476  * return: begining of non-whitespace characters or end of string
3477  * str(in):
3478  * str_end(in): end of string (pointer to first character after last
3479  * character of string) or NULL if str is null terminated
3480  * codeset(in): codeset of string
3481  *
3482  * Note : White spaces are: ASCII space, TAB character, CR and LF
3483  * If codeset is EUC also the double byte character space (A1 A1) is
3484  * considered;
3485  *
3486  */
3487 const char *
3488 intl_skip_spaces (const char *str, const char *str_end, const INTL_CODESET codeset)
3489 {
3490  assert (str != NULL);
3491 
3492  switch (codeset)
3493  {
3495  if (str_end == NULL)
3496  {
3497  while (*str != '\0')
3498  {
3499  if (*((unsigned char *) str) == 0xa1 && *((unsigned char *) (str + 1)) == 0xa1)
3500  {
3501  str++;
3502  str++;
3503  }
3504  else if (char_isspace (*str))
3505  {
3506  str++;
3507  }
3508  else
3509  {
3510  break;
3511  }
3512  }
3513  }
3514  else
3515  {
3516  while (str < str_end)
3517  {
3518  if (*((const unsigned char *) str) == 0xa1 && str + 1 < str_end
3519  && *((const unsigned char *) (str + 1)) == 0xa1)
3520  {
3521  str++;
3522  str++;
3523  }
3524  else if (char_isspace (*str))
3525  {
3526  str++;
3527  }
3528  else
3529  {
3530  break;
3531  }
3532  }
3533  }
3534  break;
3535  case INTL_CODESET_UTF8:
3536  case INTL_CODESET_ISO88591:
3538  default:
3539  if (str_end == NULL)
3540  {
3541  while (char_isspace (*str))
3542  {
3543  str++;
3544  }
3545  }
3546  else
3547  {
3548  while (str < str_end && char_isspace (*str))
3549  {
3550  str++;
3551  }
3552  }
3553  break;
3554  }
3555 
3556  return str;
3557 }
3558 
3559 /*
3560  * intl_backskip_spaces() - skips trailing white spaces in end of string
3561  * return: end of non-whitespace characters or end of string
3562  * str_begin(in): start of string
3563  * str_end(in): end of string (pointer to last character)
3564  * codeset(in): codeset of string
3565  *
3566  * Note : White spaces are: ASCII space, TAB character, CR and LF
3567  * If codeset is EUC also the double byte character space (A1 A1) is
3568  * considered;
3569  *
3570  */
3571 const char *
3572 intl_backskip_spaces (const char *str_begin, const char *str_end, const INTL_CODESET codeset)
3573 {
3574  assert (str_begin != NULL);
3575  assert (str_end != NULL);
3576 
3577  switch (codeset)
3578  {
3580  while (str_end > str_begin)
3581  {
3582  if (*((const unsigned char *) str_end) == 0xa1 && str_end - 1 > str_begin
3583  && *((const unsigned char *) (str_end - 1)) == 0xa1)
3584  {
3585  str_end--;
3586  str_end--;
3587  }
3588  else if (char_isspace (*str_end))
3589  {
3590  str_end--;
3591  }
3592  else
3593  {
3594  break;
3595  }
3596  }
3597  break;
3598  case INTL_CODESET_UTF8:
3599  case INTL_CODESET_ISO88591:
3601  default:
3602  while (str_end > str_begin && char_isspace (*str_end))
3603  {
3604  str_end++;
3605  }
3606  break;
3607  }
3608 
3609  return str_end;
3610 }
3611 
3612 /*
3613  * intl_cp_to_utf8() - converts a unicode codepoint to its
3614  * UTF-8 encoding
3615  * return: number of bytes for UTF-8; 0 means not encoded
3616  * codepoint(in) : Unicode code point (32 bit value)
3617  * utf8_seq(in/out) : pre-allocated buffer for UTF-8 sequence
3618  *
3619  */
3620 int
3621 intl_cp_to_utf8 (const unsigned int codepoint, unsigned char *utf8_seq)
3622 {
3623  assert (utf8_seq != NULL);
3624 
3625  if (codepoint <= 0x7f)
3626  {
3627  /* 1 byte */
3628  *utf8_seq = (unsigned char) codepoint;
3629  return 1;
3630  }
3631  if (codepoint <= 0x7ff)
3632  {
3633  /* 2 bytes */
3634  *utf8_seq++ = (unsigned char) (0xc0 | (codepoint >> 6));
3635  *utf8_seq = (unsigned char) (0x80 | (codepoint & 0x3f));
3636  return 2;
3637  }
3638  if (codepoint <= 0xffff)
3639  {
3640  /* 3 bytes */
3641  *utf8_seq++ = (unsigned char) (0xe0 | (codepoint >> 12));
3642  *utf8_seq++ = (unsigned char) (0x80 | ((codepoint >> 6) & 0x3f));
3643  *utf8_seq = (unsigned char) (0x80 | (codepoint & 0x3f));
3644  return 3;
3645  }
3646  if (codepoint <= 0x10ffff)
3647  {
3648  /* 4 bytes */
3649  *utf8_seq++ = (unsigned char) (0xf0 | (codepoint >> 18));
3650  *utf8_seq++ = (unsigned char) (0x80 | ((codepoint >> 12) & 0x3f));
3651  *utf8_seq++ = (unsigned char) (0x80 | ((codepoint >> 6) & 0x3f));
3652  *utf8_seq = (unsigned char) (0x80 | (codepoint & 0x3f));
3653  return 4;
3654  }
3655 
3656  assert (false);
3657  *utf8_seq = '?';
3658  return 1;
3659 }
3660 
3661 /*
3662  * intl_cp_to_dbcs() - converts a codepoint to DBCS encoding
3663  * return: number of bytes for encoding; 0 means not encoded
3664  * codepoint(in) : code point (16 bit value)
3665  * byte_flag(in): flag array : 0: single byte char,
3666  * 1: is a leading byte for DBCS,
3667  * 2: byte value not used
3668  * seq(in/out) : pre-allocated buffer for DBCS sequence
3669  *
3670  */
3671 int
3672 intl_cp_to_dbcs (const unsigned int codepoint, const unsigned char *byte_flag, unsigned char *seq)
3673 {
3674  assert (seq != NULL);
3675 
3676  /* is_lead_byte is assumed to have 256 elements */
3677  assert (byte_flag != NULL);
3678 
3679  if (codepoint <= 0xff)
3680  {
3681  if (byte_flag[codepoint] == 0)
3682  {
3683  /* 1 byte */
3684  *seq = (unsigned char) codepoint;
3685  }
3686  else
3687  {
3688  /* undefined or lead byte */
3689  *seq = '?';
3690  }
3691  return 1;
3692  }
3693  if (codepoint <= 0xffff)
3694  {
3695  /* 2 bytes */
3696  *seq++ = (unsigned char) (0xff & (codepoint >> 8));
3697  *seq = (unsigned char) (codepoint & 0xff);
3698  return 2;
3699  }
3700 
3701  assert (false);
3702  *seq = '?';
3703  return 1;
3704 }
3705 
3706 /*
3707  * intl_utf8_to_cp() - converts a UTF-8 encoded char to unicode codepoint
3708  * return: unicode code point; 0xffffffff means error
3709  * utf8(in) : buffer for UTF-8 char
3710  * size(in) : size of buffer
3711  * next_char(in/out): pointer to next character
3712  *
3713  */
3714 unsigned int
3715 intl_utf8_to_cp (const unsigned char *utf8, const int size, unsigned char **next_char)
3716 {
3717  assert (utf8 != NULL);
3718  assert (size > 0);
3719  assert (next_char != NULL);
3720 
3721  if (utf8[0] < 0x80)
3722  {
3723  *next_char = (unsigned char *) utf8 + 1;
3724  return (unsigned int) (utf8[0]);
3725  }
3726  else if (size >= 2 && utf8[0] >= 0xc0 && utf8[0] < 0xe0)
3727  {
3728  *next_char = (unsigned char *) utf8 + 2;
3729  return (unsigned int) (((utf8[0] & 0x1f) << 6) | (utf8[1] & 0x3f));
3730  }
3731  else if (size >= 3 && utf8[0] >= 0xe0 && utf8[0] < 0xf0)
3732  {
3733  *next_char = (unsigned char *) utf8 + 3;
3734  return (unsigned int) (((utf8[0] & 0x0f) << 12) | ((utf8[1] & 0x3f) << 6) | (utf8[2] & 0x3f));
3735  }
3736  else if (size >= 4 && utf8[0] >= 0xf0 && utf8[0] < 0xf8)
3737  {
3738  *next_char = (unsigned char *) utf8 + 4;
3739  return (unsigned int) (((utf8[0] & 0x07) << 18) | ((utf8[1] & 0x3f) << 12) | ((utf8[2] & 0x3f) << 6) |
3740  (utf8[3] & 0x3f));
3741  }
3742 #if INTL_UTF8_MAX_CHAR_SIZE > 4
3743  else if (size >= 5 && utf8[0] >= 0xf8 && utf8[0] < 0xfc)
3744  {
3745  *next_char = (unsigned char *) utf8 + 5;
3746  return (unsigned int) (((utf8[0] & 0x03) << 24) | ((utf8[1] & 0x3f) << 18) | ((utf8[2] & 0x3f) << 12) |
3747  ((utf8[3] & 0x3f) << 6) | (utf8[4] & 0x3f));
3748  }
3749  else if (size >= 6 && utf8[0] >= 0xfc && utf8[0] < 0xfe)
3750  {
3751  *next_char = (unsigned char *) utf8 + 6;
3752  return (unsigned int) (((utf8[0] & 0x01) << 30) | ((utf8[1] & 0x3f) << 24) | ((utf8[2] & 0x3f) << 18) |
3753  ((utf8[3] & 0x3f) << 12) | ((utf8[4] & 0x3f) << 6) | (utf8[5] & 0x3f));
3754  }
3755 #endif
3756 
3757  *next_char = (unsigned char *) utf8 + 1;
3758  return 0xffffffff;
3759 }
3760 
3761 /*
3762  * intl_back_utf8_to_cp() - converts a UTF-8 encoded char to unicode codepoint
3763  * but starting from the last byte of a character
3764  * return: unicode code point; 0xffffffff means error
3765  *
3766  * utf8_start(in) : start of buffer
3767  * utf8_last(in) : pointer to last byte of buffer (and last byte of last
3768  * character)
3769  * last_byte__prev_char(in/out) : pointer to last byte of previous character
3770  *
3771  */
3772 unsigned int
3773 intl_back_utf8_to_cp (const unsigned char *utf8_start, const unsigned char *utf8_last,
3774  unsigned char **last_byte__prev_char)
3775 {
3776  int char_size = 1;
3777  unsigned char *dummy;
3778 
3779  assert (utf8_start != NULL);
3780  assert (utf8_last != NULL);
3781  assert (utf8_start <= utf8_last);
3782  assert (last_byte__prev_char != NULL);
3783 
3784  if (*utf8_last < 0x80)
3785  {
3786  *last_byte__prev_char = ((unsigned char *) utf8_last) - 1;
3787  return *utf8_last;
3788  }
3789 
3790  /* multibyte character */
3791  do
3792  {
3793  if (((*utf8_last--) & 0xc0) != 0x80)
3794  {
3795  break;
3796  }
3797  if (utf8_last < utf8_start)
3798  {
3799  /* broken char, invalid CP */
3800  *last_byte__prev_char = ((unsigned char *) utf8_start) - 1;
3801  return 0xffffffff;
3802  }
3803  }
3804  while (++char_size < INTL_UTF8_MAX_CHAR_SIZE);
3805 
3806  *last_byte__prev_char = (unsigned char *) utf8_last;
3807  return intl_utf8_to_cp (utf8_last + 1, char_size, &dummy);
3808 }
3809 
3810 /*
3811  * intl_dbcs_to_cp() - converts a DBCS encoded char to DBCS codepoint
3812  * return: DBCS code point; 0xffffffff means error
3813  * seq(in) : buffer for DBCS char
3814  * size(in) : size of buffer
3815  * byte_flag(in) : array of flags for lead bytes
3816  * next_char(in/out): pointer to next character
3817  *
3818  */
3819 unsigned int
3820 intl_dbcs_to_cp (const unsigned char *seq, const int size, const unsigned char *byte_flag, unsigned char **next_char)
3821 {
3822  assert (seq != NULL);
3823  assert (size > 0);
3824  assert (next_char != NULL);
3825 
3826  assert (byte_flag != NULL);
3827 
3828  if (byte_flag[seq[0]] == 1 && size >= 2)
3829  {
3830  *next_char = (unsigned char *) seq + 2;
3831  return (unsigned int) (((seq[0]) << 8) | (seq[1]));
3832  }
3833 
3834  *next_char = (unsigned char *) seq + 1;
3835  return (unsigned int) (seq[0]);
3836 }
3837 
3838 
3839 /*
3840  * intl_utf8_to_cp_list() - converts a UTF-8 encoded string to a list of
3841  * unicode codepoint
3842  * return: number of codepoints found in string
3843  * utf8(in) : buffer for UTF-8 char
3844  * size(in) : size of string buffer
3845  * cp_array(in/out) : preallocated array to store computed codepoints list
3846  * max_array_size(in) : maximum size of computed codepoints list
3847  * cp_count(out) : number of codepoints found in string
3848  * array_count(out) : number of elements in codepoints list
3849  */
3850 int
3851 intl_utf8_to_cp_list (const unsigned char *utf8, const int size, unsigned int *cp_array, const int max_array_size,
3852  int *array_count)
3853 {
3854  unsigned char *next = NULL;
3855  const unsigned char *utf8_end = utf8 + size;
3856  int i;
3857 
3858  assert (utf8 != NULL);
3859  assert (size > 0);
3860  assert (cp_array != NULL);
3861  assert (max_array_size > 0);
3862  assert (array_count != NULL);
3863 
3864  for (i = 0, *array_count = 0; utf8 < utf8_end; i++)
3865  {
3866  unsigned int cp;
3867  assert (utf8_end - utf8 > 0);
3868 
3869  cp = intl_utf8_to_cp (utf8, CAST_STRLEN (utf8_end - utf8), &next);
3870  utf8 = next;
3871 
3872  if (i < max_array_size)
3873  {
3874  cp_array[i] = cp;
3875  (*array_count)++;
3876  }
3877  }
3878 
3879  return i;
3880 }
3881 
3882 #define UTF8_BYTE_IN_RANGE(b, r1, r2) (!(b < r1 || b > r2))
3883 
3884 /*
3885  * intl_check_utf8 - Checks if a string contains valid UTF-8 sequences
3886  *
3887  * return: 0 if valid,
3888  * 1 if contains and invalid byte in one char
3889  * 2 if last char is truncated (missing bytes)
3890  * buf(in): buffer
3891  * size(out): size of buffer (negative values accepted, in this case buffer
3892  * is assumed to be NUL terminated)
3893  * pos(out): pointer to beginning of invalid character
3894  *
3895  * Valid ranges:
3896  * - 1 byte : 00 - 7F
3897  * - 2 bytes: C2 - DF , 80 - BF (U +80 .. U+7FF)
3898  * - 3 bytes: E0 , A0 - BF , 80 - BF (U +800 .. U+FFF)
3899  * E1 - EC , 80 - BF , 80 - BF (U +1000 .. +CFFF)
3900  * ED , 80 - 9F , 80 - BF (U +D000 .. +D7FF)
3901  * EE - EF , 80 - BF , 80 - BF (U +E000 .. +FFFF)
3902  * - 4 bytes: F0 , 90 - BF , 80 - BF , 80 - BF (U +10000 .. +3FFFF)
3903  * F1 - F3 , 80 - BF , 80 - BF , 80 - BF (U +40000 .. +FFFFF)
3904  * F4 , 80 - 8F , 80 - BF , 80 - BF (U +100000 .. +10FFFF)
3905  *
3906  * Note:
3907  * This function should be used only when the UTF-8 string enters the CUBRID
3908  * system.
3909  */
3911 intl_check_utf8 (const unsigned char *buf, int size, char **pos)
3912 {
3913 #define OUTPUT(charp_out) if (pos != NULL) *pos = (char *) charp_out
3914 
3915  const unsigned char *p = buf;
3916  const unsigned char *p_end = NULL;
3917  const unsigned char *curr_char = NULL;
3918 
3919  if (pos != NULL)
3920  {
3921  *pos = NULL;
3922  }
3923 
3924  if (size < 0)
3925  {
3926  size = strlen ((char *) buf);
3927  }
3928 
3929  p_end = buf + size;
3930 
3931  while (p < p_end)
3932  {
3933  curr_char = p;
3934 
3935  if (*p < 0x80)
3936  {
3937  p++;
3938  continue;
3939  }
3940 
3941  /* range 80 - BF is not valid UTF-8 first byte */
3942  /* range C0 - C1 overlaps 1 byte 00 - 20 (2 byte overflow) */
3943  if (*p < 0xc2)
3944  {
3945  OUTPUT (curr_char);
3946  return INTL_UTF8_INVALID;
3947  }
3948 
3949  /* check 2 bytes sequences */
3950  /* 2 bytes sequence allowed : C2 - DF , 80 - BF */
3951  if (UTF8_BYTE_IN_RANGE (*p, 0xc2, 0xdf))
3952  {
3953  p++;
3954  if (p >= p_end)
3955  {
3956  OUTPUT (curr_char);
3957  return INTL_UTF8_TRUNCATED;
3958  }
3959 
3960  if (UTF8_BYTE_IN_RANGE (*p, 0x80, 0xbf))
3961  {
3962  p++;
3963  continue;
3964  }
3965  OUTPUT (curr_char);
3966  return INTL_UTF8_INVALID;
3967  }
3968 
3969  /* check 3 bytes sequences */
3970  /* 3 bytes sequence : E0 , A0 - BF , 80 - BF */
3971  if (*p == 0xe0)
3972  {
3973  p++;
3974  if (p >= p_end)
3975  {
3976  OUTPUT (curr_char);
3977  return INTL_UTF8_TRUNCATED;
3978  }
3979 
3980  if (UTF8_BYTE_IN_RANGE (*p, 0xa0, 0xbf))
3981  {
3982  p++;
3983  if (p >= p_end)
3984  {
3985  OUTPUT (curr_char);
3986  return INTL_UTF8_TRUNCATED;
3987  }
3988 
3989  if (UTF8_BYTE_IN_RANGE (*p, 0x80, 0xbf))
3990  {
3991  p++;
3992  continue;
3993  }
3994  }
3995 
3996  OUTPUT (curr_char);
3997  return INTL_UTF8_INVALID;
3998  }
3999  /* 3 bytes sequence : E1 - EC , 80 - BF , 80 - BF */
4000  /* 3 bytes sequence : EE - EF , 80 - BF , 80 - BF */
4001  else if (UTF8_BYTE_IN_RANGE (*p, 0xe1, 0xec) || UTF8_BYTE_IN_RANGE (*p, 0xee, 0xef))
4002  {
4003  p++;
4004  if (p >= p_end)
4005  {
4006  OUTPUT (curr_char);
4007  return INTL_UTF8_TRUNCATED;
4008  }
4009 
4010  if (UTF8_BYTE_IN_RANGE (*p, 0x80, 0xbf))
4011  {
4012  p++;
4013  if (p >= p_end)
4014  {
4015  OUTPUT (curr_char);
4016  return INTL_UTF8_TRUNCATED;
4017  }
4018 
4019  if (UTF8_BYTE_IN_RANGE (*p, 0x80, 0xbf))
4020  {
4021  p++;
4022  continue;
4023  }
4024  }
4025  OUTPUT (curr_char);
4026  return INTL_UTF8_INVALID;
4027  }
4028  /* 3 bytes sequence : ED , 80 - 9F , 80 - BF */
4029  else if (*p == 0xed)
4030  {
4031  p++;
4032  if (p >= p_end)
4033  {
4034  OUTPUT (curr_char);
4035  return INTL_UTF8_TRUNCATED;
4036  }
4037 
4038  if (UTF8_BYTE_IN_RANGE (*p, 0x80, 0x9f))
4039  {
4040  p++;
4041  if (p >= p_end)
4042  {
4043  OUTPUT (curr_char);
4044  return INTL_UTF8_TRUNCATED;
4045  }
4046 
4047  if (UTF8_BYTE_IN_RANGE (*p, 0x80, 0xbf))
4048  {
4049  p++;
4050  continue;
4051  }
4052  }
4053  OUTPUT (curr_char);
4054  return INTL_UTF8_INVALID;
4055  }
4056 
4057  /* 4 bytes sequence : F0 , 90 - BF , 80 - BF , 80 - BF */
4058  if (*p == 0xf0)
4059  {
4060  p++;
4061  if (p >= p_end)
4062  {
4063  OUTPUT (curr_char);
4064  return INTL_UTF8_TRUNCATED;
4065  }
4066 
4067  if (UTF8_BYTE_IN_RANGE (*p, 0x90, 0xbf))
4068  {
4069  p++;
4070  if (p >= p_end)
4071  {
4072  OUTPUT (curr_char);
4073  return INTL_UTF8_TRUNCATED;
4074  }
4075 
4076  if (UTF8_BYTE_IN_RANGE (*p, 0x80, 0xbf))
4077  {
4078  p++;
4079  if (p >= p_end)
4080  {
4081  OUTPUT (curr_char);
4082  return INTL_UTF8_TRUNCATED;
4083  }
4084 
4085  if (UTF8_BYTE_IN_RANGE (*p, 0x80, 0xbf))
4086  {
4087  p++;
4088  continue;
4089  }
4090  }
4091  }
4092  OUTPUT (curr_char);
4093  return INTL_UTF8_INVALID;
4094  }
4095  /* 4 bytes sequence : F1 - F3 , 80 - BF , 80 - BF , 80 - BF */
4096  if (UTF8_BYTE_IN_RANGE (*p, 0xf1, 0xf3))
4097  {
4098  p++;
4099  if (p >= p_end)
4100  {
4101  OUTPUT (curr_char);
4102  return INTL_UTF8_TRUNCATED;
4103  }
4104 
4105  if (UTF8_BYTE_IN_RANGE (*p, 0x80, 0xbf))
4106  {
4107  p++;
4108  if (p >= p_end)
4109  {
4110  OUTPUT (curr_char);
4111  return INTL_UTF8_TRUNCATED;
4112  }
4113 
4114  if (UTF8_BYTE_IN_RANGE (*p, 0x80, 0xbf))
4115  {
4116  p++;
4117  if (p >= p_end)
4118  {
4119  OUTPUT (curr_char);
4120  return INTL_UTF8_TRUNCATED;
4121  }
4122 
4123  if (UTF8_BYTE_IN_RANGE (*p, 0x80, 0xbf))
4124  {
4125  p++;
4126  continue;
4127  }
4128  }
4129  }
4130  OUTPUT (curr_char);
4131  return INTL_UTF8_INVALID;
4132  }
4133  /* 4 bytes sequence : F4 , 80 - 8F , 80 - BF , 80 - BF */
4134  else if (*p == 0xf4)
4135  {
4136  p++;
4137  if (p >= p_end)
4138  {
4139  OUTPUT (curr_char);
4140  return INTL_UTF8_TRUNCATED;
4141  }
4142 
4143  if (UTF8_BYTE_IN_RANGE (*p, 0x80, 0x8f))
4144  {
4145  p++;
4146  if (p >= p_end)
4147  {
4148  OUTPUT (curr_char);
4149  return INTL_UTF8_TRUNCATED;
4150  }
4151 
4152  if (UTF8_BYTE_IN_RANGE (*p, 0x80, 0xbf))
4153  {
4154  p++;
4155  if (p >= p_end)
4156  {
4157  OUTPUT (curr_char);
4158  return INTL_UTF8_TRUNCATED;
4159  }
4160 
4161  if (UTF8_BYTE_IN_RANGE (*p, 0x80, 0xbf))
4162  {
4163  p++;
4164  continue;
4165  }
4166  }
4167  }
4168  OUTPUT (curr_char);
4169  return INTL_UTF8_INVALID;
4170  }
4171 
4172  assert (*p > 0xf4);
4173  OUTPUT (curr_char);
4174  return INTL_UTF8_INVALID;
4175  }
4176 
4177  return INTL_UTF8_VALID;
4178 
4179 #undef OUTPUT
4180 }
4181 
4182 /*
4183  * intl_check_euckr - Checks if a string contains valid EUC-KR sequences
4184  *
4185  *
4186  * return: 0 if valid,
4187  * 1 if contains and invalid byte in one char
4188  * 2 if last char is truncated (missing bytes)
4189  * buf(in): buffer
4190  * size(out): size of buffer (negative values accepted, in this case buffer is assumed to be NUL terminated)
4191  * pos(out): pointer to beginning of invalid character
4192  *
4193  * Valid ranges:
4194  * - 1 byte : 00 - 8E ; 90 - A0
4195  * - 2 bytes: A1 - FE , 00 - FF
4196  * - 3 bytes: 8F , 00 - FF , 00 - FF
4197  */
4199 intl_check_euckr (const unsigned char *buf, int size, char **pos)
4200 {
4201 #define OUTPUT(charp_out) if (pos != NULL) *pos = (char *) charp_out
4202 
4203  const unsigned char *p = buf;
4204  const unsigned char *p_end = NULL;
4205  const unsigned char *curr_char = NULL;
4206 
4207  if (pos != NULL)
4208  {
4209  *pos = NULL;
4210  }
4211 
4212  if (size < 0)
4213  {
4214  size = strlen ((char *) buf);
4215  }
4216 
4217  p_end = buf + size;
4218 
4219  while (p < p_end)
4220  {
4221  curr_char = p;
4222 
4223  if (*p < 0x80)
4224  {
4225  p++;
4226  continue;
4227  }
4228 
4229  /* SS3 byte value starts a 3 bytes character */
4230  if (*p == SS3)
4231  {
4232  p++;
4233  p++;
4234  p++;
4235  if (p > p_end)
4236  {
4237  OUTPUT (curr_char);
4238  return INTL_UTF8_TRUNCATED;
4239  }
4240  continue;
4241  }
4242 
4243  /* check 2 bytes sequences */
4244  if (UTF8_BYTE_IN_RANGE (*p, 0xa1, 0xfe))
4245  {
4246  p++;
4247  p++;
4248  if (p > p_end)
4249  {
4250  OUTPUT (curr_char);
4251  return INTL_UTF8_TRUNCATED;
4252  }
4253  continue;
4254  }
4255 
4256  OUTPUT (curr_char);
4257  return INTL_UTF8_INVALID;
4258  }
4259 
4260  return INTL_UTF8_VALID;
4261 
4262 #undef OUTPUT
4263 }
4264 
4265 /*
4266  * intl_check_string - Checks if a string contains valid sequences in current codeset
4267  *
4268  * return: 0 - if valid, non-zero otherwise : 1 - if invalid byte in char
4269  * 2 - if last char is truncated
4270  * buf(in): buffer
4271  * size(out): size of buffer (negative values accepted, in this case buffer
4272  * is assumed to be NUL terminated)
4273  * codeset(in): codeset assumed for buf
4274  */
4276 intl_check_string (const char *buf, int size, char **pos, const INTL_CODESET codeset)
4277 {
4279  {
4280  // this function is currently used either in client-modes or for loaddb. if it will be used in other server-mode
4281  // contexts, that can impact the result of queries, global variable should be replaced with a session parameter.
4282  return INTL_UTF8_VALID;
4283  }
4284 
4285  switch (codeset)
4286  {
4287  case INTL_CODESET_UTF8:
4288  return intl_check_utf8 ((const unsigned char *) buf, size, pos);
4289 
4291  return intl_check_euckr ((const unsigned char *) buf, size, pos);
4292 
4294  default:
4295  break;
4296  }
4297 
4298  return INTL_UTF8_VALID;
4299 }
4300 
4301 #if !defined (SERVER_MODE)
4302 /*
4303  * intl_is_bom_magic - Returns 1 if the buffer contains BOM magic for UTF-8
4304  *
4305  * return: true if BOM, false otherwise
4306  * buf(in): buffer
4307  * size(out): size of buffer (negative means buffer is NUL terminated)
4308  */
4309 bool
4310 intl_is_bom_magic (const char *buf, const int size)
4311 {
4312  const char BOM[] = { (char) 0xef, (char) 0xbb, (char) 0xbf };
4313  if (size >= 3)
4314  {
4315  return (memcmp (buf, BOM, 3) == 0) ? true : false;
4316  }
4317  else if (size < 0)
4318  {
4319  if (*buf == BOM[0] && buf[1] == BOM[1] && buf[2] == BOM[2])
4320  {
4321  return true;
4322  }
4323  }
4324 
4325  return false;
4326 }
4327 #endif /* SERVER_MODE */
4328 
4329 /* UTF-8 to console routines */
4330 
4331 /*
4332  * intl_text_single_byte_to_utf8() - converts a buffer containing text with ISO
4333  * 8859-X encoding to UTF-8
4334  *
4335  * return: error code
4336  * in_buf(in): buffer
4337  * in_size(in): size of input string (NUL terminator not included)
4338  * out_buf(int/out) : output buffer : uses the pre-allocated buffer passed
4339  * as input or a new allocated buffer; NULL if conversion
4340  * is not required
4341  * out_size(out): size of string (NUL terminator not included)
4342  */
4343 int
4344 intl_text_single_byte_to_utf8 (const char *in_buf, const int in_size, char **out_buf, int *out_size)
4345 {
4346  return intl_text_single_byte_to_utf8_ext (lang_get_txt_conv (), (const unsigned char *) in_buf, in_size,
4347  (unsigned char **) out_buf, out_size);
4348 }
4349 
4350 /*
4351  * intl_text_single_byte_to_utf8_ext() - converts a buffer containing text
4352  * with ISO 8859-X encoding to UTF-8
4353  *
4354  * return: error code
4355  * t(in): text conversion data
4356  * in_buf(in): buffer
4357  * in_size(in): size of input string (NUL terminator not included)
4358  * out_buf(in/out) : output buffer : uses the pre-allocated buffer passed
4359  * as input or a new allocated buffer; NULL if conversion
4360  * is not required
4361  * out_size(in/out): size of string (NUL terminator not included)
4362  */
4363 int
4364 intl_text_single_byte_to_utf8_ext (void *t, const unsigned char *in_buf, const int in_size, unsigned char **out_buf,
4365  int *out_size)
4366 {
4367 
4368  const unsigned char *p_in = NULL;
4369  unsigned char *p_out = NULL;
4370  TEXT_CONVERSION *txt_conv;
4371  bool is_ascii = true;
4372 
4373  assert (in_buf != NULL);
4374  assert (out_buf != NULL);
4375  assert (out_size != NULL);
4376  assert (t != NULL);
4377 
4378  txt_conv = (TEXT_CONVERSION *) t;
4379 
4380  p_in = in_buf;
4381  while (p_in < in_buf + in_size)
4382  {
4383  if (*p_in++ >= 0x80)
4384  {
4385  is_ascii = false;
4386  break;
4387  }
4388  }
4389 
4390  if (is_ascii)
4391  {
4392  *out_buf = NULL;
4393  return NO_ERROR;
4394  }
4395 
4396  if (*out_buf == NULL)
4397  {
4398  /* a ISO8859-X character is encoded on maximum 2 bytes in UTF-8 */
4399  *out_buf = (unsigned char *) malloc (in_size * 2 + 1);
4400  if (*out_buf == NULL)
4401  {
4402  er_set (ER_ERROR_SEVERITY, ARG_FILE_LINE, ER_OUT_OF_VIRTUAL_MEMORY, 1, (size_t) (in_size * 2 + 1));
4403  return ER_OUT_OF_VIRTUAL_MEMORY;
4404  }
4405  }
4406  else
4407  {
4408  if (*out_size < in_size * 2 + 1)
4409  {
4411  return ER_GENERIC_ERROR;
4412  }
4413  }
4414 
4415  assert (txt_conv->text_last_cp > 0);
4416  for (p_in = in_buf, p_out = *out_buf; p_in < in_buf + in_size; p_in++)
4417  {
4418  if (*p_in >= txt_conv->text_first_cp && *p_in <= txt_conv->text_last_cp)
4419  {
4420  unsigned char *utf8_bytes = txt_conv->text_to_utf8[*p_in - txt_conv->text_first_cp].bytes;
4421  int utf8_size = txt_conv->text_to_utf8[*p_in - txt_conv->text_first_cp].size;
4422 
4423  do
4424  {
4425  *p_out++ = *utf8_bytes++;
4426  }
4427  while (--utf8_size > 0);
4428  }
4429  else
4430  {
4431  if (*p_in < 0x80)
4432  {
4433  *p_out++ = *p_in;
4434  }
4435  else
4436  {
4437  assert (false);
4438  *p_out++ = '?';
4439  }
4440  }
4441  }
4442 
4443  *(p_out) = '\0';
4444  *out_size = CAST_STRLEN (p_out - *(out_buf));
4445 
4446  return NO_ERROR;
4447 }
4448 
4449 /*
4450  * intl_text_utf8_to_single_byte() - converts a buffer containing UTF-8 text
4451  * to ISO 8859-X encoding
4452  *
4453  * return: error code
4454  * in_buf(in): buffer
4455  * in_size(in): size of input string (NUL terminator not included)
4456  * out_buf(in/out) : output buffer : uses the pre-allocated buffer passed
4457  * as input or a new allocated buffer; NULL if conversion
4458  * is not required
4459  * out_size(in/out): size of output string (NUL terminator not counted)
4460  */
4461 int
4462 intl_text_utf8_to_single_byte (const char *in_buf, const int in_size, char **out_buf, int *out_size)
4463 {
4464  const unsigned char *p_in = NULL;
4465  unsigned char *p_out = NULL;
4466  unsigned char *p_next = NULL;
4467  TEXT_CONVERSION *txt_conv = lang_get_txt_conv ();
4468  bool is_ascii = true;
4469 
4470  assert (in_buf != NULL);
4471  assert (out_buf != NULL);
4472  assert (out_size != NULL);
4473  assert (txt_conv != NULL);
4474 
4475  p_in = (const unsigned char *) in_buf;
4476  while (p_in < (const unsigned char *) in_buf + in_size)
4477  {
4478  if (*p_in++ >= 0x80)
4479  {
4480  is_ascii = false;
4481  break;
4482  }
4483  }
4484 
4485  if (is_ascii)
4486  {
4487  *out_buf = NULL;
4488  return NO_ERROR;
4489  }
4490 
4491  if (*out_buf == NULL)
4492  {
4493  *out_buf = (char *) malloc (in_size + 1);
4494  if (*out_buf == NULL)
4495  {
4496  er_set (ER_ERROR_SEVERITY, ARG_FILE_LINE, ER_OUT_OF_VIRTUAL_MEMORY, 1, (size_t) (in_size + 1));
4497  return ER_OUT_OF_VIRTUAL_MEMORY;
4498  }
4499  }
4500  else
4501  {
4502  if (*out_size < in_size + 1)
4503  {
4505  return ER_GENERIC_ERROR;
4506  }
4507  }
4508 
4509  for (p_in = (const unsigned char *) in_buf, p_out = (unsigned char *) *out_buf;
4510  p_in < (const unsigned char *) in_buf + in_size;)
4511  {
4512  unsigned int cp = 0;
4513 
4514  if (*p_in < 0x80)
4515  {
4516  *p_out++ = *p_in++;
4517  continue;
4518  }
4519 
4520  cp = intl_utf8_to_cp (p_in, CAST_STRLEN (in_buf + in_size - (char *) p_in), &p_next);
4521  if (cp >= txt_conv->utf8_first_cp && cp <= txt_conv->utf8_last_cp)
4522  {
4523  assert (txt_conv->utf8_to_text[cp - txt_conv->utf8_first_cp].size == 1);
4524  cp = (unsigned int) *(txt_conv->utf8_to_text[cp - txt_conv->utf8_first_cp].bytes);
4525  }
4526 
4527  if (cp > 0xff)
4528  {
4529  *p_out++ = '?';
4530  }
4531  else
4532  {
4533  *p_out++ = (unsigned char) cp;
4534  }
4535  p_in = p_next;
4536  }
4537 
4538  *(p_out) = '\0';
4539  *out_size = CAST_STRLEN (p_out - (unsigned char *) *(out_buf));
4540 
4541  return NO_ERROR;
4542 }
4543 
4544 /*
4545  * intl_init_conv_iso8859_1_to_utf8() - initializes conversion map from
4546  * ISO 8859-1 (Latin 1) to UTF-8
4547  * return:
4548  */
4549 static void
4551 {
4552  unsigned int i;
4553 
4554  /* 00 - 7E : mapped to ASCII */
4555  for (i = 0; i <= 0x7e; i++)
4556  {
4557  iso8859_1_To_utf8_conv[i].size = 1;
4558  *((unsigned char *) (iso8859_1_To_utf8_conv[i].bytes)) = (unsigned char) i;
4559  }
4560 
4561  /* 7F - 9F : not mapped */
4562  for (i = 0x7f; i <= 0x9f; i++)
4563  {
4564  iso8859_1_To_utf8_conv[i].size = 1;
4565  *((unsigned char *) (iso8859_1_To_utf8_conv[i].bytes)) = (unsigned char) '?';
4566  }
4567 
4568  /* A0 - FF : mapped to Unicode codepoint with the same value */
4569  for (i = 0xa0; i <= 0xff; i++)
4570  {
4571  iso8859_1_To_utf8_conv[i].size = intl_cp_to_utf8 (i, iso8859_1_To_utf8_conv[i].bytes);
4572  }
4573 
4574  con_Iso_8859_1_conv.text_first_cp = 0;
4575  con_Iso_8859_1_conv.text_last_cp = 0xff;
4576  con_Iso_8859_1_conv.text_to_utf8 = iso8859_1_To_utf8_conv;
4577 
4578  /* no specific mapping here : Unicode codepoints in range 00-FF map directly onto ISO-8859-1 */
4579  con_Iso_8859_1_conv.utf8_first_cp = 0;
4580  con_Iso_8859_1_conv.utf8_last_cp = 0;
4581  con_Iso_8859_1_conv.utf8_to_text = NULL;
4582 }
4583 
4584 /*
4585  * intl_init_conv_iso8859_9_to_utf8() - initializes conversion map from
4586  * ISO 8859-9 (turkish) to UTF-8
4587  * return:
4588  *
4589  */
4590 static void
4592 {
4593  unsigned int i;
4594  const unsigned int iso8859_9_special_mapping[][2] = {
4595  {0xd0, 0x11e}, /* capital G with breve */
4596  {0xdd, 0x130}, /* capital I with dot above */
4597  {0xde, 0x15e}, /* capital S with cedilla */
4598  {0xf0, 0x11f}, /* small g with breve */
4599  {0xfd, 0x131}, /* small i dotless */
4600  {0xfe, 0x15f} /* small s with cedilla */
4601  };
4602 
4603  /* 00 - 7E : mapped to ASCII */
4604  for (i = 0; i <= 0x7e; i++)
4605  {
4606  iso8859_9_To_utf8_conv[i].size = 1;
4607  *((unsigned char *) (iso8859_9_To_utf8_conv[i].bytes)) = (unsigned char) i;
4608  }
4609 
4610  /* 7F - 9F : not mapped */
4611  for (i = 0x7f; i <= 0x9f; i++)
4612  {
4613  iso8859_9_To_utf8_conv[i].size = 1;
4614  *((unsigned char *) (iso8859_9_To_utf8_conv[i].bytes)) = (unsigned char) '?';
4615  }
4616 
4617  /* A0 - FF : mapped to Unicode codepoint with the same value */
4618  for (i = 0xa0; i <= 0xff; i++)
4619  {
4620  iso8859_9_To_utf8_conv[i].size = intl_cp_to_utf8 (i, iso8859_9_To_utf8_conv[i].bytes);
4621  }
4622 
4623  for (i = ISO_8859_9_FIRST_CP; i <= ISO_8859_9_LAST_CP; i++)
4624  {
4625  utf8_Cp_to_iso_8859_9_conv[i - ISO_8859_9_FIRST_CP].size = 1;
4626  *(utf8_Cp_to_iso_8859_9_conv[i - ISO_8859_9_FIRST_CP].bytes) = '?';
4627  }
4628 
4629  /* special mapping */
4630  for (i = 0; i < DIM (iso8859_9_special_mapping); i++)
4631  {
4632  unsigned int val8bit = iso8859_9_special_mapping[i][0];
4633  unsigned int cp = iso8859_9_special_mapping[i][1];
4634 
4635  iso8859_9_To_utf8_conv[val8bit].size = intl_cp_to_utf8 (cp, iso8859_9_To_utf8_conv[val8bit].bytes);
4636 
4637  *(utf8_Cp_to_iso_8859_9_conv[cp - ISO_8859_9_FIRST_CP].bytes) = val8bit;
4638 
4639  assert (utf8_Cp_to_iso_8859_9_conv[cp - ISO_8859_9_FIRST_CP].size == 1);
4640  }
4641 
4642  con_Iso_8859_9_conv.text_first_cp = 0;
4643  con_Iso_8859_9_conv.text_last_cp = 0xff;
4644  con_Iso_8859_9_conv.text_to_utf8 = iso8859_9_To_utf8_conv;
4645 
4646  con_Iso_8859_9_conv.utf8_first_cp = ISO_8859_9_FIRST_CP;
4647  con_Iso_8859_9_conv.utf8_last_cp = ISO_8859_9_LAST_CP;
4648  con_Iso_8859_9_conv.utf8_to_text = utf8_Cp_to_iso_8859_9_conv;
4649 }
4650 
4651 /*
4652  * intl_text_dbcs_to_utf8() - converts a buffer containing text with DBCS
4653  * encoding to UTF-8
4654  *
4655  * return: error code
4656  * in_buf(in): buffer
4657  * in_size(in): size of input string (NUL terminator not included)
4658  * out_buf(int/out) : output buffer : uses the pre-allocated buffer passed
4659  * as input or a new allocated buffer; NULL if conversion
4660  * is not required
4661  * out_size(out): size of string (NUL terminator not included)
4662  */
4663 int
4664 intl_text_dbcs_to_utf8 (const char *in_buf, const int in_size, char **out_buf, int *out_size)
4665 {
4666  return intl_text_dbcs_to_utf8_ext (lang_get_txt_conv (), (const unsigned char *) in_buf, in_size,
4667  (unsigned char **) out_buf, out_size);
4668 }
4669 
4670 /*
4671  * intl_text_dbcs_to_utf8_ext() - converts a buffer containing text with DBCS
4672  * encoding to UTF-8
4673  *
4674  * return: error code
4675  * t(in): text conversion data
4676  * in_buf(in): buffer
4677  * in_size(in): size of input string (NUL terminator not included)
4678  * out_buf(in/out) : output buffer : uses the pre-allocated buffer passed
4679  * as input or a new allocated buffer; NULL if conversion
4680  * is not required
4681  * out_size(in/out): size of string (NUL terminator not included)
4682  */
4683 int
4684 intl_text_dbcs_to_utf8_ext (void *t, const unsigned char *in_buf, const int in_size, unsigned char **out_buf,
4685  int *out_size)
4686 {
4687  const unsigned char *p_in = NULL;
4688  unsigned char *p_out = NULL;
4689  TEXT_CONVERSION *txt_conv;
4690  bool is_ascii = true;
4691 
4692  assert (in_buf != NULL);
4693  assert (out_buf != NULL);
4694  assert (out_size != NULL);
4695  assert (t != NULL);
4696 
4697  txt_conv = (TEXT_CONVERSION *) t;
4698 
4699  p_in = in_buf;
4700  while (p_in < in_buf + in_size)
4701  {
4702  if (*p_in++ >= 0x80)
4703  {
4704  is_ascii = false;
4705  break;
4706  }
4707  }
4708 
4709  if (is_ascii)
4710  {
4711  *out_buf = NULL;
4712  return NO_ERROR;
4713  }
4714 
4715  if (*out_buf == NULL)
4716  {
4717  /* a DBCS text may contain ASCII characters (encoded with 1 byte) which may expand to maximum 2 bytes in UTF-8
4718  * and DBCS characters (2 bytes) which may expand to maximum 3 bytes in UTF-8; Also it may contain single byte
4719  * characters which may expand to 3 bytes characters in UTF-8 Apply a safe expansion of 3 */
4720  *out_buf = (unsigned char *) malloc (in_size * 3 + 1);
4721  if (*out_buf == NULL)
4722  {
4723  er_set (ER_ERROR_SEVERITY, ARG_FILE_LINE, ER_OUT_OF_VIRTUAL_MEMORY, 1, (size_t) (in_size * 3 + 1));
4724  return ER_OUT_OF_VIRTUAL_MEMORY;
4725  }
4726  }
4727  else
4728  {
4729  if (*out_size < in_size * 3 + 1)
4730  {
4732  return ER_GENERIC_ERROR;
4733  }
4734  }
4735 
4736  assert (txt_conv->text_last_cp > 0);
4737  for (p_in = in_buf, p_out = *out_buf; p_in < in_buf + in_size;)
4738  {
4739  unsigned char *p_next;
4740  unsigned int text_cp =
4741  intl_dbcs_to_cp (p_in, CAST_STRLEN (in_buf + in_size - p_in), txt_conv->byte_flag, &p_next);
4742 
4743  if (text_cp >= txt_conv->text_first_cp && text_cp <= txt_conv->text_last_cp)
4744  {
4745  unsigned char *utf8_bytes = txt_conv->text_to_utf8[text_cp - txt_conv->text_first_cp].bytes;
4746  int utf8_size = txt_conv->text_to_utf8[text_cp - txt_conv->text_first_cp].size;
4747 
4748  do
4749  {
4750  *p_out++ = *utf8_bytes++;
4751  }
4752  while (--utf8_size > 0);
4753  }
4754  else
4755  {
4756  if (text_cp < 0x80)
4757  {
4758  *p_out++ = *p_in;
4759  }
4760  else
4761  {
4762  *p_out++ = '?';
4763  }
4764  }
4765 
4766  assert (p_next <= in_buf + in_size);
4767  p_in = p_next;
4768  }
4769 
4770  *(p_out) = '\0';
4771  *out_size = CAST_STRLEN (p_out - *(out_buf));
4772 
4773  return NO_ERROR;
4774 }
4775 
4776 /*
4777  * intl_text_utf8_to_dbcs() - converts a buffer containing UTF-8 text
4778  * to DBCS encoding
4779  *
4780  * return: error code
4781  * in_buf(in): buffer
4782  * in_size(in): size of input string (NUL terminator not included)
4783  * out_buf(in/out) : output buffer : uses the pre-allocated buffer passed
4784  * as input or a new allocated buffer; NULL if conversion
4785  * is not required
4786  * out_size(in/out): size of output string (NUL terminator not counted)
4787  */
4788 int
4789 intl_text_utf8_to_dbcs (const char *in_buf, const int in_size, char **out_buf, int *out_size)
4790 {
4791  const unsigned char *p_in = NULL;
4792  unsigned char *p_out = NULL;
4793  unsigned char *p_next = NULL;
4794  TEXT_CONVERSION *txt_conv = lang_get_txt_conv ();
4795  bool is_ascii = true;
4796 
4797  assert (in_buf != NULL);
4798  assert (out_buf != NULL);
4799  assert (out_size != NULL);
4800  assert (txt_conv != NULL);
4801 
4802  p_in = (const unsigned char *) in_buf;
4803  while (p_in < (const unsigned char *) in_buf + in_size)
4804  {
4805  if (*p_in++ >= 0x80)
4806  {
4807  is_ascii = false;
4808  break;
4809  }
4810  }
4811 
4812  if (is_ascii)
4813  {
4814  *out_buf = NULL;
4815  return NO_ERROR;
4816  }
4817 
4818  if (*out_buf == NULL)
4819  {
4820  *out_buf = (char *) malloc (in_size + 1);
4821  if (*out_buf == NULL)
4822  {
4823  er_set (ER_ERROR_SEVERITY, ARG_FILE_LINE, ER_OUT_OF_VIRTUAL_MEMORY, 1, (size_t) (in_size + 1));
4824  return ER_OUT_OF_VIRTUAL_MEMORY;
4825  }
4826  }
4827  else
4828  {
4829  if (*out_size < in_size + 1)
4830  {
4832  return ER_GENERIC_ERROR;
4833  }
4834  }
4835 
4836  assert (txt_conv->utf8_last_cp > 0);
4837 
4838  for (p_in = (const unsigned char *) in_buf, p_out = (unsigned char *) *out_buf;
4839  p_in < (const unsigned char *) in_buf + in_size;)
4840  {
4841  unsigned int cp = 0;
4842 
4843  if (*p_in < 0x80)
4844  {
4845  *p_out++ = *p_in++;
4846  continue;
4847  }
4848 
4849  cp = intl_utf8_to_cp (p_in, CAST_STRLEN (in_buf + in_size - (char *) p_in), &p_next);
4850  if (cp >= txt_conv->utf8_first_cp && cp <= txt_conv->utf8_last_cp)
4851  {
4852  unsigned char *text_bytes = txt_conv->utf8_to_text[cp - txt_conv->utf8_first_cp].bytes;
4853  int text_size = txt_conv->utf8_to_text[cp - txt_conv->utf8_first_cp].size;
4854 
4855  assert (text_size >= 1);
4856  do
4857  {
4858  *p_out++ = *text_bytes++;
4859  }
4860  while (--text_size > 0);
4861  }
4862  else if (cp > 0x80)
4863  {
4864  *p_out++ = '?';
4865  }
4866  else
4867  {
4868  *p_out++ = (unsigned char) cp;
4869  }
4870  p_in = p_next;
4871  }
4872 
4873  *(p_out) = '\0';
4874  *out_size = CAST_STRLEN (p_out - (unsigned char *) *(out_buf));
4875 
4876  return NO_ERROR;
4877 }
4878 
4879 /*
4880  * intl_fast_iso88591_to_utf8() - converts a buffer containing text with ISO
4881  * 8859-1 encoding to UTF-8
4882  *
4883  * return: 0 conversion ok, 1 conversion done, but invalid characters where
4884  * found
4885  * in_buf(in): buffer
4886  * in_size(in): size of input string (NUL terminator not included)
4887  * out_buf(int/out) : output buffer : uses the pre-allocated buffer passed
4888  * as input or a new allocated buffer; NULL if conversion
4889  * is not required
4890  * out_size(out): size of string (NUL terminator not included)
4891  */
4892 int
4893 intl_fast_iso88591_to_utf8 (const unsigned char *in_buf, const int in_size, unsigned char **out_buf, int *out_size)
4894 {
4895  const unsigned char *p_in = NULL;
4896  const unsigned char *p_end;
4897  unsigned char *p_out = NULL;
4898  int status = 0;
4899 
4900  assert (in_size > 0);
4901  assert (in_buf != NULL);
4902  assert (out_buf != NULL);
4903  assert (out_size != NULL);
4904 
4905  for (p_in = in_buf, p_end = p_in + in_size, p_out = (unsigned char *) *out_buf; p_in < p_end; p_in++)
4906  {
4907  if (*p_in < 0x7f)
4908  {
4909  *p_out++ = *p_in;
4910  }
4911  else if (*p_in < 0xa0)
4912  {
4913  /* ISO 8859-1 characters in this range are not valid */
4914  *p_out++ = '?';
4915  status = 1;
4916  }
4917  else
4918  {
4919  *p_out++ = (unsigned char) (0xc0 | (*p_in >> 6));
4920  *p_out++ = (unsigned char) (0x80 | (*p_in & 0x3f));
4921  }
4922  }
4923 
4924  *out_size = CAST_STRLEN (p_out - *(out_buf));
4925 
4926  return status;
4927 }
4928 
4929 /*
4930  * intl_euckr_to_iso88591() - converts a buffer containing EUCKR text to
4931  * ISO88591
4932  *
4933  * return: 0 conversion ok, 1 conversion done, but invalid characters where
4934  * found
4935  * in_buf(in): buffer
4936  * in_size(in): size of input string (NUL terminator not included)
4937  * out_buf(int/out) : output buffer : uses the pre-allocated buffer passed
4938  * as input or a new allocated buffer;
4939  * out_size(out): size of string (NUL terminator not included)
4940  */
4941 int
4942 intl_euckr_to_iso88591 (const unsigned char *in_buf, const int in_size, unsigned char **out_buf, int *out_size)
4943 {
4944  const unsigned char *p_in = NULL;
4945  const unsigned char *p_end;
4946  unsigned char *p_out = NULL;
4947  unsigned int unicode_cp;
4948  int status = 0;
4949 
4950  assert (in_size > 0);
4951  assert (in_buf != NULL);
4952  assert (out_buf != NULL);
4953  assert (out_size != NULL);
4954 
4955  for (p_in = in_buf, p_end = p_in + in_size, p_out = (unsigned char *) *out_buf; p_in < p_end; p_in++)
4956  {
4957  if (*p_in < 0x80)
4958  {
4959  *p_out++ = *p_in;
4960  }
4961  else if (*p_in >= 0xa1 && *p_in < 0xff && p_end - p_in >= 2)
4962  {
4963  if (*(p_in + 1) >= 0xa1 && *(p_in + 1) < 0xff)
4964  {
4965  /* KSC5601 two-bytes character */
4966  unsigned char ksc_buf[2];
4967 
4968  ksc_buf[0] = *p_in - 0x80;
4969  ksc_buf[1] = *(p_in + 1) - 0x80;
4970 
4971  if (ksc5601_mbtowc (&unicode_cp, ksc_buf, 2) <= 0)
4972  {
4973  *p_out++ = '?';
4974  status = 1;
4975  }
4976  else
4977  {
4978  if ((unicode_cp <= 0x1F) || (unicode_cp > 0xFF) || ((unicode_cp >= 0x7F) && (unicode_cp <= 0x9F)))
4979  {
4980  *p_out++ = '?';
4981  status = 1;
4982  }
4983  else
4984  {
4985  *p_out++ = unicode_cp;
4986  }
4987  }
4988  }
4989  else
4990  {
4991  *p_out++ = '?';
4992  status = 1;
4993  }
4994 
4995  /* skip one additional byte */
4996  p_in++;
4997  }
4998  else if (*p_in == 0x8f && p_end - p_in >= 3)
4999  {
5000  if (*(p_in + 1) >= 0xa1 && *(p_in + 1) < 0xff && *(p_in + 2) >= 0xa1 && *(p_in + 2) < 0xff)
5001  {
5002  /* JISX0212 three bytes character */
5003  unsigned char jis_buf[2];
5004 
5005  jis_buf[0] = *(p_in + 1) - 0x80;
5006  jis_buf[1] = *(p_in + 2) - 0x80;
5007 
5008  if (jisx0212_mbtowc (&unicode_cp, jis_buf, 2) <= 0)
5009  {
5010  *p_out++ = '?';
5011  status = 1;
5012  }
5013  else
5014  {
5015  if ((unicode_cp <= 0x1F) || (unicode_cp > 0xFF) || ((unicode_cp >= 0x7F) && (unicode_cp <= 0x9F)))
5016  {
5017  *p_out++ = '?';
5018  status = 1;
5019  }
5020  else
5021  {
5022  *p_out++ = unicode_cp;
5023  }
5024  }
5025  }
5026  else
5027  {
5028  *p_out++ = '?';
5029  status = 1;
5030  }
5031 
5032  /* skip two additional bytes */
5033  p_in++;
5034  p_in++;
5035  }
5036  else
5037  {
5038  /* EUC-KR byte not valid */
5039  *p_out++ = '?';
5040  status = 1;
5041  }
5042  }
5043 
5044  *out_size = CAST_STRLEN (p_out - *(out_buf));
5045 
5046  return status;
5047 }
5048 
5049 /*
5050  * intl_euckr_to_utf8() - converts a buffer containing text with EUC-KR
5051  * + JISX0212 to UTF-8
5052  *
5053  * return: 0 conversion ok, 1 conversion done, but invalid characters where
5054  * found
5055  * in_buf(in): buffer
5056  * in_size(in): size of input string (NUL terminator not included)
5057  * out_buf(int/out) : output buffer : uses the pre-allocated buffer passed
5058  * as input or a new allocated buffer;
5059  * out_size(out): size of string (NUL terminator not included)
5060  */
5061 int
5062 intl_euckr_to_utf8 (const unsigned char *in_buf, const int in_size, unsigned char **out_buf, int *out_size)
5063 {
5064  const unsigned char *p_in = NULL;
5065  const unsigned char *p_end;
5066  unsigned char *p_out = NULL;
5067  unsigned int unicode_cp;
5068  int utf8_size;
5069  int status = 0;
5070 
5071  assert (in_size > 0);
5072  assert (in_buf != NULL);
5073  assert (out_buf != NULL);
5074  assert (out_size != NULL);
5075 
5076  for (p_in = in_buf, p_end = p_in + in_size, p_out = (unsigned char *) *out_buf; p_in < p_end; p_in++)
5077  {
5078  if (*p_in < 0x80)
5079  {
5080  *p_out++ = *p_in;
5081  }
5082  else if (*p_in >= 0xa1 && *p_in < 0xff && p_end - p_in >= 2)
5083  {
5084  if (*(p_in + 1) >= 0xa1 && *(p_in + 1) < 0xff)
5085  {
5086  /* KSC5601 two-bytes character */
5087  unsigned char ksc_buf[2];
5088 
5089  ksc_buf[0] = *p_in - 0x80;
5090  ksc_buf[1] = *(p_in + 1) - 0x80;
5091 
5092  if (ksc5601_mbtowc (&unicode_cp, ksc_buf, 2) <= 0)
5093  {
5094  *p_out++ = '?';
5095  status = 1;
5096  }
5097  else
5098  {
5099  utf8_size = intl_cp_to_utf8 (unicode_cp, p_out);
5100  p_out += utf8_size;
5101  }
5102  }
5103  else
5104  {
5105  *p_out++ = '?';
5106  status = 1;
5107  }
5108 
5109  /* skip one additional byte */
5110  p_in++;
5111  }
5112  else if (*p_in == 0x8f && p_end - p_in >= 3)
5113  {
5114  if (*(p_in + 1) >= 0xa1 && *(p_in + 1) < 0xff && *(p_in + 2) >= 0xa1 && *(p_in + 2) < 0xff)
5115  {
5116  /* JISX0212 three bytes character */
5117  unsigned char jis_buf[2];
5118 
5119  jis_buf[0] = *(p_in + 1) - 0x80;
5120  jis_buf[1] = *(p_in + 2) - 0x80;
5121 
5122  if (jisx0212_mbtowc (&unicode_cp, jis_buf, 2) <= 0)
5123  {
5124  *p_out++ = '?';
5125  status = 1;
5126  }
5127  else
5128  {
5129  utf8_size = intl_cp_to_utf8 (unicode_cp, p_out);
5130  p_out += utf8_size;
5131  }
5132  }
5133  else
5134  {
5135  *p_out++ = '?';
5136  status = 1;
5137  }
5138 
5139  /* skip two additional bytes */
5140  p_in++;
5141  p_in++;
5142  }
5143  else
5144  {
5145  /* EUC-KR byte not valid */
5146  *p_out++ = '?';
5147  status = 1;
5148  }
5149  }
5150 
5151  *out_size = CAST_STRLEN (p_out - *(out_buf));
5152 
5153  return status;
5154 }
5155 
5156 /*
5157  * intl_utf8_to_iso88591() - converts a buffer containing UTF8 text to ISO88591
5158  *
5159  * return: 0 conversion ok, 1 conversion done, but invalid characters where
5160  * found
5161  * in_buf(in): buffer
5162  * in_size(in): size of input string (NUL terminator not included)
5163  * out_buf(int/out) : output buffer : uses the pre-allocated buffer passed
5164  * as input or a new allocated buffer;
5165  * out_size(out): size of string (NUL terminator not included)
5166  */
5167 int
5168 intl_utf8_to_iso88591 (const unsigned char *in_buf, const int in_size, unsigned char **out_buf, int *out_size)
5169 {
5170  const unsigned char *p_in = NULL;
5171  const unsigned char *p_end;
5172  unsigned char *p_out = NULL;
5173  unsigned char *next_utf8;
5174  int status = 0;
5175  unsigned int unicode_cp = 0;
5176 
5177  assert (in_size > 0);
5178  assert (in_buf != NULL);
5179  assert (out_buf != NULL);
5180  assert (out_size != NULL);
5181 
5182  for (p_in = in_buf, p_end = in_buf + in_size, p_out = (unsigned char *) *out_buf; p_in < p_end;)
5183  {
5184  unicode_cp = intl_utf8_to_cp (p_in, CAST_STRLEN (p_end - p_in), &next_utf8);
5185 
5186  if ((unicode_cp > 0xFF) || ((unicode_cp >= 0x7F) && (unicode_cp <= 0x9F)))
5187  {
5188  *p_out++ = '?';
5189  status = 1;
5190  }
5191  else
5192  {
5193  *p_out++ = unicode_cp;
5194  }
5195 
5196  p_in = next_utf8;
5197  }
5198 
5199  *out_size = CAST_STRLEN (p_out - *(out_buf));
5200 
5201  return status;
5202 }
5203 
5204 /*
5205  * intl_utf8_to_euckr() - converts a buffer containing UTF8 text to EUC-KR
5206  * + JISX0212 encoding
5207  *
5208  * return: 0 conversion ok, 1 conversion done, but invalid characters where
5209  * found
5210  * in_buf(in): buffer
5211  * in_size(in): size of input string (NUL terminator not included)
5212  * out_buf(int/out) : output buffer : uses the pre-allocated buffer passed
5213  * as input or a new allocated buffer;
5214  * out_size(out): size of string (NUL terminator not included)
5215  */
5216 int
5217 intl_utf8_to_euckr (const unsigned char *in_buf, const int in_size, unsigned char **out_buf, int *out_size)
5218 {
5219  const unsigned char *p_in = NULL;
5220  const unsigned char *p_end;
5221  unsigned char *p_out = NULL;
5222  int status = 0;
5223 
5224  assert (in_size > 0);
5225  assert (in_buf != NULL);
5226  assert (out_buf != NULL);
5227  assert (out_size != NULL);
5228 
5229  for (p_in = in_buf, p_end = p_in + in_size, p_out = (unsigned char *) *out_buf; p_in < p_end;)
5230  {
5231  if (*p_in < 0x80)
5232  {
5233  *p_out++ = *p_in++;
5234  }
5235  else
5236  {
5237  unsigned char euc_buf[2];
5238  int euc_bytes;
5239  unsigned int unicode_cp;
5240  unsigned char *next_utf8;
5241 
5242  unicode_cp = intl_utf8_to_cp (p_in, CAST_STRLEN (p_end - p_in), &next_utf8);
5243  if (unicode_cp == 0xffffffff)
5244  {
5245  goto illegal_char;
5246  }
5247 
5248  /* try to convert to KSC5601 */
5249  euc_bytes = ksc5601_wctomb (euc_buf, unicode_cp, CAST_STRLEN (next_utf8 - p_in));
5250 
5251  assert (euc_bytes != 0);
5252  if (euc_bytes == 2)
5253  {
5254  *p_out = euc_buf[0] + 0x80;
5255  *(p_out + 1) = euc_buf[1] + 0x80;
5256  p_out++;
5257  p_out++;
5258  p_in = next_utf8;
5259  continue;
5260  }
5261 
5262  if (euc_bytes != RET_ILUNI)
5263  {
5264  goto illegal_char;
5265  }
5266  assert (euc_bytes == RET_ILUNI);
5267  /* not found as KSC encoding, try as JISX0212 */
5268  euc_bytes = jisx0212_wctomb (euc_buf, unicode_cp, CAST_STRLEN (next_utf8 - p_in));
5269 
5270  assert (euc_bytes != 0);
5271  if (euc_bytes == 2)
5272  {
5273  *p_out = 0x8f;
5274  *(p_out + 1) = euc_buf[0] + 0x80;
5275  *(p_out + 2) = euc_buf[1] + 0x80;
5276  p_out += 3;
5277  p_in = next_utf8;
5278  continue;
5279  }
5280 
5281  /* illegal Unicode or impossible to convert to EUC */
5282  illegal_char:
5283  p_in = next_utf8;
5284  *p_out = '?';
5285  p_out++;
5286  status = 1;
5287  }
5288  }
5289 
5290  *out_size = CAST_STRLEN (p_out - *(out_buf));
5291 
5292  return status;
5293 }
5294 
5295 /*
5296  * intl_iso88591_to_euckr() - converts a buffer containing ISO88591 text to
5297  * EUC-KR encoding
5298  *
5299  * return: 0 conversion ok, 1 conversion done, but invalid characters where
5300  * found
5301  * in_buf(in): buffer
5302  * in_size(in): size of input string (NUL terminator not included)
5303  * out_buf(int/out) : output buffer : uses the pre-allocated buffer passed
5304  * as input or a new allocated buffer;
5305  * out_size(out): size of string (NUL terminator not included)
5306  */
5307 int
5308 intl_iso88591_to_euckr (const unsigned char *in_buf, const int in_size, unsigned char **out_buf, int *out_size)
5309 {
5310  const unsigned char *p_in = NULL;
5311  const unsigned char *p_end;
5312  unsigned char *p_out = NULL;
5313  int status = 0;
5314 
5315  assert (in_size > 0);
5316  assert (in_buf != NULL);
5317  assert (out_buf != NULL);
5318  assert (out_size != NULL);
5319 
5320  for (p_in = in_buf, p_end = p_in + in_size, p_out = (unsigned char *) *out_buf; p_in < p_end; p_in++)
5321  {
5322  if (*p_in < 0x80)
5323  {
5324  *p_out++ = *p_in;
5325  }
5326  else
5327  {
5328  unsigned char euc_buf[2];
5329  int euc_bytes;
5330 
5331  if (*p_in < 0xa0)
5332  {
5333  *p_out = '?';
5334  p_out++;
5335  status = 1;
5336  continue;
5337  }
5338 
5339  /* try to convert to KSC5601 */
5340  euc_bytes = ksc5601_wctomb (euc_buf, *p_in, 2);
5341 
5342  assert (euc_bytes != 0);
5343  if (euc_bytes == 2)
5344  {
5345  *p_out = euc_buf[0] + 0x80;
5346  *(p_out + 1) = euc_buf[1] + 0x80;
5347  p_out++;
5348  p_out++;
5349  continue;
5350  }
5351 
5352  /* illegal ISO8859-1 or impossible to convert to KSC */
5353  if (euc_bytes != RET_ILUNI)
5354  {
5355  goto illegal_char;
5356  }
5357  assert (euc_bytes == RET_ILUNI);
5358 
5359  /* try to convert to JISX0212 */
5360  euc_bytes = jisx0212_wctomb (euc_buf, *p_in, 2);
5361 
5362  assert (euc_bytes != 0);
5363  if (euc_bytes == 2)
5364  {
5365  *p_out = 0x8f;
5366  *(p_out + 1) = euc_buf[0] + 0x80;
5367  *(p_out + 2) = euc_buf[1] + 0x80;
5368  p_out++;
5369  p_out++;
5370  p_out++;
5371  continue;
5372  }
5373 
5374  illegal_char:
5375  *p_out = '?';
5376  p_out++;
5377  status = 1;
5378  }
5379  }
5380 
5381  *out_size = CAST_STRLEN (p_out - *(out_buf));
5382 
5383  return status;
5384 }
5385 
5386 /* monetary symbols */
5387 
5388 /* UTF-8 encoding of money symbols - maps to DB_CURRENCY enum type */
5389 static char moneysymbols_utf8[][4] = {
5390  "$", /* dollar sign */
5391  "\xc2\xa5", /* Japan money symbols */
5392  "\xc2\xa3", /* pound sterling - British money symbols */
5393  "\xe2\x82\xa9", /* won - Korean money symbols */
5394  "TL", /* TL - Turkish money symbols */
5395  "KHR", /* KHR - Cambodian money symbols */
5396  "CNY", /* chinese money symbols */
5397  "INR", /* indian money symbols */
5398  "RUB", /* russian money symbols */
5399  "AUD", /* australian money symbols */
5400  "CAD", /* canadian money symbols */
5401  "BRL", /* brasilian money symbols */
5402  "RON", /* romanian money symbols */
5403  "EUR", /* euro symbol */
5404  "CHF", /* swiss money symbols */
5405  "DKK", /* danish money symbols */
5406  "NOK", /* norwegian money symbols */
5407  "BGN", /* bulgarian money symbols */
5408  "VND", /* vietnamese dong symbol */
5409  "CZK", /* Czech koruna symbol */
5410  "PLN", /* Polish zloty symbol */
5411  "SEK", /* Swedish krona symbol */
5412  "HRK", /* Croatian kuna symbol */
5413  "RSD", /* serbian dinar symbol */
5414  "\xc2\xa4" /* generic curency symbol */
5415 };
5416 
5417 /* encoding (for console output) of money symbols - maps to DB_CURRENCY enum
5418  * type */
5419 /* used for values printing in CSQL */
5420 static char moneysymbols_console[][4] = {
5421  "$", /* dollar sign */
5422  "Y", /* japanese yen */
5423  "&", /* british pound */
5424  "\\", /* Korean won */
5425  "TL", /* turkish lira */
5426  "KHR", /* cambodian riel */
5427  "CNY", /* chinese renminbi */
5428  "INR", /* indian rupee */
5429  "RUB", /* russian ruble */
5430  "AUD", /* australian dollar */
5431  "CAD", /* canadian dollar */
5432  "BRL", /* brasilian real */
5433  "RON", /* romanian leu */
5434  "EUR", /* euro */
5435  "CHF", /* swiss franc */
5436  "DKK", /* danish krone */
5437  "NOK", /* norwegian krone */
5438  "BGN", /* bulgarian lev */
5439  "VND", /* vietnamese dong */
5440  "CZK", /* Czech koruna */
5441  "PLN", /* Polish zloty */
5442  "SEK", /* Swedish krona */
5443  "HRK", /* Croatian kuna */
5444  "RSD", /* serbian dinar */
5445  "" /* generic currency symbol - add new symbols before this */
5446 };
5447 
5448 /* encoding (for grammars) of money symbols - maps to DB_CURRENCY enum type */
5449 /* used for values printing in CSQL */
5450 static char moneysymbols_grammar[][5] = {
5451  "$", /* dollar sign */
5452  "\xa1\xef", /* japanese yen */
5453  "\\GBP", /* british pound */
5454  "\\KRW", /* Korean won */
5455  "\\TL", /* turkish lira */
5456  "\\KHR", /* cambodian riel */
5457  "\\CNY", /* chinese renminbi */
5458  "\\INR", /* indian rupee */
5459  "\\RUB", /* russian ruble */
5460  "\\AUD", /* australian dollar */
5461  "\\CAD", /* canadian dollar */
5462  "\\BRL", /* brasilian real */
5463  "\\RON", /* romanian leu */
5464  "\\EUR", /* euro */
5465  "\\CHF", /* swiss franc */
5466  "\\DKK", /* danish krone */
5467  "\\NOK", /* norwegian krone */
5468  "\\BGN", /* bulgarian lev */
5469  "\\VND", /* vietnamese dong */
5470  "\\CZK", /* Czech koruna */
5471  "\\PLN", /* Polish zloty */
5472  "\\SEK", /* Swedish krona */
5473  "\\HRK", /* Croatian kuna */
5474  "\\RSD", /* serbian dinar */
5475  "" /* generic currency symbol - add new symbols before this */
5476 };
5477 
5478 /* ISO encoding of money symbols - maps to DB_CURRENCY enum type */
5479 static char moneysymbols_iso_codes[][4] = {
5480  "USD", /* dollar sign */
5481  "JPY", /* japanese yen */
5482  "GBP", /* british pound */
5483  "KRW", /* Korean won */
5484  "TRY", /* turkish lira */
5485  "KHR", /* cambodian riel */
5486  "CNY", /* chinese renminbi */
5487  "INR", /* indian rupee */
5488  "RUB", /* russian ruble */
5489  "AUD", /* australian dollar */
5490  "CAD", /* canadian dollar */
5491  "BRL", /* brasilian real */
5492  "RON", /* romanian leu */
5493  "EUR", /* euro */
5494  "CHF", /* swiss franc */
5495  "DKK", /* danish krone */
5496  "NOK", /* norwegian krone */
5497  "BGN", /* bulgarian lev */
5498  "VND", /* vietnamese dong */
5499  "CZK", /* Czech koruna */
5500  "PLN", /* Polish zloty */
5501  "SEK", /* Swedish krona */
5502  "HRK", /* Croatian kuna */
5503  "RSD", /* serbian dinar */
5504  "" /* generic currency symbol - add new symbols before this */
5505 };
5506 
5507 /* escaped ISO encoding of money symbols - maps to DB_CURRENCY enum type */
5508 static char moneysymbols_esc_iso_codes[][5] = {
5509  "\\USD", /* dollar sign */
5510  "\\JPY", /* japanese yen */
5511  "\\GBP", /* british pound */
5512  "\\KRW", /* Korean won */
5513  "\\TRY", /* turkish lira */
5514  "\\KHR", /* cambodian riel */
5515  "\\CNY", /* chinese renminbi */
5516  "\\INR", /* indian rupee */
5517  "\\RUB", /* russian ruble */
5518  "\\AUD", /* australian dollar */
5519  "\\CAD", /* canadian dollar */
5520  "\\BRL", /* brasilian real */
5521  "\\RON", /* romanian leu */
5522  "\\EUR", /* euro */
5523  "\\CHF", /* swiss franc */
5524  "\\DKK", /* danish krone */
5525  "\\NOK", /* norwegian krone */
5526  "\\BGN", /* bulgarian lev */
5527  "\\VND", /* vietnamese dong */
5528  "\\CZK", /* Czech koruna */
5529  "\\PLN", /* Polish zloty */
5530  "\\SEK", /* Swedish krona */
5531  "\\HRK", /* Croatian kuna */
5532  "\\RSD", /* serbian dinar */
5533  "" /* generic currency symbol - add new symbols before this */
5534 };
5535 
5536 /* ISO88591 encoding of money symbols - maps to DB_CURRENCY enum type */
5537 static char moneysymbols_iso88591_codes[][4] = {
5538  "$", /* dollar sign */
5539  "\xa5", /* japanese yen */
5540  "\xa3", /* british pound */
5541  "KRW", /* Korean won */
5542  "TL", /* turkish lira */
5543  "KHR", /* cambodian riel */
5544  "CNY", /* chinese renminbi */
5545  "INR", /* indian rupee */
5546  "RUB", /* russian ruble */
5547  "AUD", /* australian dollar */
5548  "CAD", /* canadian dollar */
5549  "BRL", /* brasilian real */
5550  "RON", /* romanian leu */
5551  "EUR", /* euro */
5552  "CHF", /* swiss franc */
5553  "DKK", /* danish krone */
5554  "NOK", /* norwegian krone */
5555  "BGN", /* bulgarian lev */
5556  "VND", /* vietnamese dong */
5557  "CZK", /* Czech koruna */
5558  "PLN", /* Polish zloty */
5559  "SEK", /* Swedish krona */
5560  "HRK", /* Croatian kuna */
5561  "RSD", /* serbian dinar */
5562  "" /* generic currency symbol - add new symbols before this */
5563 };
5564 
5565 /*
5566  * intl_is_currency_symbol() - check if a string matches a currency
5567  * symbol (UTF-8)
5568  * return: true if a match is found
5569  * src(in): NUL terminated string
5570  * currency(out): currency found
5571  */
5572 bool
5573 intl_is_currency_symbol (const char *src, DB_CURRENCY * currency, int *symbol_size,
5574  const CURRENCY_CHECK_MODE check_mode)
5575 {
5576  int sym_currency;
5577  int src_len = strlen (src);
5578 
5579  assert (currency != NULL);
5580  assert (symbol_size != NULL);
5581 
5582  *currency = DB_CURRENCY_NULL;
5583  *symbol_size = 0;
5584 
5585  if (check_mode & CURRENCY_CHECK_MODE_ISO)
5586  {
5587  for (sym_currency = 0; src_len > 0 && sym_currency < (int) DIM (moneysymbols_iso_codes); sym_currency++)
5588  {
5589  int symbol_len = strlen (moneysymbols_iso_codes[sym_currency]);
5590  if (src_len >= symbol_len && symbol_len > 0
5591  && !memcmp (src, moneysymbols_iso_codes[sym_currency], symbol_len))
5592  {
5593  *currency = (DB_CURRENCY) sym_currency;
5594  *symbol_size = symbol_len;
5595  return (*currency == DB_CURRENCY_NULL) ? false : true;
5596  }
5597  }
5598  }
5599 
5600  if (check_mode & CURRENCY_CHECK_MODE_ESC_ISO)
5601  {
5602  for (sym_currency = 0; src_len > 0 && sym_currency < (int) DIM (moneysymbols_esc_iso_codes); sym_currency++)
5603  {
5604  int symbol_len = strlen (moneysymbols_esc_iso_codes[sym_currency]);
5605  if (src_len >= symbol_len && symbol_len > 0
5606  && !memcmp (src, moneysymbols_esc_iso_codes[sym_currency], symbol_len))
5607  {
5608  *currency = (DB_CURRENCY) sym_currency;
5609  *symbol_size = symbol_len;
5610  return (*currency == DB_CURRENCY_NULL) ? false : true;
5611  }
5612  }
5613  }
5614 
5615  if (check_mode & CURRENCY_CHECK_MODE_UTF8)
5616  {
5617  for (sym_currency = 0; src_len > 0 && sym_currency < (int) DIM (moneysymbols_utf8); sym_currency++)
5618  {
5619  int symbol_len = strlen (moneysymbols_utf8[sym_currency]);
5620  if (src_len >= symbol_len && symbol_len > 0 && !memcmp (src, moneysymbols_utf8[sym_currency], symbol_len))
5621  {
5622  *currency = (DB_CURRENCY) sym_currency;
5623  *symbol_size = symbol_len;
5624  return (*currency == DB_CURRENCY_NULL) ? false : true;
5625  }
5626  }
5627  }
5628 
5629  if (check_mode & CURRENCY_CHECK_MODE_CONSOLE)
5630  {
5631  for (sym_currency = 0; src_len > 0 && sym_currency < (int) DIM (moneysymbols_console); sym_currency++)
5632  {
5633  int symbol_len = strlen (moneysymbols_console[sym_currency]);
5634  if (src_len >= symbol_len && symbol_len > 0 && !memcmp (src, moneysymbols_console[sym_currency], symbol_len))
5635  {
5636  *currency = (DB_CURRENCY) sym_currency;
5637  *symbol_size = symbol_len;
5638  return (*currency == DB_CURRENCY_NULL) ? false : true;
5639  }
5640  }
5641  }
5642 
5643  /* search backwards : "\TL" (turkish lira) symbol may be miss-interpreted as "\" (korean won) */
5644  if (check_mode & CURRENCY_CHECK_MODE_GRAMMAR)
5645  {
5646  for (sym_currency = (int) DIM (moneysymbols_grammar) - 1; src_len > 0 && sym_currency >= 0; sym_currency--)
5647  {
5648  int symbol_len = strlen (moneysymbols_grammar[sym_currency]);
5649  if (src_len >= symbol_len && symbol_len > 0 && !memcmp (src, moneysymbols_grammar[sym_currency], symbol_len))
5650  {
5651  *currency = (DB_CURRENCY) sym_currency;
5652  *symbol_size = symbol_len;
5653  return (*currency == DB_CURRENCY_NULL) ? false : true;
5654  }
5655  }
5656  }
5657 
5658  if (check_mode & CURRENCY_CHECK_MODE_ISO88591)
5659  {
5660  for (sym_currency = 0; src_len > 0 && sym_currency < (int) DIM (moneysymbols_iso88591_codes); sym_currency++)
5661  {
5662  int symbol_len = strlen (moneysymbols_iso88591_codes[sym_currency]);
5663  if (src_len >= symbol_len && symbol_len > 0
5664  && !memcmp (src, moneysymbols_iso88591_codes[sym_currency], symbol_len))
5665  {
5666  *currency = (DB_CURRENCY) sym_currency;
5667  *symbol_size = symbol_len;
5668  return (*currency == DB_CURRENCY_NULL) ? false : true;
5669  }
5670  }
5671  }
5672 
5673  return false;
5674 }
5675 
5676 /*
5677  * intl_get_money_symbol() - returns a string representing the currency symbol
5678  * return: currency symbol
5679  * currency(int): currency code
5680  * codeset (in): required codeset
5681  */
5682 char *
5684 {
5685  switch (codeset)
5686  {
5687  case INTL_CODESET_ISO88591:
5688  return intl_get_money_ISO88591_symbol (currency);
5689  case INTL_CODESET_UTF8:
5690  return intl_get_money_UTF8_symbol (currency);
5691  default:
5692  return intl_get_money_symbol_console (currency);
5693  }
5694 }
5695 
5696 /*
5697  * intl_get_money_symbol_console() - returns a string representing the
5698  * currency symbol printable on console
5699  * return: currency symbol
5700  * currency(int): currency code
5701  */
5702 char *
5704 {
5705  if (currency >= (int) DIM (moneysymbols_console))
5706  {
5708  }
5709  return moneysymbols_console[currency];
5710 }
5711 
5712 /*
5713  * intl_get_money_symbol_grammar() - returns a string representing the
5714  * currency symbol recognizable by grammar
5715  * return: currency symbol
5716  * currency(int): currency code
5717  */
5718 char *
5720 {
5721  if (currency >= (int) DIM (moneysymbols_grammar))
5722  {
5724  }
5725  return moneysymbols_grammar[currency];
5726 }
5727 
5728 /*
5729  * intl_get_currency_symbol_position() - returns an indication of the position
5730  * of currency symbol symbol when
5731  * is printed
5732  * return: position indicator : 0 : before value, 1 : after value
5733  * currency(int): currency code
5734  *
5735  * Note : currently ony the turkish lira is printed after the value
5736  */
5737 int
5739 {
5740  if (currency == DB_CURRENCY_TL)
5741  {
5742  return 1;
5743  }
5744 
5745  return 0;
5746 }
5747 
5748 /*
5749  * intl_get_money_ISO_symbol() - returns a string representing the currency
5750  * ISO symbol, as a 3 letter string.
5751  * return: currency ISO symbol
5752  * currency(int): currency code
5753  */
5754 char *
5756 {
5757  if (currency >= (int) DIM (moneysymbols_iso_codes))
5758  {
5760  }
5761  return moneysymbols_iso_codes[currency];
5762 }
5763 
5764 /*
5765  * intl_get_money_esc_ISO_symbol() - returns a string representing the
5766  * currency with escaped ISO symbol
5767  * return: currency escaped ISO symbol
5768  * currency(int): currency code
5769  */
5770 char *
5772 {
5773  if (currency >= (int) DIM (moneysymbols_esc_iso_codes))
5774  {
5776  }
5777  return moneysymbols_esc_iso_codes[currency];
5778 }
5779 
5780 /*
5781  * intl_get_money_UTF8_symbol() - returns a string representing the currency
5782  * UTF8 symbol, as a 3 letter string.
5783  * return: currency UTF8 symbol
5784  * currency(int): currency code
5785  */
5786 char *
5788 {
5789  if (currency >= (int) DIM (moneysymbols_utf8))
5790  {
5792  }
5793  return moneysymbols_utf8[currency];
5794 }
5795 
5796 /*
5797  * intl_get_money_ISO88591_symbol() - returns a string representing the currency
5798  * ISO88591 symbol, as a 3 letter string.
5799  * return: currency ISO88591 symbol
5800  * currency(int): currency code
5801  */
5802 char *
5804 {
5805  if (currency >= (int) DIM (moneysymbols_iso88591_codes))
5806  {
5808  }
5809  return moneysymbols_iso88591_codes[currency];
5810 }
5811 
5812 /*
5813  * intl_binary_to_utf8 - converts a buffer from binary to utf8, replacing
5814  * invalid UTF-8 sequences with '?'
5815  *
5816  * in_buf(in): buffer
5817  * in_size(in): size of input string (NUL terminator not included)
5818  * out_buf(int/out) : output buffer : uses the pre-allocated buffer passed
5819  * as input or a new allocated buffer;
5820  * out_size(out): size of string (NUL terminator not included)
5821  *
5822  * Valid ranges:
5823  * - 1 byte : 00 - 7F
5824  * - 2 bytes: C2 - DF , 80 - BF (U +80 .. U+7FF)
5825  * - 3 bytes: E0 , A0 - BF , 80 - BF (U +800 .. U+FFF)
5826  * E1 - EC , 80 - BF , 80 - BF (U +1000 .. +CFFF)
5827  * ED , 80 - 9F , 80 - BF (U +D000 .. +D7FF)
5828  * EE - EF , 80 - BF , 80 - BF (U +E000 .. +FFFF)
5829  * - 4 bytes: F0 , 90 - BF , 80 - BF , 80 - BF (U +10000 .. +3FFFF)
5830  * F1 - F3 , 80 - BF , 80 - BF , 80 - BF (U +40000 .. +FFFFF)
5831  * F4 , 80 - 8F , 80 - BF , 80 - BF (U +100000 .. +10FFFF)
5832  */
5833 void
5834 intl_binary_to_utf8 (const unsigned char *in_buf, const int in_size, unsigned char **out_buf, int *out_size)
5835 {
5836  const unsigned char *p = in_buf;
5837  const unsigned char *p_end = NULL;
5838  const unsigned char *curr_char = NULL;
5839  unsigned char *p_out = NULL;
5840 
5841  p_out = (unsigned char *) *out_buf;
5842  p_end = in_buf + in_size;
5843 
5844  while (p < p_end)
5845  {
5846  curr_char = p;
5847 
5848  if (*p < 0x80)
5849  {
5850  *p_out++ = *p++;
5851  continue;
5852  }
5853 
5854  /* range 80 - BF is not valid UTF-8 first byte */
5855  /* range C0 - C1 overlaps 1 byte 00 - 20 (2 byte overlongs) */
5856  if (*p < 0xc2)
5857  {
5858  *p_out++ = '?';
5859  p++;
5860  continue;
5861  }
5862 
5863  /* check 2 bytes sequences */
5864  /* 2 bytes sequence allowed : C2 - DF , 80 - BF */
5865  if (UTF8_BYTE_IN_RANGE (*p, 0xc2, 0xdf))
5866  {
5867  p++;
5868  if (p >= p_end)
5869  {
5870  *p_out++ = '?';
5871  continue;
5872  }
5873 
5874  if (UTF8_BYTE_IN_RANGE (*p, 0x80, 0xbf))
5875  {
5876  *p_out++ = *(p - 1);
5877  *p_out++ = *p;
5878  p++;
5879  continue;
5880  }
5881  p++;
5882  *p_out++ = '?';
5883  continue;
5884  }
5885 
5886  /* check 3 bytes sequences */
5887  /* 3 bytes sequence : E0 , A0 - BF , 80 - BF */
5888  if (*p == 0xe0)
5889  {
5890  p++;
5891  if (p >= p_end)
5892  {
5893  *p_out++ = '?';
5894  continue;
5895  }
5896 
5897  if (UTF8_BYTE_IN_RANGE (*p, 0xa0, 0xbf))
5898  {
5899  p++;
5900  if (p >= p_end)
5901  {
5902  *p_out++ = '?';
5903  continue;
5904  }
5905 
5906  if (UTF8_BYTE_IN_RANGE (*p, 0x80, 0xbf))
5907  {
5908  *p_out++ = *(p - 2);
5909  *p_out++ = *(p - 1);
5910  *p_out++ = *p;
5911  p++;
5912  continue;
5913  }
5914  }
5915  p++;
5916  if (p < p_end)
5917  {
5918  *p_out++ = '?';
5919  }
5920  continue;
5921  }
5922  /* 3 bytes sequence : E1 - EC , 80 - BF , 80 - BF */
5923  /* 3 bytes sequence : EE - EF , 80 - BF , 80 - BF */
5924  else if (UTF8_BYTE_IN_RANGE (*p, 0xe1, 0xec) || UTF8_BYTE_IN_RANGE (*p, 0xee, 0xef))
5925  {
5926  p++;
5927  if (p >= p_end)
5928  {
5929  *p_out++ = '?';
5930  continue;
5931  }
5932 
5933  if (UTF8_BYTE_IN_RANGE (*p, 0x80, 0xbf))
5934  {
5935  p++;
5936  if (p >= p_end)
5937  {
5938  *p_out++ = '?';
5939  continue;
5940  }
5941 
5942  if (UTF8_BYTE_IN_RANGE (*p, 0x80, 0xbf))
5943  {
5944  *p_out++ = *(p - 2);
5945  *p_out++ = *(p - 1);
5946  *p_out++ = *p;
5947  p++;
5948  continue;
5949  }
5950  }
5951  p++;
5952  *p_out++ = '?';
5953  continue;
5954  }
5955  /* 3 bytes sequence : ED , 80 - 9F , 80 - BF */
5956  else if (*p == 0xed)
5957  {
5958  p++;
5959  if (p >= p_end)
5960  {
5961  *p_out++ = '?';
5962  continue;
5963  }
5964 
5965  if (UTF8_BYTE_IN_RANGE (*p, 0x80, 0x9f))
5966  {
5967  p++;
5968  if (p >= p_end)
5969  {
5970  *p_out++ = '?';
5971  continue;
5972  }
5973 
5974  if (UTF8_BYTE_IN_RANGE (*p, 0x80, 0xbf))
5975  {
5976  *p_out++ = *(p - 2);
5977  *p_out++ = *(p - 1);
5978  *p_out++ = *p;
5979  p++;
5980  continue;
5981  }
5982  }
5983  p++;
5984  *p_out++ = '?';
5985  continue;
5986  }
5987 
5988  /* 4 bytes sequence : F0 , 90 - BF , 80 - BF , 80 - BF */
5989  if (*p == 0xf0)
5990  {
5991  p++;
5992  if (p >= p_end)
5993  {
5994  *p_out++ = '?';
5995  continue;
5996  }
5997 
5998  if (UTF8_BYTE_IN_RANGE (*p, 0x90, 0xbf))
5999  {
6000  p++;
6001  if (p >= p_end)
6002  {
6003  *p_out++ = '?';
6004  continue;
6005  }
6006 
6007  if (UTF8_BYTE_IN_RANGE (*p, 0x80, 0xbf))
6008  {
6009  p++;
6010  if (p >= p_end)
6011  {
6012  *p_out++ = '?';
6013  continue;
6014  }
6015 
6016  if (UTF8_BYTE_IN_RANGE (*p, 0x80, 0xbf))
6017  {
6018  *p_out++ = *(p - 3);
6019  *p_out++ = *(p - 2);
6020  *p_out++ = *(p - 1);
6021  *p_out++ = *p;
6022  p++;
6023  continue;
6024  }
6025  }
6026  }
6027  p++;
6028  *p_out++ = '?';
6029  continue;
6030  }
6031  /* 4 bytes sequence : F1 - F3 , 80 - BF , 80 - BF , 80 - BF */
6032  if (UTF8_BYTE_IN_RANGE (*p, 0xf1, 0xf3))
6033  {
6034  p++;
6035  if (p >= p_end)
6036  {
6037  *p_out++ = '?';
6038  continue;
6039  }
6040 
6041  if (UTF8_BYTE_IN_RANGE (*p, 0x80, 0xbf))
6042  {
6043  p++;
6044  if (p >= p_end)
6045  {
6046  *p_out++ = '?';
6047  continue;
6048  }
6049 
6050  if (UTF8_BYTE_IN_RANGE (*p, 0x80, 0xbf))
6051  {
6052  p++;
6053  if (p >= p_end)
6054  {
6055  *p_out++ = '?';
6056  continue;
6057  }
6058 
6059  if (UTF8_BYTE_IN_RANGE (*p, 0x80, 0xbf))
6060  {
6061  *p_out++ = *(p - 3);
6062  *p_out++ = *(p - 2);
6063  *p_out++ = *(p - 1);
6064  *p_out++ = *p;
6065  p++;
6066  continue;
6067  }
6068  }
6069  }
6070  p++;
6071  *p_out++ = '?';
6072  continue;
6073  }
6074  /* 4 bytes sequence : F4 , 80 - 8F , 80 - BF , 80 - BF */
6075  else if (*p == 0xf4)
6076  {
6077  p++;
6078  if (p >= p_end)
6079  {
6080  *p_out++ = '?';
6081  continue;
6082  }
6083 
6084  if (UTF8_BYTE_IN_RANGE (*p, 0x80, 0x8f))
6085  {
6086  p++;
6087  if (p >= p_end)
6088  {
6089  *p_out++ = '?';
6090  continue;
6091  }
6092 
6093  if (UTF8_BYTE_IN_RANGE (*p, 0x80, 0xbf))
6094  {
6095  p++;
6096  if (p >= p_end)
6097  {
6098  *p_out++ = '?';
6099  continue;
6100  }
6101 
6102  if (UTF8_BYTE_IN_RANGE (*p, 0x80, 0xbf))
6103  {
6104  *p_out++ = *(p - 3);
6105  *p_out++ = *(p - 2);
6106  *p_out++ = *(p - 1);
6107  *p_out++ = *p;
6108  p++;
6109  continue;
6110  }
6111  }
6112  }
6113  p++;
6114  *p_out++ = '?';
6115  continue;
6116  }
6117 
6118  assert (*p > 0xf4);
6119  }
6120 
6121  *out_size = CAST_STRLEN (p_out - *(out_buf));
6122 }
6123 
6124 /*
6125  * intl_binary_to_euckr - converts a buffer from binary to euckr, replacing
6126  * invalid euckr sequences with '?'
6127  *
6128  * in_buf(in): buffer
6129  * in_size(in): size of input string (NUL terminator not included)
6130  * out_buf(int/out) : output buffer : uses the pre-allocated buffer passed
6131  * as input or a new allocated buffer;
6132  * out_size(out): size of string (NUL terminator not included)
6133  *
6134  * Valid ranges:
6135  * - 1 byte : 00 - 8E ; 90 - A0
6136  * - 2 bytes: A1 - FE , 00 - FF
6137  * - 3 bytes: 8F , 00 - FF , 00 - FF
6138  */
6139 void
6140 intl_binary_to_euckr (const unsigned char *in_buf, const int in_size, unsigned char **out_buf, int *out_size)
6141 {
6142  const unsigned char *p = in_buf;
6143  const unsigned char *p_end = NULL;
6144  const unsigned char *curr_char = NULL;
6145  unsigned char *p_out = NULL;
6146 
6147  p_out = (unsigned char *) *out_buf;
6148  p_end = in_buf + in_size;
6149 
6150  while (p < p_end)
6151  {
6152  curr_char = p;
6153 
6154  if (*p < 0x80)
6155  {
6156  *p_out++ = *p++;
6157  continue;
6158  }
6159 
6160  /* SS3 byte value starts a 3 bytes character */
6161  if (*p == SS3)
6162  {
6163  p++;
6164  p++;
6165  p++;
6166  if (p > p_end)
6167  {
6168  *p_out++ = '?';
6169  continue;
6170  }
6171  *p_out++ = *(p - 3);
6172  *p_out++ = *(p - 2);
6173  *p_out++ = *(p - 1);
6174  continue;
6175  }
6176 
6177  /* check 2 bytes sequences */
6178  if (UTF8_BYTE_IN_RANGE (*p, 0xa1, 0xfe))
6179  {
6180  p++;
6181  p++;
6182  if (p > p_end)
6183  {
6184  *p_out++ = '?';
6185  continue;
6186  }
6187  *p_out++ = *(p - 2);
6188  *p_out++ = *(p - 1);
6189  continue;
6190  }
6191  p++;
6192  *p_out++ = '?';
6193  }
6194 
6195  *out_size = CAST_STRLEN (p_out - *(out_buf));
6196 }
int intl_identifier_ncasecmp(const char *str1, const char *str2, const int len)
static int intl_tolower_euc(const unsigned char *src, unsigned char *d, int byte_size)
Definition: intl_support.c:834
int char_isspace(int c)
Definition: chartype.c:109
unsigned char bytes[TEXT_CONV_MAX_BYTES]
int intl_lower_string(const ALPHABET_DATA *alphabet, const unsigned char *src, unsigned char *dst, int length_in_chars)
#define NO_ERROR
Definition: error_code.h:46
int intl_mbs_len(const char *mbs)
Definition: intl_support.c:183
CONV_CP_TO_BYTES * utf8_to_text
TEXT_CONVERSION con_Iso_8859_1_conv
Definition: intl_support.c:128
bool intl_String_validation
Definition: intl_support.c:87
int intl_toupper_iso8859(unsigned char *s, int length)
Definition: intl_support.c:747
char * intl_get_money_ISO88591_symbol(const DB_CURRENCY currency)
int intl_identifier_casecmp_w_size(const INTL_LANG lang_id, unsigned char *str1, unsigned char *str2, const int size_str1, const int size_str2)
int intl_pad_size(INTL_CODESET codeset)
#define RET_ILUNI
int char_tolower(int c)
Definition: chartype.c:146
#define IS_8BIT(c)
Definition: intl_support.c:46
static int intl_tolower_utf8(const ALPHABET_DATA *a, const unsigned char *s, unsigned char *d, int length_in_chars, int *d_size)
char * intl_mbs_chr(const char *mbs, wchar_t wc)
Definition: intl_support.c:149
#define INTL_NEXT_CHAR(ptr, s, codeset, current_char_size)
Definition: intl_support.h:99
#define OUTPUT(charp_out)
int intl_text_single_byte_to_utf8(const char *in_buf, const int in_size, char **out_buf, int *out_size)
static int jisx0212_wctomb(unsigned char *r, ucs4_t wc, int n)
Definition: jisx0212.h:2188
static int intl_char_toupper_utf8(const ALPHABET_DATA *a, const unsigned char *s, const int size, unsigned char *d, unsigned char **next)
const char * intl_mbs_nth(const char *mbs, size_t n)
Definition: intl_support.c:219
static int intl_count_utf8_bytes(const unsigned char *s, int length_in_chars)
ALPHABET_DATA alphabet
static int jisx0212_mbtowc(ucs4_t *pwc, const unsigned char *s, int n)
Definition: jisx0212.h:917
void intl_binary_to_euckr(const unsigned char *in_buf, const int in_size, unsigned char **out_buf, int *out_size)
int intl_identifier_upper_string_size(const char *src)
int intl_get_currency_symbol_position(const DB_CURRENCY currency)
enum intl_utf8_validity INTL_UTF8_VALIDITY
Definition: intl_support.h:170
#define ISO_8859_9_LAST_CP
Definition: intl_support.c:75
static void intl_init_conv_iso8859_1_to_utf8(void)
#define CAST_STRLEN
Definition: porting.h:470
#define ER_QSTR_BAD_SRC_CODESET
Definition: error_code.h:744
#define SS3
Definition: intl_support.c:49
static int intl_toupper_euc(const unsigned char *src, unsigned char *d, int byte_size)
Definition: intl_support.c:859
bool intl_is_space(const char *str, const char *str_end, const INTL_CODESET codeset, int *space_size)
int intl_cp_to_utf8(const unsigned int codepoint, unsigned char *utf8_seq)
const unsigned char * intl_prev_char(const unsigned char *s, const unsigned char *s_start, INTL_CODESET codeset, int *prev_char_size)
int intl_case_match_tok(const INTL_LANG lang_id, const INTL_CODESET codeset, unsigned char *tok, unsigned char *src, const int size_tok, const int size_src, int *matched_size_src)
bool intl_is_max_bound_chr(INTL_CODESET codeset, const unsigned char *chr)
const unsigned char * intl_nextchar_euc(const unsigned char *s, int *curr_char_length)
Definition: intl_support.c:777
const unsigned char * intl_prevchar_utf8(const unsigned char *s, const unsigned char *s_start, int *prev_char_length)
int intl_mbs_spn(const char *mbs, const wchar_t *chars)
Definition: intl_support.c:269
int intl_count_utf8_chars(const unsigned char *s, int length_in_bytes)
INTL_ZONE intl_zone(int category)
#define LOCALE_KOREAN
Definition: intl_support.c:56
static char moneysymbols_console[][4]
static CONV_CP_TO_BYTES iso8859_1_To_utf8_conv[256]
Definition: intl_support.c:81
static int intl_count_euc_chars(const unsigned char *s, int length_in_bytes)
Definition: intl_support.c:890
char * intl_mbs_ncpy(char *mbs1, const char *mbs2, size_t n)
Definition: intl_support.c:489
const LANG_LOCALE_DATA * lang_locale(void)
char * intl_get_money_symbol_grammar(const DB_CURRENCY currency)
#define REINTERPRET_CAST(dest_type, expr)
Definition: porting.h:1080
int intl_euckr_to_utf8(const unsigned char *in_buf, const int in_size, unsigned char **out_buf, int *out_size)
int intl_identifier_lower(const char *src, char *dst)
static char moneysymbols_iso88591_codes[][4]
int intl_put_char(unsigned char *dest, const unsigned char *char_p, const INTL_CODESET codeset)
unsigned int intl_back_utf8_to_cp(const unsigned char *utf8_start, const unsigned char *utf8_last, unsigned char **last_byte__prev_char)
bool intl_is_min_bound_chr(INTL_CODESET codeset, const unsigned char *chr)
INTL_CODESET lang_charset(void)
DB_CURRENCY
Definition: dbtype_def.h:799
int char_islower_iso8859(int c)
Definition: chartype.c:189
void er_set(int severity, const char *file_name, const int line_no, int err_id, int num_args,...)
static void intl_init_conv_iso8859_9_to_utf8(void)
CONV_CP_TO_BYTES * text_to_utf8
int intl_text_utf8_to_dbcs(const char *in_buf, const int in_size, char **out_buf, int *out_size)
ALPHABET_DATA ident_alphabet
enum currency_check_mode CURRENCY_CHECK_MODE
Definition: intl_support.h:162
int intl_text_utf8_to_single_byte(const char *in_buf, const int in_size, char **out_buf, int *out_size)
#define assert(x)
static char moneysymbols_grammar[][5]
INTL_UTF8_VALIDITY intl_check_utf8(const unsigned char *buf, int size, char **pos)
int intl_identifier_namecmp(const char *str1, const char *str2)
#define ER_GENERIC_ERROR
Definition: error_code.h:49
void intl_pad_char(const INTL_CODESET codeset, unsigned char *pad_char, int *pad_size)
static int intl_strcasecmp_utf8_one_cp(const ALPHABET_DATA *alphabet, unsigned char *str1, unsigned char *str2, const int size_str1, const int size_str2, unsigned int cp1, unsigned int cp2, int *skip_size1, int *skip_size2)
unsigned int text_first_cp
#define ER_OUT_OF_VIRTUAL_MEMORY
Definition: error_code.h:50
const unsigned char * intl_nextchar_utf8(const unsigned char *s, int *curr_char_length)
const char * intl_skip_spaces(const char *str, const char *str_end, const INTL_CODESET codeset)
int char_toupper_iso8859(int c)
Definition: chartype.c:211
bool intl_is_currency_symbol(const char *src, DB_CURRENCY *currency, int *symbol_size, const CURRENCY_CHECK_MODE check_mode)
unsigned int intl_identifier_mht_1strlowerhash(const void *key, const unsigned int ht_size)
int intl_identifier_cmp(const char *str1, const char *str2)
int intl_identifier_casecmp(const char *str1, const char *str2)
unsigned int intl_dbcs_to_cp(const unsigned char *seq, const int size, const unsigned char *byte_flag, unsigned char **next_char)
static int ksc5601_mbtowc(ucs4_t *pwc, const unsigned char *s, int n)
Definition: ksc5601.h:1191
#define DB_MAX_IDENTIFIER_LENGTH
Definition: dbtype_def.h:495
int intl_iso88591_to_euckr(const unsigned char *in_buf, const int in_size, unsigned char **out_buf, int *out_size)
static int intl_toupper_utf8(const ALPHABET_DATA *a, const unsigned char *s, unsigned char *d, int length_in_chars, int *d_size)
unsigned int utf8_first_cp
static int intl_char_tolower_utf8(const ALPHABET_DATA *a, const unsigned char *s, const int size, unsigned char *d, unsigned char **next)
static char moneysymbols_esc_iso_codes[][5]
static int ksc5601_wctomb(unsigned char *r, ucs4_t wc, int n)
Definition: ksc5601.h:3012
void intl_binary_to_utf8(const unsigned char *in_buf, const int in_size, unsigned char **out_buf, int *out_size)
bool intl_is_bom_magic(const char *buf, const int size)
#define NULL
Definition: freelistheap.h:34
static const unsigned char len_utf8_char[256]
unsigned int intl_utf8_to_cp(const unsigned char *utf8, const int size, unsigned char **next_char)
unsigned int * lower_cp
static int intl_count_euc_bytes(const unsigned char *s, int length_in_chars)
Definition: intl_support.c:919
unsigned int utf8_last_cp
int intl_char_size(const unsigned char *src, int length_in_chars, INTL_CODESET src_codeset, int *byte_count)
#define MB_LEN_MAX
Definition: intl_support.h:53
INTL_LANG lang_id(void)
enum intl_zone INTL_ZONE
Definition: intl_support.h:150
int count(int &result, const cub_regex_object &reg, const std::string &src, const int position, const INTL_CODESET codeset)
const LANG_LOCALE_DATA * lang_get_specific_locale(const INTL_LANG lang, const INTL_CODESET codeset)
int intl_text_dbcs_to_utf8_ext(void *t, const unsigned char *in_buf, const int in_size, unsigned char **out_buf, int *out_size)
const char * intl_backskip_spaces(const char *str_begin, const char *str_end, const INTL_CODESET codeset)
int intl_euckr_to_iso88591(const unsigned char *in_buf, const int in_size, unsigned char **out_buf, int *out_size)
#define INTL_GET_NEXTCHAR_UTF8(c, l)
Definition: intl_support.h:71
#define INTL_UTF8_MAX_CHAR_SIZE
unsigned char byte_flag[256]
int intl_utf8_to_cp_list(const unsigned char *utf8, const int size, unsigned int *cp_array, const int max_array_size, int *array_count)
#define ARG_FILE_LINE
Definition: error_manager.h:44
unsigned int INTL_LANG
Definition: intl_support.h:132
int intl_cp_to_dbcs(const unsigned int codepoint, const unsigned char *byte_flag, unsigned char *seq)
int intl_identifier_lower_string_size(const char *src)
static CONV_CP_TO_BYTES iso8859_9_To_utf8_conv[256]
Definition: intl_support.c:77
#define strlen(s1)
Definition: intl_support.c:43
int intl_lower_string_size(const ALPHABET_DATA *alphabet, const unsigned char *src, int src_size, int src_length)
int intl_set_min_bound_chr(INTL_CODESET codeset, char *chr)
int intl_mbs_ncasecmp(const char *mbs1, const char *mbs2, size_t n)
Definition: intl_support.c:441
char * intl_get_money_symbol_console(const DB_CURRENCY currency)
int char_tolower_iso8859(int c)
Definition: chartype.c:200
int char_toupper(int c)
Definition: chartype.c:157
char * intl_get_money_symbol(const DB_CURRENCY currency, INTL_CODESET codeset)
enum intl_codeset INTL_CODESET
Definition: intl_support.h:190
int intl_char_count(const unsigned char *src, int length_in_bytes, INTL_CODESET src_codeset, int *char_count)
Definition: intl_support.c:983
int intl_utf8_to_euckr(const unsigned char *in_buf, const int in_size, unsigned char **out_buf, int *out_size)
bool prm_get_bool_value(PARAM_ID prm_id)
bool intl_Mbs_support
Definition: intl_support.c:86
static CONV_CP_TO_BYTES utf8_Cp_to_iso_8859_9_conv[ISO_8859_9_LAST_CP-ISO_8859_9_FIRST_CP+1]
Definition: intl_support.c:78
int intl_mbs_casecmp(const char *mbs1, const char *mbs2)
Definition: intl_support.c:358
char * intl_get_money_ISO_symbol(const DB_CURRENCY currency)
int intl_cmp_char(const unsigned char *s1, const unsigned char *s2, INTL_CODESET codeset, int *char_size)
int i
Definition: dynamic_load.c:954
int char_isupper_iso8859(int c)
Definition: chartype.c:177
int intl_identifier_fix(char *name, int ident_max_size, bool error_on_case_overflow)
#define CHAR_BYTE_TO_UPPER(c)
Definition: intl_support.c:71
TEXT_CONVERSION con_Iso_8859_9_conv
Definition: intl_support.c:116
INTL_UTF8_VALIDITY intl_check_euckr(const unsigned char *buf, int size, char **pos)
int intl_upper_string(const ALPHABET_DATA *alphabet, const unsigned char *src, unsigned char *dst, int length_in_chars)
unsigned char size
int intl_text_dbcs_to_utf8(const char *in_buf, const int in_size, char **out_buf, int *out_size)
static char moneysymbols_utf8[][4]
int intl_convert_charset(const unsigned char *src, int length_in_chars, INTL_CODESET src_codeset, unsigned char *dest, INTL_CODESET dest_codeset, int *unconverted)
Definition: intl_support.c:953
TEXT_CONVERSION * lang_get_txt_conv(void)
#define INTL_CASING_EXPANSION_MULTIPLIER
int intl_reverse_string(const unsigned char *src, unsigned char *dst, int length_in_chars, int size_in_bytes, INTL_CODESET codeset)
unsigned int text_last_cp
INTL_UTF8_VALIDITY intl_check_string(const char *buf, int size, char **pos, const INTL_CODESET codeset)
const unsigned char * intl_next_char(const unsigned char *s, INTL_CODESET codeset, int *current_char_size)
#define CHAR_BYTE_TO_LOWER(c)
Definition: intl_support.c:69
int intl_tolower_iso8859(unsigned char *s, int length)
Definition: intl_support.c:721
#define INTL_CODESET_MULT(codeset)
Definition: intl_support.h:77
int intl_set_max_bound_chr(INTL_CODESET codeset, char *chr)
int intl_fast_iso88591_to_utf8(const unsigned char *in_buf, const int in_size, unsigned char **out_buf, int *out_size)
char * intl_get_money_UTF8_symbol(const DB_CURRENCY currency)
int intl_utf8_to_iso88591(const unsigned char *in_buf, const int in_size, unsigned char **out_buf, int *out_size)
const unsigned char * intl_prevchar_euc(const unsigned char *s, const unsigned char *s_start, int *prev_char_length)
Definition: intl_support.c:806
int intl_upper_string_size(const ALPHABET_DATA *alphabet, const unsigned char *src, int src_size, int src_length)
const char ** p
Definition: dynamic_load.c:945
int intl_identifier_upper(const char *src, char *dst)
static char moneysymbols_iso_codes[][4]
#define ISO_8859_9_FIRST_CP
Definition: intl_support.c:74
char * intl_get_money_esc_ISO_symbol(const DB_CURRENCY currency)
#define UTF8_BYTE_IN_RANGE(b, r1, r2)
int intl_text_single_byte_to_utf8_ext(void *t, const unsigned char *in_buf, const int in_size, unsigned char **out_buf, int *out_size)
const unsigned char *const intl_Len_utf8_char
unsigned int * upper_cp