CUBRID Engine  latest
double_byte_support.c
Go to the documentation of this file.
1 /*
2  * Copyright 2008 Search Solution Corporation
3  * Copyright 2016 CUBRID Corporation
4  *
5  * Licensed under the Apache License, Version 2.0 (the "License");
6  * you may not use this file except in compliance with the License.
7  * You may obtain a copy of the License at
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  *
17  */
18 
19 /*
20  * double_byte_support.c - parser supporting functions for double byte character set
21  */
22 
23 #ident "$Id$"
24 
25 #include "config.h"
26 
27 #include <stdio.h>
28 #include "parser.h"
29 #include "language_support.h"
30 
31 #define MAX_UNGET_SIZE 16
32 #define WSPACE_CHAR 0xa1a1
33 
34 #define DBCS_UNGET_RETURN(X, Y) \
35  do { *dbcs_Unget_buf++ = (X); return(Y); } while(0)
36 
37 #define DBCS_STATUS_RETURN(X, Y) \
38  do { dbcs_Input_status = X; return(Y); } while(0)
39 
40 #define DBCS_STATUS_UNGET_RETURN(X,Y,Z) \
41  do{ dbcs_Input_status = X; \
42  *dbcs_Unget_buf++ = (Y); return(Z); } while(0)
43 
44 #define DBCS_NEXT_CHAR(p) ((p->next_byte)(p))
45 
46 typedef enum
47 { CSQL_, /* In CSQL language */
48  CSQL_BEGIN_, /* Just before CSQL */
49  C_COMMENT_, /* Within C comment */
50  C_COMMENT_BEGIN_, /* Just before C comment */
51  SQL_COMMENT_, /* -- sql comment */
52  SQL_COMMENT_BEGIN_, /* just before sql comment */
53  CPP_COMMENT_, /* C++ comment */
54  CPP_COMMENT_BEGIN_, /* just before C++ comment */
55  DQS_, /* Double quoted string */
56  DQS_OCTAL_, /* Octal escape in DQS */
57  DQS_HEXA_, /* Hexa decimal escape in DQS */
60  DQS_DECIMAL_, /* Decimal escape in DQS */
61  DQS_TRANSPARENT_, /* DQS, but no check */
62  DQS_TRANSPARENT_2, /* has to skip two char */
63  SQS_, /* Single quoted string */
64  SQS_TRANSPARENT_ /* SQS, but no check */
66 
68 static int *dbcs_Unget_buf;
69 static unsigned int dbcs_Latter_byte;
70 static int dbcs_Latter_flag;
72 
74 static int dbcs_convert_w_char (int input_char);
76 static int dbcs_process_csql (PARSER_CONTEXT * parser, int converted_char);
77 static int dbcs_process_double_quote_string (PARSER_CONTEXT * parser, int input_char, int converted_char);
78 static int dbcs_process_double_quote_string_octal (PARSER_CONTEXT * parser, int input_char, int converted_char);
79 static int dbcs_process_double_quote_string_hexa (PARSER_CONTEXT * parser, int input_char, int converted_char);
80 static int dbcs_process_double_quote_string_decimal (PARSER_CONTEXT * parser, int input_char, int converted_char);
81 static int dbcs_process_single_quote_string (PARSER_CONTEXT * parser, int input_char, int converted_char);
82 static int dbcs_process_c_comment (PARSER_CONTEXT * parser, int input_char, int converted_char);
83 
84 /*
85  * dbcs_start_input () -
86  * return: none
87  */
88 void
90 {
91  dbcs_Unget_buf = DBCS_UNGET_BUF; /* Lead ahead buffer */
92  dbcs_Input_status = CSQL_; /* Scanning status */
93  dbcs_Latter_flag = 0; /* Two byte code */
94 }
95 
96 /*
97  * dbcs_get_next () - Read one byte
98  * return:
99  * parser(in):
100  */
101 int
103 {
104  int input_char;
105 
106  if (dbcs_Latter_flag)
107  {
108  dbcs_Latter_flag = 0;
109  input_char = dbcs_Latter_byte;
110  }
111  else
112  {
113  input_char = dbcs_get_next_token_wchar (parser);
114  if (input_char != EOF)
115  {
116  if (input_char & 0xff00)
117  {
118  dbcs_Latter_flag = 1;
119  dbcs_Latter_byte = input_char & 0x00ff;
120  input_char = ((input_char & 0xff00) >> 8);
121  }
122  else
123  {
124  input_char = (input_char & 0x00ff);
125  }
126  }
127  }
128 
129  return input_char;
130 }
131 
132 
133 /*
134  * dbcs_get_next_token_wchar () -
135  * return:
136  * parser(in):
137  */
138 static int
140 {
141  int converted_char;
142  int input_char;
143 
144  if ((input_char = dbcs_get_next_w_char (parser)) == EOF)
145  {
146  return (EOF);
147  }
148 
149  converted_char = dbcs_convert_w_char (input_char);
150 
151 
152  switch (dbcs_Input_status)
153  {
154  case CSQL_:
155  /* Scanning over CSQL part, not in comments or character string */
156  return (dbcs_process_csql (parser, converted_char));
157 
158  case CSQL_BEGIN_:
159  /* Typically, the last character of C-style comment */
160  DBCS_STATUS_RETURN (CSQL_, converted_char);
161 
162  case C_COMMENT_BEGIN_:
163  /* Scanning latter part of the beginning of C-style comment */
164  DBCS_STATUS_RETURN (C_COMMENT_, converted_char);
165 
166  case SQL_COMMENT_BEGIN_:
167  /* Scanning latter part of the beginning of SQL comment */
168  DBCS_STATUS_RETURN (SQL_COMMENT_, converted_char);
169 
170  case CPP_COMMENT_BEGIN_:
171  /* Scanning latter part of the C++ style comment */
172  DBCS_STATUS_RETURN (CPP_COMMENT_, converted_char);
173 
174  case SQL_COMMENT_:
175  case CPP_COMMENT_:
176  /* Scanning C++ style or SQL comment. Because termination condition is the same, we have common source lines */
177  if (converted_char == '\n')
178  {
179  DBCS_STATUS_RETURN (CSQL_, converted_char);
180  }
181  else
182  {
183  return (input_char);
184  }
185 
186  case C_COMMENT_:
187  /* Scanning C comments. */
188  return (dbcs_process_c_comment (parser, input_char, converted_char));
189 
190  case SQS_:
191  /* Scanning single quote character strings */
192  return (dbcs_process_single_quote_string (parser, input_char, converted_char));
193 
194  case SQS_TRANSPARENT_:
195  /* This happenes in the latter character of contiguous back slash */
196  DBCS_STATUS_RETURN (SQS_, input_char);
197 
198  case DQS_:
199  /* Scanning double quote character strings */
200  return (dbcs_process_double_quote_string (parser, input_char, converted_char));
201 
202  case DQS_TRANSPARENT_:
203  case DQS_TRANSPARENT_2:
204  /* This happenes in the latter characters of escape sequences in double quote string */
205  DBCS_STATUS_RETURN (DQS_, input_char);
206 
207  case DQS_OCTAL_:
208  /* Scanning octal escape sequence in double quote string */
209  return (dbcs_process_double_quote_string_octal (parser, input_char, converted_char));
210 
211  case DQS_HEXA_:
212  /* Scanning hexadecimal escape sequence in double quote string */
213  return (dbcs_process_double_quote_string_hexa (parser, input_char, converted_char));
214 
215  case DQS_HEXA_BEGIN_:
216  /* Scanning third character of hexadecimal escape sequence in double quote string, that is, x (\0x__) */
217  DBCS_STATUS_RETURN (DQS_HEXA_, converted_char);
218 
219  case DQS_HEXA_BEGIN_2:
220  /* Scanning second character of hexadecimal escape sequence in double quote string, that is, 0 (\0x__) */
221  DBCS_STATUS_RETURN (DQS_HEXA_BEGIN_, converted_char);
222 
223  case DQS_DECIMAL_:
224  /* Scanning decimal escape sequence in double quote string */
225  return (dbcs_process_double_quote_string_decimal (parser, input_char, converted_char));
226  }
227 
228  return converted_char;
229 }
230 
231 
232 /*
233  * dbcs_process_double_quote_string_hexa () - Scanning hexadecimal
234  * representation of escape sequence in double-quote string
235  * return:
236  * parser(in):
237  * input_char(in):
238  * converted_char(in):
239  */
240 static int
241 dbcs_process_double_quote_string_hexa (PARSER_CONTEXT * parser, int input_char, int converted_char)
242 {
243  if ((converted_char >= '0' && converted_char <= '9') || (converted_char >= 'A' && converted_char <= 'F')
244  || (converted_char >= 'a' && converted_char <= 'f'))
245  {
246  return (converted_char); /* Within the sequence */
247  }
248  else
249  {
250  /* Sequence terminated. Back to double quote string */
252  return (dbcs_process_double_quote_string (parser, input_char, converted_char));
253  }
254 }
255 
256 
257 /*
258  * dbcs_process_double_quote_string_decimal () - Scanning decomal
259  * representation of escape sequence in double-quote string
260  * return:
261  * parser(in):
262  * input_char(in):
263  * converted_char(in):
264  */
265 static int
266 dbcs_process_double_quote_string_decimal (PARSER_CONTEXT * parser, int input_char, int converted_char)
267 {
268  if (converted_char >= '0' && converted_char <= '9')
269  {
270  return (converted_char); /* Within the sequence */
271  }
272  else
273  {
274  /* Sequence terminated. Back to double quote string */
276  return (dbcs_process_double_quote_string (parser, input_char, converted_char));
277  }
278 }
279 
280 
281 /*
282  * dbcs_process_double_quote_string_octal () - Scanning octal representation
283  * of escape sequence in double-quote string
284  * return:
285  * parser(in):
286  * input_char(in):
287  * converted_char(in):
288  */
289 static int
290 dbcs_process_double_quote_string_octal (PARSER_CONTEXT * parser, int input_char, int converted_char)
291 {
292  if (converted_char >= '0' && converted_char <= '7')
293  {
294  return (converted_char); /* Within the sequence */
295  }
296  else
297  {
298  /* Sequence terminated. Back to double quote string */
300  return (dbcs_process_double_quote_string (parser, input_char, converted_char));
301  }
302 }
303 
304 
305 /*
306  * dbcs_process_double_quote_string () - Scan Double-quote string
307  * return:
308  * parser(in):
309  * input_char(in):
310  * converted_char(in):
311  *
312  * Note :
313  * When double quote is found in double-quote string,
314  * there are three possibilities:
315  * 1) Termination of the string,
316  * 2) Escape sequence for double quote itself.
317  * 3) Error token.
318  */
319 static int
320 dbcs_process_double_quote_string (PARSER_CONTEXT * parser, int input_char, int converted_char)
321 {
322  switch (converted_char)
323  {
324  case '"':
325  {
326  int c1, c1_c;
327 
328  if ((c1 = dbcs_get_next_w_char (parser)) == EOF)
329  {
330  DBCS_STATUS_UNGET_RETURN (CSQL_, c1, converted_char);
331  }
332 
333  switch (c1_c = dbcs_convert_w_char (c1))
334  {
335  case '"':
336  /*
337  * Contiguous double quote. Then we may scann double quote
338  * string later still.
339  */
340  if (input_char == c1)
341  {
342  if (input_char == '"')
343  {
344  /*
345  * Single byte double quote. Then, latter half character
346  * has to be scanned next time.
347  */
348  DBCS_STATUS_UNGET_RETURN (DQS_TRANSPARENT_, c1, input_char);
349  }
350  else
351  {
352  /*
353  * Double byte double quote. Main scanner does not require
354  * escape sequence to accept this.
355  */
356  return (input_char);
357  }
358  }
359  else
360  {
361  /* Contiguous double quote */
362  DBCS_STATUS_UNGET_RETURN (CSQL_, c1, converted_char);
363  }
364  default:
365  /*
366  * Double quote did not appear after the double quote. Then
367  * terminate double quote string status and go back to CSQL
368  * statement status.
369  */
370  DBCS_STATUS_UNGET_RETURN (CSQL_, c1, converted_char);
371  }
372  }
373  case '\\':
374  { /* back slash escapement */
375  int c1, c1_c;
376 
377  if ((c1 = dbcs_get_next_w_char (parser)) == EOF)
378  {
379  DBCS_UNGET_RETURN (c1, converted_char);
380  }
381 
382  switch (c1_c = dbcs_convert_w_char (c1))
383  {
384  case '\n':
385  case 'a':
386  case 'b':
387  case 'f':
388  case 'n':
389  case 'r':
390  case 't':
391  case 'v':
392  /* standard escapes to represent control characters */
393  DBCS_STATUS_UNGET_RETURN (DQS_TRANSPARENT_, c1_c, converted_char);
394 
395  case '?':
396  if (c1 == '?')
397  {
398  DBCS_STATUS_UNGET_RETURN (DQS_TRANSPARENT_, c1, converted_char);
399  }
400  else
401  {
402  return (c1);
403  }
404 
405  case '\'':
406  if (c1 == '\'')
407  {
408  DBCS_STATUS_UNGET_RETURN (DQS_TRANSPARENT_, c1_c, converted_char);
409  }
410  else
411  {
412  return (c1);
413  }
414 
415  case '"':
416  if (c1 == '"')
417  {
418  DBCS_STATUS_UNGET_RETURN (DQS_TRANSPARENT_, c1_c, converted_char);
419  }
420  else
421  {
422  return (c1);
423  }
424 
425  case '0':
426  { /* Hexadecimal or Octal escape */
427  int c2, c2_c;
428 
429  if ((c2 = dbcs_get_next_w_char (parser)) == EOF)
430  {
431  /* Unexpected end of sequence */
432  *dbcs_Unget_buf++ = c2;
433  DBCS_STATUS_UNGET_RETURN (DQS_OCTAL_, c1, converted_char);
434  }
435 
436  switch (c2_c = dbcs_convert_w_char (c2))
437  {
438  case 'x':
439  /* Begen hexadecimal representation */
440  *dbcs_Unget_buf++ = c2;
441  DBCS_STATUS_UNGET_RETURN (DQS_HEXA_BEGIN_2, c1, converted_char);
442  default:
443  /* Begin Octal representation */
444  *dbcs_Unget_buf++ = c2;
445  DBCS_STATUS_UNGET_RETURN (DQS_OCTAL_, c1, converted_char);
446  }
447  }
448 
449  default:
450  /* Decimal escape or Self escape to ASCII character */
451  if (c1_c >= 1 && c1_c <= 9)
452  {
453  DBCS_STATUS_UNGET_RETURN (DQS_DECIMAL_, c1, converted_char);
454  }
455 
456  if ((c1 & 0xff00) != 0)
457  {
458  return (c1);
459  }
460  else
461  {
462  DBCS_STATUS_UNGET_RETURN (DQS_TRANSPARENT_, c1, converted_char);
463  }
464  }
465  default:
466  return (input_char);
467  }
468  }
469 }
470 
471 
472 
473 /*
474  * dbcs_process_single_quote_string () - Scanning single quote character string
475  * return:
476  * parser(in):
477  * input_char(in):
478  * converted_char(in):
479  */
480 static int
481 dbcs_process_single_quote_string (PARSER_CONTEXT * parser, int input_char, int converted_char)
482 {
483  switch (converted_char)
484  {
485  case '\'':
486  { /* detect some escape sequences */
487  int c1, c1_c;
488 
489  if ((c1 = dbcs_get_next_w_char (parser)) == EOF)
490  {
491  DBCS_STATUS_UNGET_RETURN (CSQL_, c1, converted_char);
492  }
493 
494  switch (c1_c = dbcs_convert_w_char (c1))
495  {
496  case '\'':
497  /* Escape for ASCII-coded single quote or Termination of quote */
498  if (input_char == c1)
499  {
500  if (input_char == '\'')
501  {
502  DBCS_STATUS_UNGET_RETURN (SQS_TRANSPARENT_, c1, input_char);
503  }
504  else
505  {
506  return (input_char);
507  }
508  }
509  else
510  {
511  DBCS_STATUS_UNGET_RETURN (CSQL_, c1, converted_char);
512  }
513 
514  default:
515  /* Terminate single quote string */
516  DBCS_STATUS_UNGET_RETURN (CSQL_, c1, converted_char);
517  }
518  }
519 
520  case '\\':
521  {
522  /* In single quote string, backslash escape is used only to delimit lengthy string with new line. */
523  int c1;
524 
525  if ((c1 = dbcs_get_next_w_char (parser)) == EOF)
526  {
527  DBCS_UNGET_RETURN (c1, converted_char);
528  }
529 
530  if (c1 == '\n')
531  {
532  DBCS_UNGET_RETURN (c1, converted_char);
533  }
534  else
535  {
536  DBCS_UNGET_RETURN (c1, input_char);
537  }
538  }
539 
540  default:
541  /* Other character is simple character in string */
542  return (input_char);
543  }
544 }
545 
546 
547 /*
548  * dbcs_process_c_comment () - Scanning C-style comment
549  * return:
550  * parser(in):
551  * input_char(in):
552  * converted_char(in):
553  */
554 static int
555 dbcs_process_c_comment (PARSER_CONTEXT * parser, int input_char, int converted_char)
556 {
557  switch (converted_char)
558  {
559  case '*':
560  {
561  int c1, c1_c;
562 
563  if ((c1 = dbcs_get_next_w_char (parser)) == EOF)
564  {
565  DBCS_UNGET_RETURN (c1, converted_char);
566  }
567 
568  if ((c1_c = dbcs_convert_w_char (c1)) == '/')
569  {
570  /*
571  * Because this is the end of the C-comment, converted value is
572  * returned so that this is recognized by the parser correctly.
573  */
574  *dbcs_Unget_buf++ = c1_c;
575  DBCS_STATUS_RETURN (CSQL_BEGIN_, converted_char);
576  }
577 
578  /*
579  * Because this is a part of comment, input character is returned
580  * without conversion.
581  */
582  DBCS_UNGET_RETURN (c1, input_char);
583  }
584 
585  default:
586  return (input_char);
587  }
588 }
589 
590 
591 /*
592  * dbcs_process_csql () - Scanning CSQL language body
593  * return:
594  * parser(in):
595  * converted_char(in):
596  */
597 static int
598 dbcs_process_csql (PARSER_CONTEXT * parser, int converted_char)
599 {
600  switch (converted_char)
601  {
602  case '"': /* Start Double quoted string " ... " */
604  return (converted_char);
605 
606  case '\'': /* Start Single quoted string ' ... ' */
608  return (converted_char);
609 
610  case '-': /* Maybe start of SQL comment '-- ... ' */
611  {
612  int c1, c1_c;
613 
614  if ((c1 = dbcs_get_next_w_char (parser)) == EOF)
615  {
616  DBCS_UNGET_RETURN (c1, converted_char);
617  }
618 
619  if ((c1_c = dbcs_convert_w_char (c1)) == '-')
620  {
622  }
623  DBCS_UNGET_RETURN (c1, converted_char);
624  }
625 
626  case '/': /* Maybe C++ comment or C comment */
627  {
628  int c1, c1_c;
629 
630  if ((c1 = dbcs_get_next_w_char (parser)) == EOF)
631  {
632  DBCS_UNGET_RETURN (c1, converted_char);
633  }
634 
635  switch (c1_c = dbcs_convert_w_char (c1))
636  {
637  case '/': /* C++ comment */
639  DBCS_UNGET_RETURN (c1, converted_char);
640 
641  case '*': /* C comment */
643  DBCS_UNGET_RETURN (c1, converted_char);
644 
645  default:
646  DBCS_UNGET_RETURN (c1, converted_char);
647  }
648  }
649 
650  default:
651  return (converted_char);
652  }
653 }
654 
655 
656 /*
657  * dbcs_get_next_w_char () - Read one character (not byte)
658  * return:
659  * parser(in):
660  */
661 static int
663 {
664  int input_char;
665  int return_char;
666 
668  {
669  return_char = *--dbcs_Unget_buf;
670  }
671  else if ((input_char = DBCS_NEXT_CHAR (parser)) == EOF)
672  {
673  return_char = EOF;
674  }
675  else
676  {
677  if ((input_char & 0x80) == 0)
678  {
679  return_char = input_char;
680  }
681  else
682  {
683  int c1;
684 
685  if ((c1 = DBCS_NEXT_CHAR (parser)) == EOF)
686  {
687  return_char = EOF;
688  }
689  return_char = ((input_char & 0xff) << 8) | c1;
690  }
691  }
692 
693  return (return_char);
694 }
695 
696 
697 /*
698  * dbcs_convert_w_char () - convert wide character into ASCII
699  * return:
700  * input_char(in):
701  */
702 static int
703 dbcs_convert_w_char (int input_char)
704 {
707  {
708  return (input_char);
709  }
710  else
711  {
712  if (input_char == WSPACE_CHAR)
713  {
714  *dbcs_Unget_buf++ = 0x20;
715  return (0x20);
716  }
717 
718  return (input_char);
719  }
720 }
#define WSPACE_CHAR
static int dbcs_process_double_quote_string_decimal(PARSER_CONTEXT *parser, int input_char, int converted_char)
#define DBCS_STATUS_RETURN(X, Y)
DBCS_INPUT_STATUS
static DBCS_INPUT_STATUS dbcs_Input_status
#define MAX_UNGET_SIZE
#define DBCS_UNGET_RETURN(X, Y)
static int DBCS_UNGET_BUF[MAX_UNGET_SIZE]
static int dbcs_process_double_quote_string_octal(PARSER_CONTEXT *parser, int input_char, int converted_char)
#define DBCS_NEXT_CHAR(p)
SP_PARSER_CTX * parser
#define DBCS_STATUS_UNGET_RETURN(X, Y, Z)
void dbcs_start_input(void)
static int * dbcs_Unget_buf
static unsigned int dbcs_Latter_byte
static int dbcs_process_c_comment(PARSER_CONTEXT *parser, int input_char, int converted_char)
static int dbcs_process_double_quote_string(PARSER_CONTEXT *parser, int input_char, int converted_char)
static int dbcs_process_csql(PARSER_CONTEXT *parser, int converted_char)
static int dbcs_Latter_flag
static int dbcs_process_single_quote_string(PARSER_CONTEXT *parser, int input_char, int converted_char)
static int dbcs_get_next_w_char(PARSER_CONTEXT *parser)
int dbcs_get_next(PARSER_CONTEXT *parser)
static int dbcs_convert_w_char(int input_char)
static int dbcs_process_double_quote_string_hexa(PARSER_CONTEXT *parser, int input_char, int converted_char)
static int dbcs_get_next_token_wchar(PARSER_CONTEXT *parser)