File double_byte_support.c¶
File List > cubrid > src > parser > double_byte_support.c
Go to the documentation of this file
/*
* Copyright 2008 Search Solution Corporation
* Copyright 2016 CUBRID Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*/
/*
* double_byte_support.c - parser supporting functions for double byte character set
*/
#ident "$Id$"
#include "config.h"
#include <stdio.h>
#include "parser.h"
#include "language_support.h"
#define MAX_UNGET_SIZE 16
#define WSPACE_CHAR 0xa1a1
#define DBCS_UNGET_RETURN(X, Y) \
do { *dbcs_Unget_buf++ = (X); return(Y); } while(0)
#define DBCS_STATUS_RETURN(X, Y) \
do { dbcs_Input_status = X; return(Y); } while(0)
#define DBCS_STATUS_UNGET_RETURN(X,Y,Z) \
do{ dbcs_Input_status = X; \
*dbcs_Unget_buf++ = (Y); return(Z); } while(0)
#define DBCS_NEXT_CHAR(p) ((p->next_byte)(p))
typedef enum
{ CSQL_, /* In CSQL language */
CSQL_BEGIN_, /* Just before CSQL */
C_COMMENT_, /* Within C comment */
C_COMMENT_BEGIN_, /* Just before C comment */
SQL_COMMENT_, /* -- sql comment */
SQL_COMMENT_BEGIN_, /* just before sql comment */
CPP_COMMENT_, /* C++ comment */
CPP_COMMENT_BEGIN_, /* just before C++ comment */
DQS_, /* Double quoted string */
DQS_OCTAL_, /* Octal escape in DQS */
DQS_HEXA_, /* Hexa decimal escape in DQS */
DQS_HEXA_BEGIN_, /* */
DQS_HEXA_BEGIN_2, /* */
DQS_DECIMAL_, /* Decimal escape in DQS */
DQS_TRANSPARENT_, /* DQS, but no check */
DQS_TRANSPARENT_2, /* has to skip two char */
SQS_, /* Single quoted string */
SQS_TRANSPARENT_ /* SQS, but no check */
} DBCS_INPUT_STATUS;
static int DBCS_UNGET_BUF[MAX_UNGET_SIZE];
static int *dbcs_Unget_buf;
static unsigned int dbcs_Latter_byte;
static int dbcs_Latter_flag;
static DBCS_INPUT_STATUS dbcs_Input_status;
static int dbcs_get_next_w_char (PARSER_CONTEXT * parser);
static int dbcs_convert_w_char (int input_char);
static int dbcs_get_next_token_wchar (PARSER_CONTEXT * parser);
static int dbcs_process_csql (PARSER_CONTEXT * parser, int converted_char);
static int dbcs_process_double_quote_string (PARSER_CONTEXT * parser, int input_char, int converted_char);
static int dbcs_process_double_quote_string_octal (PARSER_CONTEXT * parser, int input_char, int converted_char);
static int dbcs_process_double_quote_string_hexa (PARSER_CONTEXT * parser, int input_char, int converted_char);
static int dbcs_process_double_quote_string_decimal (PARSER_CONTEXT * parser, int input_char, int converted_char);
static int dbcs_process_single_quote_string (PARSER_CONTEXT * parser, int input_char, int converted_char);
static int dbcs_process_c_comment (PARSER_CONTEXT * parser, int input_char, int converted_char);
/*
* dbcs_start_input () -
* return: none
*/
void
dbcs_start_input (void)
{
dbcs_Unget_buf = DBCS_UNGET_BUF; /* Lead ahead buffer */
dbcs_Input_status = CSQL_; /* Scanning status */
dbcs_Latter_flag = 0; /* Two byte code */
}
/*
* dbcs_get_next () - Read one byte
* return:
* parser(in):
*/
int
dbcs_get_next (PARSER_CONTEXT * parser)
{
int input_char;
if (dbcs_Latter_flag)
{
dbcs_Latter_flag = 0;
input_char = dbcs_Latter_byte;
}
else
{
input_char = dbcs_get_next_token_wchar (parser);
if (input_char != EOF)
{
if (input_char & 0xff00)
{
dbcs_Latter_flag = 1;
dbcs_Latter_byte = input_char & 0x00ff;
input_char = ((input_char & 0xff00) >> 8);
}
else
{
input_char = (input_char & 0x00ff);
}
}
}
return input_char;
}
/*
* dbcs_get_next_token_wchar () -
* return:
* parser(in):
*/
static int
dbcs_get_next_token_wchar (PARSER_CONTEXT * parser)
{
int converted_char;
int input_char;
if ((input_char = dbcs_get_next_w_char (parser)) == EOF)
{
return (EOF);
}
converted_char = dbcs_convert_w_char (input_char);
switch (dbcs_Input_status)
{
case CSQL_:
/* Scanning over CSQL part, not in comments or character string */
return (dbcs_process_csql (parser, converted_char));
case CSQL_BEGIN_:
/* Typically, the last character of C-style comment */
DBCS_STATUS_RETURN (CSQL_, converted_char);
case C_COMMENT_BEGIN_:
/* Scanning latter part of the beginning of C-style comment */
DBCS_STATUS_RETURN (C_COMMENT_, converted_char);
case SQL_COMMENT_BEGIN_:
/* Scanning latter part of the beginning of SQL comment */
DBCS_STATUS_RETURN (SQL_COMMENT_, converted_char);
case CPP_COMMENT_BEGIN_:
/* Scanning latter part of the C++ style comment */
DBCS_STATUS_RETURN (CPP_COMMENT_, converted_char);
case SQL_COMMENT_:
case CPP_COMMENT_:
/* Scanning C++ style or SQL comment. Because termination condition is the same, we have common source lines */
if (converted_char == '\n')
{
DBCS_STATUS_RETURN (CSQL_, converted_char);
}
else
{
return (input_char);
}
case C_COMMENT_:
/* Scanning C comments. */
return (dbcs_process_c_comment (parser, input_char, converted_char));
case SQS_:
/* Scanning single quote character strings */
return (dbcs_process_single_quote_string (parser, input_char, converted_char));
case SQS_TRANSPARENT_:
/* This happenes in the latter character of contiguous back slash */
DBCS_STATUS_RETURN (SQS_, input_char);
case DQS_:
/* Scanning double quote character strings */
return (dbcs_process_double_quote_string (parser, input_char, converted_char));
case DQS_TRANSPARENT_:
case DQS_TRANSPARENT_2:
/* This happenes in the latter characters of escape sequences in double quote string */
DBCS_STATUS_RETURN (DQS_, input_char);
case DQS_OCTAL_:
/* Scanning octal escape sequence in double quote string */
return (dbcs_process_double_quote_string_octal (parser, input_char, converted_char));
case DQS_HEXA_:
/* Scanning hexadecimal escape sequence in double quote string */
return (dbcs_process_double_quote_string_hexa (parser, input_char, converted_char));
case DQS_HEXA_BEGIN_:
/* Scanning third character of hexadecimal escape sequence in double quote string, that is, x (\0x__) */
DBCS_STATUS_RETURN (DQS_HEXA_, converted_char);
case DQS_HEXA_BEGIN_2:
/* Scanning second character of hexadecimal escape sequence in double quote string, that is, 0 (\0x__) */
DBCS_STATUS_RETURN (DQS_HEXA_BEGIN_, converted_char);
case DQS_DECIMAL_:
/* Scanning decimal escape sequence in double quote string */
return (dbcs_process_double_quote_string_decimal (parser, input_char, converted_char));
}
return converted_char;
}
/*
* dbcs_process_double_quote_string_hexa () - Scanning hexadecimal
* representation of escape sequence in double-quote string
* return:
* parser(in):
* input_char(in):
* converted_char(in):
*/
static int
dbcs_process_double_quote_string_hexa (PARSER_CONTEXT * parser, int input_char, int converted_char)
{
if ((converted_char >= '0' && converted_char <= '9') || (converted_char >= 'A' && converted_char <= 'F')
|| (converted_char >= 'a' && converted_char <= 'f'))
{
return (converted_char); /* Within the sequence */
}
else
{
/* Sequence terminated. Back to double quote string */
dbcs_Input_status = DQS_;
return (dbcs_process_double_quote_string (parser, input_char, converted_char));
}
}
/*
* dbcs_process_double_quote_string_decimal () - Scanning decomal
* representation of escape sequence in double-quote string
* return:
* parser(in):
* input_char(in):
* converted_char(in):
*/
static int
dbcs_process_double_quote_string_decimal (PARSER_CONTEXT * parser, int input_char, int converted_char)
{
if (converted_char >= '0' && converted_char <= '9')
{
return (converted_char); /* Within the sequence */
}
else
{
/* Sequence terminated. Back to double quote string */
dbcs_Input_status = DQS_;
return (dbcs_process_double_quote_string (parser, input_char, converted_char));
}
}
/*
* dbcs_process_double_quote_string_octal () - Scanning octal representation
* of escape sequence in double-quote string
* return:
* parser(in):
* input_char(in):
* converted_char(in):
*/
static int
dbcs_process_double_quote_string_octal (PARSER_CONTEXT * parser, int input_char, int converted_char)
{
if (converted_char >= '0' && converted_char <= '7')
{
return (converted_char); /* Within the sequence */
}
else
{
/* Sequence terminated. Back to double quote string */
dbcs_Input_status = DQS_;
return (dbcs_process_double_quote_string (parser, input_char, converted_char));
}
}
/*
* dbcs_process_double_quote_string () - Scan Double-quote string
* return:
* parser(in):
* input_char(in):
* converted_char(in):
*
* Note :
* When double quote is found in double-quote string,
* there are three possibilities:
* 1) Termination of the string,
* 2) Escape sequence for double quote itself.
* 3) Error token.
*/
static int
dbcs_process_double_quote_string (PARSER_CONTEXT * parser, int input_char, int converted_char)
{
switch (converted_char)
{
case '"':
{
int c1, c1_c;
if ((c1 = dbcs_get_next_w_char (parser)) == EOF)
{
DBCS_STATUS_UNGET_RETURN (CSQL_, c1, converted_char);
}
switch (c1_c = dbcs_convert_w_char (c1))
{
case '"':
/*
* Contiguous double quote. Then we may scann double quote
* string later still.
*/
if (input_char == c1)
{
if (input_char == '"')
{
/*
* Single byte double quote. Then, latter half character
* has to be scanned next time.
*/
DBCS_STATUS_UNGET_RETURN (DQS_TRANSPARENT_, c1, input_char);
}
else
{
/*
* Double byte double quote. Main scanner does not require
* escape sequence to accept this.
*/
return (input_char);
}
}
else
{
/* Contiguous double quote */
DBCS_STATUS_UNGET_RETURN (CSQL_, c1, converted_char);
}
default:
/*
* Double quote did not appear after the double quote. Then
* terminate double quote string status and go back to CSQL
* statement status.
*/
DBCS_STATUS_UNGET_RETURN (CSQL_, c1, converted_char);
}
}
case '\\':
{ /* back slash escapement */
int c1, c1_c;
if ((c1 = dbcs_get_next_w_char (parser)) == EOF)
{
DBCS_UNGET_RETURN (c1, converted_char);
}
switch (c1_c = dbcs_convert_w_char (c1))
{
case '\n':
case 'a':
case 'b':
case 'f':
case 'n':
case 'r':
case 't':
case 'v':
/* standard escapes to represent control characters */
DBCS_STATUS_UNGET_RETURN (DQS_TRANSPARENT_, c1_c, converted_char);
case '?':
if (c1 == '?')
{
DBCS_STATUS_UNGET_RETURN (DQS_TRANSPARENT_, c1, converted_char);
}
else
{
return (c1);
}
case '\'':
if (c1 == '\'')
{
DBCS_STATUS_UNGET_RETURN (DQS_TRANSPARENT_, c1_c, converted_char);
}
else
{
return (c1);
}
case '"':
if (c1 == '"')
{
DBCS_STATUS_UNGET_RETURN (DQS_TRANSPARENT_, c1_c, converted_char);
}
else
{
return (c1);
}
case '0':
{ /* Hexadecimal or Octal escape */
int c2, c2_c;
if ((c2 = dbcs_get_next_w_char (parser)) == EOF)
{
/* Unexpected end of sequence */
*dbcs_Unget_buf++ = c2;
DBCS_STATUS_UNGET_RETURN (DQS_OCTAL_, c1, converted_char);
}
switch (c2_c = dbcs_convert_w_char (c2))
{
case 'x':
/* Begen hexadecimal representation */
*dbcs_Unget_buf++ = c2;
DBCS_STATUS_UNGET_RETURN (DQS_HEXA_BEGIN_2, c1, converted_char);
default:
/* Begin Octal representation */
*dbcs_Unget_buf++ = c2;
DBCS_STATUS_UNGET_RETURN (DQS_OCTAL_, c1, converted_char);
}
}
default:
/* Decimal escape or Self escape to ASCII character */
if (c1_c >= 1 && c1_c <= 9)
{
DBCS_STATUS_UNGET_RETURN (DQS_DECIMAL_, c1, converted_char);
}
if ((c1 & 0xff00) != 0)
{
return (c1);
}
else
{
DBCS_STATUS_UNGET_RETURN (DQS_TRANSPARENT_, c1, converted_char);
}
}
default:
return (input_char);
}
}
}
/*
* dbcs_process_single_quote_string () - Scanning single quote character string
* return:
* parser(in):
* input_char(in):
* converted_char(in):
*/
static int
dbcs_process_single_quote_string (PARSER_CONTEXT * parser, int input_char, int converted_char)
{
switch (converted_char)
{
case '\'':
{ /* detect some escape sequences */
int c1, c1_c;
if ((c1 = dbcs_get_next_w_char (parser)) == EOF)
{
DBCS_STATUS_UNGET_RETURN (CSQL_, c1, converted_char);
}
switch (c1_c = dbcs_convert_w_char (c1))
{
case '\'':
/* Escape for ASCII-coded single quote or Termination of quote */
if (input_char == c1)
{
if (input_char == '\'')
{
DBCS_STATUS_UNGET_RETURN (SQS_TRANSPARENT_, c1, input_char);
}
else
{
return (input_char);
}
}
else
{
DBCS_STATUS_UNGET_RETURN (CSQL_, c1, converted_char);
}
default:
/* Terminate single quote string */
DBCS_STATUS_UNGET_RETURN (CSQL_, c1, converted_char);
}
}
case '\\':
{
/* In single quote string, backslash escape is used only to delimit lengthy string with new line. */
int c1;
if ((c1 = dbcs_get_next_w_char (parser)) == EOF)
{
DBCS_UNGET_RETURN (c1, converted_char);
}
if (c1 == '\n')
{
DBCS_UNGET_RETURN (c1, converted_char);
}
else
{
DBCS_UNGET_RETURN (c1, input_char);
}
}
default:
/* Other character is simple character in string */
return (input_char);
}
}
/*
* dbcs_process_c_comment () - Scanning C-style comment
* return:
* parser(in):
* input_char(in):
* converted_char(in):
*/
static int
dbcs_process_c_comment (PARSER_CONTEXT * parser, int input_char, int converted_char)
{
switch (converted_char)
{
case '*':
{
int c1, c1_c;
if ((c1 = dbcs_get_next_w_char (parser)) == EOF)
{
DBCS_UNGET_RETURN (c1, converted_char);
}
if ((c1_c = dbcs_convert_w_char (c1)) == '/')
{
/*
* Because this is the end of the C-comment, converted value is
* returned so that this is recognized by the parser correctly.
*/
*dbcs_Unget_buf++ = c1_c;
DBCS_STATUS_RETURN (CSQL_BEGIN_, converted_char);
}
/*
* Because this is a part of comment, input character is returned
* without conversion.
*/
DBCS_UNGET_RETURN (c1, input_char);
}
default:
return (input_char);
}
}
/*
* dbcs_process_csql () - Scanning CSQL language body
* return:
* parser(in):
* converted_char(in):
*/
static int
dbcs_process_csql (PARSER_CONTEXT * parser, int converted_char)
{
switch (converted_char)
{
case '"': /* Start Double quoted string " ... " */
dbcs_Input_status = DQS_;
return (converted_char);
case '\'': /* Start Single quoted string ' ... ' */
dbcs_Input_status = SQS_;
return (converted_char);
case '-': /* Maybe start of SQL comment '-- ... ' */
{
int c1, c1_c;
if ((c1 = dbcs_get_next_w_char (parser)) == EOF)
{
DBCS_UNGET_RETURN (c1, converted_char);
}
if ((c1_c = dbcs_convert_w_char (c1)) == '-')
{
dbcs_Input_status = SQL_COMMENT_BEGIN_;
}
DBCS_UNGET_RETURN (c1, converted_char);
}
case '/': /* Maybe C++ comment or C comment */
{
int c1, c1_c;
if ((c1 = dbcs_get_next_w_char (parser)) == EOF)
{
DBCS_UNGET_RETURN (c1, converted_char);
}
switch (c1_c = dbcs_convert_w_char (c1))
{
case '/': /* C++ comment */
dbcs_Input_status = CPP_COMMENT_BEGIN_;
DBCS_UNGET_RETURN (c1, converted_char);
case '*': /* C comment */
dbcs_Input_status = C_COMMENT_BEGIN_;
DBCS_UNGET_RETURN (c1, converted_char);
default:
DBCS_UNGET_RETURN (c1, converted_char);
}
}
default:
return (converted_char);
}
}
/*
* dbcs_get_next_w_char () - Read one character (not byte)
* return:
* parser(in):
*/
static int
dbcs_get_next_w_char (PARSER_CONTEXT * parser)
{
int input_char;
int return_char;
if (dbcs_Unget_buf != DBCS_UNGET_BUF)
{
return_char = *--dbcs_Unget_buf;
}
else if ((input_char = DBCS_NEXT_CHAR (parser)) == EOF)
{
return_char = EOF;
}
else
{
if ((input_char & 0x80) == 0)
{
return_char = input_char;
}
else
{
int c1;
if ((c1 = DBCS_NEXT_CHAR (parser)) == EOF)
{
return_char = EOF;
}
return_char = ((input_char & 0xff) << 8) | c1;
}
}
return (return_char);
}
/*
* dbcs_convert_w_char () - convert wide character into ASCII
* return:
* input_char(in):
*/
static int
dbcs_convert_w_char (int input_char)
{
if (dbcs_Input_status == DQS_ || dbcs_Input_status == DQS_TRANSPARENT_ || dbcs_Input_status == DQS_TRANSPARENT_2
|| dbcs_Input_status == SQS_ || dbcs_Input_status == SQS_TRANSPARENT_)
{
return (input_char);
}
else
{
if (input_char == WSPACE_CHAR)
{
*dbcs_Unget_buf++ = 0x20;
return (0x20);
}
return (input_char);
}
}