Skip to content

File string_regex_std.cpp

File List > cubrid > src > query > string_regex_std.cpp

Go to the documentation of this file

/*
 *
 * Copyright 2016 CUBRID Corporation
 *
 *  Licensed under the Apache License, Version 2.0 (the "License");
 *  you may not use this file except in compliance with the License.
 *  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 *
 */

//
// string_regex_std - regular expression engine using C++ <regex> standard library
//
#include "string_regex.hpp"

#include "error_manager.h"
#include "locale_helper.hpp"
// XXX: SHOULD BE THE LAST INCLUDE HEADER
#include "memory_wrapper.hpp"

namespace cubregex
{
  std::string
  std_parse_regex_exception (std::regex_error &e)
  {
    std::string error_message;
    using namespace std::regex_constants;
    switch (e.code ())
      {
      case error_collate:
    error_message.assign ("regex_error(error_collate): the expression contains an invalid collating element name");
    break;
      case error_ctype:
    error_message.assign ("regex_error(error_ctype): the expression contains an invalid character class name");
    break;
      case error_escape:
    error_message.assign ("regex_error(error_escape): the expression contains an invalid escaped character or a trailing escape");
    break;
      case error_backref:
    error_message.assign ("regex_error(error_backref): the expression contains an invalid back reference");
    break;
      case error_brack:
    error_message.assign ("regex_error(error_brack): the expression contains mismatched square brackets ('[' and ']')");
    break;
      case error_paren:
    error_message.assign ("regex_error(error_paren): the expression contains mismatched parentheses ('(' and ')')");
    break;
      case error_brace:
    error_message.assign ("regex_error(error_brace): the expression contains mismatched curly braces ('{' and '}')");
    break;
      case error_badbrace:
    error_message.assign ("regex_error(error_badbrace): the expression contains an invalid range in a {} expression");
    break;
      case error_range:
    error_message.assign ("regex_error(error_range): the expression contains an invalid character range (e.g. [b-a])");
    break;
      case error_space:
    error_message.assign ("regex_error(error_space): there was not enough memory to convert the expression into a finite state machine");
    break;
      case error_badrepeat:
    error_message.assign ("regex_error(error_badrepeat): one of *?+{ was not preceded by a valid regular expression");
    break;
      case error_complexity:
    error_message.assign ("regex_error(error_complexity): the complexity of an attempted match exceeded a predefined level");
    break;
      case error_stack:
    error_message.assign ("regex_error(error_stack): there was not enough memory to perform a match");
    break;
      default:
    error_message.assign ("regex_error(error_unknown)");
    break;
      }
    return error_message;
  }

  std::regex_constants::syntax_option_type
  std_parse_match_type (const opt_flag_type &opt_type)
  {
    std::regex_constants::syntax_option_type reg_flags = std::regex_constants::ECMAScript |
    std::regex_constants::icase; // default
    if ((opt_type & OPT_ICASE) == 0) // OPT_ICASE is not set
      {
    reg_flags &= ~std::regex_constants::icase;
      }
    return reg_flags;
  }

  int std_compile (REFPTR (compiled_regex, cr), const LANG_COLLATION *collation)
  {
    int error_status = NO_ERROR;

    std::wstring pattern_wstring;
    if (cublocale::convert_utf8_to_wstring (pattern_wstring, cr->pattern) == false)
      {
    error_status = ER_QSTR_BAD_SRC_CODESET;
    return error_status;
      }

    try
      {
#if defined(WINDOWS)
    /* HACK: collating element features doesn't work well on Windows.
    *  And lookup_collatename is not invoked when regex pattern has collating element.
    *  It is hacky code finding collating element pattern and throw error.
    */
    wchar_t *collate_elem_pattern = L"[[.";
    int found = pattern_wstring.find ( std::wstring (collate_elem_pattern));
    if (found != std::wstring::npos)
      {
        throw std::regex_error (std::regex_constants::error_collate);
      }
#endif
    REFPTR (cub_std_regex, reg) = cr->compiled->std_obj = new cub_std_regex ();
    std::locale loc = cublocale::get_locale (std::string ("utf-8"), cublocale::get_lang_name (collation));
    reg->imbue (loc);
    reg->assign (pattern_wstring, std_parse_match_type (cr->flags));
      }
    catch (std::regex_error &e)
      {
    // regex compilation exception
    error_status = ER_REGEX_COMPILE_ERROR;
    std::string error_message = std_parse_regex_exception (e);
    er_set (ER_ERROR_SEVERITY, ARG_FILE_LINE, error_status, 1, error_message.c_str ());
      }

    return error_status;
  }

  int std_search (int &result, const compiled_regex &reg, const std::string &src)
  {
    int error_status = NO_ERROR;
    bool is_matched = false;

    std::wstring src_wstring;
    if (cublocale::convert_utf8_to_wstring (src_wstring, src) == false)
      {
    error_status = ER_QSTR_BAD_SRC_CODESET;
    result = V_FALSE;
    return error_status;
      }

    try
      {
    cub_std_regex &std_reg = GET_STD_OBJ (reg);

#if defined(WINDOWS)
    /* HACK: case insensitive doesn't work well on Windows.
    *  This code transforms source string into lowercase
    *  and perform searching regular expression pattern.
    */
    if (std_reg.flags() & std::regex_constants::icase)
      {
        std::wstring src_lower;
        src_lower.resize (src_wstring.size ());
        std::transform (src_wstring.begin(), src_wstring.end(), src_lower.begin(), ::towlower);
        is_matched = std::regex_search (src_lower, std_reg);
      }
    else
      {
        is_matched = std::regex_search (src_wstring, std_reg);
      }
#else
    is_matched = std::regex_search (src_wstring, std_reg);
#endif
      }
    catch (std::regex_error &e)
      {
    // regex execution exception
    is_matched = false;
    error_status = ER_REGEX_EXEC_ERROR;
    std::string error_message = std_parse_regex_exception (e);
    er_set (ER_ERROR_SEVERITY, ARG_FILE_LINE, error_status, 1, error_message.c_str ());
      }

    result = is_matched ? V_TRUE : V_FALSE;
    return error_status;
  }

  int std_count (int &result, const compiled_regex &reg, const std::string &src, const int position)
  {
    assert (position >= 0);

    int error_status = NO_ERROR;

    std::wstring src_wstring;
    if (cublocale::convert_utf8_to_wstring (src_wstring, src) == false)
      {
    error_status = ER_QSTR_BAD_SRC_CODESET;
    return error_status;
      }

    /* split source string by position value */
    std::wstring target (
        src_wstring.substr (position, src_wstring.size () - position)
    );

    cub_std_regex &std_reg = GET_STD_OBJ (reg);

#if defined(WINDOWS)
    /* HACK: case insensitive doesn't work well on Windows.
    *  This code transforms source string into lowercase
    *  and perform searching regular expression pattern.
    */
    std::wstring target_lower;
    if (std_reg.flags() & std::regex_constants::icase)
      {
    target_lower.resize (target.size ());
    std::transform (target.begin(), target.end(), target_lower.begin(), ::towlower);
      }
    else
      {
    target_lower = target;
      }
#endif

    result = 0;
    try
      {
#if defined(WINDOWS)
    auto reg_iter = cub_std_regex_iterator (target_lower.begin (), target_lower.end (), std_reg);
#else
    auto reg_iter = cub_std_regex_iterator (target.begin (), target.end (), std_reg);
#endif
    auto reg_end = cub_std_regex_iterator ();
    result = std::distance (reg_iter, reg_end);
      }
    catch (std::regex_error &e)
      {
    // regex execution exception, error_complexity or error_stack
    error_status = ER_REGEX_EXEC_ERROR;
    std::string error_message = cubregex::std_parse_regex_exception (e);
    er_set (ER_ERROR_SEVERITY, ARG_FILE_LINE, error_status, 1, error_message.c_str ());
      }
    return error_status;
  }

  int std_instr (int &result, const compiled_regex &reg, const std::string &src,
         const int position, const int occurrence, const int return_opt)
  {
    assert (position >= 0);
    assert (occurrence >= 1);

    int error_status = NO_ERROR;

    std::wstring src_wstring;
    if (cublocale::convert_utf8_to_wstring (src_wstring, src) == false)
      {
    error_status = ER_QSTR_BAD_SRC_CODESET;
    return error_status;
      }

    /* split source string by position value */
    std::wstring target (
        src_wstring.substr (position, src_wstring.size () - position)
    );

    cub_std_regex &std_reg = GET_STD_OBJ (reg);
#if defined(WINDOWS)
    /* HACK: case insensitive doesn't work well on Windows.
    *  This code transforms source string into lowercase
    *  and perform searching regular expression pattern.
    */
    std::wstring target_lower;
    if (std_reg.flags() & std::regex_constants::icase)
      {
    target_lower.resize (target.size ());
    std::transform (target.begin(), target.end(), target_lower.begin(), ::towlower);
      }
    else
      {
    target_lower = target;
      }
#endif

    int match_idx = -1;
    try
      {
#if defined(WINDOWS)
    auto reg_iter = cub_std_regex_iterator (target_lower.begin (), target_lower.end (), std_reg);
#else
    auto reg_iter = cub_std_regex_iterator (target.begin (), target.end (), std_reg);
#endif
    auto reg_end = cub_std_regex_iterator ();

    int n = 1;
    while (reg_iter != reg_end)
      {
        /* match */
        if (n == occurrence)
          {
        cub_std_regex_results match_result = *reg_iter;
        match_idx = match_result.position ();
        if (return_opt == 1)
          {
            match_idx += reg_iter->length ();
          }
        break;
          }
        ++reg_iter;
        ++n;
      }

    if (match_idx != -1)
      {
        result = position + match_idx + 1;
      }
    else
      {
        result = 0;
      }
      }
    catch (std::regex_error &e)
      {
    // regex execution exception, error_complexity or error_stack
    error_status = ER_REGEX_EXEC_ERROR;
    result = 0;
    std::string error_message = cubregex::std_parse_regex_exception (e);
    er_set (ER_ERROR_SEVERITY, ARG_FILE_LINE, error_status, 1, error_message.c_str ());
      }

    return error_status;
  }

#if defined(WINDOWS)
  /* HACK: case insensitive doesn't work well on Windows.
  *  This code transforms source string into lowercase
  *  and perform searching regular expression pattern.
  */
  int std_replace (std::string &result, const compiled_regex &reg, const std::string &src,
           const std::string &repl, const int position,
           const int occurrence)
  {
    assert (position >= 0);
    assert (occurrence >= 0);

    int error_status = NO_ERROR;

    std::wstring src_wstring;
    if (cublocale::convert_utf8_to_wstring (src_wstring, src) == false)
      {
    error_status = ER_QSTR_BAD_SRC_CODESET;
    return error_status;
      }

    std::wstring repl_wstring;
    if (cublocale::convert_utf8_to_wstring (repl_wstring, repl) == false)
      {
    error_status = ER_QSTR_BAD_SRC_CODESET;
    return error_status;
      }

    /* split source string by position value */
    std::wstring result_wstring (src_wstring.substr (0, position));
    std::wstring target (
        src_wstring.substr (position, src_wstring.size () - position)
    );

    cub_std_regex &std_reg = GET_STD_OBJ (reg);
    std::wstring target_lowercase;
    if (std_reg.flags() & std::regex_constants::icase)
      {
    target_lowercase.resize (target.size ());
    std::transform (target.begin(), target.end(), target_lowercase.begin(), ::towlower);
      }
    else
      {
    target_lowercase = target;
      }

    try
      {
    auto reg_iter = cub_std_regex_iterator (target_lowercase.begin (), target_lowercase.end (), std_reg);
    auto reg_end = cub_std_regex_iterator ();

    int match_pos = -1;
    int last_pos = 0;
    int n = 1;
    auto out = std::back_inserter (result_wstring);

    cub_std_regex_results match_result;
    while (reg_iter != reg_end)
      {
        match_result = *reg_iter;

        /* prefix */
        match_pos = match_result.position ();
        size_t match_length = match_result.length ();
        std::wstring match_prefix = target.substr (last_pos, match_pos - last_pos);
        out = std::copy (match_prefix.begin (), match_prefix.end (), out);

        /* match */
        if (n == occurrence || occurrence == 0)
          {
        out = match_result.format (out, repl_wstring);
          }
        else
          {
        std::wstring match_str = target.substr (match_pos, match_length);
        out = std::copy (match_str.begin (), match_str.end (), out);
          }

        ++reg_iter;

        /* suffix */
        last_pos = match_pos + match_length;
        if (((occurrence != 0) && (n == occurrence)) || reg_iter == reg_end)
          {
        std::wstring match_suffix = target.substr (match_pos + match_length, std::string::npos);
        out = std::copy (match_suffix.begin (), match_suffix.end (), out);
        if (occurrence != 0 && n == occurrence)
          {
            break;
          }
          }
        ++n;
      }

    /* nothing matched */
    if (match_pos == -1 && reg_iter == reg_end)
      {
        out = std::copy (target.begin (), target.end (), out);
      }

    if (cublocale::convert_wstring_to_utf8 (result, result_wstring) == false)
      {
        error_status = ER_QSTR_BAD_SRC_CODESET;
        return error_status;
      }
      }
    catch (std::regex_error &e)
      {
    // regex execution exception, error_complexity or error_stack
    error_status = ER_REGEX_EXEC_ERROR;
    result.clear ();
    std::string error_message = cubregex::std_parse_regex_exception (e);
    er_set (ER_ERROR_SEVERITY, ARG_FILE_LINE, error_status, 1, error_message.c_str ());
      }

    return error_status;
  }
#else
  int std_replace (std::string &result, const compiled_regex &reg, const std::string &src,
           const std::string &repl, const int position,
           const int occurrence)
  {
    assert (position >= 0);
    assert (occurrence >= 0);

    int error_status = NO_ERROR;

    std::wstring src_wstring;
    if (cublocale::convert_utf8_to_wstring (src_wstring, src) == false)
      {
    error_status = ER_QSTR_BAD_SRC_CODESET;
    return error_status;
      }

    std::wstring repl_wstring;
    if (cublocale::convert_utf8_to_wstring (repl_wstring, repl) == false)
      {
    error_status = ER_QSTR_BAD_SRC_CODESET;
    return error_status;
      }

    /* split source string by position value */
    std::wstring result_wstring (src_wstring.substr (0, position));
    std::wstring target (
        src_wstring.substr (position, src_wstring.size () - position)
    );

    try
      {
    cub_std_regex &std_reg = GET_STD_OBJ (reg);
    if (occurrence == 0)
      {
        result_wstring.append (
            std::regex_replace (target, std_reg, repl_wstring)
        );
      }
    else
      {
        auto reg_iter = cub_std_regex_iterator (target.begin (), target.end (), std_reg);
        auto reg_end = cub_std_regex_iterator ();

        int n = 1;
        auto out = std::back_inserter (result_wstring);

        int match_pos = -1;
        while (reg_iter != reg_end)
          {
        const cub_std_regex_results match_result = *reg_iter;

        std::wstring match_prefix = match_result.prefix ().str ();
        out = std::copy (match_prefix.begin (), match_prefix.end (), out);

        /* match */
        match_pos = match_result.position ();
        size_t match_length = match_result.length ();
        if (n == occurrence)
          {
            out = match_result.format (out, repl_wstring);
          }
        else
          {
            std::wstring match_str = match_result.str ();
            out = std::copy (match_str.begin (), match_str.end (), out);
          }

        ++reg_iter;

        /* suffix */
        if (n == occurrence || reg_iter == reg_end)
          {
            /* occurrence option specified or end of matching */
            std::wstring match_suffix = match_result.suffix (). str ();
            out = std::copy (match_suffix.begin (), match_suffix.end (), out);
            break;
          }
        ++n;
          }

        /* nothing matched */
        if (match_pos == -1 && reg_iter == reg_end)
          {
        out = std::copy (target.begin (), target.end (), out);
          }
      }

    if (cublocale::convert_wstring_to_utf8 (result, result_wstring) == false)
      {
        error_status = ER_QSTR_BAD_SRC_CODESET;
        return error_status;
      }

      }
    catch (std::regex_error &e)
      {
    // regex execution exception, error_complexity or error_stack
    error_status = ER_REGEX_EXEC_ERROR;
    result.clear ();
    std::string error_message = cubregex::std_parse_regex_exception (e);
    er_set (ER_ERROR_SEVERITY, ARG_FILE_LINE, error_status, 1, error_message.c_str ());
      }

    return error_status;
  }
#endif

  int std_substr (std::string &result, bool &is_matched, const compiled_regex &reg, const std::string &src,
          const int position, const int occurrence)
  {
    assert (position >= 0);
    assert (occurrence >= 1);

    int error_status = NO_ERROR;
    is_matched = false;

    std::wstring src_wstring;
    if (cublocale::convert_utf8_to_wstring (src_wstring, src) == false)
      {
    error_status = ER_QSTR_BAD_SRC_CODESET;
    return error_status;
      }

    /* split source string by position value */
    std::wstring result_wstring;
    std::wstring target (
        src_wstring.substr (position, src_wstring.size () - position)
    );

    cub_std_regex &std_reg = GET_STD_OBJ (reg);

#if defined(WINDOWS)
    /* HACK: case insensitive doesn't work well on Windows.
    *  This code transforms source string into lowercase
    *  and perform searching regular expression pattern.
    */
    std::wstring target_lower;
    if (std_reg.flags() & std::regex_constants::icase)
      {
    target_lower.resize (target.size ());
    std::transform (target.begin(), target.end(), target_lower.begin(), ::towlower);
      }
    else
      {
    target_lower = target;
      }
#endif

    int match_pos = -1;
    size_t match_length = 0;
    try
      {
#if defined(WINDOWS)
    auto reg_iter = cub_std_regex_iterator (target_lower.begin (), target_lower.end (), std_reg);
#else
    auto reg_iter = cub_std_regex_iterator (target.begin (), target.end (), std_reg);
#endif
    auto reg_end = cub_std_regex_iterator ();
    auto out = std::back_inserter (result_wstring);

    int n = 1;
    while (reg_iter != reg_end)
      {
        cub_std_regex_results match_result = *reg_iter;

        /* match */
        match_pos = match_result.position ();
        match_length = match_result.length ();
        if (n == occurrence)
          {
        std::wstring match_str = target.substr (match_pos, match_length);
        out = std::copy (match_str.begin (), match_str.end (), out);
        is_matched = true;
        break;
          }
        ++reg_iter;
        ++n;
      }

    if (cublocale::convert_wstring_to_utf8 (result, result_wstring) == false)
      {
        error_status = ER_QSTR_BAD_SRC_CODESET;
        return error_status;
      }

      }
    catch (std::regex_error &e)
      {
    // regex execution exception, error_complexity or error_stack
    error_status = ER_REGEX_EXEC_ERROR;
    result.clear ();
    std::string error_message = cubregex::std_parse_regex_exception (e);
    er_set (ER_ERROR_SEVERITY, ARG_FILE_LINE, error_status, 1, error_message.c_str ());
      }

    return error_status;
  }
}