Here's the diffs for the current version of regexp support. It's probably stable enough to use for real so that we can get some feedback on it.
Do you want to merge this? Regards, Elias
Index: configure.ac =================================================================== --- configure.ac (revision 1011) +++ configure.ac (working copy) @@ -162,6 +162,8 @@ fi fi +m4_include([m4/ax_path_lib_pcre.m4]) AX_PATH_LIB_PCRE([]) + # check if rdtsc (read CPU cycle counter is available. # This is expected only on Intel CPUs AC_MSG_CHECKING([whether CPU has rdtsc (read CPU cycle counter) opcode]) Index: m4/ax_path_lib_pcre.m4 =================================================================== --- m4/ax_path_lib_pcre.m4 (nonexistent) +++ m4/ax_path_lib_pcre.m4 (working copy) @@ -0,0 +1,90 @@ +# =========================================================================== +# https://www.gnu.org/software/autoconf-archive/ax_path_lib_pcre.html +# =========================================================================== +# +# SYNOPSIS +# +# AX_PATH_LIB_PCRE [(A/NA)] +# +# DESCRIPTION +# +# check for pcre lib and set PCRE_LIBS and PCRE_CFLAGS accordingly. +# +# also provide --with-pcre option that may point to the $prefix of the +# pcre installation - the macro will check $pcre/include and $pcre/lib to +# contain the necessary files. +# +# the usual two ACTION-IF-FOUND / ACTION-IF-NOT-FOUND are supported and +# they can take advantage of the LIBS/CFLAGS additions. +# +# LICENSE +# +# Copyright (c) 2008 Guido U. Draheim <gui...@gmx.de> +# +# This program is free software; you can redistribute it and/or modify it +# under the terms of the GNU General Public License as published by the +# Free Software Foundation; either version 3 of the License, or (at your +# option) any later version. +# +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General +# Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with this program. If not, see <https://www.gnu.org/licenses/>. +# +# As a special exception, the respective Autoconf Macro's copyright owner +# gives unlimited permission to copy, distribute and modify the configure +# scripts that are the output of Autoconf when processing the Macro. You +# need not follow the terms of the GNU General Public License when using +# or distributing such scripts, even though portions of the text of the +# Macro appear in them. The GNU General Public License (GPL) does govern +# all other use of the material that constitutes the Autoconf Macro. +# +# This special exception to the GPL applies to versions of the Autoconf +# Macro released by the Autoconf Archive. When you make and distribute a +# modified version of the Autoconf Macro, you may extend this special +# exception to the GPL to apply to your modified version as well. + +#serial 8 + +AC_DEFUN([AX_PATH_LIB_PCRE],[dnl +AC_MSG_CHECKING([lib pcre]) +AC_ARG_WITH(pcre, +[ --with-pcre[[=prefix]] compile xmlpcre part (via libpcre check)],, + with_pcre="yes") +if test ".$with_pcre" = ".no" ; then + AC_MSG_RESULT([disabled]) + m4_ifval($2,$2) +else + AC_MSG_RESULT([(testing)]) + AC_CHECK_LIB(pcre2-32, pcre2_compile_32) + if test "$ac_cv_lib_pcre2-32_pcre2_compile_32" = "yes" ; then + PCRE_LIBS="-lpcre2-32" + AC_MSG_CHECKING([lib pcre]) + AC_MSG_RESULT([$PCRE_LIBS]) + m4_ifval($1,$1) + else + OLDLDFLAGS="$LDFLAGS" ; LDFLAGS="$LDFLAGS -L$with_pcre/lib" + OLDCPPFLAGS="$CPPFLAGS" ; CPPFLAGS="$CPPFLAGS -I$with_pcre/include" + AC_CHECK_LIB(pcre2-32, pcre2_match_data_create_from_pattern_32) + CPPFLAGS="$OLDCPPFLAGS" + LDFLAGS="$OLDLDFLAGS" + if test "$ac_cv_lib_pcre2-32_pcre2_match_data_create_from_pattern_32" = "yes" ; then + AC_MSG_RESULT(.setting PCRE_LIBS -L$with_pcre/lib -lpcre2-32) + PCRE_LIBS="-L$with_pcre/lib -lpcre2-32" + test -d "$with_pcre/include" && PCRE_CFLAGS="-I$with_pcre/include" + AC_MSG_CHECKING([lib pcre]) + AC_MSG_RESULT([$PCRE_LIBS]) + m4_ifval($1,$1) + else + AC_MSG_CHECKING([lib pcre]) + AC_MSG_RESULT([no (WARNING)]) + m4_ifval($2,$2) + fi + fi +fi +AC_SUBST([PCRE_LIBS]) +AC_SUBST([PCRE_CFLAGS]) +]) Index: src/Id.cc =================================================================== --- src/Id.cc (revision 1011) +++ src/Id.cc (working copy) @@ -37,6 +37,7 @@ #include "QuadFunction.hh" #include "Quad_DLX.hh" #include "Quad_FX.hh" +#include "Quad_RE.hh" #include "Quad_SQL.hh" #include "Quad_SVx.hh" #include "Quad_TF.hh" Index: src/Id.def =================================================================== --- src/Id.def (revision 1011) +++ src/Id.def (working copy) @@ -201,6 +201,7 @@ qf( SVS , "竡百VS" , ) qv( SYL , "竡百YL" , ) pp( USER_SYMBOL , --- , ) +qf( RE , "竡紐E" , ) pp( STOP_LINE , --- , ) qf( STOP , "竡百TOP" , ) qf( SQL , "竡百QL" , ) Index: src/Makefile.am =================================================================== --- src/Makefile.am (revision 1011) +++ src/Makefile.am (working copy) @@ -86,10 +86,12 @@ Quad_DLX.cc Quad_DLX.hh \ Quad_FIO.cc Quad_FIO.hh \ Quad_FX.cc Quad_FX.hh \ +Quad_RE.cc Quad_RE.hh \ Quad_RL.cc Quad_RL.hh \ Quad_SQL.cc Quad_SQL.hh \ Quad_SVx.cc Quad_SVx.hh \ Quad_TF.cc Quad_TF.hh \ +Regexp.cc Regexp.hh \ Parallel.cc Parallel.hh \ Performance.cc Performance.def Performance.hh \ RealCell.cc RealCell.hh \ Index: src/QuadFunction.cc =================================================================== --- src/QuadFunction.cc (revision 1011) +++ src/QuadFunction.cc (working copy) @@ -36,6 +36,7 @@ #include "PrintOperator.hh" #include "QuadFunction.hh" #include "Quad_FX.hh" +#include "Quad_RE.hh" #include "Quad_SQL.hh" #include "Quad_TF.hh" #include "Tokenizer.hh" Index: src/Quad_RE.cc =================================================================== --- src/Quad_RE.cc (nonexistent) +++ src/Quad_RE.cc (working copy) @@ -0,0 +1,187 @@ +#include "Quad_RE.hh" +#include "Workspace.hh" +#include "PointerCell.hh" + +#include "Regexp.hh" + +class Flags +{ +public: + Flags(const UCS_string &flags_in); + int get_compflags() const { return flags; } + bool get_error_on_no_match() const { return error_on_no_match; } + bool get_result_bitmap() const { return result_bitmap; } + +private: + int flags; + bool error_on_no_match; + bool result_bitmap; +}; + +Flags::Flags(const UCS_string &flags_string) : flags(0), error_on_no_match(false), result_bitmap(false) +{ + int result = 0; + UCS_string::iterator i = flags_string.begin(); + while(i.more()) { + Unicode ch = i.next(); + switch(static_cast<int>(ch)) { + case 'i': + result |= PCRE2_CASELESS; + break; + case 's': + result |= PCRE2_DOTALL; + break; + case 'x': + result |= PCRE2_EXTENDED; + break; + case 'm': + result |= PCRE2_MULTILINE; + break; + case 'E': + error_on_no_match = true; + break; + case 'B': + result_bitmap = true; + break; + default: + MORE_ERROR() << "Unknown regexp flag: " << ch; + VALUE_ERROR; + } + } + flags = result; +} + + +Quad_RE Quad_RE::_fun; +Quad_RE *Quad_RE::fun = &Quad_RE::_fun; + +Quad_RE::Quad_RE() : QuadFunction(TOK_Quad_RE) +{ +} + +Token Quad_RE::eval_AB(Value_P A, Value_P B) +{ + return eval_AXB(A, Str0(LOC), B); +} + +static Value_P fill_regex_results(Value_P &result, const Regexp ®exp, const Flags &flags, const UCS_string &matched) +{ + if(flags.get_result_bitmap()) { + vector<pair<ShapeItem, ShapeItem>> results; + ShapeItem pos = 0; + bool end = false; + while(!end && pos < matched.size()) { + unique_ptr<RegexpMatch> match(regexp.match(matched, static_cast<PCRE2_SIZE>(pos))); + if(match->is_match()) { + const PCRE2_SIZE *ovector = match->get_ovector(); + results.push_back(pair<PCRE2_SIZE, PCRE2_SIZE>(ovector[0], ovector[1])); + pos = ovector[1]; + } + else { + end = true; + } + } + + Shape shape(matched.size()); + Value_P result_value(shape, LOC); + ShapeItem w = 0; + int match_id = 1; + for(vector<pair<ShapeItem, ShapeItem>>::iterator i = results.begin() ; i != results.end() ; i++) { + while(w < i->first) { + new (result_value->next_ravel()) IntCell(0); + w++; + } + while(w < i->second) { + new (result_value->next_ravel()) IntCell(match_id); + w++; + } + match_id++; + } + while(w < matched.size()) { + new (result_value->next_ravel()) IntCell(0); + w++; + } + result_value->check_value(LOC); + return result_value; + } + else { + unique_ptr<RegexpMatch> match(regexp.match(matched, 0)); + if(!match->is_match()) { + if(flags.get_error_on_no_match()) { + MORE_ERROR() << "No match"; + DOMAIN_ERROR; + } + else { + return Idx0(LOC); + } + } + else { + if(match->num_matches() == 1) { + Value_P res = Value_P(match->matched_string(), LOC); + return res; + } + else { + vector<UCS_string> strings = match->matched_string_list(); + Shape shape(strings.size()); + Value_P result_value(shape, LOC); + for(vector<UCS_string>::iterator i = strings.begin() ; i != strings.end() ; i++) { + Value_P field_value(*i, LOC); + field_value->check_value(LOC); + new (result_value->next_ravel()) PointerCell(field_value, result_value.getref()); + } + result_value->check_value(LOC); + return result_value; + } + } + } +} + +Token +Quad_RE::eval_AXB(const Value_P A, const Value_P X, const Value_P B) +{ + if(!A->is_char_string()) { + MORE_ERROR() << "Regexp argument must be a string value"; + VALUE_ERROR; + } + + Flags flags(X->get_UCS_ravel()); + Regexp regexp(A->get_UCS_ravel(), flags.get_compflags()); + + const Shape &shape = B->get_shape(); + if(shape.get_rank() == 0) { + return Token(TOK_APL_VALUE1, Idx0(LOC)); + } + else if(B->is_char_string()) { + Value_P result = fill_regex_results(result, regexp, flags, B->get_UCS_ravel()); + return Token(TOK_APL_VALUE1, result); + } + else { + const Shape &shape = B->get_shape(); + Value_P result(shape, LOC); + for(ShapeItem i = 0 ; i < shape.get_volume() ; i++) { + const Cell &cell = B->get_ravel(i); + Value_P value = cell.to_value(LOC); + if(!value->is_char_string()) { + MORE_ERROR() << "Cell does not contain a string"; + DOMAIN_ERROR; + } + + Value_P result_value = fill_regex_results(result, regexp, flags, value->get_UCS_ravel()); + new (result->next_ravel()) PointerCell(result_value, result.getref()); + } + result->check_value(LOC); + return Token(TOK_APL_VALUE1, result); + } +} + +Token +Quad_RE::eval_B(Value_P B) +{ + VALENCE_ERROR; +} + +Token +Quad_RE::eval_XB(Value_P X, Value_P B) +{ + VALENCE_ERROR; +} Index: src/Quad_RE.hh =================================================================== --- src/Quad_RE.hh (nonexistent) +++ src/Quad_RE.hh (working copy) @@ -0,0 +1,54 @@ +/* + This file is part of GNU APL, a free implementation of the + ISO/IEC Standard 13751, "Programming Language APL, Extended" + + Copyright (C) 2008-2016 Dr. Jテシrgen Sauermann + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +#ifndef __Quad_RE_DEFINED__ +#define __Quad_RE_DEFINED__ + +#include "QuadFunction.hh" +#include "Value.hh" +#include "Simple_string.hh" + +class Quad_RE : public QuadFunction +{ +public: + /// Constructor. + Quad_RE(); + + static Quad_RE * fun; ///< Built-in function. + static Quad_RE _fun; ///< Built-in function. + +protected: + /// overloaded Function::eval_AB(). + Token eval_AB(const Value_P A, const Value_P B); + + /// overloaded Function::eval_AXB(). + Token eval_AXB(const Value_P A, const Value_P X, const Value_P B); + + /// overloaded Function::eval_B(). + Token eval_B(Value_P B); + + /// overloaded Function::eval_XB(). + Token eval_XB(Value_P X, Value_P B); + +// virtual Token eval_AB(Value_P A, Value_P B); + +}; + +#endif Index: src/Regexp.cc =================================================================== --- src/Regexp.cc (nonexistent) +++ src/Regexp.cc (working copy) @@ -0,0 +1,115 @@ +#include "Workspace.hh" +#include "Regexp.hh" + +static const PCRE2_UCHAR32 *ucs_to_codepoints(const UCS_string &string) +{ + int size = string.size(); + PCRE2_UCHAR32 *buf = new PCRE2_UCHAR32[size]; + PCRE2_UCHAR32 *p = buf; + UCS_string::iterator i = string.begin(); + while(i.more()) { + *p++ = i.next(); + } + return buf; +} + +static UCS_string make_ucs_string(PCRE2_UCHAR32 *buf) +{ + UCS_string result; + PCRE2_UCHAR32 *p = buf; + while(*p != 0) { + result.append(static_cast<Unicode>(*p++)); + } + return result; +} + +RegexpMatch::RegexpMatch(pcre2_code *code, const UCS_string &matched, PCRE2_SIZE start) +{ + matched_ucs = ucs_to_codepoints(matched); + match_data = pcre2_match_data_create_from_pattern_32(code, NULL); + match_result = pcre2_match_32(code, matched_ucs, matched.size(), start, 0, match_data, NULL); + if(match_result == 0) { + MORE_ERROR() << "Match buffer too small"; + FIXME; + } + else if(match_result > 0) { + ovector = pcre2_get_ovector_pointer_32(match_data); + } + else { + ovector = NULL; + } +} + +RegexpMatch::~RegexpMatch() +{ + delete[] matched_ucs; + pcre2_match_data_free(match_data); +} + +bool RegexpMatch::is_match() const +{ + return match_result > 0; +} + +int RegexpMatch::num_matches() const +{ + if(match_result < 0) { + MORE_ERROR() << "Attempt to call num_matches without matches"; + FIXME; + } + return match_result; +} + +UCS_string RegexpMatch::matched_string() const +{ + const PCRE2_SIZE *ovector = get_ovector(); + UCS_string result(reinterpret_cast<const Unicode *>(matched_ucs + ovector[0]), ovector[1] - ovector[0]); + return result; +} + +vector<UCS_string> RegexpMatch::matched_string_list() const +{ + const PCRE2_SIZE *ovector = get_ovector(); + vector<UCS_string> result; + for(int i = 1 ; i < match_result ; i++) { + PCRE2_SIZE start = ovector[i * 2]; + PCRE2_SIZE end = ovector[i * 2 + 1]; + result.push_back(UCS_string(reinterpret_cast<const Unicode *>(matched_ucs + start), end - start)); + } + return result; +} + +Regexp::Regexp(const UCS_string &pattern, int flags) +{ + const PCRE2_UCHAR32 *pattern_ucs = ucs_to_codepoints(pattern); + + int error_code; + PCRE2_SIZE error_offset; + + code = pcre2_compile_32(pattern_ucs, pattern.size(), PCRE2_NO_UTF_CHECK | flags, &error_code, &error_offset, NULL); + delete[] pattern_ucs; + if(code == NULL) { + PCRE2_UCHAR32 buf[256]; + pcre2_get_error_message_32(error_code, buf, sizeof(buf)); + UCS_string error_message = make_ucs_string(buf); + MORE_ERROR() << "Error compiling regex at offset: " << error_offset << ": " << error_message; + VALUE_ERROR; + } +} + +Regexp::~Regexp() +{ + pcre2_code_free(code); +} + +RegexpMatch *Regexp::match(const UCS_string &match, PCRE2_SIZE size) const +{ + return new RegexpMatch(code, match, size); +} + +int Regexp::expression_count() const +{ + uint32_t result; + pcre2_pattern_info(code, PCRE2_INFO_CAPTURECOUNT, &result); + return result; +} Index: src/Regexp.hh =================================================================== --- src/Regexp.hh (nonexistent) +++ src/Regexp.hh (working copy) @@ -0,0 +1,40 @@ +#ifndef __Regexp__DEFINED__ +#define __Regexp__DEFINED__ + +#include "UCS_string.hh" +#include <vector> + +#define PCRE2_CODE_UNIT_WIDTH 32 +#include <pcre2.h> + +class RegexpMatch +{ +public: + RegexpMatch(pcre2_code *code, const UCS_string &, PCRE2_SIZE start); + virtual ~RegexpMatch(); + bool is_match() const; + int num_matches() const; + UCS_string matched_string() const; + const PCRE2_SIZE *get_ovector() const { return ovector; } + vector<UCS_string> matched_string_list() const; + +private: + PCRE2_SIZE *ovector; + const PCRE2_UCHAR32 *matched_ucs; + pcre2_match_data *match_data; + int match_result; +}; + +class Regexp +{ +public: + Regexp(const UCS_string &pattern, int flags); + virtual ~Regexp(); + RegexpMatch *match(const UCS_string &match, PCRE2_SIZE size) const; + int expression_count() const; + +private: + pcre2_code *code; +}; + +#endif Index: src/SystemVariable.def =================================================================== --- src/SystemVariable.def (revision 1011) +++ src/SystemVariable.def (working copy) @@ -73,6 +73,7 @@ sf_def(Quad_NA, "NA", "Name Association" ) sf_def(Quad_NC, "NC", "Name Class" ) sf_def(Quad_NL, "NL", "Name List" ) + sf_def(Quad_RE, "RE", "Regular expression" ) sf_def(Quad_SI, "SI", "State Indicator" ) sf_def(Quad_SQL, "SQL", "SQL functions" ) sf_def(Quad_SVC, "SVC", "Shared Variable Control" ) @@ -86,6 +87,3 @@ sf_def(Quad_UCS, "UCS", "Universal Char Set (Unicode)" ) # undef sf_def #endif - - - Index: src/Token.def =================================================================== --- src/Token.def (revision 1011) +++ src/Token.def (working copy) @@ -116,6 +116,7 @@ TD(TOK_Quad_EC , TC_FUN1 , TV_FUN , ID::Quad_EC ) TD(TOK_Quad_ENV , TC_FUN1 , TV_FUN , ID::Quad_ENV ) TD(TOK_Quad_EX , TC_FUN1 , TV_FUN , ID::Quad_EX ) +TD(TOK_Quad_RE , TC_FUN2 , TV_FUN , ID::Quad_RE ) TD(TOK_Quad_SQL , TC_FUN2 , TV_FUN , ID::Quad_SQL ) TD(TOK_Quad_SVQ , TC_FUN1 , TV_FUN , ID::Quad_SVQ ) TD(TOK_Quad_SVR , TC_FUN1 , TV_FUN , ID::Quad_SVR ) Index: src/Workspace.hh =================================================================== --- src/Workspace.hh (revision 1011) +++ src/Workspace.hh (working copy) @@ -28,6 +28,7 @@ #include "Quad_CR.hh" #include "Quad_DLX.hh" #include "Quad_FIO.hh" +#include "Quad_RE.hh" #include "Quad_RL.hh" #include "Quad_SVx.hh" #include "ScalarFunction.hh"