Regex matching has an optimization where only one match is tried for a
regex anchored to the beginning of the buffer. While other anchors are
resolved with the fastmap, this one allows further optimization and is
special cased. However, because of a bug in create_cd_newstate, ^$
would be mistakenly treated as a non-anchoring match, and
re_search_internal would try matching it at every position.
In fact, the bug is (almost) fixed by this hunk:
@@ -1682,8 +1680,6 @@ create_cd_newstate (const re_dfa_t *dfa,
newstate->halt = 1;
else if (type == OP_BACK_REF)
newstate->has_backref = 1;
- else if (type == ANCHOR)
- constraint = node->opr.ctx_type;
if (constraint)
{
However, some complications in building the NFA prevent this from fixing
the problem. Therefore, this patch cleans up the handling of anchors so
that tests on type == ANCHOR are not necessary anymore. When creating
the NFA (calc_first), I move the opr.ctx_type to the constraint field of
re_token_t, and then I always look at it unconditionally, without
special-casing ANCHORs. This also allows some simplification of
duplicate_node_closure.
I attach a patch for glibc (against CVS), and one for gnulib, asking for
gnulib approval, and glibc approval+commit. I tested both using the GNU
sed testsuite.
Thanks,
Paolo
2008-04-11 Paolo Bonzini <[EMAIL PROTECTED]>
* posix/regcomp.c (optimize_utf8): Add a note on why we test
opr.ctx_type.
(calc_first): Initialize constraint field.
(duplicate_node_closure): Use it instead of special casing ANCHORS.
Use search_duplicated_node to avoid loops. Fix grammar.
(duplicate_node): Merge constraint field for all node types.
(calc_eclosure_iter): Look at constraint field for all node types.
* posix/regex_internal.c (create_cd_newstate)): Don't look at
create_cd_newstate.
--- regcomp.c.pb 2008-04-11 11:47:32.000000000 +0200
+++ regcomp.c 2008-04-11 12:03:42.000000000 +0200
@@ -1038,7 +1038,9 @@ optimize_utf8 (re_dfa_t *dfa)
case BUF_LAST:
break;
default:
- /* Word anchors etc. cannot be handled. */
+ /* Word anchors etc. cannot be handled. It's okay to test
+ opr.ctx_type since constraints (for all DFA nodes) are
+ created by ORing one or more opr.ctx_type values. */
return;
}
break;
@@ -1318,6 +1320,8 @@ calc_first (void *extra, bin_tree_t *nod
node->node_idx = re_dfa_add_node (dfa, node->token);
if (BE (node->node_idx == -1, 0))
return REG_ESPACE;
+ if (node->token.type == ANCHOR)
+ dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
}
return REG_NOERROR;
}
@@ -1446,22 +1450,17 @@ duplicate_node_closure (re_dfa_t *dfa, i
destination. */
org_dest = dfa->edests[org_node].elems[0];
re_node_set_empty (dfa->edests + clone_node);
- if (dfa->nodes[org_node].type == ANCHOR)
+ /* If the node is root_node itself, it means the epsilon clsoure
+ has a loop. Then tie it to the destination of the root_node. */
+ if (org_node == root_node && clone_node != org_node)
{
- /* In case of the node has another constraint, append it. */
- if (org_node == root_node && clone_node != org_node)
- {
- /* ...but if the node is root_node itself, it means the
- epsilon closure have a loop, then tie it to the
- destination of the root_node. */
- ret = re_node_set_insert (dfa->edests + clone_node,
- org_dest);
- if (BE (ret < 0, 0))
- return REG_ESPACE;
- break;
- }
- constraint |= dfa->nodes[org_node].opr.ctx_type;
+ ret = re_node_set_insert (dfa->edests + clone_node, org_dest);
+ if (BE (ret < 0, 0))
+ return REG_ESPACE;
+ break;
}
+ /* In case of the node has another constraint, add it. */
+ constraint |= dfa->nodes[org_node].constraint;
clone_dest = duplicate_node (dfa, org_dest, constraint);
if (BE (clone_dest == -1, 0))
return REG_ESPACE;
@@ -1479,7 +1478,7 @@ duplicate_node_closure (re_dfa_t *dfa, i
clone_dest = search_duplicated_node (dfa, org_dest, constraint);
if (clone_dest == -1)
{
- /* There are no such a duplicated node, create a new one. */
+ /* There is no such duplicated node, create a new one. */
reg_errcode_t err;
clone_dest = duplicate_node (dfa, org_dest, constraint);
if (BE (clone_dest == -1, 0))
@@ -1494,7 +1493,7 @@ duplicate_node_closure (re_dfa_t *dfa, i
}
else
{
- /* There are a duplicated node which satisfy the constraint,
+ /* There is a duplicated node which satisfies the constraint,
use it to avoid infinite loop. */
ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
if (BE (ret < 0, 0))
@@ -1543,8 +1542,7 @@ duplicate_node (re_dfa_t *dfa, int org_i
if (BE (dup_idx != -1, 1))
{
dfa->nodes[dup_idx].constraint = constraint;
- if (dfa->nodes[org_idx].type == ANCHOR)
- dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].opr.ctx_type;
+ dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint;
dfa->nodes[dup_idx].duplicated = 1;
/* Store the index of the original node. */
@@ -1624,7 +1622,6 @@ static reg_errcode_t
calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, int node, int root)
{
reg_errcode_t err;
- unsigned int constraint;
int i, incomplete;
re_node_set eclosure;
incomplete = 0;
@@ -1636,15 +1633,14 @@ calc_eclosure_iter (re_node_set *new_set
We reference this value to avoid infinite loop. */
dfa->eclosures[node].nelem = -1;
- constraint = ((dfa->nodes[node].type == ANCHOR)
- ? dfa->nodes[node].opr.ctx_type : 0);
- /* If the current node has constraints, duplicate all nodes.
- Since they must inherit the constraints. */
- if (constraint
+ /* If the current node has constraints, duplicate all nodes
+ since they must inherit the constraints. */
+ if (dfa->nodes[node].constraint
&& dfa->edests[node].nelem
&& !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
{
- err = duplicate_node_closure (dfa, node, node, node, constraint);
+ err = duplicate_node_closure (dfa, node, node, node,
+ dfa->nodes[node].constraint);
if (BE (err != REG_NOERROR, 0))
return err;
}
--- regex_internal.c.pb 2008-04-11 11:47:41.000000000 +0200
+++ regex_internal.c 2008-04-11 11:52:20.000000000 +0200
@@ -1665,11 +1665,9 @@ create_cd_newstate (const re_dfa_t *dfa,
for (i = 0 ; i < nodes->nelem ; i++)
{
- unsigned int constraint = 0;
re_token_t *node = dfa->nodes + nodes->elems[i];
re_token_type_t type = node->type;
- if (node->constraint)
- constraint = node->constraint;
+ unsigned int constraint = node->constraint;
if (type == CHARACTER && !constraint)
continue;
@@ -1682,8 +1680,6 @@ create_cd_newstate (const re_dfa_t *dfa,
newstate->halt = 1;
else if (type == OP_BACK_REF)
newstate->has_backref = 1;
- else if (type == ANCHOR)
- constraint = node->opr.ctx_type;
if (constraint)
{
2008-04-11 Paolo Bonzini <[EMAIL PROTECTED]>
* lib/regcomp.c (optimize_utf8): Add a note on why we test opr.ctx_type.
(calc_first): Initialize constraint field.
(duplicate_node_closure): Use it instead of special casing ANCHORS.
Use search_duplicated_node to avoid loops. Fix grammar.
(duplicate_node): Merge constraint field for all node types.
(calc_eclosure_iter): Look at constraint field for all node types.
* lib/regex_internal.c (create_cd_newstate)): Don't look at
create_cd_newstate.
diff --git a/lib/regcomp.c b/lib/regcomp.c
index a02418d..dc15666 100644
--- a/lib/regcomp.c
+++ b/lib/regcomp.c
@@ -1057,7 +1057,9 @@ optimize_utf8 (re_dfa_t *dfa)
case BUF_LAST:
break;
default:
- /* Word anchors etc. cannot be handled. */
+ /* Word anchors etc. cannot be handled. It's okay to test
+ opr.ctx_type since constraints (for all DFA nodes) are
+ created by ORing one or more opr.ctx_type values. */
return;
}
break;
@@ -1344,6 +1346,8 @@ calc_first (void *extra, bin_tree_t *node)
node->node_idx = re_dfa_add_node (dfa, node->token);
if (BE (node->node_idx == REG_MISSING, 0))
return REG_ESPACE;
+ if (node->token.type == ANCHOR)
+ dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
}
return REG_NOERROR;
}
@@ -1473,21 +1477,18 @@ duplicate_node_closure (re_dfa_t *dfa, Idx
top_org_node, Idx top_clone_node,
destination. */
org_dest = dfa->edests[org_node].elems[0];
re_node_set_empty (dfa->edests + clone_node);
- if (dfa->nodes[org_node].type == ANCHOR)
+ clone_dest = search_duplicated_node (dfa, org_dest, constraint);
+ /* If the node is root_node itself, it means the epsilon closure
+ has a loop. Then tie it to the destination of the root_node. */
+ if (org_node == root_node && clone_node != org_node)
{
- /* In case of the node has another constraint, append it. */
- if (org_node == root_node && clone_node != org_node)
- {
- /* ...but if the node is root_node itself, it means the
- epsilon closure have a loop, then tie it to the
- destination of the root_node. */
- ok = re_node_set_insert (dfa->edests + clone_node, org_dest);
- if (BE (! ok, 0))
- return REG_ESPACE;
- break;
- }
- constraint |= dfa->nodes[org_node].opr.ctx_type;
+ ok = re_node_set_insert (dfa->edests + clone_node, org_dest);
+ if (BE (! ok, 0))
+ return REG_ESPACE;
+ break;
}
+ /* In case the node has another constraint, append it. */
+ constraint |= dfa->nodes[org_node].constraint;
clone_dest = duplicate_node (dfa, org_dest, constraint);
if (BE (clone_dest == REG_MISSING, 0))
return REG_ESPACE;
@@ -1505,7 +1506,7 @@ duplicate_node_closure (re_dfa_t *dfa, Idx top_org_node,
Idx top_clone_node,
clone_dest = search_duplicated_node (dfa, org_dest, constraint);
if (clone_dest == REG_MISSING)
{
- /* There are no such a duplicated node, create a new one. */
+ /* There is no such duplicated node, create a new one. */
reg_errcode_t err;
clone_dest = duplicate_node (dfa, org_dest, constraint);
if (BE (clone_dest == REG_MISSING, 0))
@@ -1520,7 +1521,7 @@ duplicate_node_closure (re_dfa_t *dfa, Idx top_org_node,
Idx top_clone_node,
}
else
{
- /* There are a duplicated node which satisfy the constraint,
+ /* There is a duplicated node which satisfy the constraint,
use it to avoid infinite loop. */
ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
if (BE (! ok, 0))
@@ -1569,8 +1570,7 @@ duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int
constraint)
if (BE (dup_idx != REG_MISSING, 1))
{
dfa->nodes[dup_idx].constraint = constraint;
- if (dfa->nodes[org_idx].type == ANCHOR)
- dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].opr.ctx_type;
+ dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint;
dfa->nodes[dup_idx].duplicated = 1;
/* Store the index of the original node. */
@@ -1652,7 +1652,6 @@ static reg_errcode_t
calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, Idx node, bool root)
{
reg_errcode_t err;
- unsigned int constraint;
Idx i;
bool incomplete;
bool ok;
@@ -1666,15 +1665,14 @@ calc_eclosure_iter (re_node_set *new_set, re_dfa_t
*dfa, Idx node, bool root)
We reference this value to avoid infinite loop. */
dfa->eclosures[node].nelem = REG_MISSING;
- constraint = ((dfa->nodes[node].type == ANCHOR)
- ? dfa->nodes[node].opr.ctx_type : 0);
- /* If the current node has constraints, duplicate all nodes.
- Since they must inherit the constraints. */
- if (constraint
+ /* If the current node has constraints, duplicate all nodes
+ since they must inherit the constraints. */
+ if (dfa->nodes[node].constraint
&& dfa->edests[node].nelem
&& !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
{
- err = duplicate_node_closure (dfa, node, node, node, constraint);
+ err = duplicate_node_closure (dfa, node, node, node,
+ dfa->nodes[node].constraint);
if (BE (err != REG_NOERROR, 0))
return err;
}
diff --git a/lib/regex_internal.c b/lib/regex_internal.c
index 2129888..7f9a3ae 100644
--- a/lib/regex_internal.c
+++ b/lib/regex_internal.c
@@ -1689,11 +1689,9 @@ create_cd_newstate (const re_dfa_t *dfa, const
re_node_set *nodes,
for (i = 0 ; i < nodes->nelem ; i++)
{
- unsigned int constraint = 0;
re_token_t *node = dfa->nodes + nodes->elems[i];
re_token_type_t type = node->type;
- if (node->constraint)
- constraint = node->constraint;
+ unsigned int constraint = node->constraint;
if (type == CHARACTER && !constraint)
continue;
@@ -1706,8 +1704,6 @@ create_cd_newstate (const re_dfa_t *dfa, const
re_node_set *nodes,
newstate->halt = 1;
else if (type == OP_BACK_REF)
newstate->has_backref = 1;
- else if (type == ANCHOR)
- constraint = node->opr.ctx_type;
if (constraint)
{