Add comments for last regex commit. Thanks!
-- Regards, Tim Shen
commit b4aa16a08992866ca6fd60f40e422f551d0d6b2a Author: tim <timshe...@gmail.com> Date: Sun Oct 27 15:50:44 2013 -0400 2013- diff --git a/libstdc++-v3/include/bits/regex_executor.tcc b/libstdc++-v3/include/bits/regex_executor.tcc index d3b9a04..2f677de 100644 --- a/libstdc++-v3/include/bits/regex_executor.tcc +++ b/libstdc++-v3/include/bits/regex_executor.tcc @@ -53,6 +53,52 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION return false; } + // This function serves in different modes, DFS mode or BFS mode, indicated + // by template variable __dfs_mode. See _M_main for details. + // + // ------------------------------------------------------------ + // + // DFS mode: + // + // It applys a Depth-First-Search (aka backtracking) on given NFA and input + // string. + // At the very beginning the executor stands in the start state, then it try + // every possible state transition in current state recursively. Some state + // transitions consume input string, say, a single-char-matcher or a + // back-reference matcher; some don't, like assertion or other anchor nodes. + // When the input is exhausted and/or the current state is an accepting state, + // the whole executor returns true. + // + // TODO: This approach is exponentially slow for certain input. + // Try to compile the NFA to a DFA. + // + // Time complexity: o(match_length), O(2^(_M_nfa.size())) + // Space complexity: \theta(match_results.size() + match_length) + // + // ------------------------------------------------------------ + // + // BFS mode: + // + // Russ Cox's article (http://swtch.com/~rsc/regexp/regexp1.html) + // explained this algorithm clearly. + // + // It first computes epsilon clousure for every state that's still matching, + // using the same DFS algorithm, but doesn't reenter states (set true in + // _M_visited), nor follow _S_opcode_match. + // + // Then apply DFS to every _S_opcode_match one by one (in _M_match_queue). + // + // The order of which states needs to be recursively applied DFS matters, + // depend on which greedy mode we use. See _S_opcode_alternative branch in + // _M_dfs. + // + // It significantly reduces potential duplicate states, so have a better + // upper bound; but it deserves more overhead. + // + // Time complexity: o(match_length * match_results.size()) + // O(match_length * _M_nfa.size() * match_results.size()) + // Space complexity: o(_M_nfa.size() + match_results.size()) + // O(_M_nfa.size() * match_results.size()) template<typename _BiIter, typename _Alloc, typename _TraitsT, bool __dfs_mode> template<bool __match_mode> @@ -68,18 +114,6 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION } else { - // Like the DFS approach, it try every possible state transition; - // Unlike DFS, it uses a queue instead of a stack to store matching - // states. It's a BFS approach. - // - // Russ Cox's article(http://swtch.com/~rsc/regexp/regexp1.html) - // explained this algorithm clearly. - // - // Time complexity: o(match_length * match_results.size()) - // O(match_length * _M_nfa.size() - // * match_results.size()) - // Space complexity: o(_M_nfa.size() + match_results.size()) - // O(_M_nfa.size() * match_results.size()) _M_match_queue->push(make_pair(_M_start_state, _M_results)); bool __ret = false; while (1) @@ -132,20 +166,6 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION return false; } - // A _DFSExecutor perform a DFS on given NFA and input string. At the very - // beginning the executor stands in the start state, then it try every - // possible state transition in current state recursively. Some state - // transitions consume input string, say, a single-char-matcher or a - // back-reference matcher; some not, like assertion or other anchor nodes. - // When the input is exhausted and the current state is an accepting state, - // the whole executor return true. - // - // TODO: This approach is exponentially slow for certain input. - // Try to compile the NFA to a DFA. - // - // Time complexity: o(match_length), O(2^(_M_nfa.size())) - // Space complexity: \theta(match_results.size() + match_length) - // template<typename _BiIter, typename _Alloc, typename _TraitsT, bool __dfs_mode> template<bool __match_mode> @@ -164,25 +184,39 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION { case _S_opcode_alternative: // Greedy or not, this is a question ;) + // + // _M_alt branch is "match once more", while _M_next is "get me out + // of this quantifier". + + // Greedy. if (!__state._M_neg) { + // Once more. _M_dfs<__match_mode>(__state._M_alt); + // If it's DFS executor and already accepted, we're done. if (!__dfs_mode || !_M_has_sol) _M_dfs<__match_mode>(__state._M_next); } - else + else // Ungreedy mode { if (__dfs_mode) { + // vice-versa. _M_dfs<__match_mode>(__state._M_next); if (!_M_has_sol) _M_dfs<__match_mode>(__state._M_alt); } else { + // DON't attempt anything, because there's already someone + // with higher priority accepted. This state cannot be better + // by attempting its next node. if (!_M_has_sol) { _M_dfs<__match_mode>(__state._M_next); + // DON'T attempt anything if it's already accepted. An + // accepted state *must* be better than a solution that + // has one more time ungreedy quantifier loop. if (!_M_has_sol) _M_dfs<__match_mode>(__state._M_alt); } @@ -190,9 +224,9 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION } break; case _S_opcode_subexpr_begin: - // Here's the critical part: if there's nothing changed since last - // visit, do NOT continue. This prevents the executor from get into - // infinite loop when use "()*" to match "". + // If there's nothing changed since last visit, do NOT continue. + // This prevents the executor from get into infinite loop when use + // "()*" to match "". // // Every change on _M_cur_results will be roll back after the // recursion step finished. @@ -232,8 +266,8 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION if (_M_word_boundry(__state) == !__state._M_neg) _M_dfs<__match_mode>(__state._M_next); break; - // Here __state._M_alt offers a single start node for a sub-NFA. - // We recursivly invoke our algorithm to match the sub-NFA. + // Here __state._M_alt offers a single start node for a sub-NFA. + // We recursively invoke our algorithm to match the sub-NFA. case _S_opcode_subexpr_lookahead: if (_M_lookahead(__state) == !__state._M_neg) _M_dfs<__match_mode>(__state._M_next);