Serhiy Storchaka added the comment:
Thank you Matthew for your suggestions. Here is updated patch.
----------
Added file: http://bugs.python.org/file36852/re_error_attrs3.patch
_______________________________________
Python tracker <rep...@bugs.python.org>
<http://bugs.python.org/issue22578>
_______________________________________
diff -r 8b1ac1a3d007 Doc/library/re.rst
--- a/Doc/library/re.rst Wed Oct 08 22:32:50 2014 +0300
+++ b/Doc/library/re.rst Thu Oct 09 16:29:25 2014 +0300
@@ -726,13 +726,36 @@ form.
Clear the regular expression cache.
-.. exception:: error
+.. exception:: error(msg, pattern=None, pos=None)
Exception raised when a string passed to one of the functions here is not a
valid regular expression (for example, it might contain unmatched
parentheses)
or when some other error occurs during compilation or matching. It is
never an
- error if a string contains no match for a pattern.
+ error if a string contains no match for a pattern. The error instance has
+ the following additional attributes:
+ .. attribute:: msg
+
+ The unformatted error message.
+
+ .. attribute:: pattern
+
+ The regular expression pattern.
+
+ .. attribute:: pos
+
+ The index of *pattern* where compilation failed.
+
+ .. attribute:: lineno
+
+ The line corresponding to *pos*.
+
+ .. attribute:: colno
+
+ The column corresponding to *pos*.
+
+ .. versionchanged:: 3.5
+ Added additional attributes.
.. _re-objects:
diff -r 8b1ac1a3d007 Lib/sre_constants.py
--- a/Lib/sre_constants.py Wed Oct 08 22:32:50 2014 +0300
+++ b/Lib/sre_constants.py Thu Oct 09 16:29:25 2014 +0300
@@ -21,8 +21,35 @@ from _sre import MAXREPEAT, MAXGROUPS
# should this really be here?
class error(Exception):
- pass
+ def __init__(self, msg, pattern=None, pos=None):
+ self.msg = msg
+ self.pattern = pattern
+ self.pos = pos
+ if pattern is not None and pos is not None:
+ msg = '%s at position %d' % (msg, pos)
+ if isinstance(pattern, str):
+ newline = '\n'
+ else:
+ newline = b'\n'
+ self.lineno = pattern.count(newline, 0, pos) + 1
+ self.colno = pos - pattern.rfind(newline, 0, pos)
+ if newline in pattern:
+ msg = '%s (line %d, column %d)' % (msg, self.lineno,
self.colno)
+ else:
+ self.lineno = self.colno = None
+ super().__init__(msg)
+def linecol(doc, pos):
+ if isinstance(pattern, str):
+ newline = '\n'
+ else:
+ newline = b'\n'
+ lineno = pattern.count(newline, 0, pos) + 1
+ if lineno == 1:
+ colno = pos + 1
+ else:
+ colno = pos - doc.rindex(newline, 0, pos)
+ return lineno, colno
# operators
FAILURE = "failure"
diff -r 8b1ac1a3d007 Lib/sre_parse.py
--- a/Lib/sre_parse.py Wed Oct 08 22:32:50 2014 +0300
+++ b/Lib/sre_parse.py Thu Oct 09 16:29:25 2014 +0300
@@ -207,7 +207,8 @@ class Tokenizer:
try:
c = self.string[self.index + 1]
except IndexError:
- raise error("bogus escape (end of line)")
+ self.next = None
+ raise self.error("bogus escape (end of line)", 0)
if not self.istext:
c = chr(c)
char = char + c
@@ -233,9 +234,13 @@ class Tokenizer:
self.__next()
return result
def tell(self):
- return self.index, self.next
+ return self.index - len(self.next or '')
def seek(self, index):
- self.index, self.next = index
+ self.index = index
+ self.__next()
+
+ def error(self, msg, offset):
+ return error(msg, self.string, self.tell() - offset)
# The following three functions are not used in this module anymore, but we
keep
# them here (with DeprecationWarnings) for backwards compatibility.
@@ -299,8 +304,8 @@ def _class_escape(source, escape):
escape += source.getwhile(2, OCTDIGITS)
c = int(escape[1:], 8)
if c > 0o377:
- raise error('octal escape value %r outside of '
- 'range 0-0o377' % escape)
+ raise source.error('octal escape value %r outside of '
+ 'range 0-0o377' % escape, len(escape))
return LITERAL, c
elif c in DIGITS:
raise ValueError
@@ -308,7 +313,7 @@ def _class_escape(source, escape):
return LITERAL, ord(escape[1])
except ValueError:
pass
- raise error("bogus escape: %s" % repr(escape))
+ raise source.error("bogus escape: %s" % repr(escape), len(escape))
def _escape(source, escape, state):
# handle escape code in expression
@@ -354,21 +359,23 @@ def _escape(source, escape, state):
escape = escape + source.get()
c = int(escape[1:], 8)
if c > 0o377:
- raise error('octal escape value %r outside of '
- 'range 0-0o377' % escape)
+ raise source.error('octal escape value %r outside of '
+ 'range 0-0o377' % escape,
+ len(escape))
return LITERAL, c
# not an octal escape, so this is a group reference
group = int(escape[1:])
if group < state.groups:
if not state.checkgroup(group):
- raise error("cannot refer to open group")
+ raise source.error("cannot refer to open group",
+ len(escape))
return GROUPREF, group
raise ValueError
if len(escape) == 2:
return LITERAL, ord(escape[1])
except ValueError:
pass
- raise error("bogus escape: %s" % repr(escape))
+ raise source.error("bogus escape: %s" % repr(escape), len(escape))
def _parse_sub(source, state, nested=1):
# parse an alternation: a|b|c
@@ -385,7 +392,7 @@ def _parse_sub(source, state, nested=1):
if not source.next or sourcematch(")", 0):
break
else:
- raise error("pattern not properly closed")
+ raise source.error("pattern not properly closed", 0)
if len(items) == 1:
return items[0]
@@ -434,11 +441,12 @@ def _parse_sub_cond(source, state, condg
if source.match("|"):
item_no = _parse(source, state)
if source.match("|"):
- raise error("conditional backref with more than two branches")
+ raise source.error("conditional backref with more than two
branches",
+ 1)
else:
item_no = None
if source.next and not source.match(")", 0):
- raise error("pattern not properly closed")
+ raise source.error("pattern not properly closed", 0)
subpattern = SubPattern(state)
subpattern.append((GROUPREF_EXISTS, (condgroup, item_yes, item_no)))
return subpattern
@@ -503,7 +511,7 @@ def _parse(source, state):
elif this:
code1 = LITERAL, ord(this)
else:
- raise error("unexpected end of regular expression")
+ raise source.error("unexpected end of regular expression",
0)
if sourcematch("-"):
# potential range
this = sourceget()
@@ -519,14 +527,14 @@ def _parse(source, state):
else:
code2 = LITERAL, ord(this)
if code1[0] != LITERAL or code2[0] != LITERAL:
- raise error("bad character range")
+ raise source.error("bad character range",
len(this))
lo = code1[1]
hi = code2[1]
if hi < lo:
- raise error("bad character range")
+ raise source.error("bad character range",
len(this))
setappend((RANGE, (lo, hi)))
else:
- raise error("unexpected end of regular expression")
+ raise source.error("unexpected end of regular
expression", 0)
else:
if code1[0] is IN:
code1 = code1[1][0]
@@ -543,6 +551,7 @@ def _parse(source, state):
elif this and this[0] in REPEAT_CHARS:
# repeat previous item
+ here = source.tell()
if this == "?":
min, max = 0, 1
elif this == "*":
@@ -554,7 +563,6 @@ def _parse(source, state):
if source.next == "}":
subpatternappend((LITERAL, ord(this)))
continue
- here = source.tell()
min, max = 0, MAXREPEAT
lo = hi = ""
while source.next in DIGITS:
@@ -577,18 +585,21 @@ def _parse(source, state):
if max >= MAXREPEAT:
raise OverflowError("the repetition number is too
large")
if max < min:
- raise error("bad repeat interval")
+ raise source.error("bad repeat interval",
+ source.tell() - here)
else:
- raise error("not supported")
+ raise source.error("not supported", len(this))
# figure out which item to repeat
if subpattern:
item = subpattern[-1:]
else:
item = None
if not item or (_len(item) == 1 and item[0][0] == AT):
- raise error("nothing to repeat")
+ raise source.error("nothing to repeat",
+ source.tell() - here + len(this))
if item[0][0] in REPEATCODES:
- raise error("multiple repeat")
+ raise source.error("multiple repeat",
+ source.tell() - here + len(this))
if sourcematch("?"):
subpattern[-1] = (MIN_REPEAT, (min, max, item))
else:
@@ -612,41 +623,45 @@ def _parse(source, state):
while 1:
char = sourceget()
if char is None:
- raise error("unterminated name")
+ raise source.error("unterminated name", 0)
if char == ">":
break
name = name + char
group = 1
if not name:
- raise error("missing group name")
+ raise source.error("missing group name", 1)
if not name.isidentifier():
- raise error("bad character in group name %r" %
name)
+ raise source.error("bad character in group name "
+ "%r" % name,
+ len(name) + 1)
elif sourcematch("="):
# named backreference
name = ""
while 1:
char = sourceget()
if char is None:
- raise error("unterminated name")
+ raise source.error("unterminated name", 0)
if char == ")":
break
name = name + char
if not name:
- raise error("missing group name")
+ raise source.error("missing group name", 1)
if not name.isidentifier():
- raise error("bad character in backref group name "
- "%r" % name)
+ raise source.error("bad character in backref "
+ "group name %r" % name,
+ len(name) + 1)
gid = state.groupdict.get(name)
if gid is None:
msg = "unknown group name: {0!r}".format(name)
- raise error(msg)
+ raise source.error(msg, len(name) + 1)
subpatternappend((GROUPREF, gid))
continue
else:
char = sourceget()
if char is None:
- raise error("unexpected end of pattern")
- raise error("unknown specifier: ?P%s" % char)
+ raise source.error("unexpected end of pattern", 0)
+ raise source.error("unknown specifier: ?P%s" % char,
+ len(char))
elif sourcematch(":"):
# non-capturing group
group = 2
@@ -657,7 +672,7 @@ def _parse(source, state):
break
sourceget()
if not sourcematch(")"):
- raise error("unbalanced parenthesis")
+ raise source.error("unbalanced parenthesis", 0)
continue
elif source.next in ASSERTCHARS:
# lookahead assertions
@@ -665,12 +680,12 @@ def _parse(source, state):
dir = 1
if char == "<":
if source.next not in LOOKBEHINDASSERTCHARS:
- raise error("syntax error")
+ raise source.error("syntax error", 0)
dir = -1 # lookbehind
char = sourceget()
p = _parse_sub(source, state)
if not sourcematch(")"):
- raise error("unbalanced parenthesis")
+ raise source.error("unbalanced parenthesis", 0)
if char == "=":
subpatternappend((ASSERT, (dir, p)))
else:
@@ -682,33 +697,36 @@ def _parse(source, state):
while 1:
char = sourceget()
if char is None:
- raise error("unterminated name")
+ raise source.error("unterminated name", 0)
if char == ")":
break
condname = condname + char
group = 2
if not condname:
- raise error("missing group name")
+ raise source.error("missing group name", 1)
if condname.isidentifier():
condgroup = state.groupdict.get(condname)
if condgroup is None:
msg = "unknown group name: {0!r}".format(condname)
- raise error(msg)
+ raise source.error(msg, len(condname) + 1)
else:
try:
condgroup = int(condname)
if condgroup < 0:
raise ValueError
except ValueError:
- raise error("bad character in group name")
+ raise source.error("bad character in group name",
+ len(condname) + 1)
if not condgroup:
- raise error("bad group number")
+ raise source.error("bad group number",
+ len(condname) + 1)
if condgroup >= MAXGROUPS:
- raise error("the group number is too large")
+ raise source.error("the group number is too large",
+ len(condname) + 1)
else:
# flags
if not source.next in FLAGS:
- raise error("unexpected end of pattern")
+ raise source.error("unexpected end of pattern", 0)
while source.next in FLAGS:
state.flags = state.flags | FLAGS[sourceget()]
if group:
@@ -717,13 +735,16 @@ def _parse(source, state):
# anonymous group
group = None
else:
- group = state.opengroup(name)
+ try:
+ group = state.opengroup(name)
+ except error as err:
+ raise source.error(err.msg, len(name) + 1)
if condgroup:
p = _parse_sub_cond(source, state, condgroup)
else:
p = _parse_sub(source, state)
if not sourcematch(")"):
- raise error("unbalanced parenthesis")
+ raise source.error("unbalanced parenthesis", 0)
if group is not None:
state.closegroup(group)
subpatternappend((SUBPATTERN, (group, p)))
@@ -731,10 +752,10 @@ def _parse(source, state):
while 1:
char = sourceget()
if char is None:
- raise error("unexpected end of pattern")
+ raise source.error("unexpected end of pattern", 0)
if char == ")":
break
- raise error("unknown extension")
+ raise source.error("unknown extension", len(char))
elif this == "^":
subpatternappend((AT, AT_BEGINNING))
@@ -747,7 +768,7 @@ def _parse(source, state):
subpatternappend(code)
else:
- raise error("parser error")
+ raise source.error("parser error", len(this))
return subpattern
@@ -778,9 +799,10 @@ def parse(str, flags=0, pattern=None):
tail = source.get()
if tail == ")":
- raise error("unbalanced parenthesis")
+ raise source.error("unbalanced parenthesis", 1)
elif tail:
- raise error("bogus characters at end of regular expression")
+ raise source.error("bogus characters at end of regular expression",
+ len(tail))
if flags & SRE_FLAG_DEBUG:
p.dump()
@@ -820,21 +842,23 @@ def parse_template(source, pattern):
while True:
char = sget()
if char is None:
- raise error("unterminated group name")
+ raise s.error("unterminated group name", 0)
if char == ">":
break
name += char
if not name:
- raise error("missing group name")
+ raise s.error("missing group name", 1)
try:
index = int(name)
if index < 0:
- raise error("negative group number")
+ raise s.error("negative group number", len(name) + 1)
if index >= MAXGROUPS:
- raise error("the group number is too large")
+ raise s.error("the group number is too large",
+ len(name) + 1)
except ValueError:
if not name.isidentifier():
- raise error("bad character in group name")
+ raise s.error("bad character in group name",
+ len(name) + 1)
try:
index = pattern.groupindex[name]
except KeyError:
@@ -857,8 +881,8 @@ def parse_template(source, pattern):
isoctal = True
c = int(this[1:], 8)
if c > 0o377:
- raise error('octal escape value %r outside of '
- 'range 0-0o377' % this)
+ raise s.error('octal escape value %r outside of '
+ 'range 0-0o377' % this, len(this))
lappend(chr(c))
if not isoctal:
addgroup(int(this[1:]))
diff -r 8b1ac1a3d007 Lib/test/test_re.py
--- a/Lib/test/test_re.py Wed Oct 08 22:32:50 2014 +0300
+++ b/Lib/test/test_re.py Thu Oct 09 16:29:25 2014 +0300
@@ -1270,6 +1270,40 @@ subpattern None
# with ignore case.
self.assertEqual(re.fullmatch('[a-c]+', 'ABC', re.I).span(), (0, 3))
+ def test_error(self):
+ with self.assertRaises(re.error) as cm:
+ re.compile('(\u20ac))')
+ err = cm.exception
+ self.assertIsInstance(err.pattern, str)
+ self.assertEqual(err.pattern, '(\u20ac))')
+ self.assertEqual(err.pos, 3)
+ self.assertEqual(err.lineno, 1)
+ self.assertEqual(err.colno, 4)
+ self.assertIn(err.msg, str(err))
+ self.assertIn(' at position 3', str(err))
+ self.assertNotIn(' at position 3', err.msg)
+ with self.assertRaises(re.error) as cm:
+ re.compile(b'(\xa4))')
+ err = cm.exception
+ self.assertIsInstance(err.pattern, bytes)
+ self.assertEqual(err.pattern, b'(\xa4))')
+ self.assertEqual(err.pos, 3)
+ with self.assertRaises(re.error) as cm:
+ re.compile("""
+ (
+ abc
+ )
+ )
+ (
+ """, re.VERBOSE)
+ err = cm.exception
+ self.assertEqual(err.pos, 77)
+ self.assertEqual(err.lineno, 5)
+ self.assertEqual(err.colno, 17)
+ self.assertIn(err.msg, str(err))
+ self.assertIn(' at position 77', str(err))
+ self.assertIn('(line 5, column 17)', str(err))
+
class PatternReprTests(unittest.TestCase):
def check(self, pattern, expected):
_______________________________________________
Python-bugs-list mailing list
Unsubscribe:
https://mail.python.org/mailman/options/python-bugs-list/archive%40mail-archive.com