New submission from Alexander Grigoriev <alegrigor...@gmail.com>:

Certain patterns and input strings cause re.match to take exponentially longer 
time. This can be expected because of recursive nature of matching some 
patterns, but it can take surprisingly long time with some combination of a 
pattern and input data of moderate length. For example:

import re
import time

t1 = time.monotonic()
re.match(rb'((?: |\t)*)((?:.*?(?:\\(?: |\t)+)?)*)(\s*)$', b'#define EVENT_ 
{\t\\\r\n\tif {\t\\\r\n\t\tmnt_disp;\t\\\r\n\t} }\r\n')
t2 = time.monotonic()
print(r"re.match(rb'((?: |\t)*)((?:.*?(?:\\(?: |\t)+)?)*)(\s*)$', b'#define 
EVENT_ {\t\\\r\n\tif {\t\\\r\n\t\tmnt_disp;\t\\\r\n\t} }\r\n'): %s s" % (t2-t1))
t1 = time.monotonic()
re.match(rb'((?: |\t)*)((?:.*?(?:\\(?: |\t)+)?)*)(\s*)$', b'#define EVENT_D 
{\t\\\r\n\tif {\t\\\r\n\t\tmnt_disp;\t\\\r\n\t} }\r\n')
t2 = time.monotonic()
print(r"re.match(rb'((?: |\t)*)((?:.*?(?:\\(?: |\t)+)?)*)(\s*)$', b'#define 
EVENT_D {\t\\\r\n\tif {\t\\\r\n\t\tmnt_disp;\t\\\r\n\t} }\r\n'): %s s" % 
(t2-t1))
t1 = time.monotonic()
re.match(rb'((?: |\t)*)((?:.*?(?:\\(?: |\t)+)?)*)(\s*)$', b'#define EVENT_DI 
{\t\\\r\n\tif {\t\\\r\n\t\tmnt_disp;\t\\\r\n\t} }\r\n')
t2 = time.monotonic()
print(r"re.match(rb'((?: |\t)*)((?:.*?(?:\\(?: |\t)+)?)*)(\s*)$', b'#define 
EVENT_DI {\t\\\r\n\tif {\t\\\r\n\t\tmnt_disp;\t\\\r\n\t} }\r\n'): %s s" % 
(t2-t1))
t1 = time.monotonic()
re.match(rb'((?: |\t)*)((?:.*?(?:\\(?: |\t)+)?)*)(\s*)$', b'#define EVENT_DIS 
{\t\\\r\n\tif {\t\\\r\n\t\tmnt_disp;\t\\\r\n\t} }\r\n')
t2 = time.monotonic()
print(r"re.match(rb'((?: |\t)*)((?:.*?(?:\\(?: |\t)+)?)*)(\s*)$', b'#define 
EVENT_DIS {\t\\\r\n\tif {\t\\\r\n\t\tmnt_disp;\t\\\r\n\t} }\r\n'): %s s" % 
(t2-t1))
t1 = time.monotonic()
re.match(rb'((?: |\t)*)((?:.*?(?:\\(?: |\t)+)?)*)(\s*)$', b'#define EVENT_DISP 
{\t\\\r\n\tif {\t\\\r\n\t\tmnt_disp;\t\\\r\n\t} }\r\n')
t2 = time.monotonic()
print(r"re.match(rb'((?: |\t)*)((?:.*?(?:\\(?: |\t)+)?)*)(\s*)$', b'#define 
EVENT_DISP {\t\\\r\n\tif {\t\\\r\n\t\tmnt_disp;\t\\\r\n\t} }\r\n'): %s s" % 
(t2-t1))
t1 = time.monotonic()
re.match(rb'((?: |\t)*)((?:.*?(?:\\(?: |\t)+)?)*)(\s*)$', b'#define EVENT_DISPL 
{\t\\\r\n\tif {\t\\\r\n\t\tmnt_disp;\t\\\r\n\t} }\r\n')
t2 = time.monotonic()
print(r"re.match(rb'((?: |\t)*)((?:.*?(?:\\(?: |\t)+)?)*)(\s*)$', b'#define 
EVENT_DISPL {\t\\\r\n\tif {\t\\\r\n\t\tmnt_disp;\t\\\r\n\t} }\r\n'): %s s" % 
(t2-t1))
t1 = time.monotonic()
re.match(rb'((?: |\t)*)((?:.*?(?:\\(?: |\t)+)?)*)(\s*)$', b'#define 
EVENT_DISPLA {\t\\\r\n\tif {\t\\\r\n\t\tmnt_disp;\t\\\r\n\t} }\r\n')
t2 = time.monotonic()
print(r"re.match(rb'((?: |\t)*)((?:.*?(?:\\(?: |\t)+)?)*)(\s*)$', b'#define 
EVENT_DISPLA {\t\\\r\n\tif {\t\\\r\n\t\tmnt_disp;\t\\\r\n\t} }\r\n'): %s s" % 
(t2-t1))
t1 = time.monotonic()
re.match(rb'((?: |\t)*)((?:.*?(?:\\(?: |\t)+)?)*)(\s*)$', b'#define 
EVENT_DISPLAY {\t\\\r\n\tif {\t\\\r\n\t\tmnt_disp;\t\\\r\n\t} }\r\n')
t2 = time.monotonic()
print(r"re.match(rb'((?: |\t)*)((?:.*?(?:\\(?: |\t)+)?)*)(\s*)$', b'#define 
EVENT_DISPLAY {\t\\\r\n\tif {\t\\\r\n\t\tmnt_disp;\t\\\r\n\t} }\r\n'): %s s" % 
(t2-t1))


Does:

re.match(rb'((?: |\t)*)((?:.*?(?:\\(?: |\t)+)?)*)(\s*)$', b'#define EVENT_ 
{\t\\\r\n\tif {\t\\\r\n\t\tmnt_disp;\t\\\r\n\t} }\r\n'): 0.2190000000409782 s
re.match(rb'((?: |\t)*)((?:.*?(?:\\(?: |\t)+)?)*)(\s*)$', b'#define EVENT_D 
{\t\\\r\n\tif {\t\\\r\n\t\tmnt_disp;\t\\\r\n\t} }\r\n'): 0.4529999999795109 s
re.match(rb'((?: |\t)*)((?:.*?(?:\\(?: |\t)+)?)*)(\s*)$', b'#define EVENT_DI 
{\t\\\r\n\tif {\t\\\r\n\t\tmnt_disp;\t\\\r\n\t} }\r\n'): 0.9529999999795109 s
re.match(rb'((?: |\t)*)((?:.*?(?:\\(?: |\t)+)?)*)(\s*)$', b'#define EVENT_DIS 
{\t\\\r\n\tif {\t\\\r\n\t\tmnt_disp;\t\\\r\n\t} }\r\n'): 1.797000000020489 s
re.match(rb'((?: |\t)*)((?:.*?(?:\\(?: |\t)+)?)*)(\s*)$', b'#define EVENT_DISP 
{\t\\\r\n\tif {\t\\\r\n\t\tmnt_disp;\t\\\r\n\t} }\r\n'): 3.6090000001713634 s
re.match(rb'((?: |\t)*)((?:.*?(?:\\(?: |\t)+)?)*)(\s*)$', b'#define EVENT_DISPL 
{\t\\\r\n\tif {\t\\\r\n\t\tmnt_disp;\t\\\r\n\t} }\r\n'): 7.125 s
re.match(rb'((?: |\t)*)((?:.*?(?:\\(?: |\t)+)?)*)(\s*)$', b'#define 
EVENT_DISPLA {\t\\\r\n\tif {\t\\\r\n\t\tmnt_disp;\t\\\r\n\t} }\r\n'): 
14.890999999828637 s
re.match(rb'((?: |\t)*)((?:.*?(?:\\(?: |\t)+)?)*)(\s*)$', b'#define 
EVENT_DISPLAY {\t\\\r\n\tif {\t\\\r\n\t\tmnt_disp;\t\\\r\n\t} }\r\n'): 
29.688000000081956 s


Perhaps re.match needs to be optimized and/or rewritten in low level language 
(C)

----------
components: Library (Lib)
messages: 389949
nosy: alegrigoriev
priority: normal
severity: normal
status: open
title: re.match appears to hang with certain combinations of pattern and string
type: performance
versions: Python 3.9

_______________________________________
Python tracker <rep...@bugs.python.org>
<https://bugs.python.org/issue43686>
_______________________________________
_______________________________________________
Python-bugs-list mailing list
Unsubscribe: 
https://mail.python.org/mailman/options/python-bugs-list/archive%40mail-archive.com

Reply via email to