Steven D'Aprano: > What you're trying to do is the equivalent of substring searching. There > are simple algorithms that do this, and there are fast algorithms, but > the simple ones aren't fast and the fast ones aren't simple. > However, for your use case, the simple algorithm is probably fast enough: > > def is_sublist(b, a): > """Return True if b is a sub-list of a, otherwise False""" > # Get the simple cases out of the way first, for speed. > if len(a) < len(b): return False > elif a == b: return True > elif not b: return False # Or maybe True, if you prefer. > lb = len(b) > for i in xrange(len(a)-lb+1): > if a[i:i+lb] == b: > return True > return False
I think a compromise can be found, this seem to work, and fast enough (Psyco helps a lot of this kind of code): def issubseq(sub, items): """issubseq(sub, items): return true if the sequence 'sub' is a contiguous subsequence of the 'items' sequence. >>> issubseq() Traceback (most recent call last): ... TypeError: issubseq() takes exactly 2 arguments (0 given) >>> issubseq("abc") Traceback (most recent call last): ... TypeError: issubseq() takes exactly 2 arguments (1 given) >>> issubseq(1, [1, 2, 3]) Traceback (most recent call last): ... TypeError: object of type 'int' has no len() >>> isi = lambda s,i: int(issubseq(s,i)) >>> isi([], []) 1 >>> isi("a", "") 0 >>> isi("", "a") 1 >>> isi("", "aaa") 1 >>> isi("a", "a") 1 >>> isi("ab", "bab") 1 >>> isi("ab", "bab") 1 >>> isi("ba", "bbb") 0 >>> isi("bab", "ab") 0 >>> isi(("a", "b"), ("a","a","b")) 1 >>> isi(("a", "b"), ("a","a","c")) 0 >>> isi([1,2,1], [3,5, 1,2,4, 1,2,1, 6]) 1 >>> isi([1,2,1], [3,5, 1,2,4, 1,2,3, 6]) 0 >>> l = [1] * 100 + [1,2,3] + [4] * 100 >>> isi([1,2,3], l) 1 >>> l = [1] * 100 + [1,2,4] + [5] * 100 >>> isi([1,2,3], l) 0 """ len_sub = len(sub) len_items = len(items) if len_sub == 0: return True if len_sub > len_items: return False if len_items == 0: return False if sub == items: return True table = [0] * (len_sub + 1) # building prefix-function m = 0 for i in xrange(1, len_sub): while m > 0 and sub[m] != sub[i]: m = table[m - 1] if sub[m] == sub[i]: m += 1 table[i] = m # searching m, i = 0, 0 for x in items: while m > 0 and sub[m] != x: m = table[m - 1] if sub[m] == x: m += 1 if m == len_sub: return True i += 1 return False def _is_sublist(sub, items): """Return True if sub is a sub-list of items, otherwise False. >>> _is_sublist() Traceback (most recent call last): ... TypeError: _is_sublist() takes exactly 2 arguments (0 given) >>> _is_sublist("abc") Traceback (most recent call last): ... TypeError: _is_sublist() takes exactly 2 arguments (1 given) >>> _is_sublist(1, [1, 2, 3]) Traceback (most recent call last): ... TypeError: object of type 'int' has no len() >>> isi = lambda s,i: int(_is_sublist(s,i)) >>> isi([], []) 1 >>> isi("a", "") 0 >>> isi("", "a") 1 >>> isi("", "aaa") 1 >>> isi("a", "a") 1 >>> isi("ab", "bab") 1 >>> isi("ab", "bab") 1 >>> isi("ba", "bbb") 0 >>> isi("bab", "ab") 0 >>> isi(("a", "b"), ("a","a","b")) 1 >>> isi(("a", "b"), ("a","a","c")) 0 >>> isi([1,2,1], [3,5, 1,2,4, 1,2,1, 6]) 1 >>> isi([1,2,1], [3,5, 1,2,4, 1,2,3, 6]) 0 >>> l = [1] * 100 + [1,2,3] + [4] * 100 >>> isi([1,2,3], l) 1 >>> l = [1] * 100 + [1,2,4] + [5] * 100 >>> isi([1,2,3], l) 0 """ # By Steven D'Aprano, modified len_sub = len(sub) len_items = len(items) if len_sub == 0: return True if len_sub > len_items: return False if len_items == 0: return False if sub == items: return True for i in xrange(len_items - len_sub + 1): if items[i : i+len_sub] == sub: return True return False if __name__ == "__main__": import doctest doctest.testmod() print "Doctests finished.\n" def subsequence_benchmark(with_psyco=False): from random import randrange from time import clock N = 100000 NBIG = 1000 small_sub = [1, 2, 3] big_sub = [randrange(10) for _ in xrange(NBIG)] small_present = [1] * N + small_sub + [N] * N small_absent1 = [1] * N + small_sub[:-1] + [N] * N small_absent2 = small_sub[:-1] * N big_present = [1] * N + big_sub + [N] * N big_absent = [1] * N + big_sub[:-1] + [N] * N test_cases = [(small_sub, small_present), (small_sub, small_absent1), (small_sub, small_absent2), (big_sub, big_present), (big_sub, big_absent)] algos = [issubseq, _is_sublist] if with_psyco: import psyco for algo in algos: psyco.bind(algo) print "With psyco" else: print "Without psyco" print "N, NBIG =", N, NBIG for sub, seq in test_cases: print "len(sub),len(seq)= %d %d" % (len(sub), len(seq)) result = [] for finder in algos: t0 = clock() r = finder(sub, seq) t1 = clock() result.append(r) print " %s: %s (time: %.02f)" %(finder.func_name, r, round(t1-t0,2)) assert(all(r==True for r in result) or all(r==False for r in result)) #subsequence_benchmark(with_psyco=True) There are ways to use less memory or to go faster, but they may be overkill: http://www.icu-project.org/docs/papers/efficient_text_searching_in_java.html Bye, bearophile -- http://mail.python.org/mailman/listinfo/python-list