Alexzive a écrit :
Hello there :) ,
I am a python newbie and need to run following code for a task in an
external simulation programm called "Abaqus" which makes use of python
to access the mesh (ensamble of nodes with xy coordinates) of a
certain geometrical model.
[IN is the starting input containing the nodes to be check, there are
some double nodes with the same x and y coordinates which need to be
removed. SN is the output containing such double nodes]
Code: Select all
for i in range(len(IN)): #scan all elements of the list IN
for j in range(len(IN)):
if i <> j:
if IN[i].coordinates[0] == IN[j].coordinates[0]:
if IN[i].coordinates[1] == IN[j].coordinates[1]:
SN.append(IN[i].label)
Unfortunately my len(IN) is about 100.000 and the running time about
15h !!!! :(
Any idea to improve it?
A couple ones have been submitted. Harald gets a point about the
redundant tests (even if his solution seems to be broken, cf below) -
your inner loop should have looked like :
for j in xrange(i+1, len(IN))
Now the obvious winner is pruebono - even unoptimized, using sets seems
to be *way* faster than even the most optimized corrected version of
your algorithm.
Here's a quick bench - please everyone doublecheck it to make sure it's ok:
from timeit import Timer
import random
class Node(object):
def __init__(self, x, y):
self.coordinates = (x, y)
def __repr__(self):
return repr(self.coordinates)
# how to build src:
# src = [(random.randrange(10),random.randrange(10)) for x in xrange(100)]
src = [
(4, 9), (5, 0), (6, 6), (7, 2), (3, 6), (9, 6), (0, 1), (1, 6),
(0, 5), (1, 2), (8, 9), (5, 4), (1, 6), (7, 6), (9, 1), (7, 6),
(0, 1), (7, 4), (7, 4), (8, 4), (8, 4), (3, 5), (9, 6), (6, 1),
(3, 4), (4, 5), (0, 5), (6, 3), (2, 4), (1, 6), (9, 5), (1, 2),
(5, 8), (8, 5), (3, 1), (9, 4), (9, 4), (3, 3), (4, 8), (9, 7),
(8, 4), (6, 2), (1, 5), (5, 8), (8, 6), (0, 8), (5, 2), (3, 4),
(0, 5), (4, 4), (2, 9), (7, 7), (1, 0), (4, 2), (5, 7), (0, 4),
(2, 5), (0, 8), (7, 3), (9, 1), (0, 4), (5, 0), (4, 9), (0, 6),
(3, 0), (3, 0), (3, 9), (8, 3), (7, 9), (8, 5), (7, 6), (1, 5),
(0, 6), (5, 9), (6, 8), (0, 0), (4, 1), (3, 3), (5, 4), (5, 3),
(6, 1), (5, 4), (4, 5), (5, 8), (4, 1), (3, 6), (1, 9), (0, 5),
(6, 5), (5, 5), (6, 0), (0, 9), (2, 6), (0, 7), (5, 9), (7, 3),
(7, 9), (5, 4), (4, 9), (2, 9)
]
IN = [Node(x, y) for x, y in src]
def doubles0():
SN = []
for i in range(len(IN)): #scan all elements of the list IN
for j in range(len(IN)):
#print i, j
if i <> j:
if IN[i].coordinates[0] == IN[j].coordinates[0]:
if IN[i].coordinates[1] == IN[j].coordinates[1]:
SN.append(IN[i])
return SN
def doubles1():
"first solve an algoritmic problem"
SN = []
for i in range(len(IN)):
for j in range(i+1, len(IN)):
#print i, j
#if i != j:
if IN[i].coordinates[0] == IN[j].coordinates[0]:
if IN[i].coordinates[1] == IN[j].coordinates[1]:
SN.append(IN[i])
return SN
def doubles2():
"then avoid buildin useless lists"
SN = []
for i in xrange(len(IN)):
for j in xrange(i+1, len(IN)):
if IN[i].coordinates[0] == IN[j].coordinates[0]:
if IN[i].coordinates[1] == IN[j].coordinates[1]:
SN.append(IN[i])
return SN
def doubles3():
"then avoid uselessly calling a constant operation"
SN = []
in_len = len(IN)
for i in xrange(in_len):
for j in xrange(i+1, in_len):
if IN[i].coordinates[0] == IN[j].coordinates[0]:
if IN[i].coordinates[1] == IN[j].coordinates[1]:
SN.append(IN[i])
return SN
def doubles4():
"then alias commonly used methods"
SN = []
sn_append = SN.append
in_len = len(IN)
for i in xrange(in_len):
for j in xrange(i+1, in_len):
if IN[i].coordinates[0] == IN[j].coordinates[0]:
if IN[i].coordinates[1] == IN[j].coordinates[1]:
sn_append(IN[i])
return SN
def doubles5():
"then alias a couple other things"
SN = []
sn_append = SN.append
in_len = len(IN)
for i in xrange(in_len):
node_i = IN[i]
coords_i = node_i.coordinates
for j in xrange(i+1, in_len):
if coords_i[0] == IN[j].coordinates[0]:
if coords_i[1] == IN[j].coordinates[1]:
sn_append(node_i)
return SN
def doubles6():
"then simplify tests"
SN = []
sn_append = SN.append
in_len = len(IN)
for i in xrange(in_len):
node_i = IN[i]
coords_i = node_i.coordinates
for j in xrange(i+1, in_len):
if coords_i == IN[j].coordinates:
sn_append(node_i)
return SN
# Harald : uncomment this and run test_results. As far as I can tell, it
# doesn't yields the expected results
## IN7 = IN[:]
## def sortk7(n):
## return n.coordinates[0]
## def doubles7():
## "is ordering better ? - Nope Sir, it's broken..."
## IN7.sort(key=sortk)
## SN = []
## sn_append = SN.append
## in_len = len(IN)
## for i in xrange(in_len):
## node_i = IN[i]
## coords_i = node_i.coordinates
## for j in xrange(i+1, in_len):
## if coords_i[0] == IN[j].coordinates[0]:
## if coords_i[1] == IN[j].coordinates[1]:
## sn_append(node_i)
## else:
## break
## return SN
def doubles8():
"Using a set ?"
dup = set()
SN = []
for item in IN:
c = item.coordinates
if c in dup:
SN.append(item)
else:
dup.add(c)
return SN
def doubles9():
"Using a set is way faster - but can still be improved a bit"
dup = set()
dup_add = dup.add
SN = []
sn_append = SN.append
for item in IN:
c = item.coordinates
if c in dup:
sn_append(item)
else:
dup_add(c)
return SN
# need this for tests:
names_funcs = sorted(
(n, f) for n, f in locals().iteritems()
if n.startswith('doubles')
)
def test_results():
"""
make sure all solutions give correct results,
assuming the OP solution is at least correct !-)
"""
results = [
(n, set(o.coordinates for o in f()))
for n, f in names_funcs
]
_, correct = results[0]
ok = True
for n, r in results[1:]:
if r != correct:
print "error on %s ?\n expected %s\n, got %s" \
% (n, correct, r)
ok = False
return ok
def test_times():
" And the winner is..."
results = [
(n, Timer(
'%s()' % n,
'from __main__ import %s' % n
).timeit(100)
)
for n, _ in names_funcs
]
print "\n".join("%s : %s" % r for r in results)
Results here (py2.5, gentoo linux, athlonxp1800, 512 ram):
>>> test_results()
True
>>> test_times()
doubles0 : 1.55667901039
doubles1 : 0.719144105911
doubles2 : 0.703393936157
doubles3 : 0.700654983521
doubles4 : 0.706257104874
doubles5 : 0.528184890747
doubles6 : 0.461633205414
doubles8 : 0.0134379863739
doubles9 : 0.0108540058136
>>>
Not surprisingly, half less iterations makes for half less time.
Aliasing, as often, proves to be a good optimization too. But obviously,
using the correct data structure / algorithm combo is the key : simpler
code, and 115 times faster (143 times with aliasing). If pruebono
solution's is correct (and it as AFAICT), your 15 hours computation
should by now take less than 10 minutes...
--
http://mail.python.org/mailman/listinfo/python-list