Alexzive a écrit :
Hello there :) ,

I am a python newbie and need to run following code for a task in an
external simulation programm called "Abaqus" which makes use of python
to access the mesh (ensamble of nodes with xy coordinates) of a
certain geometrical model.

[IN is the starting input containing the nodes to be check, there are
some double nodes with the same x and y coordinates which need to be
removed. SN is the output containing such double nodes]

Code: Select all
    for i in range(len(IN)): #scan all elements of the list IN
      for j in range(len(IN)):
        if i <> j:
         if IN[i].coordinates[0] == IN[j].coordinates[0]:
           if IN[i].coordinates[1] == IN[j].coordinates[1]:
              SN.append(IN[i].label)



Unfortunately my len(IN) is about 100.000 and the running time about
15h !!!! :(

Any idea to improve it?

A couple ones have been submitted. Harald gets a point about the redundant tests (even if his solution seems to be broken, cf below) - your inner loop should have looked like :

  for j in xrange(i+1, len(IN))

Now the obvious winner is pruebono - even unoptimized, using sets seems to be *way* faster than even the most optimized corrected version of your algorithm.

Here's a quick bench - please everyone doublecheck it to make sure it's ok:

from timeit import Timer
import random

class Node(object):
    def __init__(self, x, y):
        self.coordinates = (x, y)
    def __repr__(self):
        return repr(self.coordinates)


# how to build src:
# src = [(random.randrange(10),random.randrange(10)) for x in xrange(100)]
src = [
    (4, 9), (5, 0), (6, 6), (7, 2), (3, 6), (9, 6), (0, 1), (1, 6),
    (0, 5), (1, 2), (8, 9), (5, 4), (1, 6), (7, 6), (9, 1), (7, 6),
    (0, 1), (7, 4), (7, 4), (8, 4), (8, 4), (3, 5), (9, 6), (6, 1),
    (3, 4), (4, 5), (0, 5), (6, 3), (2, 4), (1, 6), (9, 5), (1, 2),
    (5, 8), (8, 5), (3, 1), (9, 4), (9, 4), (3, 3), (4, 8), (9, 7),
    (8, 4), (6, 2), (1, 5), (5, 8), (8, 6), (0, 8), (5, 2), (3, 4),
    (0, 5), (4, 4), (2, 9), (7, 7), (1, 0), (4, 2), (5, 7), (0, 4),
    (2, 5), (0, 8), (7, 3), (9, 1), (0, 4), (5, 0), (4, 9), (0, 6),
    (3, 0), (3, 0), (3, 9), (8, 3), (7, 9), (8, 5), (7, 6), (1, 5),
    (0, 6), (5, 9), (6, 8), (0, 0), (4, 1), (3, 3), (5, 4), (5, 3),
    (6, 1), (5, 4), (4, 5), (5, 8), (4, 1), (3, 6), (1, 9), (0, 5),
    (6, 5), (5, 5), (6, 0), (0, 9), (2, 6), (0, 7), (5, 9), (7, 3),
    (7, 9), (5, 4), (4, 9), (2, 9)
    ]
IN = [Node(x, y) for x, y in src]


def doubles0():
    SN = []
    for i in range(len(IN)): #scan all elements of the list IN
        for j in range(len(IN)):
            #print i, j
            if i <> j:
                if IN[i].coordinates[0] == IN[j].coordinates[0]:
                    if IN[i].coordinates[1] == IN[j].coordinates[1]:
                        SN.append(IN[i])
    return SN

def doubles1():
    "first solve an algoritmic problem"
    SN = []
    for i in range(len(IN)):
        for j in range(i+1, len(IN)):
            #print i, j
            #if i != j:
            if IN[i].coordinates[0] == IN[j].coordinates[0]:
                if IN[i].coordinates[1] == IN[j].coordinates[1]:
                    SN.append(IN[i])
    return SN

def doubles2():
    "then avoid buildin useless lists"
    SN = []
    for i in xrange(len(IN)):
        for j in xrange(i+1, len(IN)):
            if IN[i].coordinates[0] == IN[j].coordinates[0]:
                if IN[i].coordinates[1] == IN[j].coordinates[1]:
                    SN.append(IN[i])
    return SN


def doubles3():
    "then avoid uselessly calling a constant operation"
    SN = []
    in_len = len(IN)
    for i in xrange(in_len):
        for j in xrange(i+1, in_len):
            if IN[i].coordinates[0] == IN[j].coordinates[0]:
                if IN[i].coordinates[1] == IN[j].coordinates[1]:
                    SN.append(IN[i])
    return SN

def doubles4():
    "then alias commonly used methods"
    SN = []
    sn_append = SN.append
    in_len = len(IN)
    for i in xrange(in_len):
        for j in xrange(i+1, in_len):
            if IN[i].coordinates[0] == IN[j].coordinates[0]:
                if IN[i].coordinates[1] == IN[j].coordinates[1]:
                    sn_append(IN[i])
    return SN

def doubles5():
    "then alias a couple other things"
    SN = []
    sn_append = SN.append
    in_len = len(IN)
    for i in xrange(in_len):
        node_i = IN[i]
        coords_i = node_i.coordinates
        for j in xrange(i+1, in_len):
            if coords_i[0] == IN[j].coordinates[0]:
                if coords_i[1] == IN[j].coordinates[1]:
                    sn_append(node_i)
    return SN

def doubles6():
    "then simplify tests"
    SN = []
    sn_append = SN.append
    in_len = len(IN)
    for i in xrange(in_len):
        node_i = IN[i]
        coords_i = node_i.coordinates
        for j in xrange(i+1, in_len):
            if coords_i == IN[j].coordinates:
                sn_append(node_i)
    return SN

# Harald : uncomment this and run test_results. As far as I can tell, it
# doesn't yields the expected results

## IN7 = IN[:]
## def sortk7(n):
##     return n.coordinates[0]

## def doubles7():
##     "is ordering better ? - Nope Sir, it's broken..."
##     IN7.sort(key=sortk)
##     SN = []
##     sn_append = SN.append
##     in_len = len(IN)
##     for i in xrange(in_len):
##         node_i = IN[i]
##         coords_i = node_i.coordinates
##         for j in xrange(i+1, in_len):
##             if coords_i[0] == IN[j].coordinates[0]:
##                 if coords_i[1] == IN[j].coordinates[1]:
##                     sn_append(node_i)
##             else:
##                 break

##     return SN


def doubles8():
    "Using a set ?"
    dup = set()
    SN = []
    for item in IN:
        c = item.coordinates
        if c in dup:
            SN.append(item)
        else:
            dup.add(c)
    return SN


def doubles9():
    "Using a set is way faster - but can still be improved a bit"
    dup = set()
    dup_add = dup.add
    SN = []
    sn_append = SN.append

    for item in IN:
        c = item.coordinates
        if c in dup:
            sn_append(item)
        else:
            dup_add(c)
    return SN


# need this for tests:
names_funcs = sorted(
    (n, f) for n, f in locals().iteritems()
    if n.startswith('doubles')
    )

def test_results():
    """
    make sure all solutions give correct results,
    assuming the OP solution is at least correct !-)
    """
    results = [
       (n, set(o.coordinates for o in f()))
       for n, f in names_funcs
       ]
    _, correct = results[0]
    ok = True
    for n, r in results[1:]:
        if r != correct:
            print "error on %s ?\n expected %s\n, got %s" \
                % (n, correct, r)
            ok = False
    return ok


def test_times():
    " And the winner is..."
    results = [
        (n, Timer(
             '%s()' % n,
             'from __main__ import %s' % n
            ).timeit(100)
        )
        for n, _ in names_funcs
        ]
    print "\n".join("%s : %s" % r for r in results)



Results here (py2.5, gentoo linux, athlonxp1800, 512 ram):

>>> test_results()
True
>>> test_times()
doubles0 : 1.55667901039
doubles1 : 0.719144105911
doubles2 : 0.703393936157
doubles3 : 0.700654983521
doubles4 : 0.706257104874
doubles5 : 0.528184890747
doubles6 : 0.461633205414
doubles8 : 0.0134379863739
doubles9 : 0.0108540058136
>>>

Not surprisingly, half less iterations makes for half less time. Aliasing, as often, proves to be a good optimization too. But obviously, using the correct data structure / algorithm combo is the key : simpler code, and 115 times faster (143 times with aliasing). If pruebono solution's is correct (and it as AFAICT), your 15 hours computation should by now take less than 10 minutes...



--
http://mail.python.org/mailman/listinfo/python-list

Reply via email to