Skip to site navigation (Press enter)

[webkit-changes] [90419] trunk/Tools

dpranke Tue, 05 Jul 2011 17:12:14 -0700

Title: [90419] trunk/Tools

Revision: 90419
Author: dpra...@chromium.org
Date: 2011-07-05 17:12:06 -0700 (Tue, 05 Jul 2011)

Log Message

2011-07-05  Dirk Pranke  <dpra...@chromium.org>

        nrwt: allow for multiple http shards
        https://bugs.webkit.org/show_bug.cgi?id=63116


        Reviewed by Tony Chang.

        This modifies the sharding logic to support multiple http
        shards, but for now we clamp to one shard until we can test
        perf impact and flakiness impact.

        * Scripts/webkitpy/layout_tests/layout_package/manager.py:
        * Scripts/webkitpy/layout_tests/layout_package/manager_unittest.py:

Modified Paths

trunk/Tools/ChangeLog
trunk/Tools/Scripts/webkitpy/layout_tests/layout_package/manager.py
trunk/Tools/Scripts/webkitpy/layout_tests/layout_package/manager_unittest.py

Diff

Modified: trunk/Tools/ChangeLog (90418 => 90419)


--- trunk/Tools/ChangeLog	2011-07-06 00:08:36 UTC (rev 90418)
+++ trunk/Tools/ChangeLog	2011-07-06 00:12:06 UTC (rev 90419)
@@ -7,6 +7,20 @@
 
 2011-07-05  Dirk Pranke  <dpra...@chromium.org>
 
+        nrwt: allow for multiple http shards
+        https://bugs.webkit.org/show_bug.cgi?id=63116
+
+        Reviewed by Tony Chang.
+
+        This modifies the sharding logic to support multiple http
+        shards, but for now we clamp to one shard until we can test
+        perf impact and flakiness impact.
+
+        * Scripts/webkitpy/layout_tests/layout_package/manager.py:
+        * Scripts/webkitpy/layout_tests/layout_package/manager_unittest.py:
+
+2011-07-05  Dirk Pranke  <dpra...@chromium.org>
+
         Re-land nrwt: make sharding tests needing locks less hard-coded
         https://bugs.webkit.org/show_bug.cgi?id=63112

Modified: trunk/Tools/Scripts/webkitpy/layout_tests/layout_package/manager.py (90418 => 90419)


--- trunk/Tools/Scripts/webkitpy/layout_tests/layout_package/manager.py	2011-07-06 00:08:36 UTC (rev 90418)
+++ trunk/Tools/Scripts/webkitpy/layout_tests/layout_package/manager.py	2011-07-06 00:12:06 UTC (rev 90419)
@@ -237,6 +237,21 @@
     pass
 
 
+class TestShard(object):
+    """A test shard is a named list of TestInputs."""
+
+    # FIXME: Make this class visible, used by workers as well.
+    def __init__(self, name, test_inputs):
+        self.name = name
+        self.test_inputs = test_inputs
+
+    def __repr__(self):
+        return "TestShard(name='%s', test_inputs=%s'" % (self.name, self.test_inputs)
+
+    def __eq__(self, other):
+        return self.name == other.name and self.test_inputs == other.test_inputs
+
+
 class Manager(object):
     """A class for managing running a series of tests on a series of layout
     test files."""
@@ -535,68 +550,145 @@
         return self._expectations.has_modifier(test_file,
                                                test_expectations.SLOW)
 
-    def _shard_tests(self, test_files, use_real_shards):
+    def _shard_tests(self, test_files, num_workers, fully_parallel):
         """Groups tests into batches.
         This helps ensure that tests that depend on each other (aka bad tests!)
         continue to run together as most cross-tests dependencies tend to
-        occur within the same directory. If use_real_shards is False, we
-        put each (non-HTTP/websocket) test into its own shard for maximum
-        concurrency instead of trying to do any sort of real sharding.
-
+        occur within the same directory.
         Return:
-            Two lists of lists of TestInput objects. The first list should
-            only be run under the server lock, the second can be run whenever.
+            Two list of TestShards. The first contains tests that must only be
+            run under the server lock, the second can be run whenever.
         """
-        # FIXME: We still need to support multiple locked shards.
+
+        # FIXME: Move all of the sharding logic out of manager into its
+        # own class or module. Consider grouping it with the chunking logic
+        # in prepare_lists as well.
+        if num_workers == 1:
+            return self._shard_in_two(test_files)
+        elif fully_parallel:
+            return self._shard_every_file(test_files)
+        return self._shard_by_directory(test_files, num_workers)
+
+    def _shard_in_two(self, test_files):
+        """Returns two lists of shards, one with all the tests requiring a lock and one with the rest.
+
+        This is used when there's only one worker, to minimize the per-shard overhead."""
+        locked_inputs = []
+        unlocked_inputs = []
+        for test_file in test_files:
+            test_input = self._get_test_input_for_file(test_file)
+            if self._test_requires_lock(test_file):
+                locked_inputs.append(test_input)
+            else:
+                unlocked_inputs.append(test_input)
+        return [TestShard('locked_tests', locked_inputs)], [TestShard('unlocked_tests', unlocked_inputs)]
+
+    def _shard_every_file(self, test_files):
+        """Returns two lists of shards, each shard containing a single test file.
+
+        This mode gets maximal parallelism at the cost of much higher flakiness."""
         locked_shards = []
         unlocked_shards = []
-        tests_to_http_lock = []
-        if not use_real_shards:
-            for test_file in test_files:
-                test_input = self._get_test_input_for_file(test_file)
-                if self._test_requires_lock(test_file):
-                    tests_to_http_lock.append(test_input)
-                else:
-                    unlocked_shards.append((".", [test_input]))
-        else:
-            tests_by_dir = {}
-            for test_file in test_files:
-                directory = self._get_dir_for_test_file(test_file)
-                test_input = self._get_test_input_for_file(test_file)
-                if self._test_requires_lock(test_file):
-                    tests_to_http_lock.append(test_input)
-                else:
-                    tests_by_dir.setdefault(directory, [])
-                    tests_by_dir[directory].append(test_input)
-            for directory in tests_by_dir:
-                test_list = tests_by_dir[directory]
-                test_list_tuple = (directory, test_list)
-                unlocked_shards.append(test_list_tuple)
+        for test_file in test_files:
+            test_input = self._get_test_input_for_file(test_file)
 
-            # Sort the shards by directory name.
-            unlocked_shards.sort(lambda a, b: cmp(a[0], b[0]))
+            # Note that we use a '.' for the shard name; the name doesn't really
+            # matter, and the only other meaningful value would be the filename,
+            # which would be really redundant.
+            if self._test_requires_lock(test_file):
+                locked_shards.append(TestShard('.', [test_input]))
+            else:
+                unlocked_shards.append(TestShard('.', [test_input]))
 
-        if tests_to_http_lock:
-            locked_shards = [("tests_to_http_lock", tests_to_http_lock)]
+        return locked_shards, unlocked_shards
 
-        return (locked_shards, unlocked_shards)
+    def _shard_by_directory(self, test_files, num_workers):
+        """Returns two lists of shards, each shard containing all the files in a directory.
 
+        This is the default mode, and gets as much parallelism as we can while
+        minimizing flakiness caused by inter-test dependencies."""
+        locked_shards = []
+        unlocked_shards = []
+        tests_by_dir = {}
+        # FIXME: Given that the tests are already sorted by directory,
+        # we can probably rewrite this to be clearer and faster.
+        for test_file in test_files:
+            directory = self._get_dir_for_test_file(test_file)
+            test_input = self._get_test_input_for_file(test_file)
+            tests_by_dir.setdefault(directory, [])
+            tests_by_dir[directory].append(test_input)
+
+        for directory, test_inputs in tests_by_dir.iteritems():
+            shard = TestShard(directory, test_inputs)
+            if self._test_requires_lock(directory):
+                locked_shards.append(shard)
+            else:
+                unlocked_shards.append(shard)
+
+        # Sort the shards by directory name.
+        locked_shards.sort(key=lambda shard: shard.name)
+        unlocked_shards.sort(key=lambda shard: shard.name)
+
+        return (self._resize_shards(locked_shards, self._max_locked_shards(num_workers),
+                                    'locked_shard'),
+                unlocked_shards)
+
+    def _max_locked_shards(self, num_workers):
+        # Put a ceiling on the number of locked shards, so that we
+        # don't hammer the servers too badly.
+
+        # FIXME: For now, limit to one shard. After testing to make sure we
+        # can handle multiple shards, we should probably do something like
+        # limit this to no more than a quarter of all workers, e.g.:
+        # return max(math.ceil(num_workers / 4.0), 1)
+        return 1
+
+    def _resize_shards(self, old_shards, max_new_shards, shard_name_prefix):
+        """Takes a list of shards and redistributes the tests into no more
+        than |max_new_shards| new shards."""
+
+        # This implementation assumes that each input shard only contains tests from a
+        # single directory, and that tests in each shard must remain together; as a
+        # result, a given input shard is never split between output shards.
+        #
+        # Each output shard contains the tests from one or more input shards and
+        # hence may contain tests from multiple directories.
+
+        def divide_and_round_up(numerator, divisor):
+            return int(math.ceil(float(numerator) / divisor))
+
+        def extract_and_flatten(shards):
+            test_inputs = []
+            for shard in shards:
+                test_inputs.extend(shard.test_inputs)
+            return test_inputs
+
+        def split_at(seq, index):
+            return (seq[:index], seq[index:])
+
+        num_old_per_new = divide_and_round_up(len(old_shards), max_new_shards)
+        new_shards = []
+        remaining_shards = old_shards
+        while remaining_shards:
+            some_shards, remaining_shards = split_at(remaining_shards, num_old_per_new)
+            new_shards.append(TestShard('%s_%d' % (shard_name_prefix, len(new_shards) + 1),
+                                        extract_and_flatten(some_shards)))
+        return new_shards
+
     def _contains_tests(self, subdir):
         for test_file in self._test_files:
             if test_file.find(subdir) >= 0:
                 return True
         return False
 
-    def _num_workers(self, num_shards):
-        num_workers = min(int(self._options.child_processes), num_shards)
+    def _log_num_workers(self, num_workers, num_shards, num_locked_shards):
         driver_name = self._port.driver_name()
         if num_workers == 1:
             self._printer.print_config("Running 1 %s over %s" %
                 (driver_name, grammar.pluralize('shard', num_shards)))
         else:
-            self._printer.print_config("Running %d %ss in parallel over %d shards" %
-                (num_workers, driver_name, num_shards))
-        return num_workers
+            self._printer.print_config("Running %d %ss in parallel over %d shards (%d locked)" %
+                (num_workers, driver_name, num_shards, num_locked_shards))
 
     def _run_tests(self, file_list, result_summary):
         """Runs the tests in the file_list.
@@ -625,7 +717,7 @@
 
         self._printer.print_update('Sharding tests ...')
         locked_shards, unlocked_shards = self._shard_tests(file_list,
-            int(self._options.child_processes) > 1 and not self._options.experimental_fully_parallel)
+            int(self._options.child_processes), self._options.experimental_fully_parallel)
 
         # FIXME: We don't have a good way to coordinate the workers so that
         # they don't try to run the shards that need a lock if we don't actually
@@ -640,7 +732,9 @@
         if locked_shards:
             self.start_servers_with_lock()
 
-        num_workers = self._num_workers(len(all_shards))
+        num_workers = min(int(self._options.child_processes), len(all_shards))
+        self._log_num_workers(num_workers, len(all_shards), len(locked_shards))
+
         manager_connection = manager_worker_broker.get(self._port, self._options,
                                                        self, worker.Worker)
 
@@ -663,7 +757,8 @@
 
         self._printer.print_update("Starting testing ...")
         for shard in all_shards:
-            manager_connection.post_message('test_list', shard[0], shard[1])
+            # FIXME: Change 'test_list' to 'shard', make sharding public.
+            manager_connection.post_message('test_list', shard.name, shard.test_inputs)
 
         # We post one 'stop' message for each worker. Because the stop message
         # are sent after all of the tests, and because each worker will stop
@@ -1346,7 +1441,7 @@
 
         def find(name, test_lists):
             for i in range(len(test_lists)):
-                if test_lists[i][0] == name:
+                if test_lists[i].name == name:
                     return i
             return -1

Modified: trunk/Tools/Scripts/webkitpy/layout_tests/layout_package/manager_unittest.py (90418 => 90419)


--- trunk/Tools/Scripts/webkitpy/layout_tests/layout_package/manager_unittest.py	2011-07-06 00:08:36 UTC (rev 90418)
+++ trunk/Tools/Scripts/webkitpy/layout_tests/layout_package/manager_unittest.py	2011-07-06 00:12:06 UTC (rev 90419)
@@ -39,7 +39,7 @@
 
 from webkitpy import layout_tests
 from webkitpy.layout_tests import run_webkit_tests
-from webkitpy.layout_tests.layout_package.manager import Manager, natural_sort_key, path_key, TestRunInterruptedException
+from webkitpy.layout_tests.layout_package.manager import Manager, natural_sort_key, path_key, TestRunInterruptedException, TestShard
 from webkitpy.layout_tests.layout_package import printing
 from webkitpy.layout_tests.layout_package.result_summary import ResultSummary
 from webkitpy.tool.mocktool import MockOptions
@@ -50,40 +50,79 @@
         return test_file
 
 
-class ManagerTest(unittest.TestCase):
-    def test_shard_tests(self):
-        # Test that _shard_tests in test_runner.TestRunner really
-        # put the http tests first in the queue.
+class ShardingTests(unittest.TestCase):
+    # FIXME: Remove "LayoutTests" from this if we can ever convert the generic
+    # code from filenames to test names.
+    test_list = [
+        "LayoutTests/http/tests/websocket/tests/unicode.htm",
+        "LayoutTests/animations/keyframes.html",
+        "LayoutTests/http/tests/security/view-source-no-refresh.html",
+        "LayoutTests/http/tests/websocket/tests/websocket-protocol-ignored.html",
+        "LayoutTests/fast/css/display-none-inline-style-change-crash.html",
+        "LayoutTests/http/tests/xmlhttprequest/supported-xml-content-types.html",
+        "LayoutTests/dom/html/level2/html/HTMLAnchorElement03.html",
+        "LayoutTests/ietestcenter/_javascript_/11.1.5_4-4-c-1.html",
+        "LayoutTests/dom/html/level2/html/HTMLAnchorElement06.html",
+    ]
+
+    def get_shards(self, num_workers, fully_parallel):
         port = Mock()
         port._filesystem = filesystem_mock.MockFileSystem()
-        manager = ManagerWrapper(port=port, options=Mock(), printer=Mock())
+        self.manager = ManagerWrapper(port=port, options=Mock(), printer=Mock())
+        return self.manager._shard_tests(self.test_list, num_workers, fully_parallel)
 
-        test_list = [
-          "LayoutTests/websocket/tests/unicode.htm",
-          "LayoutTests/animations/keyframes.html",
-          "LayoutTests/http/tests/security/view-source-no-refresh.html",
-          "LayoutTests/websocket/tests/websocket-protocol-ignored.html",
-          "LayoutTests/fast/css/display-none-inline-style-change-crash.html",
-          "LayoutTests/http/tests/xmlhttprequest/supported-xml-content-types.html",
-          "LayoutTests/dom/html/level2/html/HTMLAnchorElement03.html",
-          "LayoutTests/ietestcenter/_javascript_/11.1.5_4-4-c-1.html",
-          "LayoutTests/dom/html/level2/html/HTMLAnchorElement06.html",
-        ]
+    def test_shard_by_dir(self):
+        locked, unlocked = self.get_shards(num_workers=2, fully_parallel=False)
 
-        expected_tests_to_http_lock = set([
-          'LayoutTests/websocket/tests/unicode.htm',
-          'LayoutTests/http/tests/security/view-source-no-refresh.html',
-          'LayoutTests/websocket/tests/websocket-protocol-ignored.html',
-          'LayoutTests/http/tests/xmlhttprequest/supported-xml-content-types.html',
-        ])
+        # Note that although there are tests in multiple dirs that need locks,
+        # they are crammed into a single shard in order to reduce the # of
+        # workers hitting the server at once.
+        self.assertEquals(locked,
+            [TestShard('locked_shard_1',
+              ['LayoutTests/http/tests/security/view-source-no-refresh.html',
+               'LayoutTests/http/tests/websocket/tests/unicode.htm',
+               'LayoutTests/http/tests/websocket/tests/websocket-protocol-ignored.html',
+               'LayoutTests/http/tests/xmlhttprequest/supported-xml-content-types.html'])])
+        self.assertEquals(unlocked,
+            [TestShard('animations',
+                       ['LayoutTests/animations/keyframes.html']),
+             TestShard('dom/html/level2/html',
+                       ['LayoutTests/dom/html/level2/html/HTMLAnchorElement03.html',
+                        'LayoutTests/dom/html/level2/html/HTMLAnchorElement06.html']),
+             TestShard('fast/css',
+                       ['LayoutTests/fast/css/display-none-inline-style-change-crash.html']),
+             TestShard('ietestcenter/_javascript_',
+                       ['LayoutTests/ietestcenter/_javascript_/11.1.5_4-4-c-1.html'])])
 
-        single_locked, single_unlocked = manager._shard_tests(test_list, False)
-        multi_locked, multi_unlocked = manager._shard_tests(test_list, True)
+    def test_shard_every_file(self):
+        locked, unlocked = self.get_shards(num_workers=2, fully_parallel=True)
+        self.assertEquals(locked,
+            [TestShard('.', ['LayoutTests/http/tests/websocket/tests/unicode.htm']),
+             TestShard('.', ['LayoutTests/http/tests/security/view-source-no-refresh.html']),
+             TestShard('.', ['LayoutTests/http/tests/websocket/tests/websocket-protocol-ignored.html']),
+             TestShard('.', ['LayoutTests/http/tests/xmlhttprequest/supported-xml-content-types.html'])])
+        self.assertEquals(unlocked,
+            [TestShard('.', ['LayoutTests/animations/keyframes.html']),
+             TestShard('.', ['LayoutTests/fast/css/display-none-inline-style-change-crash.html']),
+             TestShard('.', ['LayoutTests/dom/html/level2/html/HTMLAnchorElement03.html']),
+             TestShard('.', ['LayoutTests/ietestcenter/_javascript_/11.1.5_4-4-c-1.html']),
+             TestShard('.', ['LayoutTests/dom/html/level2/html/HTMLAnchorElement06.html'])])
 
-        self.assertEqual("tests_to_http_lock", single_locked[0][0])
-        self.assertEqual(expected_tests_to_http_lock, set(single_locked[0][1]))
-        self.assertEqual("tests_to_http_lock", multi_locked[0][0])
-        self.assertEqual(expected_tests_to_http_lock, set(multi_locked[0][1]))
+    def test_shard_in_two(self):
+        locked, unlocked = self.get_shards(num_workers=1, fully_parallel=False)
+        self.assertEquals(locked,
+            [TestShard('locked_tests',
+                       ['LayoutTests/http/tests/websocket/tests/unicode.htm',
+                        'LayoutTests/http/tests/security/view-source-no-refresh.html',
+                        'LayoutTests/http/tests/websocket/tests/websocket-protocol-ignored.html',
+                        'LayoutTests/http/tests/xmlhttprequest/supported-xml-content-types.html'])])
+        self.assertEquals(unlocked,
+            [TestShard('unlocked_tests',
+                       ['LayoutTests/animations/keyframes.html',
+                        'LayoutTests/fast/css/display-none-inline-style-change-crash.html',
+                        'LayoutTests/dom/html/level2/html/HTMLAnchorElement03.html',
+                        'LayoutTests/ietestcenter/_javascript_/11.1.5_4-4-c-1.html',
+                        'LayoutTests/dom/html/level2/html/HTMLAnchorElement06.html'])])
 
     def test_http_locking(tester):
         class LockCheckingManager(Manager):
@@ -93,7 +132,7 @@
 
             def handle_finished_list(self, source, list_name, num_tests, elapsed_time):
                 if not self._finished_list_called:
-                    tester.assertEquals(list_name, 'tests_to_http_lock')
+                    tester.assertEquals(list_name, 'locked_tests')
                     tester.assertTrue(self._remaining_locked_shards)
                     tester.assertTrue(self._has_http_lock)

_______________________________________________
webkit-changes mailing list
webkit-changes@lists.webkit.org
http://lists.webkit.org/mailman/listinfo.cgi/webkit-changes