Index: job.c
===================================================================
RCS file: /sources/make/make/job.c,v
retrieving revision 1.211
diff -u -r1.211 job.c
--- job.c	6 Nov 2010 21:56:24 -0000	1.211
+++ job.c	19 Apr 2011 15:04:07 -0000
@@ -243,6 +243,12 @@
 /* Number of jobserver tokens this instance is currently using.  */
 
 unsigned int jobserver_tokens = 0;
+
+#ifdef POSIX /* PARALLEL-SYNC */
+/* Sync semaphore for use in -j mode with .PARALLELSYNC target. */
+
+int sync_handle = -1;
+#endif /* POSIX - PARALLEL-SYNC */
 
 #ifdef WINDOWS32
 /*
@@ -514,6 +520,93 @@
   */
 }
 
+#ifdef POSIX /* PARALLEL-SYNC */
+static void
+pump_from_tmp_fd(int from_fd, int to_fd)
+{
+  ssize_t nleft, nwrite;
+  char buffer[8192];
+
+  if (lseek(from_fd, 0, SEEK_SET) == -1)
+    perror("lseek()");
+
+  while (1)
+    {
+      EINTRLOOP(nleft, read(from_fd, buffer, sizeof(buffer)));
+      if (nleft < 0)
+	perror("read()");
+      else
+	while (nleft > 0)
+	{
+	  EINTRLOOP(nwrite, write(to_fd, buffer, nleft));
+	  if (nwrite < 0)
+	    {
+	      perror("write()");
+	      return;
+	    }
+
+	  nleft -= nwrite;
+	}
+
+      if (nleft <= 0)
+	break;
+    }
+    close(from_fd);
+}
+
+static void *
+acquire_semaphore(void)
+{
+  static struct flock fl;
+
+  fl.l_type = F_WRLCK;
+  fl.l_whence = SEEK_SET;
+  fl.l_pid = getpid();
+  fl.l_start = fl.l_pid; /* lock just one byte according to pid */
+  fl.l_len = 1;
+  if (fcntl(sync_handle, F_SETLKW, &fl) != -1)
+    return &fl;
+  perror("fcntl()");
+  return NULL;
+}
+
+static void
+release_semaphore(void *sem)
+{
+  struct flock *flp = (struct flock *)sem;
+  flp->l_type = F_UNLCK;
+  if (fcntl(sync_handle, F_SETLKW, flp) == -1)
+    perror("fcntl()");
+}
+
+/* Synchronize the output of jobs in -j mode to keep the results of
+   each job together. This is done by holding the results in temp files,
+   one for stdout and potentially another for stderr, and only releasing
+   them to "real" stdout/stderr when a semaphore can be obtained. */
+
+static void
+sync_output(struct child *c)
+{
+  void *sem;
+
+  if ((sem = acquire_semaphore()))
+    {
+      /*
+       * We've entered the "critical section" during which a lock is held.
+       * We want to keep it as short as possible.
+       */
+      if (c->outfd >= 0)
+	pump_from_tmp_fd(c->outfd, fileno(stdout));
+      if (c->errfd >= 0)
+	pump_from_tmp_fd(c->errfd, fileno(stderr));
+
+      /* Exit the critical section */
+      release_semaphore(sem);
+    }
+  c->outfd = c->errfd = -1;
+}
+#endif /* POSIX - PARALLEL-SYNC */
+
 extern int shell_function_pid, shell_function_completed;
 
 /* Reap all dead children, storing the returned status and the new command
@@ -790,6 +883,12 @@
         c->sh_batch_file = NULL;
       }
 
+#ifdef POSIX /* PARALLEL-SYNC */
+      /* Synchronize parallel output if requested */
+      if (parallel_sync)
+	sync_output(c);
+#endif /* POSIX - PARALLEL-SYNC */
+
       /* If this child had the good stdin, say it is now free.  */
       if (c->good_stdin)
         good_stdin_used = 0;
@@ -1053,6 +1152,7 @@
 #else
   char **argv;
 #endif
+  static int combined_output;
 
   /* If we have a completely empty commandset, stop now.  */
   if (!child->command_ptr)
@@ -1356,6 +1456,102 @@
 
 #else  /* !__EMX__ */
 
+#ifdef POSIX /* PARALLEL-SYNC */
+      if (parallel_sync)
+	{
+	  /* If .PARALLELSYNC is turned on, find a resource to
+	      synchronize on. This block is traversed only once. */
+	  if (sync_handle == -1)
+	    {
+	      struct stat stbuf_o, stbuf_e;
+
+	      if (ftell(stdout) != -1)
+		{
+		  sync_handle = fileno(stdout);
+		  combined_output =
+		    fstat(fileno(stdout), &stbuf_o) == 0 &&
+		    fstat(fileno(stderr), &stbuf_e) == 0 &&
+		    stbuf_o.st_dev == stbuf_e.st_dev &&
+		    stbuf_o.st_ino == stbuf_e.st_ino;
+		}
+	      else if (ftell(stderr) != -1)
+		sync_handle = fileno(stderr);
+	      else
+		{
+		  perror("PARALLELSYNC suppressed: stderr");
+		  parallel_sync = 0;
+		}
+	    }
+
+	  /* If it still looks like we can synchronize, create
+	      temp files to hold stdout and stderr (if separate). */
+	  if (parallel_sync)
+	    {
+	      FILE *outstrm = NULL, *errstrm = NULL;
+	      int o_ok, e_ok;
+
+	      /* Check stdout and stderr before hooking up temp files. */
+	      o_ok = ftell(stdout) != -1;
+	      e_ok = ftell(stderr) != -1;
+	      child->outfd = child->errfd = -1;
+	      if (o_ok && e_ok && !combined_output)
+		{
+		  if (!(outstrm = tmpfile()) || !(errstrm = tmpfile()))
+		    {
+		      perror("PARALLELSYNC suppressed: tmpfile()");
+		      if (outstrm)
+			(void)fclose(outstrm);
+		      outstrm = errstrm = NULL;
+		      parallel_sync = 0;
+		    }
+		}
+	      else if (o_ok)
+		{
+		  if (!(outstrm = tmpfile()))
+		    {
+		      perror("PARALLELSYNC suppressed: tmpfile()");
+		      parallel_sync = 0;
+		    }
+		}
+	      else if (e_ok)
+		{
+		  if (!(errstrm = tmpfile()))
+		    {
+		      perror("PARALLELSYNC suppressed: tmpfile()");
+		      parallel_sync = 0;
+		    }
+		}
+	      else
+		{
+		  perror("PARALLELSYNC suppressed: stdout");
+		  parallel_sync = 0;
+		}
+
+	    if (outstrm)
+	      {
+		if ((child->outfd = dup(fileno(outstrm))) == -1)
+		{
+		  perror("PARALLELSYNC suppressed: dup()");
+		  goto error;
+		}
+		fclose(outstrm);
+		CLOSE_ON_EXEC(child->outfd);
+	      }
+
+	    if (errstrm)
+	      {
+		if ((child->errfd = dup(fileno(errstrm))) == -1)
+		{
+		  perror("PARALLELSYNC suppressed: dup()");
+		  goto error;
+		}
+		fclose(errstrm);
+		CLOSE_ON_EXEC(child->errfd);
+	      }
+	    }
+	}
+#endif /* POSIX - PARALLEL-SYNC */
+
       child->pid = vfork ();
       environ = parent_environ;	/* Restore value child may have clobbered.  */
       if (child->pid == 0)
@@ -1379,6 +1575,24 @@
             setrlimit (RLIMIT_STACK, &stack_limit);
 #endif
 
+#ifdef POSIX /* PARALLEL-SYNC */
+	  /* Divert output into tempfiles if .PARALLELSYNC in use. */
+	  if (parallel_sync)
+	    {
+	      int outfd = fileno(stdout);
+	      int errfd = fileno(stderr);
+
+	      if ((child->outfd >= 0 &&
+		  (close(outfd) == -1 || dup2(child->outfd, outfd) == -1))
+		|| (child->errfd >= 0 &&
+		  (close(errfd) == -1 || dup2(child->errfd, errfd) == -1)))
+		{
+		  perror("PARALLELSYNC suppressed: dup2()");
+		  parallel_sync = 0; /* depends on vfork */
+		}
+	    }
+#endif /* POSIX - PARALLEL-SYNC */
+
 	  child_execute_job (child->good_stdin ? 0 : bad_stdin, 1,
                              argv, child->environment);
 	}
Index: job.h
===================================================================
RCS file: /sources/make/make/job.h,v
retrieving revision 1.27
diff -u -r1.27 job.h
--- job.h	13 Jul 2010 01:20:41 -0000	1.27
+++ job.h	19 Apr 2011 15:04:07 -0000
@@ -64,6 +64,10 @@
     unsigned int good_stdin:1;	/* Nonzero if this child has a good stdin.  */
     unsigned int deleted:1;	/* Nonzero if targets have been deleted.  */
     unsigned int dontcare:1;    /* Saved dontcare flag.  */
+#ifdef POSIX /* PARALLEL-SYNC */
+    int outfd;			/* Optional file descriptor for saving stdout */
+    int errfd;			/* Optional file descriptor for saving stderr */
+#endif /* POSIX - PARALLEL-SYNC */
   };
 
 extern struct child *children;
Index: main.c
===================================================================
RCS file: /sources/make/make/main.c,v
retrieving revision 1.246
diff -u -r1.246 main.c
--- main.c	29 Aug 2010 23:05:27 -0000	1.246
+++ main.c	19 Apr 2011 15:04:07 -0000
@@ -503,6 +503,13 @@
 
 int one_shell;
 
+/* Nonzero if we have seen the '.PARALLELSYNC' target.
+   This attempts to synchronize the output of parallel
+   jobs such that the results of each job stay together.
+   It works best in combination with .ONESHELL.   */
+
+int parallel_sync;
+
 /* Nonzero if we have seen the `.NOTPARALLEL' target.
    This turns off parallel builds for this invocation of make.  */
 
Index: make.h
===================================================================
RCS file: /sources/make/make/make.h,v
retrieving revision 1.147
diff -u -r1.147 make.h
--- make.h	21 Feb 2011 07:30:11 -0000	1.147
+++ make.h	19 Apr 2011 15:04:07 -0000
@@ -522,7 +522,7 @@
 extern int print_version_flag, print_directory_flag, check_symlink_flag;
 extern int warn_undefined_variables_flag, trace_flag, posix_pedantic;
 extern int not_parallel, second_expansion, clock_skew_detected;
-extern int rebuilding_makefiles, one_shell;
+extern int rebuilding_makefiles, one_shell, parallel_sync;
 
 /* can we run commands via 'sh -c xxx' or must we use batch files? */
 extern int batch_mode_shell;
Index: read.c
===================================================================
RCS file: /sources/make/make/read.c,v
retrieving revision 1.197
diff -u -r1.197 read.c
--- read.c	18 Apr 2011 01:25:20 -0000	1.197
+++ read.c	19 Apr 2011 15:04:07 -0000
@@ -1961,6 +1961,8 @@
       else if (streq (name, ".ONESHELL"))
         one_shell = 1;
 #endif
+      else if (job_slots != 1 && streq (name, ".PARALLELSYNC"))
+        parallel_sync = 1;
 
       /* If this is a static pattern rule:
          `targets: target%pattern: prereq%pattern; recipe',
Index: doc/make.texi
===================================================================
RCS file: /sources/make/make/doc/make.texi,v
retrieving revision 1.71
diff -u -r1.71 make.texi
--- doc/make.texi	18 Apr 2011 01:25:21 -0000	1.71
+++ doc/make.texi	19 Apr 2011 15:04:08 -0000
@@ -2896,6 +2896,15 @@
 the shell rather than each line being invoked separately
 (@pxref{Execution, ,Recipe Execution}).
 
+@findex .PARALLELSYNC
+@item .PARALLELSYNC
+@cindex parallel recipe execution, output
+
+If @code{.PARALLELSYNC} is mentioned as a target, then when jobs
+are running in parallel under @code{--jobs}, the output of each
+is held until the job is complete thus ensuring that the output
+of each recipe is grouped together.
+
 @findex .POSIX
 @item .POSIX
 @cindex POSIX-conforming mode, setting
@@ -3983,9 +3992,13 @@
 there is no limit on the number of job slots.  The default number of job
 slots is one, which means serial execution (one thing at a time).
 
-One unpleasant consequence of running several recipes simultaneously is
-that output generated by the recipes appears whenever each recipe
-sends it, so messages from different recipes may be interspersed.
+One consequence of running several recipes simultaneously is that by
+default, output from each recipe appears as soon as it is generated,
+with the result that messages from different recipes may be interspersed.
+This may create problems in interpreting output. If the special target
+@code{.PARALLELSYNC} is mentioned, however, recipes will save their
+output until completion and then take turns writing it, with a more
+coherent result.
 
 Another problem is that two processes cannot both take input from the
 same device; so to make sure that only one recipe tries to take input
