Hi,

here follows my patch. Please excuse the long delay.

The original code uses explicitly programmed loop unrolling. I did not do that 
because I assume that the compiler can do this much more efficiently.

BTW: Has anyone tried to use Intel's compilers? This would give a boost in 
execution speed.

Michael


On Thursday 19 February 2004 18.35, you wrote:
>  >This is what I did. The luminance channel of the matte is used in a
>  > convex combination of the input sources thus using the [16,235]/219
>  > range.
>
> Well, post your patch, already.  You fixed a bug!  :)
>
> -matt m.

-- 
/*
 *  matteblend - reads three frame-interlaced YUV4MPEG streams from stdin
 *               and blends the second over the first using the third's
 *               luminance channel as a matte
 *
 *  Copyright (C) 2001, pHilipp Zabel <[EMAIL PROTECTED]>
 *
 *  This program is free software; you can redistribute it and/or
 *  modify it under the terms of the GNU General Public License
 *  as published by the Free Software Foundation; either version 2
 *  of the License, or (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program; if not, write to the Free Software
 *  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
 */

#ifdef HAVE_CONFIG_H
#include "config.h"
#endif

#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>

#include "yuv4mpeg.h"

static void usage (void) {

   fprintf (stderr, "usage:  matteblend.flt\n"
                    "no params at the moment - color saturation falloff or such has to be implemented\n");

}

static void blend (unsigned char *src0[3], unsigned char *src1[3], unsigned char *matte[3],
                   unsigned int width,     unsigned int height,
                   unsigned char *dst[3])
{
   register unsigned int i,j;
   register unsigned int len = width * height;

   for (i=0; i<len; i+=4) {
      dst[0][i]   = ((235 - matte[0][i]   )   * src0[0][i]   + (matte[0][i] - 16 )  * src1[0][i])   / 219;
      dst[0][i+1] = ((235 - matte[0][i+1] ) * src0[0][i+1] + (matte[0][i+1] - 16 )  * src1[0][i+1]) / 219;
      dst[0][i+2] = ((235 - matte[0][i+2] ) * src0[0][i+2] + (matte[0][i+2] - 16 )  * src1[0][i+2]) / 219;
      dst[0][i+3] = ((235 - matte[0][i+3] ) * src0[0][i+3] + (matte[0][i+3] - 16 )  * src1[0][i+3]) / 219;            
   }

   len>>=2; /* len = len / 4 */
   /* do we really have to "downscale" matte here? */
   for (i=0,j=0; i<len; i++, j+=2) {
      int m = (matte[0][j] + matte[0][j+1] + matte[0][j+width] + matte[0][j+width+1]) >> 2;
      if ((j % width) == (width - 2)) j += width;
      dst[1][i] = ((235-m) * src0[1][i] + (m-16) * src1[1][i]) / 219;
      dst[2][i] = ((235-m) * src0[2][i] + (m-16) * src1[2][i]) / 219;
   }
}

int main (int argc, char *argv[])
{
   int in_fd  = 0;         /* stdin */
   int out_fd = 1;         /* stdout */
   unsigned char *yuv0[3]; /* input 0 */
   unsigned char *yuv1[3]; /* input 1 */
   unsigned char *yuv2[3]; /* input 2 */
   unsigned char *yuv[3];  /* output */
   y4m_stream_info_t streaminfo;
   y4m_frame_info_t frameinfo;
   int i;
   int w, h;

   if (argc > 1) {
      usage ();
      exit (1);
   }

   y4m_init_stream_info (&streaminfo);
   y4m_init_frame_info (&frameinfo);

   i = y4m_read_stream_header (in_fd, &streaminfo);
   if (i != Y4M_OK) {
      fprintf (stderr, "%s: input stream error - %s\n", 
	       argv[0], y4m_strerr(i));
      exit (1);
   }
   w = y4m_si_get_width(&streaminfo);
   h = y4m_si_get_height(&streaminfo);

   yuv[0] =  malloc (w * h);
   yuv0[0] = malloc (w * h);
   yuv1[0] = malloc (w * h);
   yuv2[0] = malloc (w * h);
   yuv[1] =  malloc (w * h / 4);
   yuv0[1] = malloc (w * h / 4);
   yuv1[1] = malloc (w * h / 4);
   yuv2[1] = malloc (w * h / 4);
   yuv[2] =  malloc (w * h / 4);
   yuv0[2] = malloc (w * h / 4);
   yuv1[2] = malloc (w * h / 4);
   yuv2[2] = malloc (w * h / 4);

   y4m_write_stream_header (out_fd, &streaminfo);

   while (1) {
      i = y4m_read_frame(in_fd, &streaminfo, &frameinfo, yuv0);
      if (i == Y4M_ERR_EOF)
	exit (0);
      else if (i != Y4M_OK)
         exit (1);
      i = y4m_read_frame(in_fd, &streaminfo, &frameinfo, yuv1);
      if (i != Y4M_OK)
         exit (1);
      i = y4m_read_frame(in_fd, &streaminfo, &frameinfo, yuv2);
      if (i != Y4M_OK)
         exit (1);
      /* constrain matte luma */
      for (i = 0; i < w*h; i++) {
	  if (yuv2[0][i] < 16) yuv2[0][i] = 16;
	  else
	      if (yuv2[0][i] > 235) yuv2[0][i] = 235;
      }

      blend (yuv0, yuv1, yuv2, w, h, yuv);

      y4m_write_frame (out_fd, &streaminfo, &frameinfo, yuv);

   }

}

Reply via email to