Module Name:    xsrc
Committed By:   macallan
Date:           Fri Dec 10 18:25:44 UTC 2021

Modified Files:
        xsrc/external/mit/xf86-video-suncg14/dist/src: cg14_accel.c

Log Message:
add another Copy8() variant:
- supports unaligned source and destination
- uses all 32bit accesses
- supports copies up to 124 pixels wide so an entire line fits into registers
  and we can ignore x direction
... mostly an exercise in learning how to use the funnel shifter
TODO:
- skip the funnel shifter if source and destination are aligned
- skip fb reads where possible, like straight GXcopy


To generate a diff of this commit:
cvs rdiff -u -r1.21 -r1.22 \
    xsrc/external/mit/xf86-video-suncg14/dist/src/cg14_accel.c

Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.

Modified files:

Index: xsrc/external/mit/xf86-video-suncg14/dist/src/cg14_accel.c
diff -u xsrc/external/mit/xf86-video-suncg14/dist/src/cg14_accel.c:1.21 xsrc/external/mit/xf86-video-suncg14/dist/src/cg14_accel.c:1.22
--- xsrc/external/mit/xf86-video-suncg14/dist/src/cg14_accel.c:1.21	Thu Dec  9 17:29:14 2021
+++ xsrc/external/mit/xf86-video-suncg14/dist/src/cg14_accel.c	Fri Dec 10 18:25:43 2021
@@ -1,4 +1,4 @@
-/* $NetBSD: cg14_accel.c,v 1.21 2021/12/09 17:29:14 christos Exp $ */
+/* $NetBSD: cg14_accel.c,v 1.22 2021/12/10 18:25:43 macallan Exp $ */
 /*
  * Copyright (c) 2013 Michael Lorenz
  * All rights reserved.
@@ -405,6 +405,114 @@ next:
 	}
 }
 
+/* up to 124 pixels so direction doesn't matter, unaligned, ROP */
+static void
+CG14Copy8_short_rop(Cg14Ptr p, int srcstart, int dststart, int w, int h, int srcpitch, int dstpitch)
+{
+	int saddr, daddr, pre, dist, wrds, swrds, spre, sreg, restaddr, post;
+#ifdef DEBUG
+	int taddr = 4 + dstpitch * 50;
+#endif
+	uint32_t lmask, rmask;
+	ENTER;
+	
+	pre = dststart & 3;
+	lmask = 0xffffffff >> pre;
+	spre = srcstart & 3;
+	/*
+	 * make sure we count all the words needed to cover the destination 
+	 * line, covering potential partials on both ends
+	 */
+	wrds = (w + pre + 3) >> 2;
+	swrds = (w + spre + 3) >> 2;
+
+	if (spre < pre) {
+		dist = 32 - (pre - spre) * 8;
+		sreg = 9;
+	} else {
+		dist = (spre - pre) * 8;
+		sreg = 8;
+	}
+
+	/*
+	 * mask out trailing pixels to avoid partial writes
+	 */
+	post = (dststart + w) & 3;
+	rmask = ~(0xffffffff >> (post * 8));
+	write_sx_reg(p, SX_QUEUED(7), rmask);	
+	write_sx_reg(p, SX_QUEUED(6), ~rmask);	
+	
+	DPRINTF(X_ERROR, "%s %d %d, %d %d %08x %d %d %d %d %08x\n", __func__,
+	    w, h, spre, pre, lmask, dist, sreg, wrds, post, rmask);
+
+	/* mask out the leading pixels in dst by using a mask and ROP */
+	write_sx_reg(p, SX_ROP_CONTROL, (p->last_rop & 0xf0) | 0xa);
+	write_sx_reg(p, SX_QUEUED(R_MASK), 0xffffffff);	
+
+	saddr = srcstart & ~3;
+	daddr = dststart & ~3;
+	
+	/* TODO:
+	 * - special case dist == 0 where we can skip the funnel shifter
+	 *   and only need to deal with leading / trailing garbage
+	 * - skip reading the fb where we can get away with it, for example
+	 *   GXcopy, where we only need to read the destination for partials,
+	 *   everything in between is straight copy
+	 */
+	while (h > 0) {
+		write_sx_io(p, daddr & ~7, SX_LD(80, wrds - 1, daddr & 7));
+		write_sx_io(p, saddr & ~7, SX_LD(sreg, swrds - 1, saddr & 7));
+		if (wrds > 15) {
+			write_sx_reg(p, SX_INSTRUCTIONS, SX_FUNNEL_I(8, dist, 40, 15));
+			write_sx_reg(p, SX_INSTRUCTIONS, SX_FUNNEL_I(24, dist, 56, wrds - 16));
+			/* shifted source pixels are now at register 40+ */
+			if (pre != 0) {
+				/* mask out leading junk */
+				write_sx_reg(p, SX_QUEUED(R_MASK), lmask);
+				write_sx_reg(p, SX_INSTRUCTIONS, SX_ROPB(40, 80, 8, 0));
+				write_sx_reg(p, SX_QUEUED(R_MASK), 0xffffffff);
+				write_sx_reg(p, SX_INSTRUCTIONS, SX_ROPB(41, 81, 9, 14));	
+			} else {
+				write_sx_reg(p, SX_INSTRUCTIONS, SX_ROPB(40, 80, 8, 15));
+			}
+			write_sx_reg(p, SX_INSTRUCTIONS, SX_ROPB(56, 96, 24, wrds - 16));
+		} else {
+			write_sx_reg(p, SX_INSTRUCTIONS, SX_FUNNEL_I(8, dist, 40, wrds));
+
+			if (pre != 0) {
+				/* mask out leading junk */
+				write_sx_reg(p, SX_QUEUED(R_MASK), lmask);
+				write_sx_reg(p, SX_INSTRUCTIONS, SX_ROPB(40, 80, 8, 0));
+				write_sx_reg(p, SX_QUEUED(R_MASK), 0xffffffff);
+				write_sx_reg(p, SX_INSTRUCTIONS, SX_ROPB(41, 81, 9, wrds));
+			} else {
+				write_sx_reg(p, SX_INSTRUCTIONS, SX_ROPB(40, 80, 8, wrds));
+			}
+		}
+		if (post != 0) {
+			/*
+			 * if the last word to be written out is a partial we 
+			 * mask out the leftovers and replace them with
+			 * background pixels
+			 * we could pull the same ROP * mask trick as we do on
+			 * the left end but it's less annoying this way and
+			 * the instruction count is the same
+			 */
+			write_sx_reg(p, SX_INSTRUCTIONS, SX_ANDS(7 + wrds, 7, 5, 0));
+			write_sx_reg(p, SX_INSTRUCTIONS, SX_ANDS(79 + wrds, 6, 4, 0));
+			write_sx_reg(p, SX_INSTRUCTIONS, SX_ORS(5, 4, 7 + wrds, 0));
+		}
+#ifdef DEBUG
+		write_sx_io(p, taddr & ~7, SX_ST(40, wrds - 1, taddr & 7));
+		taddr += dstpitch;
+#endif
+		write_sx_io(p, daddr & ~7, SX_ST(8, wrds - 1, daddr & 7));
+		saddr += srcpitch;
+		daddr += dstpitch;
+		h--;
+	}
+}
+
 static void
 CG14Copy8(PixmapPtr pDstPixmap,
          int srcX, int srcY, int dstX, int dstY, int w, int h)
@@ -427,13 +535,6 @@ CG14Copy8(PixmapPtr pDstPixmap,
 	srcstart = srcX + (srcpitch * srcY) + srcoff;
 	dststart = dstX + (dstpitch * dstY) + dstoff;
 
-	if ((p->xdir < 0) && (srcoff == dstoff) && (srcY == dstY)) {
-		srcstart += (w - 32);
-		dststart += (w - 32);
-		xinc = -32;
-	} else
-		xinc = 32;
-
 	if (p->ydir < 0) {
 		srcstart += (h - 1) * srcpitch;
 		dststart += (h - 1) * dstpitch;
@@ -443,6 +544,32 @@ CG14Copy8(PixmapPtr pDstPixmap,
 		srcinc = srcpitch;
 		dstinc = dstpitch;
 	}
+
+	/*
+	 * this copies up to 124 pixels wide in one go, so horizontal
+	 * direction / overlap don't matter
+	 * uses all 32bit accesses and funnel shifter for unaligned copies
+	 */
+	if ((w < 125) && (w > 8)) {
+		CG14Copy8_short_rop(p, srcstart, dststart, w, h, srcinc, dstinc);
+		return;
+	}
+
+	/*
+	 * only invert x direction if absolutely necessary, it's a pain to
+	 * go backwards on SX so avoid as much as possible
+	 */
+	if ((p->xdir < 0) && (srcoff == dstoff) && (srcY == dstY)) {
+		srcstart += (w - 32);
+		dststart += (w - 32);
+		xinc = -32;
+	} else
+		xinc = 32;
+
+	/*
+	 * for aligned copies we can go all 32bit and avoid VRAM reads in the
+	 * most common case
+	 */
 	if (((srcstart & 3) == (dststart & 3)) && (xinc > 0)) {
 		switch (p->last_rop) {
 			case 0xcc:
@@ -453,6 +580,7 @@ CG14Copy8(PixmapPtr pDstPixmap,
 		}
 		return;
 	}
+
 	if (p->last_rop == 0xcc) {
 		/* plain old copy */
 		if ( xinc > 0) {

Reply via email to