Module Name: src Committed By: isaki Date: Sun Sep 25 11:28:40 UTC 2022
Modified Files: src/sys/arch/luna68k/dev: lunafb.c omrasops.c omrasopsvar.h Log Message: lunafb: Improve drawing performance using VRAM ROP features. - Drawing a character on 4bpp normally needs 4 times writes, but by using VRAM ROP actively, it can be reduced to write only once. The same goes for copyrows. If the whole row consists of only two colors (one foreground and one background), it can be copied by reading once and writing once, regardless of the number of planes. Only if the row consists of more than two colors, it will be copied plane by plane. - On 8bpp board, it acts as 4bpp (16 colors). - On 4bpp board on the real LUNA-I(68030/20MHz), monochrome scroll is about 4 times faster even without asm. Using asm improves it by additional 5% (asm is enabled by default). - By tsutsui@-san's report, even color scroll is about about 2 times faster on his 8bpp board on the real LUNA-II(68040). This was first developped by Y.Sugahara back in late 2019, and was modified a lot by me in 2022. http://mail-index.netbsd.org/port-luna68k/2022/09/23/msg000072.html To generate a diff of this commit: cvs rdiff -u -r1.46 -r1.47 src/sys/arch/luna68k/dev/lunafb.c cvs rdiff -u -r1.22 -r1.23 src/sys/arch/luna68k/dev/omrasops.c cvs rdiff -u -r1.5 -r1.6 src/sys/arch/luna68k/dev/omrasopsvar.h Please note that diffs are not public domain; they are subject to the copyright notices on the relevant files.
Modified files: Index: src/sys/arch/luna68k/dev/lunafb.c diff -u src/sys/arch/luna68k/dev/lunafb.c:1.46 src/sys/arch/luna68k/dev/lunafb.c:1.47 --- src/sys/arch/luna68k/dev/lunafb.c:1.46 Thu Jul 14 20:13:21 2022 +++ src/sys/arch/luna68k/dev/lunafb.c Sun Sep 25 11:28:40 2022 @@ -1,4 +1,4 @@ -/* $NetBSD: lunafb.c,v 1.46 2022/07/14 20:13:21 tsutsui Exp $ */ +/* $NetBSD: lunafb.c,v 1.47 2022/09/25 11:28:40 isaki Exp $ */ /*- * Copyright (c) 2000 The NetBSD Foundation, Inc. @@ -31,7 +31,7 @@ #include <sys/cdefs.h> /* RCS ID & Copyright macro defns */ -__KERNEL_RCSID(0, "$NetBSD: lunafb.c,v 1.46 2022/07/14 20:13:21 tsutsui Exp $"); +__KERNEL_RCSID(0, "$NetBSD: lunafb.c,v 1.47 2022/09/25 11:28:40 isaki Exp $"); #include <sys/param.h> #include <sys/systm.h> @@ -76,6 +76,8 @@ struct bt458 { #define OMFB_RFCNT BMAP_RFCNT /* video h-origin/v-origin */ #define OMFB_RAMDAC BMAP_PALLET2 /* Bt454/Bt458 RAMDAC */ +#define OMFB_FB_WADDR (BMAP_BMP + 8) /* common bitmap plane */ +#define OMFB_FB_RADDR (BMAP_BMAP0 + 8)/* bitmap plane #0 */ #define OMFB_SIZE (BMAP_FN0 - BMAP_BMP + PAGE_SIZE) @@ -173,6 +175,8 @@ CFATTACH_DECL_NEW(fb, sizeof(struct omfb extern int hwplanemask; /* hardware planemask; retrieved at boot */ +int hwplanecount; /* for omrasops */ + static int omfb_console; int omfb_cnattach(void); @@ -456,7 +460,7 @@ omfb_resetcmap(struct om_hwdevconfig *dc static void omfb_getdevconfig(paddr_t paddr, struct om_hwdevconfig *dc) { - int bpp, i; + int i; struct rasops_info *ri; union { struct { short h, v; } p; @@ -465,21 +469,21 @@ omfb_getdevconfig(paddr_t paddr, struct switch (hwplanemask) { case 0xff: - bpp = 8; /* XXX check monochrome bit in DIPSW */ + hwplanecount = 8; /* XXX check monochrome bit in DIPSW */ break; default: case 0x0f: - bpp = 4; /* XXX check monochrome bit in DIPSW */ + hwplanecount = 4; /* XXX check monochrome bit in DIPSW */ break; case 1: - bpp = 1; + hwplanecount = 1; break; } dc->dc_wid = 1280; dc->dc_ht = 1024; - dc->dc_depth = bpp; + dc->dc_depth = hwplanecount; dc->dc_rowbytes = 2048 / 8; - dc->dc_cmsize = (bpp == 1) ? 0 : 1 << bpp; + dc->dc_cmsize = (hwplanecount == 1) ? 0 : 1 << hwplanecount; dc->dc_videobase = paddr; omfb_resetcmap(dc); @@ -509,7 +513,7 @@ omfb_getdevconfig(paddr_t paddr, struct ri->ri_flg |= RI_NO_AUTO; ri->ri_hw = dc; - if (bpp == 4 || bpp == 8) + if (hwplanecount == 4 || hwplanecount == 8) omrasops4_init(ri, 34, 80); else omrasops1_init(ri, 34, 80); Index: src/sys/arch/luna68k/dev/omrasops.c diff -u src/sys/arch/luna68k/dev/omrasops.c:1.22 src/sys/arch/luna68k/dev/omrasops.c:1.23 --- src/sys/arch/luna68k/dev/omrasops.c:1.22 Sun Sep 25 11:22:36 2022 +++ src/sys/arch/luna68k/dev/omrasops.c Sun Sep 25 11:28:40 2022 @@ -1,4 +1,4 @@ -/* $NetBSD: omrasops.c,v 1.22 2022/09/25 11:22:36 isaki Exp $ */ +/* $NetBSD: omrasops.c,v 1.23 2022/09/25 11:28:40 isaki Exp $ */ /*- * Copyright (c) 2000 The NetBSD Foundation, Inc. @@ -31,7 +31,7 @@ #include <sys/cdefs.h> /* RCS ID & Copyright macro defns */ -__KERNEL_RCSID(0, "$NetBSD: omrasops.c,v 1.22 2022/09/25 11:22:36 isaki Exp $"); +__KERNEL_RCSID(0, "$NetBSD: omrasops.c,v 1.23 2022/09/25 11:28:40 isaki Exp $"); /* * Designed speficically for 'm68k bitorder'; @@ -41,6 +41,15 @@ __KERNEL_RCSID(0, "$NetBSD: omrasops.c,v * - first column is at 32bit aligned address, * - font glyphs are stored in 32bit padded. */ +/* + * BMSEL affects both of + * 1) which plane a write to the common bitmap plane is reflected on and + * 2) which plane's ROP a write to the common ROP is reflected on. + * + * The common ROP is not a ROP applied to write to the common bitmap plane. + * It's equivalent to set ROPs of the plane selected in the plane mask one + * by one. + */ #include <sys/param.h> #include <sys/systm.h> @@ -52,32 +61,84 @@ __KERNEL_RCSID(0, "$NetBSD: omrasops.c,v #include <arch/luna68k/dev/omrasopsvar.h> +#ifdef luna68k +#define USE_M68K_ASM 1 +#endif + +/* To provide optimization conditions to compilers */ +#if defined(__GNUC__) +#define ASSUME(cond) if (!(cond)) __unreachable() +#elif defined(__clang__) && __has_builtin(__builtin_assume) +#define ASSUME(cond) __builtin_assume(cond) +#else +#define ASSUME(cond) (void)(cond) +#endif + +/* XXX it should be redesigned, including making the attributes support 8bpp */ +typedef struct { + union { + int32_t all; + struct { + int8_t ismulti; /* is multi color used */ + uint8_t fg; + uint8_t bg; + uint8_t reserved; + }; + }; +} rowattr_t; + /* wscons emulator operations */ -static void om1_cursor(void *, int, int, int); -static void om4_cursor(void *, int, int, int); -static int om_mapchar(void *, int, unsigned int *); -static void om1_putchar(void *, int, int, u_int, long); -static void om4_putchar(void *, int, int, u_int, long); +static void om_cursor(void *, int, int, int); +static int om_mapchar(void *, int, u_int *); +static void om_putchar(void *, int, int, u_int, long); static void om1_copycols(void *, int, int, int, int); static void om4_copycols(void *, int, int, int, int); static void om1_copyrows(void *, int, int, int num); static void om4_copyrows(void *, int, int, int num); -static void om1_erasecols(void *, int, int, int, long); -static void om4_erasecols(void *, int, int, int, long); -static void om1_eraserows(void *, int, int, long); -static void om4_eraserows(void *, int, int, long); -static int om1_allocattr(void *, int, int, int, long *); -static int om4_allocattr(void *, int, int, int, long *); -static void om4_unpack_attr(long, int *, int *, int *); +static void om_erasecols(void *, int, int, int, long); +static void om_eraserows(void *, int, int, long); +static int om_allocattr(void *, int, int, int, long *); + +static void om_fill(int, int, uint8_t *, int, int, uint32_t, int, int); +static void om_fill_color(int, uint8_t *, int, int, int, int); +static void om_rascopy_single(uint8_t *, uint8_t *, int16_t, int16_t, + uint8_t[]); +static void om4_rascopy_multi(uint8_t *, uint8_t *, int16_t, int16_t); +static void om_unpack_attr(long, uint8_t *, uint8_t *, int *); static int omrasops_init(struct rasops_info *, int, int); +/* + * XXX should be fixed... + * This number of elements is derived from howmany(1024, fontheight = 24). + * But it is currently initialized with row = 34, so it is used only up to 34. + */ +static rowattr_t rowattr[43]; + #define ALL1BITS (~0U) #define ALL0BITS (0U) #define BLITWIDTH (32) #define ALIGNMASK (0x1f) #define BYTESDONE (4) +#if 0 /* XXX not used yet */ +/* + * internal attributes. see om_allocattr(). + */ +#define OMFB_ATTR_MULTICOLOR (1U << 31) +#define OMFB_ATTR_UNDERLINE (1U << 17) +#define OMFB_ATTR_BOLD (1U << 16) +#endif + +/* + * XXX deprecated. + * This way cannot be extended to 8bpp, so don't use it in new code. + */ +#define P0(addr) ((uint32_t *)((uint8_t *)(addr) + OMFB_PLANEOFFS * 1)) +#define P1(addr) ((uint32_t *)((uint8_t *)(addr) + OMFB_PLANEOFFS * 2)) +#define P2(addr) ((uint32_t *)((uint8_t *)(addr) + OMFB_PLANEOFFS * 3)) +#define P3(addr) ((uint32_t *)((uint8_t *)(addr) + OMFB_PLANEOFFS * 4)) + /* * macros to handle unaligned bit copy ops. * See src/sys/dev/rasops/rasops_masks.h for MI version. @@ -103,405 +164,934 @@ static int omrasops_init(struct rasops_i #define PUTBITS(src, x, w, pdst) FASTPUTBITS(src, x, w, pdst) /* - * Blit a character at the specified co-ordinates. + * Clear lower w bits from x. + * x must be filled with 1 at least lower w bits. */ -static void -om1_putchar(void *cookie, int row, int startcol, u_int uc, long attr) +#if USE_M68K_ASM +#define CLEAR_LOWER_BITS(x, w) \ + asm volatile( \ + " bclr %[width],%[data] ;\n" \ + " addq.l #1,%[data] ;\n" \ + : [data] "+&d" (x) \ + : [width] "d" (w) \ + : \ + ) +#else +#define CLEAR_LOWER_BITS(x, w) x = ((x) & ~(1U << (w))) + 1 +#endif + +/* Set planemask for the common plane and the common ROP */ +static inline void +om_set_planemask(int planemask) { - struct rasops_info *ri = cookie; - uint8_t *p; - int scanspan, startx, height, width, align, y; - uint32_t lmask, rmask, glyph, inverse; - int i; - uint8_t *fb; - scanspan = ri->ri_stride; - y = ri->ri_font->fontheight * row; - startx = ri->ri_font->fontwidth * startcol; - height = ri->ri_font->fontheight; - fb = (uint8_t *)ri->ri_font->data + - (uc - ri->ri_font->firstchar) * ri->ri_fontscale; - inverse = ((attr & 0x00000001) != 0) ? ALL1BITS : ALL0BITS; + *(volatile uint32_t *)OMFB_PLANEMASK = planemask; +} - p = (uint8_t *)ri->ri_bits + y * scanspan + ((startx / 32) * 4); - align = startx & ALIGNMASK; - width = ri->ri_font->fontwidth + align; - lmask = ALL1BITS >> align; - rmask = ALL1BITS << (-width & ALIGNMASK); - if (width <= BLITWIDTH) { - lmask &= rmask; - /* set lmask as ROP mask value, with THROUGH mode */ - ((volatile uint32_t *)OMFB_ROPFUNC)[ROP_THROUGH] = lmask; +/* Get a ROP address */ +static inline volatile uint32_t * +om_rop_addr(int plane, int rop) +{ - while (height > 0) { - glyph = 0; - for (i = ri->ri_font->stride; i != 0; i--) - glyph = (glyph << 8) | *fb++; - glyph <<= (4 - ri->ri_font->stride) * NBBY; - glyph = (glyph >> align) ^ inverse; + return (volatile uint32_t *) + (OMFB_ROP_P0 + OMFB_PLANEOFFS * plane + rop * 4); +} - *W(p) = glyph; +/* Set ROP and ROP's mask for individual plane */ +static inline void +om_set_rop(int plane, int rop, uint32_t mask) +{ - p += scanspan; - height--; - } - /* reset mask value */ - ((volatile uint32_t *)OMFB_ROPFUNC)[ROP_THROUGH] = ALL1BITS; - } else { - uint8_t *q = p; - uint32_t lhalf, rhalf; + *om_rop_addr(plane, rop) = mask; +} - while (height > 0) { - glyph = 0; - for (i = ri->ri_font->stride; i != 0; i--) - glyph = (glyph << 8) | *fb++; - glyph <<= (4 - ri->ri_font->stride) * NBBY; - lhalf = (glyph >> align) ^ inverse; - /* set lmask as ROP mask value, with THROUGH mode */ - ((volatile uint32_t *)OMFB_ROPFUNC)[ROP_THROUGH] = - lmask; - - *W(p) = lhalf; +/* Set ROP and ROP's mask for current setplanemask-ed plane(s) */ +static inline void +om_set_rop_curplane(int rop, uint32_t mask) +{ - p += BYTESDONE; + ((volatile uint32_t *)(OMFB_ROP_COMMON))[rop] = mask; +} - rhalf = (glyph << (BLITWIDTH - align)) ^ inverse; - /* set rmask as ROP mask value, with THROUGH mode */ - ((volatile uint32_t *)OMFB_ROPFUNC)[ROP_THROUGH] = - rmask; +/* Reset planemask and ROP */ +static inline void +om_reset_planemask_and_rop(void) +{ - *W(p) = rhalf; + om_set_planemask(hwplanemask); + om_set_rop_curplane(ROP_THROUGH, ~0U); +} - p = (q += scanspan); - height--; +static inline void +om_set_rowattr(int row, uint8_t fg, uint8_t bg) +{ + + if (rowattr[row].fg == fg && rowattr[row].bg == bg) + return; + if (rowattr[row].ismulti) + return; + + if (rowattr[row].fg == rowattr[row].bg) { + /* From the initial (erased) state, */ + if (rowattr[row].fg != fg && rowattr[row].bg != bg) { + /* if both are changed at once, it's multi color */ + rowattr[row].ismulti = true; + } else { + /* otherwise, it's single color */ + rowattr[row].fg = fg; + rowattr[row].bg = bg; } - /* reset mask value */ - ((volatile uint32_t *)OMFB_ROPFUNC)[ROP_THROUGH] = ALL1BITS; + } else { + rowattr[row].ismulti = true; } } +static inline void +om_reset_rowattr(int row, uint8_t bg) +{ + + /* Setting fg equal to bg means 'reset' or 'erased'. */ + rowattr[row].ismulti = false; + rowattr[row].bg = bg; + rowattr[row].fg = bg; +} + +/* + * Fill rectangle. + * val is assumed only ALL0BITS or ALL1BITS, because all bits are used as is + * regardless of bit offset of the destination. + */ static void -om4_putchar(void *cookie, int row, int startcol, u_int uc, long attr) +om_fill(int planemask, int rop, uint8_t *dstptr, int dstbitoffs, int dstspan, + uint32_t val, int width, int height) { - struct rasops_info *ri = cookie; - uint8_t *p; - int scanspan, startx, height, width, align, y; - uint32_t lmask, rmask, glyph, glyphbg, fgpat, bgpat; - uint32_t fgmask0, fgmask1, fgmask2, fgmask3; - uint32_t bgmask0, bgmask1, bgmask2, bgmask3; - int i, fg, bg; - uint8_t *fb; + uint32_t mask; + uint32_t prev_mask; + int32_t height_m1; + int dw; /* 1 pass width bits */ + + ASSUME(width > 0); + ASSUME(height > 0); + ASSUME(0 <= dstbitoffs && dstbitoffs < 32); + + om_set_planemask(planemask); + + height_m1 = height - 1; + mask = ALL1BITS >> dstbitoffs; + prev_mask = ~mask; + dw = 32 - dstbitoffs; + + /* do-while loop seems slightly faster than a for loop */ + do { + uint8_t *d; + int32_t h; + + width -= dw; + if (width < 0) { + CLEAR_LOWER_BITS(mask, -width); + /* To exit this loop. */ + width = 0; + } + + if (prev_mask != mask) { + om_set_rop_curplane(rop, mask); + prev_mask = mask; + } + + d = dstptr; + dstptr += 4; + h = height_m1; + +#if USE_M68K_ASM + asm volatile("\n" + "om_fill_loop_h:\n" + " move.l %[val],(%[d]) ;\n" + " add.l %[dstspan],%[d] ;\n" + " dbra %[h],om_fill_loop_h ;\n" + : /* output */ + [d] "+&a" (d), + [h] "+&d" (h) + : /* input */ + [val] "d" (val), + [dstspan] "r" (dstspan) + : /* clobbers */ + "memory" + ); +#else + do { + *(uint32_t *)d = val; + d += dstspan; + } while (--h >= 0); +#endif + mask = ALL1BITS; + dw = 32; + } while (width > 0); +} - scanspan = ri->ri_stride; - y = ri->ri_font->fontheight * row; - startx = ri->ri_font->fontwidth * startcol; - height = ri->ri_font->fontheight; - fb = (uint8_t *)ri->ri_font->data + - (uc - ri->ri_font->firstchar) * ri->ri_fontscale; - om4_unpack_attr(attr, &fg, &bg, NULL); - fgmask0 = (fg & 0x01) ? ALL1BITS : ALL0BITS; - fgmask1 = (fg & 0x02) ? ALL1BITS : ALL0BITS; - fgmask2 = (fg & 0x04) ? ALL1BITS : ALL0BITS; - fgmask3 = (fg & 0x08) ? ALL1BITS : ALL0BITS; - bgmask0 = (bg & 0x01) ? ALL1BITS : ALL0BITS; - bgmask1 = (bg & 0x02) ? ALL1BITS : ALL0BITS; - bgmask2 = (bg & 0x04) ? ALL1BITS : ALL0BITS; - bgmask3 = (bg & 0x08) ? ALL1BITS : ALL0BITS; - - p = (uint8_t *)ri->ri_bits + y * scanspan + ((startx / 32) * 4); - align = startx & ALIGNMASK; - width = ri->ri_font->fontwidth + align; - lmask = ALL1BITS >> align; - rmask = ALL1BITS << (-width & ALIGNMASK); +static void +om_fill_color(int color, uint8_t *dstptr, int dstbitoffs, int dstspan, + int width, int height) +{ + uint32_t mask; + uint32_t prev_mask; + int32_t height_m1; + int dw; /* 1 pass width bits */ + + ASSUME(width > 0); + ASSUME(height > 0); + ASSUME(omfb_planecount > 0); + + /* select all planes */ + om_set_planemask(hwplanemask); + + mask = ALL1BITS >> dstbitoffs; + prev_mask = ~mask; + dw = 32 - dstbitoffs; + height_m1 = height - 1; + + do { + uint8_t *d; + int32_t plane; + int32_t h; + int16_t rop; + + width -= dw; + if (width < 0) { + CLEAR_LOWER_BITS(mask, -width); + /* To exit this loop. */ + width = 0; + } + + if (prev_mask != mask) { + for (plane = 0; plane < omfb_planecount; plane++) { + if ((color & (1U << plane)) != 0) + rop = ROP_ONE; + else + rop = ROP_ZERO; + om_set_rop(plane, rop, mask); + } + prev_mask = mask; + } - /* select all planes for later ROP function target */ - *(volatile uint32_t *)OMFB_PLANEMASK = 0xff; + d = dstptr; + dstptr += 4; + h = height_m1; + +#if USE_M68K_ASM + asm volatile("\n" + "om_fill_color_loop_h:\n" + " clr.l (%[d]) ;\n" + " add.l %[dstspan],%[d] ;\n" + " dbra %[h],om_fill_color_loop_h ;\n" + : /* output */ + [d] "+&a" (d), + [h] "+&d" (h) + : /* input */ + [dstspan] "r" (dstspan) + : /* clobbers */ + "memory" + ); +#else + do { + /* + * ROP is either ONE or ZERO, + * so don't care what you write to *d. + */ + *(uint32_t *)d = 0; + d += dstspan; + } while (--h >= 0); +#endif + mask = ALL1BITS; + dw = 32; + } while (width > 0); +} - if (width <= BLITWIDTH) { - lmask &= rmask; - /* set lmask as ROP mask value, with THROUGH mode */ - ((volatile uint32_t *)OMFB_ROPFUNC)[ROP_THROUGH] = lmask; +/* + * Calculate ROP depending on fg/bg color combination as follows. + * This is called per individual plane while shifting fg and bg. + * So the LSB of fg and bg points to this plane. + * + * All ROP values we want to use here happens to be a multiple of 5. + * + * bg fg rop result + * -- -- ---------------- ------ + * 0 0 ROP_ZERO = 0 0 + * 0 1 ROP_THROUGH = 5 D + * 1 0 ROP_INV1 = 10 ~D + * 1 1 ROP_ONE = 15 1 + * + * This allows characters to be drawn in the specified fg/bg colors with + * a single write to the common plane. + */ +static inline int +om_fgbg2rop(uint8_t fg, uint8_t bg) +{ + int t; - while (height > 0) { - glyph = 0; - for (i = ri->ri_font->stride; i != 0; i--) - glyph = (glyph << 8) | *fb++; - glyph <<= (4 - ri->ri_font->stride) * NBBY; - glyph = (glyph >> align); - glyphbg = glyph ^ ALL1BITS; - - fgpat = glyph & fgmask0; - bgpat = glyphbg & bgmask0; - *P0(p) = (fgpat | bgpat); - fgpat = glyph & fgmask1; - bgpat = glyphbg & bgmask1; - *P1(p) = (fgpat | bgpat); - fgpat = glyph & fgmask2; - bgpat = glyphbg & bgmask2; - *P2(p) = (fgpat | bgpat); - fgpat = glyph & fgmask3; - bgpat = glyphbg & bgmask3; - *P3(p) = (fgpat | bgpat); + t = (bg & 1) * 2 + (fg & 1); + return t * 5; +} - p += scanspan; - height--; - } - /* reset mask value */ - ((volatile uint32_t *)OMFB_ROPFUNC)[ROP_THROUGH] = ALL1BITS; - } else { - uint8_t *q = p; - uint32_t lhalf, rhalf; - uint32_t lhalfbg, rhalfbg; +/* + * Blit a character at the specified co-ordinates. + * This function modifies(breaks) the planemask and ROPs. + */ +static void +om_putchar(void *cookie, int row, int startcol, u_int uc, long attr) +{ + struct rasops_info *ri = cookie; + uint8_t *fontptr; + uint8_t *dstcmn; + uint32_t mask; + int width; + int height; + int x, y; + int fontstride; + int fontx; + int plane; + int dw; /* 1 pass width bits */ + int xh, xl; + uint8_t fg, bg; + /* ROP address cache */ + static volatile uint32_t *ropaddr[OMFB_MAX_PLANECOUNT]; + static uint8_t last_fg, last_bg; - while (height > 0) { - glyph = 0; - for (i = ri->ri_font->stride; i != 0; i--) - glyph = (glyph << 8) | *fb++; - glyph <<= (4 - ri->ri_font->stride) * NBBY; - lhalf = (glyph >> align); - lhalfbg = lhalf ^ ALL1BITS; - /* set lmask as ROP mask value, with THROUGH mode */ - ((volatile uint32_t *)OMFB_ROPFUNC)[ROP_THROUGH] = - lmask; - - fgpat = lhalf & fgmask0; - bgpat = lhalfbg & bgmask0; - *P0(p) = (fgpat | bgpat); - fgpat = lhalf & fgmask1; - bgpat = lhalfbg & bgmask1; - *P1(p) = (fgpat | bgpat); - fgpat = lhalf & fgmask2; - bgpat = lhalfbg & bgmask2; - *P2(p) = (fgpat | bgpat); - fgpat = lhalf & fgmask3; - bgpat = lhalfbg & bgmask3; - *P3(p) = (fgpat | bgpat); + if (uc >= 0x80) + return; - p += BYTESDONE; + width = ri->ri_font->fontwidth; + height = ri->ri_font->fontheight; + fontstride = ri->ri_font->stride; + y = height * row; + x = width * startcol; + fontptr = (uint8_t *)ri->ri_font->data + + (uc - ri->ri_font->firstchar) * ri->ri_fontscale; - rhalf = (glyph << (BLITWIDTH - align)); - rhalfbg = rhalf ^ ALL1BITS; - /* set rmask as ROP mask value, with THROUGH mode */ - ((volatile uint32_t *)OMFB_ROPFUNC)[ROP_THROUGH] = - rmask; - - fgpat = rhalf & fgmask0; - bgpat = rhalfbg & bgmask0; - *P0(p) = (fgpat | bgpat); - fgpat = rhalf & fgmask1; - bgpat = rhalfbg & bgmask1; - *P1(p) = (fgpat | bgpat); - fgpat = rhalf & fgmask2; - bgpat = rhalfbg & bgmask2; - *P2(p) = (fgpat | bgpat); - fgpat = rhalf & fgmask3; - bgpat = rhalfbg & bgmask3; - *P3(p) = (fgpat | bgpat); + om_unpack_attr(attr, &fg, &bg, NULL); + om_set_rowattr(row, fg, bg); - p = (q += scanspan); - height--; - } - /* reset mask value */ - ((volatile uint32_t *)OMFB_ROPFUNC)[ROP_THROUGH] = ALL1BITS; - } - /* select plane #0 only; XXX need this ? */ - *(volatile uint32_t *)OMFB_PLANEMASK = 0x01; + if (last_fg != fg || last_bg != bg) { + last_fg = fg; + last_bg = bg; + /* calculate ROP */ + for (plane = 0; plane < omfb_planecount; plane++) { + int t = om_fgbg2rop(fg, bg); + ropaddr[plane] = om_rop_addr(plane, t); + fg >>= 1; + bg >>= 1; + } + } + + /* divide x into the lower 5 bits and the rest. */ + xh = x >> 5; + xl = x & 0x1f; + + /* write to common plane */ + dstcmn = (uint8_t *)ri->ri_bits + xh * 4 + y * OMFB_STRIDE; + + /* select all plane */ + om_set_planemask(hwplanemask); + + fontx = 0; + mask = ALL1BITS >> xl; + dw = 32 - xl; + + ASSUME(omfb_planecount == 1 || + omfb_planecount == 4 || + omfb_planecount == 8); + + do { + uint8_t *d; + uint8_t *f; + int32_t h; + + width -= dw; + if (width < 0) { + CLEAR_LOWER_BITS(mask, -width); + /* To exit this loop. */ + width = 0; + } + + switch (omfb_planecount) { + case 8: + *(ropaddr[7]) = mask; + *(ropaddr[6]) = mask; + *(ropaddr[5]) = mask; + *(ropaddr[4]) = mask; + /* FALLTHROUGH */ + case 4: + *(ropaddr[3]) = mask; + *(ropaddr[2]) = mask; + *(ropaddr[1]) = mask; + /* FALLTHROUGH */ + case 1: + *(ropaddr[0]) = mask; + break; + } + + d = dstcmn; + f = fontptr; + h = height - 1; + do { + uint32_t v; + GETBITS(f, fontx, dw, v); + /* no need to shift v because it's masked by ROP */ + *(uint32_t *)d = v; + d += OMFB_STRIDE; + f += fontstride; + } while (--h >= 0); + + dstcmn += 4; + fontx += dw; + mask = ALL1BITS; + dw = 32; + } while (width > 0); + + om_reset_planemask_and_rop(); } static void -om1_erasecols(void *cookie, int row, int startcol, int ncols, long attr) +om_erasecols(void *cookie, int row, int startcol, int ncols, long attr) { struct rasops_info *ri = cookie; + int startx; + int width; + int height; + int sh, sl; + int y; + int scanspan; uint8_t *p; - int scanspan, startx, height, width, align, w, y; - uint32_t lmask, rmask, fill; + uint8_t fg, bg; scanspan = ri->ri_stride; y = ri->ri_font->fontheight * row; startx = ri->ri_font->fontwidth * startcol; + width = ri->ri_font->fontwidth * ncols; height = ri->ri_font->fontheight; - w = ri->ri_font->fontwidth * ncols; - fill = ((attr & 0x00000001) != 0) ? ALL1BITS : ALL0BITS; - - p = (uint8_t *)ri->ri_bits + y * scanspan + ((startx / 32) * 4); - align = startx & ALIGNMASK; - width = w + align; - lmask = ALL1BITS >> align; - rmask = ALL1BITS << (-width & ALIGNMASK); - if (width <= BLITWIDTH) { - lmask &= rmask; - fill &= lmask; - while (height > 0) { - *P0(p) = (*P0(p) & ~lmask) | fill; - p += scanspan; - height--; - } + om_unpack_attr(attr, &fg, &bg, NULL); + sh = startx >> 5; + sl = startx & 0x1f; + p = (uint8_t *)ri->ri_bits + y * scanspan + sh * 4; + + /* I'm not sure */ + om_set_rowattr(row, fg, bg); + + if (bg == 0) { + /* om_fill seems slightly efficient */ + om_fill(hwplanemask, ROP_ZERO, + p, sl, scanspan, 0, width, height); } else { - uint8_t *q = p; - while (height > 0) { - *P0(p) = (*P0(p) & ~lmask) | (fill & lmask); - width -= 2 * BLITWIDTH; - while (width > 0) { - p += BYTESDONE; - *P0(p) = fill; - width -= BLITWIDTH; - } - p += BYTESDONE; - *P0(p) = (fill & rmask) | (*P0(p) & ~rmask); - - p = (q += scanspan); - width = w + align; - height--; - } + om_fill_color(bg, p, sl, scanspan, width, height); } + + /* reset mask value */ + om_reset_planemask_and_rop(); } static void -om4_erasecols(void *cookie, int row, int startcol, int ncols, long attr) +om_eraserows(void *cookie, int startrow, int nrows, long attr) { struct rasops_info *ri = cookie; + int startx; + int width; + int height; + int sh, sl; + int y; + int scanspan; + int row; uint8_t *p; - int scanspan, startx, height, width, align, w, y, fg, bg; - uint32_t lmask, rmask, fill0, fill1, fill2, fill3; + uint8_t fg, bg; scanspan = ri->ri_stride; - y = ri->ri_font->fontheight * row; - startx = ri->ri_font->fontwidth * startcol; - height = ri->ri_font->fontheight; - w = ri->ri_font->fontwidth * ncols; - om4_unpack_attr(attr, &fg, &bg, NULL); - fill0 = ((bg & 0x01) != 0) ? ALL1BITS : ALL0BITS; - fill1 = ((bg & 0x02) != 0) ? ALL1BITS : ALL0BITS; - fill2 = ((bg & 0x04) != 0) ? ALL1BITS : ALL0BITS; - fill3 = ((bg & 0x08) != 0) ? ALL1BITS : ALL0BITS; - - p = (uint8_t *)ri->ri_bits + y * scanspan + ((startx / 32) * 4); - align = startx & ALIGNMASK; - width = w + align; - lmask = ALL1BITS >> align; - rmask = ALL1BITS << (-width & ALIGNMASK); - if (width <= BLITWIDTH) { - lmask &= rmask; - fill0 &= lmask; - fill1 &= lmask; - fill2 &= lmask; - fill3 &= lmask; - while (height > 0) { - *P0(p) = (*P0(p) & ~lmask) | fill0; - *P1(p) = (*P1(p) & ~lmask) | fill1; - *P2(p) = (*P2(p) & ~lmask) | fill2; - *P3(p) = (*P3(p) & ~lmask) | fill3; - p += scanspan; - height--; - } - } else { - uint8_t *q = p; - while (height > 0) { - *P0(p) = (*P0(p) & ~lmask) | (fill0 & lmask); - *P1(p) = (*P1(p) & ~lmask) | (fill1 & lmask); - *P2(p) = (*P2(p) & ~lmask) | (fill2 & lmask); - *P3(p) = (*P3(p) & ~lmask) | (fill3 & lmask); - width -= 2 * BLITWIDTH; - while (width > 0) { - p += BYTESDONE; - *P0(p) = fill0; - *P1(p) = fill1; - *P2(p) = fill2; - *P3(p) = fill3; - width -= BLITWIDTH; - } - p += BYTESDONE; - *P0(p) = (fill0 & rmask) | (*P0(p) & ~rmask); - *P1(p) = (fill1 & rmask) | (*P1(p) & ~rmask); - *P2(p) = (fill2 & rmask) | (*P2(p) & ~rmask); - *P3(p) = (fill3 & rmask) | (*P3(p) & ~rmask); + y = ri->ri_font->fontheight * startrow; + startx = 0; + width = ri->ri_emuwidth; + height = ri->ri_font->fontheight * nrows; + om_unpack_attr(attr, &fg, &bg, NULL); + sh = startx >> 5; + sl = startx & 0x1f; + p = (uint8_t *)ri->ri_bits + y * scanspan + sh * 4; + + for (row = startrow; row < startrow + nrows; row++) { + om_reset_rowattr(row, bg); + } - p = (q += scanspan); - width = w + align; - height--; - } + if (bg == 0) { + /* om_fill seems slightly efficient */ + om_fill(hwplanemask, ROP_ZERO, + p, sl, scanspan, 0, width, height); + } else { + om_fill_color(bg, p, sl, scanspan, width, height); } + /* reset mask value */ + om_reset_planemask_and_rop(); } +/* + * Single plane raster copy. + * dst: destination plane pointer. + * src: source plane pointer. + * if y-forward, src > dst, point to left-top. + * if y-backward, src < dst, point to left-bottom. + * width: pixel width (must > 0) + * height: pixel height (> 0 if forward, < 0 if backward) + * rop: ROP array with omfb_planecount elements. + * + * This function modifies(breaks) the planemask and ROPs. + */ static void -om1_eraserows(void *cookie, int startrow, int nrows, long attr) +om_rascopy_single(uint8_t *dst, uint8_t *src, int16_t width, int16_t height, + uint8_t rop[]) { - struct rasops_info *ri = cookie; - uint8_t *p, *q; - int scanspan, starty, height, width, w; - uint32_t rmask, fill; + uint32_t mask; + int wh; + int wl; + int step; + int plane; + int16_t height_m1; + int16_t w, h; + + step = OMFB_STRIDE; + + /* + * X direction is always forward (or ascend order) to use (An)+ + * addressing mode in asm. + */ + + /* Reverse order Y if backward copy */ + if (height < 0) { + /* The sign is managed by step, height is always positive */ + step = -step; + height = -height; + } + height_m1 = height - 1; + + /* + * On single, it's not necessary to process two longwords at a time, + * but we do so for symmetry and speedup. + */ + + /* First, transfer a rectangle consist of two longwords */ + wh = (width >> 6); + if (wh > 0) { + int step8 = step - wh * 8; + +#if USE_M68K_ASM + wh--; /* for dbra */ + h = height_m1; + asm volatile("\n" + "om_rascopy_single_LL:\n" + " move.w %[wh],%[w] ;\n" + "1:\n" + " move.l (%[src])+,(%[dst])+ ;\n" + " move.l (%[src])+,(%[dst])+ ;\n" + " dbra %[w],1b ;\n" + + " adda.l %[step8],%[src] ;\n" + " adda.l %[step8],%[dst] ;\n" + " dbra %[h],om_rascopy_single_LL ;\n" + : /* output */ + [src] "+&a" (src), + [dst] "+&a" (dst), + [h] "+&d" (h), + [w] "=&d" (w) + : /* input */ + [wh] "r" (wh), + [step8] "r" (step8) + : /* clobbers */ + "memory" + ); +#else + wh--; /* to match to asm side */ + for (h = height_m1; h >= 0; h--) { + uint32_t *s32 = (uint32_t *)src; + uint32_t *d32 = (uint32_t *)dst; + for (w = wh; w >= 0; w--) { + *d32++ = *s32++; + *d32++ = *s32++; + } + src = (uint8_t *)s32 + step8; + dst = (uint8_t *)d32 + step8; + } +#endif - scanspan = ri->ri_stride; - starty = ri->ri_font->fontheight * startrow; - height = ri->ri_font->fontheight * nrows; - w = ri->ri_emuwidth; - fill = ((attr & 0x00000001) != 0) ? ALL1BITS : ALL0BITS; + if ((width & 0x3f) == 0) { + /* transfer completed */ + return; + } - p = (uint8_t *)ri->ri_bits + starty * scanspan; - width = w; - rmask = ALL1BITS << (-width & ALIGNMASK); - q = p; - while (height > 0) { - *P0(p) = fill; /* always aligned */ - width -= 2 * BLITWIDTH; - while (width > 0) { - p += BYTESDONE; - *P0(p) = fill; - width -= BLITWIDTH; + /* rewind y for the next transfer */ + src -= height * step; + dst -= height * step; + } + + if ((width & 32) != 0) { + /* Transfer one longword since an odd longword */ +#if USE_M68K_ASM + h = height_m1; + asm volatile("\n" + "om_rascopy_single_L:\n" + " move.l (%[src]),(%[dst]) ;\n" + " adda.l %[step],%[src] ;\n" + " adda.l %[step],%[dst] ;\n" + " dbra %[h],om_rascopy_single_L ;\n" + : /* output */ + [src] "+&a" (src), + [dst] "+&a" (dst), + [h] "+&d" (h) + : /* input */ + [step] "r" (step) + : /* clobbers */ + "memory" + ); +#else + for (h = height_m1; h >= 0; h--) { + *(uint32_t *)dst = *(uint32_t *)src; + dst += step; + src += step; } - p += BYTESDONE; - *P0(p) = (fill & rmask) | (*P0(p) & ~rmask); - p = (q += scanspan); - width = w; - height--; +#endif + + if ((width & 0x1f) == 0) { + /* transfer completed */ + return; + } + + /* rewind y for the next transfer */ + src += 4 - height * step; + dst += 4 - height * step; + } + + wl = width & 0x1f; + /* wl > 0 at this point */ + + /* Then, transfer residual bits */ + + mask = ALL1BITS << (32 - wl); + /* + * The common ROP cannot be used here. Because the hardware doesn't + * allow you to set the mask while keeping the ROP states. + */ + for (plane = 0; plane < omfb_planecount; plane++) { + om_set_rop(plane, rop[plane], mask); + } + +#if USE_M68K_ASM + h = height_m1; + asm volatile("\n" + "om_rascopy_single_bit:\n" + " move.l (%[src]),(%[dst]) ;\n" + " adda.l %[step],%[src] ;\n" + " adda.l %[step],%[dst] ;\n" + " dbra %[h],om_rascopy_single_bit ;\n" + : /* output */ + [src] "+&a" (src), + [dst] "+&a" (dst), + [h] "+&d" (h) + : /* input */ + [step] "r" (step) + : /* clobbers */ + "memory" + ); +#else + for (h = height_m1; h >= 0; h--) { + *(uint32_t *)dst = *(uint32_t *)src; + dst += step; + src += step; + } +#endif + + for (plane = 0; plane < omfb_planecount; plane++) { + om_set_rop(plane, rop[plane], ALL1BITS); } } +/* + * Multiple plane raster copy. + * dst0: destination pointer in Plane0. + * src0: source pointer in Plane0. + * if y-forward, src0 > dst0, point to left-top. + * if y-backward, src0 < dst0, point to left-bottom. + * width: pixel width (must > 0) + * height: pixel height (> 0 if forward, < 0 if backward) + * + * This function modifies(breaks) the planemask and ROPs. + */ static void -om4_eraserows(void *cookie, int startrow, int nrows, long attr) +om4_rascopy_multi(uint8_t *dst0, uint8_t *src0, int16_t width, int16_t height) { - struct rasops_info *ri = cookie; - uint8_t *p, *q; - int scanspan, starty, height, width, w, fg, bg; - uint32_t rmask, fill0, fill1, fill2, fill3; + uint8_t *dst1, *dst2, *dst3; + int wh; + int wl; + int rewind; + int step; + uint32_t mask; + int16_t height_m1; + int16_t w, h; + + step = OMFB_STRIDE; + + /* + * X direction is always forward (or ascend order) to use (An)+ + * addressing mode in asm. + */ + + /* Reverse order Y if backward copy */ + if (height < 0) { + /* The sign is managed by step, height is always positive */ + step = -step; + height = -height; + } + height_m1 = height - 1; + + dst1 = dst0 + OMFB_PLANEOFFS; + dst2 = dst1 + OMFB_PLANEOFFS; + dst3 = dst2 + OMFB_PLANEOFFS; + + /* First, transfer a rectangle consist of two longwords */ + wh = width >> 6; + if (wh > 0) { + int step8 = step - wh * 8; + +#if USE_M68K_ASM + wh--; /* for dbra */ + h = height_m1; + asm volatile("\n" + "om4_rascopy_multi_LL:\n" + " move.w %[wh],%[w] ;\n" + "1:\n" + /* + * Optimized for 68030. + * + * On LUNA, the following is faster than any of + * "MOVE.L (An)+,(An)+", "MOVE.L (An,Dn),(An,Dn)", or + * "MOVEM.L", due to the relationship of instruction + * overlaps and access waits. + * + * The head time of (An)+ as source operand is 0 and + * the head time of ADDA instruction is 2. If the + * previous instruction has some write wait cycles, + * i.e., tail cycles, (An)+ as source operand cannot + * overlap it but ADDA instruction can. + */ + " move.l (%[src0]),(%[dst0])+ ;\n" /* P0 */ + " adda.l %[PLANEOFFS],%[src0] ;\n" + " move.l (%[src0]),(%[dst1])+ ;\n" /* P1 */ + " adda.l %[PLANEOFFS],%[src0] ;\n" + " move.l (%[src0]),(%[dst2])+ ;\n" /* P2 */ + " adda.l %[PLANEOFFS],%[src0] ;\n" + " move.l (%[src0]),(%[dst3])+ ;\n" /* P3 */ + /* Expect an overlap, so don't use (An)+ */ + " addq.l #4,%[src0] ;\n" + + " move.l (%[src0]),(%[dst3])+ ;\n" /* P3 */ + " suba.l %[PLANEOFFS],%[src0] ;\n" + " move.l (%[src0]),(%[dst2])+ ;\n" /* P2 */ + " suba.l %[PLANEOFFS],%[src0] ;\n" + " move.l (%[src0]),(%[dst1])+ ;\n" /* P1 */ + " suba.l %[PLANEOFFS],%[src0] ;\n" + " move.l (%[src0])+,(%[dst0])+ ;\n" /* P0 */ + " dbra %[w],1b ;\n" + + " adda.l %[step8],%[src0] ;\n" + " adda.l %[step8],%[dst0] ;\n" + " adda.l %[step8],%[dst1] ;\n" + " adda.l %[step8],%[dst2] ;\n" + " adda.l %[step8],%[dst3] ;\n" + " dbra %[h],om4_rascopy_multi_LL ;\n" + : /* output */ + [src0] "+&a" (src0), + [dst0] "+&a" (dst0), + [dst1] "+&a" (dst1), + [dst2] "+&a" (dst2), + [dst3] "+&a" (dst3), + [h] "+&d" (h), + [w] "=&d" (w) + : /* input */ + [wh] "r" (wh), + [PLANEOFFS] "r" (OMFB_PLANEOFFS), + [step8] "r" (step8) + : /* clobbers */ + "memory" + ); +#else + wh--; /* to match to asm side */ + for (h = height_m1; h >= 0; h--) { + for (w = wh; w >= 0; w--) { + *(uint32_t *)dst0 = *(uint32_t *)src0; + dst0 += 4; + src0 += OMFB_PLANEOFFS; + *(uint32_t *)dst1 = *(uint32_t *)src0; + dst1 += 4; + src0 += OMFB_PLANEOFFS; + *(uint32_t *)dst2 = *(uint32_t *)src0; + dst2 += 4; + src0 += OMFB_PLANEOFFS; + *(uint32_t *)dst3 = *(uint32_t *)src0; + dst3 += 4; + src0 += 4; + + *(uint32_t *)dst3 = *(uint32_t *)src0; + dst3 += 4; + src0 -= OMFB_PLANEOFFS; + *(uint32_t *)dst2 = *(uint32_t *)src0; + dst2 += 4; + src0 -= OMFB_PLANEOFFS; + *(uint32_t *)dst1 = *(uint32_t *)src0; + dst1 += 4; + src0 -= OMFB_PLANEOFFS; + *(uint32_t *)dst0 = *(uint32_t *)src0; + dst0 += 4; + src0 += 4; + } + src0 += step8; + dst0 += step8; + dst1 += step8; + dst2 += step8; + dst3 += step8; + } +#endif - scanspan = ri->ri_stride; - starty = ri->ri_font->fontheight * startrow; - height = ri->ri_font->fontheight * nrows; - w = ri->ri_emuwidth; - om4_unpack_attr(attr, &fg, &bg, NULL); - fill0 = ((bg & 0x01) != 0) ? ALL1BITS : ALL0BITS; - fill1 = ((bg & 0x02) != 0) ? ALL1BITS : ALL0BITS; - fill2 = ((bg & 0x04) != 0) ? ALL1BITS : ALL0BITS; - fill3 = ((bg & 0x08) != 0) ? ALL1BITS : ALL0BITS; + if ((width & 0x3f) == 0) { + /* transfer completed */ + return; + } - p = (uint8_t *)ri->ri_bits + starty * scanspan; - width = w; - rmask = ALL1BITS << (-width & ALIGNMASK); - q = p; - while (height > 0) { - *P0(p) = fill0; /* always aligned */ - *P1(p) = fill1; - *P2(p) = fill2; - *P3(p) = fill3; - width -= 2 * BLITWIDTH; - while (width > 0) { - p += BYTESDONE; - *P0(p) = fill0; - *P1(p) = fill1; - *P2(p) = fill2; - *P3(p) = fill3; - width -= BLITWIDTH; + /* rewind y for the next transfer */ + src0 -= height * step; + dst0 -= height * step; + dst1 -= height * step; + dst2 -= height * step; + dst3 -= height * step; + } + + /* This rewind rewinds the plane, so Y order is irrelevant */ + rewind = OMFB_STRIDE - OMFB_PLANEOFFS * 3; + + if ((width & 32) != 0) { + /* Transfer one longword since an odd longword */ +#if USE_M68K_ASM + h = height_m1; + asm volatile("\n" + "om4_rascopy_multi_L:\n" + " move.l (%[src0]),(%[dst0]) ;\n" + " adda.l %[PLANEOFFS],%[src0] ;\n" + " move.l (%[src0]),(%[dst1]) ;\n" + " adda.l %[PLANEOFFS],%[src0] ;\n" + " move.l (%[src0]),(%[dst2]) ;\n" + " adda.l %[PLANEOFFS],%[src0] ;\n" + " move.l (%[src0]),(%[dst3]) ;\n" + " adda.l %[rewind],%[src0] ;\n" + + " adda.l %[step],%[dst0] ;\n" + " adda.l %[step],%[dst1] ;\n" + " adda.l %[step],%[dst2] ;\n" + " adda.l %[step],%[dst3] ;\n" + " dbra %[h],om4_rascopy_multi_L ;\n" + : /* output */ + [src0] "+&a" (src0), + [dst0] "+&a" (dst0), + [dst1] "+&a" (dst1), + [dst2] "+&a" (dst2), + [dst3] "+&a" (dst3), + [h] "+&d" (h) + : /* input */ + [PLANEOFFS] "r" (OMFB_PLANEOFFS), + [rewind] "r" (rewind), + [step] "r" (step) + : /* clobbers */ + "memory" + ); +#else + for (h = height_m1; h >= 0; h--) { + *(uint32_t *)dst0 = *(uint32_t *)src0; + src0 += OMFB_PLANEOFFS; + *(uint32_t *)dst1 = *(uint32_t *)src0; + src0 += OMFB_PLANEOFFS; + *(uint32_t *)dst2 = *(uint32_t *)src0; + src0 += OMFB_PLANEOFFS; + *(uint32_t *)dst3 = *(uint32_t *)src0; + src0 += rewind; + + dst0 += step; + dst1 += step; + dst2 += step; + dst3 += step; } - p += BYTESDONE; - *P0(p) = (fill0 & rmask) | (*P0(p) & ~rmask); - *P1(p) = (fill1 & rmask) | (*P1(p) & ~rmask); - *P2(p) = (fill2 & rmask) | (*P2(p) & ~rmask); - *P3(p) = (fill3 & rmask) | (*P3(p) & ~rmask); - p = (q += scanspan); - width = w; - height--; +#endif + + if ((width & 0x1f) == 0) { + /* transfer completed */ + return; + } + + /* rewind y for the next transfer */ + src0 += 4 - height * step; + dst0 += 4 - height * step; + dst1 += 4 - height * step; + dst2 += 4 - height * step; + dst3 += 4 - height * step; + } + + wl = width & 0x1f; + /* wl > 0 at this point */ + + /* Then, transfer residual bits */ + + mask = ALL1BITS << (32 - wl); + om_set_planemask(hwplanemask); + om_set_rop_curplane(ROP_THROUGH, mask); + +#if USE_M68K_ASM + h = height_m1; + asm volatile("\n" + "om4_rascopy_multi_bit:\n" + " move.l (%[src0]),(%[dst0]) ;\n" + " adda.l %[PLANEOFFS],%[src0] ;\n" + " move.l (%[src0]),(%[dst1]) ;\n" + " adda.l %[PLANEOFFS],%[src0] ;\n" + " move.l (%[src0]),(%[dst2]) ;\n" + " adda.l %[PLANEOFFS],%[src0] ;\n" + " move.l (%[src0]),(%[dst3]) ;\n" + " adda.l %[rewind],%[src0] ;\n" + + " adda.l %[step],%[dst0] ;\n" + " adda.l %[step],%[dst1] ;\n" + " adda.l %[step],%[dst2] ;\n" + " adda.l %[step],%[dst3] ;\n" + " dbra %[h],om4_rascopy_multi_bit ;\n" + : /* output */ + [src0] "+&a" (src0), + [dst0] "+&a" (dst0), + [dst1] "+&a" (dst1), + [dst2] "+&a" (dst2), + [dst3] "+&a" (dst3), + [h] "+&d" (h) + : /* input */ + [PLANEOFFS] "r" (OMFB_PLANEOFFS), + [rewind] "r" (rewind), + [step] "r" (step) + : /* clobbers */ + "memory" + ); +#else + for (h = height_m1; h >= 0; h--) { + *(uint32_t *)dst0 = *(uint32_t *)src0; + src0 += OMFB_PLANEOFFS; + *(uint32_t *)dst1 = *(uint32_t *)src0; + src0 += OMFB_PLANEOFFS; + *(uint32_t *)dst2 = *(uint32_t *)src0; + src0 += OMFB_PLANEOFFS; + *(uint32_t *)dst3 = *(uint32_t *)src0; + src0 += rewind; + + dst0 += step; + dst1 += step; + dst2 += step; + dst3 += step; } +#endif + om_reset_planemask_and_rop(); } static void @@ -547,50 +1137,137 @@ static void om4_copyrows(void *cookie, int srcrow, int dstrow, int nrows) { struct rasops_info *ri = cookie; - uint8_t *p, *q; - int scanspan, offset, srcy, height, width, w; - uint32_t rmask; + uint8_t *src, *dst; + int width, rowheight; + int ptrstep, rowstep; + int srcplane; + int i; + int r; + uint8_t rop[OMFB_MAX_PLANECOUNT]; - scanspan = ri->ri_stride; - height = ri->ri_font->fontheight * nrows; - offset = (dstrow - srcrow) * scanspan * ri->ri_font->fontheight; - srcy = ri->ri_font->fontheight * srcrow; - if (srcrow < dstrow && srcrow + nrows > dstrow) { - scanspan = -scanspan; - srcy = srcy + height - 1; + width = ri->ri_emuwidth; + rowheight = ri->ri_font->fontheight; + src = (uint8_t *)ri->ri_bits + srcrow * rowheight * ri->ri_stride; + dst = (uint8_t *)ri->ri_bits + dstrow * rowheight * ri->ri_stride; + + if (nrows <= 0 || srcrow == dstrow) { + return; + } else if (srcrow < dstrow) { + /* y-backward */ + + /* select the bottom raster of the bottom row */ + srcrow += nrows - 1; + dstrow += nrows - 1; + src += nrows * rowheight * ri->ri_stride - ri->ri_stride; + dst += nrows * rowheight * ri->ri_stride - ri->ri_stride; + rowstep = -1; + rowheight = -rowheight; + } else { + /* y-forward */ + rowstep = 1; } + ptrstep = ri->ri_stride * rowheight; - p = (uint8_t *)ri->ri_bits + srcy * ri->ri_stride; - w = ri->ri_emuwidth; - width = w; - rmask = ALL1BITS << (-width & ALIGNMASK); - q = p; - while (height > 0) { - *P0(p + offset) = *P0(p); /* always aligned */ - *P1(p + offset) = *P1(p); - *P2(p + offset) = *P2(p); - *P3(p + offset) = *P3(p); - width -= 2 * BLITWIDTH; - while (width > 0) { - p += BYTESDONE; - *P0(p + offset) = *P0(p); - *P1(p + offset) = *P1(p); - *P2(p + offset) = *P2(p); - *P3(p + offset) = *P3(p); - width -= BLITWIDTH; + om_set_planemask(hwplanemask); + + srcplane = 0; + while (nrows > 0) { + r = 1; + if (rowattr[srcrow].ismulti == false && + rowattr[srcrow].fg == rowattr[srcrow].bg && + rowattr[srcrow].all == rowattr[dstrow].all) { + goto skip; + } + + /* count the number of rows with the same attributes */ + for (; r < nrows; r++) { + if (rowattr[srcrow + r * rowstep].all != + rowattr[srcrow].all) { + break; + } } - p += BYTESDONE; - *P0(p + offset) = (*P0(p) & rmask) | (*P0(p + offset) & ~rmask); - *P1(p + offset) = (*P1(p) & rmask) | (*P1(p + offset) & ~rmask); - *P2(p + offset) = (*P2(p) & rmask) | (*P2(p + offset) & ~rmask); - *P3(p + offset) = (*P3(p) & rmask) | (*P3(p + offset) & ~rmask); + /* r is the number of rows including srcrow itself */ - p = (q += scanspan); - width = w; - height--; + if (rowattr[srcrow].ismulti) { + /* + * src,dst point to the common plane. src0,dst0 will + * point to the same offset in plane0 because plane0 + * is placed just after the common plane. + */ + uint8_t *src0 = src + OMFB_PLANEOFFS; + uint8_t *dst0 = dst + OMFB_PLANEOFFS; + om_set_rop_curplane(ROP_THROUGH, ALL1BITS); + om4_rascopy_multi(dst0, src0, width, rowheight * r); + } else { + uint8_t *srcp; + uint8_t fg; + uint8_t bg; + uint8_t set; + + fg = rowattr[srcrow].fg; + bg = rowattr[srcrow].bg; + set = fg ^ bg; + if (set == 0) { + /* use fg since both can be acceptable */ + set = fg; + } else if ((set & fg) != 0) { + /* + * set is the set of bits that set in fg and + * cleared in bg. + */ + set &= fg; + } else { + /* + * otherwise, set is the set of bits that + * (probably) set in bg and cleared in fg. + */ + uint8_t tmp; + + set &= bg; + /* and swap fg and bg */ + tmp = fg; + fg = bg; + bg = tmp; + } + + for (i = 0; i < omfb_planecount; i++) { + int t = om_fgbg2rop(fg, bg); + rop[i] = t; + om_set_rop(i, rop[i], ALL1BITS); + fg >>= 1; + bg >>= 1; + } + + /* + * If any bit in 'set' is set, any of them can be used. + * If all bits in 'set' are cleared, use plane 0. + * srcplane is the plane that fg is set and bg is + * cleared. + */ + srcplane = (set != 0) ? (31 - __builtin_clz(set)) : 0; + + srcp = src + OMFB_PLANEOFFS + srcplane * OMFB_PLANEOFFS; + om_rascopy_single(dst, srcp, width, rowheight * r, rop); + } + +skip: + for (i = 0; i < r; i++) { + rowattr[dstrow] = rowattr[srcrow]; + + srcrow += rowstep; + dstrow += rowstep; + src += ptrstep; + dst += ptrstep; + nrows--; + } } } +/* + * XXX om{1,4}_copycols can be merged, but these are not frequently executed + * and have low execution costs. So I'm putting it off for now. + */ + static void om1_copycols(void *cookie, int startrow, int srccol, int dstcol, int ncols) { @@ -612,6 +1289,8 @@ om1_copycols(void *cookie, int startrow, sb = srcx & ALIGNMASK; db = dstx & ALIGNMASK; + om_reset_planemask_and_rop(); + if (db + w <= BLITWIDTH) { /* Destination is contained within a single word */ sp = basep + (srcx / 32) * 4; @@ -649,8 +1328,9 @@ om1_copycols(void *cookie, int startrow, sboff = sb + lnum; if (sboff >= 32) sboff -= 32; - } else + } else { sboff = sb; + } sq = sp; dq = dp; @@ -742,6 +1422,8 @@ om4_copycols(void *cookie, int startrow, sb = srcx & ALIGNMASK; db = dstx & ALIGNMASK; + om_reset_planemask_and_rop(); + if (db + w <= BLITWIDTH) { /* Destination is contained within a single word */ sp = basep + (srcx / 32) * 4; @@ -785,8 +1467,9 @@ om4_copycols(void *cookie, int startrow, sboff = sb + lnum; if (sboff >= 32) sboff -= 32; - } else + } else { sboff = sb; + } sq = sp; dq = dp; @@ -923,12 +1606,16 @@ om_mapchar(void *cookie, int c, u_int *c * Position|{enable|disable} the cursor at the specified location. */ static void -om1_cursor(void *cookie, int on, int row, int col) +om_cursor(void *cookie, int on, int row, int col) { struct rasops_info *ri = cookie; + int startx; + int width; + int height; + int sh, sl; + int y; + int scanspan; uint8_t *p; - int scanspan, startx, height, width, align, y; - uint32_t lmask, rmask; if (!on) { /* make sure it's on */ @@ -946,169 +1633,120 @@ om1_cursor(void *cookie, int on, int row scanspan = ri->ri_stride; y = ri->ri_font->fontheight * row; startx = ri->ri_font->fontwidth * col; + width = ri->ri_font->fontwidth; height = ri->ri_font->fontheight; + sh = startx >> 5; + sl = startx & 0x1f; + p = (uint8_t *)ri->ri_bits + y * scanspan + sh * 4; - p = (uint8_t *)ri->ri_bits + y * scanspan + ((startx / 32) * 4); - align = startx & ALIGNMASK; - width = ri->ri_font->fontwidth + align; - lmask = ALL1BITS >> align; - rmask = ALL1BITS << (-width & ALIGNMASK); - if (width <= BLITWIDTH) { - lmask &= rmask; - /* set lmask as ROP mask value, with INV2 mode */ - ((volatile uint32_t *)OMFB_ROPFUNC)[ROP_INV2] = lmask; + /* ROP_INV2 ignores data from MPU and inverts the current VRAM data */ + om_fill(hwplanemask, ROP_INV2, p, sl, scanspan, 0, width, height); - while (height > 0) { - *P0(p) = ALL1BITS; - p += scanspan; - height--; - } - /* reset mask value */ - ((volatile uint32_t *)OMFB_ROPFUNC)[ROP_THROUGH] = ALL1BITS; - } else { - uint8_t *q = p; - - while (height > 0) { - /* set lmask as ROP mask value, with INV2 mode */ - ((volatile uint32_t *)OMFB_ROPFUNC)[ROP_INV2] = lmask; - *W(p) = ALL1BITS; - - p += BYTESDONE; - - /* set lmask as ROP mask value, with INV2 mode */ - ((volatile uint32_t *)OMFB_ROPFUNC)[ROP_INV2] = rmask; - *W(p) = ALL1BITS; - - p = (q += scanspan); - height--; - } - /* reset mask value */ - ((volatile uint32_t *)OMFB_ROPFUNC)[ROP_THROUGH] = ALL1BITS; - } ri->ri_flg ^= RI_CURSOR; -} - -static void -om4_cursor(void *cookie, int on, int row, int col) -{ - struct rasops_info *ri = cookie; - uint8_t *p; - int scanspan, startx, height, width, align, y; - uint32_t lmask, rmask; - - if (!on) { - /* make sure it's on */ - if ((ri->ri_flg & RI_CURSOR) == 0) - return; - - row = ri->ri_crow; - col = ri->ri_ccol; - } else { - /* unpaint the old copy. */ - ri->ri_crow = row; - ri->ri_ccol = col; - } - - scanspan = ri->ri_stride; - y = ri->ri_font->fontheight * row; - startx = ri->ri_font->fontwidth * col; - height = ri->ri_font->fontheight; - - p = (uint8_t *)ri->ri_bits + y * scanspan + ((startx / 32) * 4); - align = startx & ALIGNMASK; - width = ri->ri_font->fontwidth + align; - lmask = ALL1BITS >> align; - rmask = ALL1BITS << (-width & ALIGNMASK); - - /* select all planes for later ROP function target */ - *(volatile uint32_t *)OMFB_PLANEMASK = 0xff; - if (width <= BLITWIDTH) { - lmask &= rmask; - /* set lmask as ROP mask value, with INV2 mode */ - ((volatile uint32_t *)OMFB_ROPFUNC)[ROP_INV2] = lmask; - - while (height > 0) { - *W(p) = ALL1BITS; - p += scanspan; - height--; - } - /* reset mask value */ - ((volatile uint32_t *)OMFB_ROPFUNC)[ROP_THROUGH] = ALL1BITS; - } else { - uint8_t *q = p; - - while (height > 0) { - /* set lmask as ROP mask value, with INV2 mode */ - ((volatile uint32_t *)OMFB_ROPFUNC)[ROP_INV2] = lmask; - *W(p) = ALL1BITS; - - p += BYTESDONE; - - /* set rmask as ROP mask value, with INV2 mode */ - ((volatile uint32_t *)OMFB_ROPFUNC)[ROP_INV2] = rmask; - *W(p) = ALL1BITS; - - p = (q += scanspan); - height--; - } - /* reset mask value */ - ((volatile uint32_t *)OMFB_ROPFUNC)[ROP_THROUGH] = ALL1BITS; - } - /* select plane #0 only; XXX need this ? */ - *(volatile uint32_t *)OMFB_PLANEMASK = 0x01; - - ri->ri_flg ^= RI_CURSOR; + /* reset mask value */ + om_reset_planemask_and_rop(); } /* * Allocate attribute. We just pack these into an integer. + * + * Attribute bitmap: + * b31: Multi color (used by copyrows) + * b30-18: 0 (reserved) + * b17: Underline (not supported yet) + * b16: Bold (or HILIT if 1bpp; not supported yet) + * b15-8: fg color code + * b7-0: bg color code + */ +#if 0 +/* + * Future plan: + * Place fg and bg side by side in advance to reduce the computation cost + * at the time of ROP setting. + * + * bit: 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0 + * f7 b7 f6 b6 f5 b5 f4 b4 f3 b3 f2 b2 f1 b1 f0 b0 + * + * In this form, use bit1..0 if 1bpp, use bit7..0 if 4bpp. */ +#endif static int -om1_allocattr(void *id, int fg, int bg, int flags, long *attrp) +om_allocattr(void *id, int fg, int bg, int flags, long *attrp) { + uint32_t a; + uint16_t c; - if ((flags & (WSATTR_HILIT | WSATTR_BLINK | - WSATTR_UNDERLINE | WSATTR_WSCOLORS)) != 0) - return EINVAL; - if ((flags & WSATTR_REVERSE) != 0) - *attrp = 1; - else - *attrp = 0; - return 0; -} - -static int -om4_allocattr(void *id, int fg, int bg, int flags, long *attrp) -{ + a = 0; + c = 0; - if ((flags & (WSATTR_BLINK | WSATTR_UNDERLINE)) != 0) + if ((flags & WSATTR_BLINK) != 0) return EINVAL; + if ((flags & WSATTR_WSCOLORS) == 0) { - fg = WSCOL_WHITE; - bg = WSCOL_BLACK; + fg = WSCOL_WHITE; /* maybe 7 or 1 */ + bg = WSCOL_BLACK; /* maybe 0 */ } if ((flags & WSATTR_REVERSE) != 0) { - int swap; - swap = fg; + int tmp; + tmp = fg; fg = bg; - bg = swap; + bg = tmp; } - if ((flags & WSATTR_HILIT) != 0) - fg += 8; + if ((flags & WSATTR_HILIT) != 0) { + if (omfb_planecount == 1) { +#if 0 + a |= OMFB_ATTR_BOLD; +#else + return EINVAL; +#endif + } else if (fg < 8) { + fg += 8; + } + } + + if ((flags & WSATTR_UNDERLINE) != 0) { +#if 0 + a |= OMFB_ATTR_UNDERLINE; +#else + return EINVAL; +#endif + } - *attrp = (fg << 24) | (bg << 16); + fg &= hwplanemask; + bg &= hwplanemask; + +#if 0 + int i; + for (i = 0; i < omfb_planecount; i++) { + c += c; + c += ((fg & 1) << 1) | (bg & 1); + fg >>= 1; + bg >>= 1; + } +#else + c = (fg << 8) | bg; +#endif + a |= c; + + *attrp = a; return 0; } static void -om4_unpack_attr(long attr, int *fg, int *bg, int *underline) +om_unpack_attr(long attr, uint8_t *fg, uint8_t *bg, int *underline) { + uint8_t f, b; + + f = (attr >> 8) & hwplanemask; + b = attr & hwplanemask; - *fg = ((u_int)attr >> 24) & 0xf; - *bg = ((u_int)attr >> 16) & 0xf; + if (fg) + *fg = f; + if (bg) + *bg = b; } /* @@ -1121,14 +1759,14 @@ omrasops1_init(struct rasops_info *ri, i omrasops_init(ri, wantrows, wantcols); /* fill our own emulops */ - ri->ri_ops.cursor = om1_cursor; + ri->ri_ops.cursor = om_cursor; ri->ri_ops.mapchar = om_mapchar; - ri->ri_ops.putchar = om1_putchar; + ri->ri_ops.putchar = om_putchar; ri->ri_ops.copycols = om1_copycols; - ri->ri_ops.erasecols = om1_erasecols; + ri->ri_ops.erasecols = om_erasecols; ri->ri_ops.copyrows = om1_copyrows; - ri->ri_ops.eraserows = om1_eraserows; - ri->ri_ops.allocattr = om1_allocattr; + ri->ri_ops.eraserows = om_eraserows; + ri->ri_ops.allocattr = om_allocattr; ri->ri_caps = WSSCREEN_REVERSE; ri->ri_flg |= RI_CFGDONE; @@ -1143,14 +1781,14 @@ omrasops4_init(struct rasops_info *ri, i omrasops_init(ri, wantrows, wantcols); /* fill our own emulops */ - ri->ri_ops.cursor = om4_cursor; + ri->ri_ops.cursor = om_cursor; ri->ri_ops.mapchar = om_mapchar; - ri->ri_ops.putchar = om4_putchar; + ri->ri_ops.putchar = om_putchar; ri->ri_ops.copycols = om4_copycols; - ri->ri_ops.erasecols = om4_erasecols; + ri->ri_ops.erasecols = om_erasecols; ri->ri_ops.copyrows = om4_copyrows; - ri->ri_ops.eraserows = om4_eraserows; - ri->ri_ops.allocattr = om4_allocattr; + ri->ri_ops.eraserows = om_eraserows; + ri->ri_ops.allocattr = om_allocattr; ri->ri_caps = WSSCREEN_HILIT | WSSCREEN_WSCOLORS | WSSCREEN_REVERSE; ri->ri_flg |= RI_CFGDONE; Index: src/sys/arch/luna68k/dev/omrasopsvar.h diff -u src/sys/arch/luna68k/dev/omrasopsvar.h:1.5 src/sys/arch/luna68k/dev/omrasopsvar.h:1.6 --- src/sys/arch/luna68k/dev/omrasopsvar.h:1.5 Sun Sep 22 05:49:16 2019 +++ src/sys/arch/luna68k/dev/omrasopsvar.h Sun Sep 25 11:28:40 2022 @@ -1,4 +1,4 @@ -/* $NetBSD: omrasopsvar.h,v 1.5 2019/09/22 05:49:16 rin Exp $ */ +/* $NetBSD: omrasopsvar.h,v 1.6 2022/09/25 11:28:40 isaki Exp $ */ /* * Copyright (c) 2013 Kenji Aoyama * @@ -15,27 +15,28 @@ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ +#include <machine/board.h> + /* * Base addresses of LUNA's frame buffer * XXX: We consider only 1bpp and 4bpp for now */ -#include <machine/board.h> - -#define OMFB_PLANEMASK BMAP_BMSEL /* BMSEL register */ -#define OMFB_FB_WADDR (BMAP_BMP + 8) /* common plane */ -#define OMFB_FB_RADDR (BMAP_BMAP0 + 8)/* plane #0 */ -#define OMFB_ROPFUNC BMAP_FN /* common ROP function */ - -/* - * Helper macros - */ -#define W(addr) ((uint32_t *)(addr)) -#define R(addr) ((uint32_t *)((uint8_t *)(addr) + 0x40000)) -#define P0(addr) ((uint32_t *)((uint8_t *)(addr) + 0x40000)) -#define P1(addr) ((uint32_t *)((uint8_t *)(addr) + 0x80000)) -#define P2(addr) ((uint32_t *)((uint8_t *)(addr) + 0xC0000)) -#define P3(addr) ((uint32_t *)((uint8_t *)(addr) + 0x100000)) +#define OMFB_PLANEMASK BMAP_BMSEL /* BMSEL register */ +#define OMFB_ROP_COMMON BMAP_FN /* common ROP */ +#define OMFB_ROP_P0 BMAP_FN0 + +/* will be merged in near future */ +#define OMFB_ROPFUNC BMAP_FN /* common ROP function */ + +#define OMFB_MAX_PLANECOUNT (8) +#define OMFB_PLANEOFFS (0x40000) /* plane offset */ +#define OMFB_STRIDE (2048/8) /* stride [byte] */ + +/* TODO: should be improved... */ +#define omfb_planecount hwplanecount +extern int hwplanemask; +extern int hwplanecount; /* * ROP function