On 29-12-2021 00:00, Bart via lazarus wrote:
On Tue, Dec 28, 2021 at 11:35 PM Martin Frb via lazarus
<lazarus@lists.lazarus-ide.org> wrote:
I have a core I7-8600
The diff between the old code and popcnt is less significant.
old: 715
pop: 695
But there is a 3rd way, that is faster.
add: 610
Not surprising that you should come up with a faster solution.
IIRC you won both speed contests I had on the forum ;-)
Feel free to implement it in LazUtf8.
New unit test, with Martin's integrated. If I play with godbolt, Ryzen
zen3 (ryzen 5x00X) is nearly twice as fast in cycles as my Ivy Bridge,
so I would like to see some benchmarks from various processors. Also
from very old ones (P4 and Clawhammers) to test instruction sets.
I use unaligned loads which afaik on older ( pre Core 1st or 2nd
generation) CPUs are costly. (because it loads two caches lines per time)
//
// (C) 2021 Martin Friebe and Marco van de Voort.
// attempt to accelerate utf8lengthfast which is a length(s) in utf8 codepoints
without integrity checking
//
// 4 versions.
// - Original,
// - with popcount and
// - the "add" variant that accumulates 127 iterations of ptrints and only adds
// the intermeidates outside that loop
// - a SSE2 version loosely inspired by the add variant combined with
// the core of an existing (branchless) binarization routine for the
main loop.
{$mode objfpc}{$H+}
{$asmmode intel}
{define asmdebug}
uses SysUtils,StrUtils;
const
mask3 : array[0..15] of byte = ( $C0,$C0,$C0,$C0,
$C0,$C0,$C0,$C0,
$C0,$C0,$C0,$C0,
$C0,$C0,$C0,$C0);
mask4 : array[0..15] of byte = ( $80,$80,$80,$80,
$80,$80,$80,$80,
$80,$80,$80,$80,
$80,$80,$80,$80);
mask2 : array[0..15] of byte = ( $1,$1,$1,$1,
$1,$1,$1,$1,
$1,$1,$1,$1,
$1,$1,$1,$1);
// Integer arguments are passed in registers RCX, RDX, R8, and R9.
// Floating point arguments are passed in XMM0L, XMM1L, XMM2L, and XMM3L.
// volatile: RAX, RCX, RDX, R8, R9, R10, R11
// nonvolatile RBX, RBP, RDI, RSI, RSP, R12, R13, R14, and R15 are considered
nonvolatile
// volatile xmm0-xmm3 (params) en xmm4,5
// https://msdn.microsoft.com/en-us/library/ms235286.aspx
{$ifdef asmdebug}
function asmutf8length(const s : pchar;res:pbyte;len:integer):int64;
{$else}
function asmutf8length(const s : pchar;len:integer):int64;
{$endif}
begin
asm
{$ifndef asmdebug}
mov r8,rdx
{$endif}
// using broadcast etc raises requirements?
mov rax,r8
mov r9,r8
// tuning for short strings:
// ------
// test rax,rax
// je @theend
// cmp r9,128 // difference between long and short
// jl @restbytes
and r9,15
shr r8,4
pxor xmm5,xmm5 // always zero
pxor xmm6,xmm6 // dword counts
movdqu xmm1,[rip+mask3]
movdqu xmm2,[rip+mask4]
movdqu xmm3,[rip+mask2]
test r8,r8
je @restbytes
@outer:
mov r10,127 // max iterations
cmp r10,r8 // more or less left?
jl @last // more
mov r10,r8 // less
@last:
sub r8,r10 // iterations left - iterations to do
pxor xmm4,xmm4
@inner:
movdqu xmm0, [rcx]
pand xmm0,xmm1 // mask out top 2 bits
pcmpeqb xmm0,xmm2 // compare with $80.
pand xmm0,xmm3 // change to $1 per byte.
paddb xmm4,xmm0 // add to cumulative
add rcx,16
dec r10
jne @inner
// process 127 iterations
movdqa xmm0,xmm4
PUNPCKLBW xmm0,xmm5 // zero extend to words
PUNPCKHBW xmm4,xmm5
paddw xmm0,xmm4 // add, now 8 16-bit words.
movdqa xmm4,xmm0
PUNPCKLWD xmm0,xmm5 // zero extend to dwords
paddd xmm6,xmm0
PUNPCKHWD xmm4,xmm5
paddd xmm6,xmm4 // add both to cumulative
test r8,r8
jne @outer
MOVHLPS xmm4,xmm6 // move high 8 bytes to low (float->int
penalty?)
paddd xmm6,xmm4 // add both 2*dwords (high doesn't matter)
pshufd xmm4,xmm6,1 // mov 2nd dword in xmm6 to first in xmm4
paddd xmm6,xmm4 // add
movd r8d,xmm6 // to int alu reg
sub rax,r8 // subtract from length in bytes.
@restbytes:
test r9,r9
je @theend // Done!
@restloop:
movzx r8d, byte [rcx] // unaligned bytes after sse loop
mov r10,r8
shr r10,7
not r8
shr r8,6
and r10,r8
sub rax,r10
inc rcx
dec r9
jne @restloop
@theend:
end['xmm5','xmm6']; // volatile registers used.
end;
function countmask(nx:int64):integer;
begin
nx := (nx and $00FF00FF00FF00FF) + ((nx >> 8) and $00FF00FF00FF00FF);
nx := (nx and $0000FFFF0000FFFF) + ((nx >> 16) and $0000FFFF0000FFFF);
result := (nx and $00000000FFFFFFFF) + ((nx >> 32) and $00000000FFFFFFFF);
end;
function UTF8LengthFast(p: PChar; ByteCount: PtrInt): PtrInt;
const
{$ifdef CPU32}
ONEMASK =$01010101;
EIGHTYMASK=$80808080;
{$endif}
{$ifdef CPU64}
ONEMASK =$0101010101010101;
EIGHTYMASK=$8080808080808080;
{$endif}
var
pnx: PPtrInt absolute p; // To get contents of text in PtrInt blocks. x
refers to 32 or 64 bits
pn8: pint8 absolute pnx; // To read text as Int8 in the initial and final
loops
ix: PtrInt absolute pnx; // To read text as PtrInt in the block loop
nx: PtrInt; // values processed in block loop
i,cnt,e: PtrInt;
begin
Result := 0;
e := ix+ByteCount; // End marker
// Handle any initial misaligned bytes.
cnt := (not (ix-1)) and (sizeof(PtrInt)-1);
if cnt>ByteCount then
cnt := ByteCount;
for i := 1 to cnt do
begin
// Is this byte NOT the first byte of a character?
Result += (pn8^ shr 7) and ((not pn8^) shr 6);
inc(pn8);
end;
// Handle complete blocks
for i := 1 to (ByteCount-cnt) div sizeof(PtrInt) do
begin
// Count bytes which are NOT the first byte of a character.
nx := ((pnx^ and EIGHTYMASK) shr 7) and ((not pnx^) shr 6);
{$push}{$overflowchecks off} // "nx * ONEMASK" causes an arithmetic
overflow.
Result += (nx * ONEMASK) >> ((sizeof(PtrInt) - 1) * 8);
{$pop}
inc(pnx);
end;
// Take care of any left-over bytes.
while ix<e do
begin
// Is this byte NOT the first byte of a character?
Result += (pn8^ shr 7) and ((not pn8^) shr 6);
inc(pn8);
end;
Result := ByteCount - Result;
end;
function UTF8LengthPop(p: PChar; ByteCount: PtrInt): PtrInt;
const
{$ifdef CPU32}
ONEMASK =$01010101;
EIGHTYMASK=$80808080;
{$endif}
{$ifdef CPU64}
ONEMASK =$0101010101010101;
EIGHTYMASK=$8080808080808080;
{$endif}
var
pnx: PPtrInt absolute p; // To get contents of text in PtrInt blocks. x
refers to 32 or 64 bits
pn8: pint8 absolute pnx; // To read text as Int8 in the initial and final
loops
ix: PtrInt absolute pnx; // To read text as PtrInt in the block loop
nx: PtrInt; // values processed in block loop
i,cnt,e: PtrInt;
begin
Result := 0;
e := ix+ByteCount; // End marker
// Handle any initial misaligned bytes.
cnt := (not (ix-1)) and (sizeof(PtrInt)-1);
if cnt>ByteCount then
cnt := ByteCount;
for i := 1 to cnt do
begin
// Is this byte NOT the first byte of a character?
Result += (pn8^ shr 7) and ((not pn8^) shr 6);
inc(pn8);
end;
// Handle complete blocks
for i := 1 to (ByteCount-cnt) div sizeof(PtrInt) do
begin
// Count bytes which are NOT the first byte of a character.
nx := ((pnx^ and EIGHTYMASK) shr 7) and ((not pnx^) shr 6);
{$push}{$overflowchecks off} // "nx * ONEMASK" causes an arithmetic
overflow.
//Result += (nx * ONEMASK) >> ((sizeof(PtrInt) - 1) * 8);
Result += PopCnt(qword(nx));
{$pop}
inc(pnx);
end;
// Take care of any left-over bytes.
while ix<e do
begin
// Is this byte NOT the first byte of a character?
Result += (pn8^ shr 7) and ((not pn8^) shr 6);
inc(pn8);
end;
Result := ByteCount - Result;
end;
function UTF8LengthAdd(p: PChar; ByteCount: PtrInt): PtrInt;
const
{$ifdef CPU32}
ONEMASK =$01010101;
EIGHTYMASK=$80808080;
{$endif}
{$ifdef CPU64}
ONEMASK =$0101010101010101;
EIGHTYMASK=$8080808080808080;
{$endif}
var
pnx: PPtrInt absolute p; // To get contents of text in PtrInt blocks. x
refers to 32 or 64 bits
pn8: pint8 absolute pnx; // To read text as Int8 in the initial and final
loops
ix: PtrInt absolute pnx; // To read text as PtrInt in the block loop
nx: PtrInt; // values processed in block loop
i,j,cnt,e, bc: PtrInt;
begin
Result := 0;
e := ix+ByteCount; // End marker
// Handle any initial misaligned bytes.
cnt := (not (ix-1)) and (sizeof(PtrInt)-1);
if cnt>ByteCount then
cnt := ByteCount;
for i := 1 to cnt do
begin
// Is this byte NOT the first byte of a character?
Result += (pn8^ shr 7) and ((not pn8^) shr 6);
inc(pn8);
end;
// Handle complete blocks
bc := (ByteCount-cnt) div sizeof(PtrInt);
for j := 1 to bc >> 7 do begin
nx := 0;
for i := 0 to 127 do
begin
// Count bytes which are NOT the first byte of a character.
nx += ((pnx^ and EIGHTYMASK) shr 7) and ((not pnx^) shr 6);
inc(pnx);
end;
nx := (nx and $00FF00FF00FF00FF) + ((nx >> 8) and $00FF00FF00FF00FF);
nx := (nx and $0000FFFF0000FFFF) + ((nx >> 16) and $0000FFFF0000FFFF);
nx := (nx and $00000000FFFFFFFF) + ((nx >> 32) and $00000000FFFFFFFF);
Result := Result + nx;
end;
if (bc and 127) > 0 then begin
nx := 0;
for i := 1 to bc and 127 do
begin
// Count bytes which are NOT the first byte of a character.
nx += ((pnx^ and EIGHTYMASK) shr 7) and ((not pnx^) shr 6);
inc(pnx);
end;
nx := (nx and $00FF00FF00FF00FF) + ((nx >> 8) and $00FF00FF00FF00FF);
nx := (nx and $0000FFFF0000FFFF) + ((nx >> 16) and $0000FFFF0000FFFF);
nx := (nx and $00000000FFFFFFFF) + ((nx >> 32) and $00000000FFFFFFFF);
Result := Result + nx;
end;
// Take care of any left-over bytes.
while ix<e do
begin
// Is this byte NOT the first byte of a character?
Result += (pn8^ shr 7) and ((not pn8^) shr 6);
inc(pn8);
end;
Result := ByteCount - Result;
end;
// one of each pattern.
const pattern : array[0..3] of char = (chr(%11001001),chr(%10001001),
chr(%00001001),chr(%01001001));
function pseudorandomutf8string(len:integer;var cnt:integer):string;
// random string but keep a count of bytes with high value %10
var lcnt:integer;
i,j:integer;
begin
setlength(result,len);
lcnt:=0;
for i:=1 to length(result) do
begin
j:=random(4);
//j:=i and 3;
if j=1 then inc(lcnt);
result[i]:=pattern[j];
end;
cnt:=length(result)-lcnt;
end;
var r : array[0..10000] of byte; // FPC "registers" dialog is poor, we use
this for writeln like dumping
procedure testasmutf8length;
const testlen = 64*100;
var s : string;
i,j,cnt : integer;
rx : int64;
begin
randomize;
s:=pseudorandomutf8string(testlen+Random(50),cnt);
rx:=asmutf8length(pchar(s),{$ifdef asmdebug}@r[0],{$endif}length(s));
writeln(inttohex(cnt,2),' = ',inttohex(rx,2),' ',inttohex(length(s)-cnt,2),'
= ',inttohex(length(s)-rx,2)); // hex because most register dumps are easier
in hex.
{$ifdef asmdebug}
for i:=0 to 6 do
begin
write(i:2,' ');
for j:=0 to 3 do
write(inttohex(pdword(@r[i*16+j*4])^,8), ' ');
writeln;
end;
{$endif}
end;
var
a: ansistring;
t: QWord;
i, j, ii: Integer;
begin
{$ifdef asmdebug}
testasmutf8length;
{$else}
a := 'اربك تكست هو اول موقع يسمح لزواره الكرام بتحويل الكتابة العربي الى
كتابة مفهومة من قبل اغلب برامج التصميم مثل الفوتوشوب و';
a :=
a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A;
a :=
a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A;
a :=
a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A;
a := a+a+A+a+a+A+a+a;
writeln(Length(a));
writeln('fst:',UTF8LengthFast(@a[1], Length(a)));
writeln('pop:',UTF8LengthPop(@a[1], Length(a)));
writeln('add:',UTF8LengthAdd(@a[1], Length(a)));
writeln('asm:',asmUTF8Length(@a[1], {$ifdef asmdebug}@r[0],{$endif}
Length(a)));
WriteLn();
writeln(Length(a) div 8);
WriteLn();
for ii := 0 to 1 do begin
for i := 0 to 3 do begin
t := GetTickCount64;
for j := 0 to 19 do
UTF8LengthFast(@a[1], Length(a));
t := GetTickCount64 - t;
writeln('fst ',t);
end;
for i := 0 to 3 do begin
t := GetTickCount64;
for j := 0 to 19 do
UTF8LengthPop(@a[1], Length(a));
t := GetTickCount64 - t;
writeln('pop ',t);
end;
for i := 0 to 3 do begin
t := GetTickCount64;
for j := 0 to 19 do
UTF8LengthAdd(@a[1], Length(a));
t := GetTickCount64 - t;
writeln('add ',t);
end;
for i := 0 to 3 do begin
t := GetTickCount64;
for j := 0 to 19 do
asmUTF8Length(@a[1], {$ifdef asmdebug}@r[0],{$endif} Length(a));
t := GetTickCount64 - t;
writeln('asm ',t);
end;
end;
{$endif}
{$ifndef FPC}
if debughook<>nil then // runtime debugger detection
{$endif}
readln;
end.
--
_______________________________________________
lazarus mailing list
lazarus@lists.lazarus-ide.org
https://lists.lazarus-ide.org/listinfo/lazarus