On 30-12-2021 10:15, Florian Klämpfl via lazarus wrote:
Linux uses different calling conventions, please check with the patch
below.
Linux is quite generous with the volatile registers, so luckily it
matches quite closely.
I first tried the approach of your patch, but [s] has problems on
windows, so would require ifdef on every "s"use, so I simply move [s] to rcx
{$ifndef Windows}
// we can't use [s] as an alias for the pointer parameter, because
the non assembler procedure on Windows
// changes that into a stack reference. FPC doesn't support non
volatile frame management for assembler procs like Delphi does.
mov rcx,s // rdi
mov edx,len // rsi
{$endif}
and the ifdeffing of the assembler procedure on linux vs inline asm
block on Windows. Then it works on Linux x86_64.
Funnily, our server AMD Athlon 200GE (Zen1, 3.2GHz?) nearly the exact
same timings as my i7-3770 3.4GHz
I did some other minor work after last post, so here is now the entire
program://
// (C) 2021 Martin Friebe and Marco van de Voort.
// attempt to accelerate utf8lengthfast which is a length(s) in utf8 codepoints
without integrity checking
//
// 4 versions.
// - Original,
// - with popcount and
// - the "add" variant that accumulates 127 iterations of ptrints and only adds
// the intermeidates outside that loop
// - a SSE2 version loosely inspired by the add variant combined with
// the core of an existing (branchless) binarization routine for the
main loop.
{$mode objfpc}{$H+}
{$asmmode intel}
{$coperators on}
{define asmdebug}
uses SysUtils,StrUtils;
const
mask3 : array[0..15] of byte = ( $C0,$C0,$C0,$C0,
$C0,$C0,$C0,$C0,
$C0,$C0,$C0,$C0,
$C0,$C0,$C0,$C0);
mask4 : array[0..15] of byte = ( $80,$80,$80,$80,
$80,$80,$80,$80,
$80,$80,$80,$80,
$80,$80,$80,$80);
mask2 : array[0..15] of byte = ( $1,$1,$1,$1,
$1,$1,$1,$1,
$1,$1,$1,$1,
$1,$1,$1,$1);
// Integer arguments are passed in registers RCX, RDX, R8, and R9.
// Floating point arguments are passed in XMM0L, XMM1L, XMM2L, and XMM3L.
// volatile: RAX, RCX, RDX, R8, R9, R10, R11
// nonvolatile RBX, RBP, RDI, RSI, RSP, R12, R13, R14, and R15 are considered
nonvolatile
// volatile xmm0-xmm3 (params) en xmm4,5
// https://msdn.microsoft.com/en-us/library/ms235286.aspx
{$ifdef asmdebug}
function asmutf8length(const s : pchar;len:integer,res:pbyte):int64;
{$else}
function asmutf8length(const s : pchar;len:integer):int64;{$ifndef
Windows}assembler; nostackframe;{$endif}
{$endif}
{$ifdef Windows}
begin
{$endif}
asm
// tuning for short strings:
// ------
{$ifndef Windows}
// we can't use [s] as an alias for the pointer parameter, because the non
assembler procedure on Windows
// changes that into a stack reference. FPC doesn't support non volatile
frame management for assembler procs like Delphi does.
mov rcx,s // rdi
mov edx,len // rsi
{$endif}
test rax,rax
je @theend
cmp rdx,128 // threshold between long and short.
jl @restbytes
mov rax,rdx
mov r10,rcx
and r10,15
mov r9,16
sub r9,r10
and r9,15
test r9,r9
je @nopreloop
sub rdx,r9
@preloop: // roughly 2 cycles per iteration on ivy bridge
movzx r11d, byte [rcx] // unaligned bytes after sse loop
mov r10,r11
shr r10,7
not r11
shr r11,6
and r10,r11
sub rax,r10
inc rcx
dec r9
jne @preloop
@nopreloop:
mov r9,rdx
and r9,15
shr rdx,4
pxor xmm5,xmm5 // always zero
pxor xmm6,xmm6 // dword counts
// using broadcast etc raises requirements? -> use constant loads.
movdqu xmm1,[rip+mask3]
movdqu xmm2,[rip+mask4]
movdqu xmm3,[rip+mask2]
test rdx,rdx
je @restbytes
@outer:
mov r10,127 // max iterations per inner loop
cmp r10,rdx // more or less left?
jl @last // more
mov r10,rdx // less
@last:
sub rdx,r10 // iterations left - iterations to do
pxor xmm4,xmm4
// process 127 iterations (limit of signed int8)
@inner: // +/- 2.2 cycles per iteration for 16 bytes on ivy
bridge
movdqu xmm0, [rcx]
pand xmm0,xmm1 // mask out top 2 bits
pcmpeqb xmm0,xmm2 // compare with $80.
pand xmm0,xmm3 // change to $1 per byte.
paddb xmm4,xmm0 // add to cumulative
add rcx,16
dec r10
jne @inner
// SSSE3 vertical adds might help this, but increase CPU reqs.
movdqa xmm0,xmm4
PUNPCKLBW xmm0,xmm5 // zero extend to words
PUNPCKHBW xmm4,xmm5
paddw xmm0,xmm4 // add, now 8 16-bit words.
movdqa xmm4,xmm0
PUNPCKLWD xmm0,xmm5 // zero extend to dwords
paddd xmm6,xmm0
PUNPCKHWD xmm4,xmm5
paddd xmm6,xmm4 // add both L and H to cumulative 4x dword xmm6
reg
test rdx,rdx
jne @outer
MOVHLPS xmm4,xmm6 // move high 8 bytes to low (no float->int
penalty for move only?)
paddd xmm6,xmm4 // add both 2*dwords (high doesn't matter)
pshufd xmm4,xmm6,1 // mov 2nd dword in xmm6 to first in xmm4
paddd xmm6,xmm4 // add
movd edx,xmm6 // to int alu reg
sub rax,rdx // subtract from length in bytes.
@restbytes:
test r9,r9
je @theend // Done!
@restloop:
movzx edx, byte [rcx] // unaligned bytes after sse loop
mov r10,rdx
shr r10,7
not rdx
shr rdx,6
and r10,rdx
sub rax,r10
inc rcx
dec r9
jne @restloop
@theend:
{$ifdef Windows}
end['xmm6']; // volatile registers used.
{$endif}
end;
function countmask(nx:int64):integer;
begin
nx := (nx and $00FF00FF00FF00FF) + ((nx >> 8) and $00FF00FF00FF00FF);
nx := (nx and $0000FFFF0000FFFF) + ((nx >> 16) and $0000FFFF0000FFFF);
result := (nx and $00000000FFFFFFFF) + ((nx >> 32) and $00000000FFFFFFFF);
end;
function UTF8LengthFast(p: PChar; ByteCount: PtrInt): PtrInt;
const
{$ifdef CPU32}
ONEMASK =$01010101;
EIGHTYMASK=$80808080;
{$endif}
{$ifdef CPU64}
ONEMASK =$0101010101010101;
EIGHTYMASK=$8080808080808080;
{$endif}
var
pnx: PPtrInt absolute p; // To get contents of text in PtrInt blocks. x
refers to 32 or 64 bits
pn8: pint8 absolute pnx; // To read text as Int8 in the initial and final
loops
ix: PtrInt absolute pnx; // To read text as PtrInt in the block loop
nx: PtrInt; // values processed in block loop
i,cnt,e: PtrInt;
begin
Result := 0;
e := ix+ByteCount; // End marker
// Handle any initial misaligned bytes.
cnt := (not (ix-1)) and (sizeof(PtrInt)-1);
if cnt>ByteCount then
cnt := ByteCount;
for i := 1 to cnt do
begin
// Is this byte NOT the first byte of a character?
Result += (pn8^ shr 7) and ((not pn8^) shr 6);
inc(pn8);
end;
// Handle complete blocks
for i := 1 to (ByteCount-cnt) div sizeof(PtrInt) do
begin
// Count bytes which are NOT the first byte of a character.
nx := ((pnx^ and EIGHTYMASK) shr 7) and ((not pnx^) shr 6);
{$push}{$overflowchecks off} // "nx * ONEMASK" causes an arithmetic
overflow.
Result += (nx * ONEMASK) >> ((sizeof(PtrInt) - 1) * 8);
{$pop}
inc(pnx);
end;
// Take care of any left-over bytes.
while ix<e do
begin
// Is this byte NOT the first byte of a character?
Result += (pn8^ shr 7) and ((not pn8^) shr 6);
inc(pn8);
end;
Result := ByteCount - Result;
end;
function UTF8LengthPop(p: PChar; ByteCount: PtrInt): PtrInt;
const
{$ifdef CPU32}
ONEMASK =$01010101;
EIGHTYMASK=$80808080;
{$endif}
{$ifdef CPU64}
ONEMASK =$0101010101010101;
EIGHTYMASK=$8080808080808080;
{$endif}
var
pnx: PPtrInt absolute p; // To get contents of text in PtrInt blocks. x
refers to 32 or 64 bits
pn8: pint8 absolute pnx; // To read text as Int8 in the initial and final
loops
ix: PtrInt absolute pnx; // To read text as PtrInt in the block loop
nx: PtrInt; // values processed in block loop
i,cnt,e: PtrInt;
begin
Result := 0;
e := ix+ByteCount; // End marker
// Handle any initial misaligned bytes.
cnt := (not (ix-1)) and (sizeof(PtrInt)-1);
if cnt>ByteCount then
cnt := ByteCount;
for i := 1 to cnt do
begin
// Is this byte NOT the first byte of a character?
Result += (pn8^ shr 7) and ((not pn8^) shr 6);
inc(pn8);
end;
// Handle complete blocks
for i := 1 to (ByteCount-cnt) div sizeof(PtrInt) do
begin
// Count bytes which are NOT the first byte of a character.
nx := ((pnx^ and EIGHTYMASK) shr 7) and ((not pnx^) shr 6);
{$push}{$overflowchecks off} // "nx * ONEMASK" causes an arithmetic
overflow.
//Result += (nx * ONEMASK) >> ((sizeof(PtrInt) - 1) * 8);
Result += PopCnt(qword(nx));
{$pop}
inc(pnx);
end;
// Take care of any left-over bytes.
while ix<e do
begin
// Is this byte NOT the first byte of a character?
Result += (pn8^ shr 7) and ((not pn8^) shr 6);
inc(pn8);
end;
Result := ByteCount - Result;
end;
function UTF8LengthAdd(p: PChar; ByteCount: PtrInt): PtrInt;
const
{$ifdef CPU32}
ONEMASK =$01010101;
EIGHTYMASK=$80808080;
{$endif}
{$ifdef CPU64}
ONEMASK =$0101010101010101;
EIGHTYMASK=$8080808080808080;
{$endif}
var
pnx: PPtrInt absolute p; // To get contents of text in PtrInt blocks. x
refers to 32 or 64 bits
pn8: pint8 absolute pnx; // To read text as Int8 in the initial and final
loops
ix: PtrInt absolute pnx; // To read text as PtrInt in the block loop
nx: PtrInt; // values processed in block loop
i,j,cnt,e, bc: PtrInt;
begin
Result := 0;
e := ix+ByteCount; // End marker
// Handle any initial misaligned bytes.
cnt := (not (ix-1)) and (sizeof(PtrInt)-1);
if cnt>ByteCount then
cnt := ByteCount;
for i := 1 to cnt do
begin
// Is this byte NOT the first byte of a character?
Result += (pn8^ shr 7) and ((not pn8^) shr 6);
inc(pn8);
end;
// Handle complete blocks
bc := (ByteCount-cnt) div sizeof(PtrInt);
for j := 1 to bc >> 7 do begin
nx := 0;
for i := 0 to 127 do
begin
// Count bytes which are NOT the first byte of a character.
nx += ((pnx^ and EIGHTYMASK) shr 7) and ((not pnx^) shr 6);
inc(pnx);
end;
nx := (nx and $00FF00FF00FF00FF) + ((nx >> 8) and $00FF00FF00FF00FF);
nx := (nx and $0000FFFF0000FFFF) + ((nx >> 16) and $0000FFFF0000FFFF);
nx := (nx and $00000000FFFFFFFF) + ((nx >> 32) and $00000000FFFFFFFF);
Result := Result + nx;
end;
if (bc and 127) > 0 then begin
nx := 0;
for i := 1 to bc and 127 do
begin
// Count bytes which are NOT the first byte of a character.
nx += ((pnx^ and EIGHTYMASK) shr 7) and ((not pnx^) shr 6);
inc(pnx);
end;
nx := (nx and $00FF00FF00FF00FF) + ((nx >> 8) and $00FF00FF00FF00FF);
nx := (nx and $0000FFFF0000FFFF) + ((nx >> 16) and $0000FFFF0000FFFF);
nx := (nx and $00000000FFFFFFFF) + ((nx >> 32) and $00000000FFFFFFFF);
Result := Result + nx;
end;
// Take care of any left-over bytes.
while ix<e do
begin
// Is this byte NOT the first byte of a character?
Result += (pn8^ shr 7) and ((not pn8^) shr 6);
inc(pn8);
end;
Result := ByteCount - Result;
end;
// one of each pattern.
const pattern : array[0..3] of char = (chr(%11001001),chr(%10001001),
chr(%00001001),chr(%01001001));
function pseudorandomutf8string(len:integer;var cnt:integer):string;
// random string but keep a count of bytes with high value %10
var lcnt:integer;
i,j:integer;
begin
setlength(result,len);
lcnt:=0;
for i:=1 to length(result) do
begin
j:=random(4);
//j:=i and 3;
if j=1 then inc(lcnt);
result[i]:=pattern[j];
end;
cnt:=length(result)-lcnt;
end;
var r : array[0..10000] of byte; // FPC "registers" dialog is poor, we use
this for writeln like dumping
procedure testasmutf8length;
const testlen = 64*100;
var s : string;
cnt : integer;
rx : int64;
begin
randomize;
s:=pseudorandomutf8string(testlen+Random(50),cnt);
rx:=asmutf8length(pchar(s),length(s){$ifdef asmdebug},@r[0]{$endif});
writeln(inttohex(cnt,2),' = ',inttohex(rx,2),' ',inttohex(length(s)-cnt,2),'
= ',inttohex(length(s)-rx,2)); // hex because most register dumps are easier
in hex.
{$ifdef asmdebug}
for i:=0 to 6 do
begin
write(i:2,' ');
for j:=0 to 3 do
write(inttohex(pdword(@r[i*16+j*4])^,8), ' ');
writeln;
end;
{$endif}
end;
var
a: ansistring;
t: QWord;
i, j, ii: Integer;
begin
{$ifdef asmdebug}
testasmutf8length;
{$else}
a := 'اربك تكست هو اول موقع يسمح لزواره الكرام بتحويل الكتابة العربي الى
كتابة مفهومة من قبل اغلب برامج التصميم مثل الفوتوشوب و';
a :=
a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A;
a :=
a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A;
a :=
a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A+a+a+A;
a := a+a+A+a+a+A+a+a;
writeln(Length(a));
writeln('fst:',UTF8LengthFast(@a[1], Length(a)));
writeln('pop:',UTF8LengthPop(@a[1], Length(a)));
writeln('add:',UTF8LengthAdd(@a[1], Length(a)));
writeln('asm:',asmUTF8Length(@a[1], Length(a){$ifdef
asmdebug},@r[0]{$endif}));
WriteLn();
writeln(Length(a) div 8);
WriteLn();
for ii := 0 to 1 do begin
for i := 0 to 3 do begin
t := GetTickCount64;
for j := 0 to 19 do
UTF8LengthFast(@a[1], Length(a));
t := GetTickCount64 - t;
writeln('fst ',t);
end;
for i := 0 to 3 do begin
t := GetTickCount64;
for j := 0 to 19 do
UTF8LengthPop(@a[1], Length(a));
t := GetTickCount64 - t;
writeln('pop ',t);
end;
for i := 0 to 3 do begin
t := GetTickCount64;
for j := 0 to 19 do
UTF8LengthAdd(@a[1], Length(a));
t := GetTickCount64 - t;
writeln('add ',t);
end;
for i := 0 to 3 do begin
t := GetTickCount64;
for j := 0 to 19 do
asmUTF8Length(@a[1], {$ifdef asmdebug}@r[0],{$endif} Length(a));
t := GetTickCount64 - t;
writeln('asm ',t);
end;
end;
{$endif}
{$ifndef FPC}
if debughook<>nil then // runtime debugger detection
{$endif}
readln;
end.
--
_______________________________________________
lazarus mailing list
lazarus@lists.lazarus-ide.org
https://lists.lazarus-ide.org/listinfo/lazarus