Re: [Qemu-devel] [PATCH v4] exec: Fix non-power-of-2 sized accesses

Alex Williamson Sat, 17 Aug 2013 08:15:41 -0700

On Sat, 2013-08-17 at 10:23 +0200, Laszlo Ersek wrote:
> On 08/16/13 23:58, Alex Williamson wrote:
> > Since commit 23326164 we align access sizes to match the alignment of
> > the address, but we don't align the access size itself.  This means we
> > let illegal access sizes (ex. 3) slip through if the address is
> > sufficiently aligned (ex. 4).  This results in an abort which would be
> > easy for a guest to trigger.  Account for aligning the access size.
> > 
> > Signed-off-by: Alex Williamson <alex.william...@redhat.com>
> > Cc: qemu-sta...@nongnu.org
> > ---
> > 
> > v4: KISS
> > v3: Highest power of 2, not lowest
> > v2: Remove unnecessary loop condition
> > 
> >  exec.c |   18 +++++++++++++-----
> >  1 file changed, 13 insertions(+), 5 deletions(-)
> > 
> > diff --git a/exec.c b/exec.c
> > index 3ca9381..67a822c 100644
> > --- a/exec.c
> > +++ b/exec.c
> > @@ -1924,12 +1924,20 @@ static int memory_access_size(MemoryRegion *mr, 
> > unsigned l, hwaddr addr)
> >          }
> >      }
> >  
> > -    /* Don't attempt accesses larger than the maximum.  */
> > -    if (l > access_size_max) {
> > -        l = access_size_max;
> > +    /* Don't attempt accesses larger than the maximum or unsupported 
> > sizes.  */
> > +    if (l >= access_size_max) {
> > +        return access_size_max;
> > +    } else {
> > +        if (l >= 8) {
> > +            return 8;
> > +        } else if (l >= 4) {
> > +            return 4;
> > +        } else if (l >= 2) {
> > +            return 2;
> > +        } else {
> > +            return 1;
> > +        }
> >      }
> > -
> > -    return l;
> >  }
> >  
> >  bool address_space_rw(AddressSpace *as, hwaddr addr, uint8_t *buf,
> > 
> 
> Considering that each block contains a return statement, I'd drop the
> else's:
> 
>     if (l >= access_size_max) {
>         return access_size_max;
>     }
>     if (l >= 8) {
>         return 8;
>     }
>     if (l >= 4) {
>         return 4;
>     }
>     if (l >= 2) {
>         return 2;
>     }
>     return 1;
> 
> Or even
> 
>     return l >= access_size_max ? access_size_max :
>            l >= 8               ? 8               :
>            l >= 4               ? 4               :
>            l >= 2               ? 2               :
>            1;
> 
> But this is just bikeshedding, so I'm not suggesting it.
> 
> Regarding function... I can at least understand this code. So, you want
> to find the most significant bit set in "l", and clear everything else.
> If said leftmost bit is to the left of bit#3, then use bit#3 instead.
> 
> This idea should work if "l" is already a whole power of two.
> 
>     if (l >= access_size_max) {
>         return access_size_max;
>     }
>     return 1 << max(3, lmb(l));
> 
> What Paolo posted seems almost identical.
> 
> clz32(l):                     leading zeros in "l"
> qemu_fls(l) == 32 - clz32(l): position of leftmost bit set, 1-based
> qemu_fls(l) - 1:              position of leftmost bit set, 0-based
> 
> Not sure if the (l & (l - 1)) check is needed in Paolo's patch. clz32()
> is not generally usable when l==0, so maybe that's (too) what the check
> is for. OTOH maybe l==0 is not even possible when entering
> memory_access_size().
> 
> Second, Paolo's patch might lack the "max(3, ...)" part. Since you
> didn't call my previous example with l==9 retarded, I guess clamping
> (qemu_fls(l) - 1) at 3 would be necessary.


Whether we need to clamp on 3 really depends on the caller.  I'm
actually doubtful that this function ever gets called with l > 8.  So I
think Paolo's code works ok.  It's possible your example of l == 9 was a
red herring for my code, but I didn't have enough faith in it anyway.

> Third, clz32() is probably very fast when gcc has a builtin for it, and
> probably slower than your open-coded version otherwsie.

Nope, the open coded version in v4 is significantly faster.  See the
attached test programs.  On my laptop I get these results (compiled with
-O):

$ time ./test-open

real    0m7.442s
user    0m7.412s
sys     0m0.005s

$ time ./test-fls

real    0m9.202s
user    0m9.117s
sys     0m0.024s

$ time ./test-pow2floor

real    0m13.884s
user    0m13.796s
sys     0m0.013s


At higher optimization levels the race gets a lot closer, but the open
coded version still seems to have an advantage (assuming the test code
even remains relevant at higher levels).  So, I conclude that it's
faster to open code for the very limited range of a power-of-2 function
we need here.

> I still don't know enough about this topic, but I like this patch
> because I can understand the intent at least :)
> 
> Reviewed-by: Laszlo Ersek <ler...@redhat.com>

Thanks!
Alex

unsigned foo(unsigned max, unsigned size)
{
	if (size >= max)
		return max;
	else {
		if (size >= 8)
			return 8;
		else if (size >= 4)
			return 4;
		else if (size >= 2)
			return 2;
		else
			return 1;
	}
}

int main(void)
{
	unsigned i, l, max;
	volatile val;

	for (i = 0; i < 100000000; i++) {
		for (max = 1; max <= 8; max <<= 1) {
			for (l = 1; l <= 8; l++)
				val = foo(max, l);
		}
	}
	return 0;
}

static inline int clz32(unsigned long val)
{
	return val ? __builtin_clz(val) : 32;
}

int fls(int value)
{
	return 32 - clz32(value);
}

unsigned foo(unsigned max, unsigned size)
{
	if (size > max)
		size = max;
	if (size & (size - 1))
		size = 1 << (fls(size) - 1);
	return size;
}

int main(void)
{
	unsigned i, l, max;
	volatile val;

	for (i = 0; i < 100000000; i++) {
		for (max = 1; max <= 8; max <<= 1) {
			for (l = 1; l <= 8; l++)
				val = foo(max, l);
		}
	}
	return 0;
}

static inline int clz64(unsigned long val)
{
	return val ? __builtin_clzll(val) : 64;
}

static inline int is_power_of_2(unsigned long value)
{
	if (!value)
		return 0;

	return !(value & (value - 1));
}

long pow2floor(long value)
{
	if (!is_power_of_2(value))
		value = 0x8000000000000000ULL >> clz64(value);

	return value;
}

unsigned foo(unsigned max, unsigned size)
{
	if (size > max)
		size = max;
	size = pow2floor(size);
	return size;
}

int main(void)
{
	unsigned i, l, max;
	volatile val;

	for (i = 0; i < 100000000; i++) {
		for (max = 1; max <= 8; max <<= 1) {
			for (l = 1; l <= 8; l++)
				val = foo(max, l);
		}
	}
	return 0;
}

Re: [Qemu-devel] [PATCH v4] exec: Fix non-power-of-2 sized accesses

Reply via email to