On 04/16/2015 02:39 AM, Mario Kleiner wrote: > On 04/16/2015 03:29 AM, Peter Hurley wrote: >> On 04/15/2015 05:26 PM, Mario Kleiner wrote: >>> A couple of questions to educate me and one review comment. >>> >>> On 04/15/2015 07:34 PM, Daniel Vetter wrote: >>>> This was a bit too much cargo-culted, so lets make it solid: >>>> - vblank->count doesn't need to be an atomic, writes are always done >>>> under the protection of dev->vblank_time_lock. Switch to an unsigned >>>> long instead and update comments. Note that atomic_read is just a >>>> normal read of a volatile variable, so no need to audit all the >>>> read-side access specifically. >>>> >>>> - The barriers for the vblank counter seqlock weren't complete: The >>>> read-side was missing the first barrier between the counter read and >>>> the timestamp read, it only had a barrier between the ts and the >>>> counter read. We need both. >>>> >>>> - Barriers weren't properly documented. Since barriers only work if >>>> you have them on boths sides of the transaction it's prudent to >>>> reference where the other side is. To avoid duplicating the >>>> write-side comment 3 times extract a little store_vblank() helper. >>>> In that helper also assert that we do indeed hold >>>> dev->vblank_time_lock, since in some cases the lock is acquired a >>>> few functions up in the callchain. >>>> >>>> Spotted while reviewing a patch from Chris Wilson to add a fastpath to >>>> the vblank_wait ioctl. >>>> >>>> v2: Add comment to better explain how store_vblank works, suggested by >>>> Chris. >>>> >>>> v3: Peter noticed that as-is the 2nd smp_wmb is redundant with the >>>> implicit barrier in the spin_unlock. But that can only be proven by >>>> auditing all callers and my point in extracting this little helper was >>>> to localize all the locking into just one place. Hence I think that >>>> additional optimization is too risky. >>>> >>>> Cc: Chris Wilson <chris at chris-wilson.co.uk> >>>> Cc: Mario Kleiner <mario.kleiner.de at gmail.com> >>>> Cc: Ville Syrjälä <ville.syrjala at linux.intel.com> >>>> Cc: Michel Dänzer <michel at daenzer.net> >>>> Cc: Peter Hurley <peter at hurleysoftware.com> >>>> Signed-off-by: Daniel Vetter <daniel.vetter at intel.com> >>>> --- >>>> drivers/gpu/drm/drm_irq.c | 95 >>>> +++++++++++++++++++++++++---------------------- >>>> include/drm/drmP.h | 8 +++- >>>> 2 files changed, 57 insertions(+), 46 deletions(-) >>>> >>>> diff --git a/drivers/gpu/drm/drm_irq.c b/drivers/gpu/drm/drm_irq.c >>>> index c8a34476570a..8694b77d0002 100644 >>>> --- a/drivers/gpu/drm/drm_irq.c >>>> +++ b/drivers/gpu/drm/drm_irq.c >>>> @@ -74,6 +74,36 @@ module_param_named(vblankoffdelay, drm_vblank_offdelay, >>>> int, 0600); >>>> module_param_named(timestamp_precision_usec, drm_timestamp_precision, >>>> int, 0600); >>>> module_param_named(timestamp_monotonic, drm_timestamp_monotonic, int, >>>> 0600); >>>> >>>> +static void store_vblank(struct drm_device *dev, int crtc, >>>> + unsigned vblank_count_inc, >>>> + struct timeval *t_vblank) >>>> +{ >>>> + struct drm_vblank_crtc *vblank = &dev->vblank[crtc]; >>>> + u32 tslot; >>>> + >>>> + assert_spin_locked(&dev->vblank_time_lock); >>>> + >>>> + if (t_vblank) { >>>> + /* All writers hold the spinlock, but readers are serialized by >>>> + * the latching of vblank->count below. >>>> + */ >>>> + tslot = vblank->count + vblank_count_inc; >>>> + vblanktimestamp(dev, crtc, tslot) = *t_vblank; >>>> + } >>>> + >>>> + /* >>>> + * vblank timestamp updates are protected on the write side with >>>> + * vblank_time_lock, but on the read side done locklessly using a >>>> + * sequence-lock on the vblank counter. Ensure correct ordering using >>>> + * memory barrriers. We need the barrier both before and also after >>>> the >>>> + * counter update to synchronize with the next timestamp write. >>>> + * The read-side barriers for this are in drm_vblank_count_and_time. >>>> + */ >>>> + smp_wmb(); >>>> + vblank->count += vblank_count_inc; >>>> + smp_wmb(); >>>> +} >>>> + >>>> /** >>>> * drm_update_vblank_count - update the master vblank counter >>>> * @dev: DRM device >>>> @@ -93,7 +123,7 @@ module_param_named(timestamp_monotonic, >>>> drm_timestamp_monotonic, int, 0600); >>>> static void drm_update_vblank_count(struct drm_device *dev, int crtc) >>>> { >>>> struct drm_vblank_crtc *vblank = &dev->vblank[crtc]; >>>> - u32 cur_vblank, diff, tslot; >>>> + u32 cur_vblank, diff; >>>> bool rc; >>>> struct timeval t_vblank; >>>> >>>> @@ -129,18 +159,12 @@ static void drm_update_vblank_count(struct >>>> drm_device *dev, int crtc) >>>> if (diff == 0) >>>> return; >>>> >>>> - /* Reinitialize corresponding vblank timestamp if high-precision query >>>> - * available. Skip this step if query unsupported or failed. Will >>>> - * reinitialize delayed at next vblank interrupt in that case. >>>> + /* >>>> + * Only reinitialize corresponding vblank timestamp if high-precision >>>> query >>>> + * available and didn't fail. Will reinitialize delayed at next vblank >>>> + * interrupt in that case. >>>> */ >>>> - if (rc) { >>>> - tslot = atomic_read(&vblank->count) + diff; >>>> - vblanktimestamp(dev, crtc, tslot) = t_vblank; >>>> - } >>>> - >>>> - smp_mb__before_atomic(); >>>> - atomic_add(diff, &vblank->count); >>>> - smp_mb__after_atomic(); >>>> + store_vblank(dev, crtc, diff, rc ? &t_vblank : NULL); >>>> } >>>> >>>> /* >>>> @@ -218,7 +242,7 @@ static void vblank_disable_and_save(struct drm_device >>>> *dev, int crtc) >>>> /* Compute time difference to stored timestamp of last vblank >>>> * as updated by last invocation of drm_handle_vblank() in vblank >>>> irq. >>>> */ >>>> - vblcount = atomic_read(&vblank->count); >>>> + vblcount = vblank->count; >>>> diff_ns = timeval_to_ns(&tvblank) - >>>> timeval_to_ns(&vblanktimestamp(dev, crtc, vblcount)); >>>> >>>> @@ -234,17 +258,8 @@ static void vblank_disable_and_save(struct drm_device >>>> *dev, int crtc) >>>> * available. In that case we can't account for this and just >>>> * hope for the best. >>>> */ >>>> - if (vblrc && (abs64(diff_ns) > 1000000)) { >>>> - /* Store new timestamp in ringbuffer. */ >>>> - vblanktimestamp(dev, crtc, vblcount + 1) = tvblank; >>>> - >>>> - /* Increment cooked vblank count. This also atomically commits >>>> - * the timestamp computed above. >>>> - */ >>>> - smp_mb__before_atomic(); >>>> - atomic_inc(&vblank->count); >>>> - smp_mb__after_atomic(); >>>> - } >>>> + if (vblrc && (abs64(diff_ns) > 1000000)) >>>> + store_vblank(dev, crtc, 1, &tvblank); >>>> >>>> spin_unlock_irqrestore(&dev->vblank_time_lock, irqflags); >>>> } >>>> @@ -852,7 +867,7 @@ u32 drm_vblank_count(struct drm_device *dev, int crtc) >>>> >>>> if (WARN_ON(crtc >= dev->num_crtcs)) >>>> return 0; >>>> - return atomic_read(&vblank->count); >>>> + return vblank->count; >>> >>> I wrongly assumed atomic_read would guarantee more than it actually does, >>> so please help me to learn something. Why don't we need some smp_rmb() here >>> before returning vblank->count? What guarantees that drm_vblank_count() >>> does return the latest value assigned to vblank->count in store_vblank()? >>> In store_vblank() there is a smp_wmb(), but why don't we need a matching >>> smp_rmb() here to benefit from it? >> >> Well, you could but the vblank counter resolution is very low (60HZ), >> so practically speaking, what would be the point? >> > > Thanks for the explanations Peter. > > Depends on the latency induced. A few microseconds don't matter, a > millisecond or more would matter for some applications. > >> The trailing write barrier in store_vblank() is only necessary to >> force the ordering of the timestamp wrt to vblank count *in case there >> was no write barrier executed between to calls to store_vblank()*, >> not to ensure the cpu forces the write to main memory immediately. >> > > That part i understand, the possibly slightly earlier write of some store > buffers to main memory is just a possible nice side effect. Bits and pieces > of my memory about how cache coherency etc. work slowly come back to my own > memory... > >> Because the time scales for these events don't require that level of >> resolution; consider how much code has to get executed between a >> hardware vblank irq triggering and the vblank counter being updated. >> >> Realistically, the only relevant requirement is that the timestamp >> match the counter. >> > > Yes that is the really important part. A msec delay would possibly matter for > some timing sensitive apps like mine - some more exotic displays run at 200 > Hz, and some apps need to synchronize to the vblank not strictly for > graphics. But i assume potential delays here are more on the order of a few > microseconds if some pending loads from the cache would get reordered for > overall efficiency? > >>> >>>> } >>>> EXPORT_SYMBOL(drm_vblank_count); >>>> >>>> @@ -897,16 +912,17 @@ u32 drm_vblank_count_and_time(struct drm_device >>>> *dev, int crtc, >>>> if (WARN_ON(crtc >= dev->num_crtcs)) >>>> return 0; >>>> >>>> - /* Read timestamp from slot of _vblank_time ringbuffer >>>> - * that corresponds to current vblank count. Retry if >>>> - * count has incremented during readout. This works like >>>> - * a seqlock. >>>> + /* >>>> + * Vblank timestamps are read lockless. To ensure consistency the >>>> vblank >>>> + * counter is rechecked and ordering is ensured using memory barriers. >>>> + * This works like a seqlock. The write-side barriers are in >>>> store_vblank. >>>> */ >>>> do { >>>> - cur_vblank = atomic_read(&vblank->count); >>>> + cur_vblank = vblank->count; >>>> + smp_rmb(); >>>> *vblanktime = vblanktimestamp(dev, crtc, cur_vblank); >>>> smp_rmb(); >>>> - } while (cur_vblank != atomic_read(&vblank->count)); >>>> + } while (cur_vblank != vblank->count); >>>> >>> >>> Similar question as above. We have a new smp_rmb() after the cur_vblank >>> assignment and then after *vblanktime assignment. My original wrong >>> assumption was that the first smp_rmb() wouldn't be needed because >>> atomic_read() would imply that, and that the compiler/cpu couldn't reorder >>> anything here because the *vblanktime assignment depends on cur_vblank from >>> the preceeding atomic_read. >>> >>> But why can we now do the comparison while(cur_vblank != vblank->count) >>> without needing something like >>> >>> new_vblank = vblank->count; >>> smp_rmb(); >>> } while (cur_vblank != new_vblank); >>> >>> to make sure the value from the 2nd vblank->count read isn't stale wrt. to >>> potential updates from store_vblank()? >> >> Similar response here. >> >> What matters is that the timestamp read is consistent with the >> vblank count; not that you "just missed" new values. >> >>> Another question is why the drm_vblank_count_and_time() code ever worked >>> without triggering any of my own tests and consistency checks in my >>> software, or any of your igt tests? I run my tests very often, but only on >>> Intel architecture cpus. I assume the same is true for the igt tests? Is >>> there anything specific about Intel cpu's that makes this still work or >>> very unlikely to break? Or are the tests insufficient to catch this? Or >>> just luck? >> >> Intel x86 cpus are strongly-ordered, so smp_rmb() and smp_wmb() are only >> compiler barriers on x86 arch. >> > > Ok, that makes sense as part of the explanation why stuff still worked, at > least on the tested x86 arch. > >> The compiler could have hoisted the vblanktimestamp read in front of >> the vblank count read, but chose not to. >> > > This one still confuses me. I know why the smp_rmb after the vblanktimestamp > read is there - i placed one there in the current code myself. But why could > the compiler - in absence of the new smp_rmb compiler barrier in this patch > - reorder those two loads without violating the semantic of the code? > > Why could it have turned this > > cur_vblank = vblank->count; > // smp_rmb(); > vblanktime = vblanktimestamp(dev, crtc, cur_vblank); > > into this instead > > *vblanktime = vblanktimestamp(dev, crtc, cur_vblank); > // smp_rmb(); > cur_vblank = vblank->count; > > when the first load would need the value of cur_vblank - and thereby the > result of the 2nd load - as input to calculate its address for the > *vblanktime = ... load? > > I think i'm still not getting something about why the compiler would be > allowed to reorder in this way in absence of the additional smp_rmb? Or is > that barrier required for other archs which are less strongly ordered?
Apologies for the confusion; I missed that it was data-dependent load. Regards, Peter Hurley > thanks, > -mario > >>> I looked through kernels back to 3.16 and most uses of the function would >>> be safe from races due to the locking around it, holding of vblank >>> refcounts, or the place and order of execution, e.g., from within >>> drm_handle_vblank(). But in some tight test loop just calling the >>> drmWaitVblank ioctl to query current values i'd expect it to at least >>> occassionally return corrupted timestamps, e.g., time jumping forward or >>> backward, etc.? >> >> As a test, reverse the load order of vblanktimestamp wrt vblank count >> and see if your tests trip; if not, you know the tests are insufficient. >> >> Regards, >> Peter Hurley >> >>> >>>> return cur_vblank; >>>> } >>>> @@ -1715,7 +1731,7 @@ bool drm_handle_vblank(struct drm_device *dev, int >>>> crtc) >>>> */ >>>> >>>> /* Get current timestamp and count. */ >>>> - vblcount = atomic_read(&vblank->count); >>>> + vblcount = vblank->count; >>>> drm_get_last_vbltimestamp(dev, crtc, &tvblank, >>>> DRM_CALLED_FROM_VBLIRQ); >>>> >>>> /* Compute time difference to timestamp of last vblank */ >>>> @@ -1731,20 +1747,11 @@ bool drm_handle_vblank(struct drm_device *dev, int >>>> crtc) >>>> * e.g., due to spurious vblank interrupts. We need to >>>> * ignore those for accounting. >>>> */ >>>> - if (abs64(diff_ns) > DRM_REDUNDANT_VBLIRQ_THRESH_NS) { >>>> - /* Store new timestamp in ringbuffer. */ >>>> - vblanktimestamp(dev, crtc, vblcount + 1) = tvblank; >>>> - >>>> - /* Increment cooked vblank count. This also atomically commits >>>> - * the timestamp computed above. >>>> - */ >>>> - smp_mb__before_atomic(); >>>> - atomic_inc(&vblank->count); >>>> - smp_mb__after_atomic(); >>>> - } else { >>>> + if (abs64(diff_ns) > DRM_REDUNDANT_VBLIRQ_THRESH_NS) >>>> + store_vblank(dev, crtc, 1, &tvblank); >>>> + else >>>> DRM_DEBUG("crtc %d: Redundant vblirq ignored. diff_ns = %d\n", >>>> crtc, (int) diff_ns); >>>> - } >>>> >>>> spin_unlock(&dev->vblank_time_lock); >>>> >>>> diff --git a/include/drm/drmP.h b/include/drm/drmP.h >>>> index 62c40777c009..4c31a2cc5a33 100644 >>>> --- a/include/drm/drmP.h >>>> +++ b/include/drm/drmP.h >>>> @@ -686,9 +686,13 @@ struct drm_pending_vblank_event { >>>> struct drm_vblank_crtc { >>>> struct drm_device *dev; /* pointer to the drm_device */ >>>> wait_queue_head_t queue; /**< VBLANK wait queue */ >>>> - struct timeval time[DRM_VBLANKTIME_RBSIZE]; /**< timestamp of >>>> current count */ >>>> struct timer_list disable_timer; /* delayed disable timer */ >>>> - atomic_t count; /**< number of VBLANK interrupts */ >>>> + >>>> + /* vblank counter, protected by dev->vblank_time_lock for writes */ >>>> + unsigned long count; >>> >>> Why is count an unsigned long (= 64 bit on 64-bit kernels) instead of u32 >>> when all users of count are u32? Is this intentional? >>> >>> >>>> + /* vblank timestamps, protected by dev->vblank_time_lock for writes */ >>>> + struct timeval time[DRM_VBLANKTIME_RBSIZE]; >>>> + >>>> atomic_t refcount; /* number of users of vblank >>>> interruptsper crtc */ >>>> u32 last; /* protected by dev->vbl_lock, used */ >>>> /* for wraparound handling */ >>>> >>> >>> Thanks, >>> -mario >>