On Tue, 24 Jun 2025 07:33:50 +0000 Dongsheng Yang <dongsheng.y...@linux.dev> wrote:
> Add cache_dev.{c,h} to manage the persistent-memory device that stores > all pcache metadata and data segments. Splitting this logic out keeps > the main dm-pcache code focused on policy while cache_dev handles the > low-level interaction with the DAX block device. > > * DAX mapping > - Opens the underlying device via dm_get_device(). > - Uses dax_direct_access() to obtain a direct linear mapping; falls > back to vmap() when the range is fragmented. > > * On-disk layout > ┌─ 4 KB ─┐ super-block (SB) > ├─ 4 KB ─┤ cache_info[0] > ├─ 4 KB ─┤ cache_info[1] > ├─ 4 KB ─┤ cache_ctrl > └─ ... ─┘ segments > Constants and macros in the header expose offsets and sizes. > > * Super-block handling > - sb_read(), sb_validate(), sb_init() verify magic, CRC32 and host > endianness (flag *PCACHE_SB_F_BIGENDIAN*). > - Formatting zeroes the metadata replicas and initialises the segment > bitmap when the SB is blank. > > * Segment allocator > - Bitmap protected by seg_lock; find_next_zero_bit() yields the next > free 16 MB segment. > > * Lifecycle helpers > - cache_dev_start()/stop() encapsulate init/exit and are invoked by > dm-pcache core. > - Gracefully handles errors: CRC mismatch, wrong endianness, device > too small (< 512 MB), or failed DAX mapping. > > Signed-off-by: Dongsheng Yang <dongsheng.y...@linux.dev> > --- > drivers/md/dm-pcache/cache_dev.c | 299 +++++++++++++++++++++++++++++++ > drivers/md/dm-pcache/cache_dev.h | 70 ++++++++ > 2 files changed, 369 insertions(+) > create mode 100644 drivers/md/dm-pcache/cache_dev.c > create mode 100644 drivers/md/dm-pcache/cache_dev.h > > diff --git a/drivers/md/dm-pcache/cache_dev.c > b/drivers/md/dm-pcache/cache_dev.c > new file mode 100644 > index 000000000000..4dcebc9c167e > --- /dev/null > +++ b/drivers/md/dm-pcache/cache_dev.c > @@ -0,0 +1,299 @@ > +static int build_vmap(struct dax_device *dax_dev, long total_pages, void > **vaddr) > +{ > + struct page **pages; > + long i = 0, chunk; > + pfn_t pfn; > + int ret; > + > + pages = vmalloc_array(total_pages, sizeof(struct page *)); Perhaps if DM allows it, use __free() here to avoid need to manually clean it up and allow early returns on errors. > + if (!pages) > + return -ENOMEM; > + > + do { > + chunk = dax_direct_access(dax_dev, i, total_pages - i, > + DAX_ACCESS, NULL, &pfn); > + if (chunk <= 0) { > + ret = chunk ? chunk : -EINVAL; > + goto out_free; > + } > + > + if (!pfn_t_has_page(pfn)) { > + ret = -EOPNOTSUPP; > + goto out_free; > + } > + > + while (chunk-- && i < total_pages) { > + pages[i++] = pfn_t_to_page(pfn); > + pfn.val++; > + if (!(i & 15)) > + cond_resched(); > + } > + } while (i < total_pages); > + > + *vaddr = vmap(pages, total_pages, VM_MAP, PAGE_KERNEL); > + if (!*vaddr) > + ret = -ENOMEM; > +out_free: > + vfree(pages); > + return ret; > +} > + > +static int cache_dev_dax_init(struct pcache_cache_dev *cache_dev) > +{ > + struct dm_pcache *pcache = CACHE_DEV_TO_PCACHE(cache_dev); > + struct dax_device *dax_dev; > + long total_pages, mapped_pages; > + u64 bdev_size; > + void *vaddr; > + int ret; > + int id; combine ret and id on one line. > + pfn_t pfn; > + > + dax_dev = cache_dev->dm_dev->dax_dev; > + /* total size check */ > + bdev_size = bdev_nr_bytes(cache_dev->dm_dev->bdev); > + if (bdev_size < PCACHE_CACHE_DEV_SIZE_MIN) { > + pcache_dev_err(pcache, "dax device is too small, required at > least %llu", > + PCACHE_CACHE_DEV_SIZE_MIN); > + ret = -ENOSPC; > + goto out; return -ENOSPC; > +int cache_dev_start(struct dm_pcache *pcache) > +{ > + struct pcache_cache_dev *cache_dev = &pcache->cache_dev; > + struct pcache_sb sb; > + bool format = false; > + int ret; > + > + mutex_init(&cache_dev->seg_lock); > + > + ret = cache_dev_dax_init(cache_dev); > + if (ret) { > + pcache_dev_err(pcache, "failed to init cache_dev %s via dax > way: %d.", > + cache_dev->dm_dev->name, ret); > + goto err; > + } > + > + ret = sb_read(cache_dev, &sb); > + if (ret) > + goto dax_release; > + > + if (le64_to_cpu(sb.magic) == 0) { > + format = true; > + ret = sb_init(cache_dev, &sb); > + if (ret < 0) > + goto dax_release; > + } > + > + ret = sb_validate(cache_dev, &sb); > + if (ret) > + goto dax_release; > + > + cache_dev->sb_flags = le32_to_cpu(sb.flags); > + ret = cache_dev_init(cache_dev, sb.seg_num); > + if (ret) > + goto dax_release; > + > + if (format) > + sb_write(cache_dev, &sb); > + > + return 0; > + > +dax_release: > + cache_dev_dax_exit(cache_dev); > +err: In these cases just return instead of going to the label. It gives generally more readable code. > + return ret; > +} > + > +int cache_dev_get_empty_segment_id(struct pcache_cache_dev *cache_dev, u32 > *seg_id) > +{ > + int ret; > + > + mutex_lock(&cache_dev->seg_lock); If DM is fine with guard() use it here. > + *seg_id = find_next_zero_bit(cache_dev->seg_bitmap, cache_dev->seg_num, > 0); > + if (*seg_id == cache_dev->seg_num) { > + ret = -ENOSPC; > + goto unlock; > + } > + > + set_bit(*seg_id, cache_dev->seg_bitmap); > + ret = 0; > +unlock: > + mutex_unlock(&cache_dev->seg_lock); > + return ret; > +}