perf on fsck.erofs reports that z_erofs_load_compact_lcluster was 
spending 20% of its time doing the div instruction. While the function
itself is ~40% of the fsck.erofs runtime. In the source code, it seems 
that the compiler can't optimize the division by vcnt despite it only 
holding powers of two.

Running a benchmark on a lzma compressed freebsd source tree 
on x86 yields a ~3% increase in performance. The following 
test was run locally on an x86 machine.

$ hyperfine -w 10 -p "echo 3 > /proc/sys/vm/drop_caches; sleep 1" \
  "./fsck.erofs ./bsd.erofs.lzma"

With shift optimization
Time (mean ± σ):     360.0 ms ±  12.0 ms    \
  [User: 236.3 ms, System: 120.6 ms]
Range (min … max):   342.3 ms … 379.8 ms    10 runs

Original Dev Branch
Time (mean ± σ):     371.1 ms ±  16.1 ms    \
  [User: 254.8 ms, System: 115.0 ms]
Range (min … max):   354.8 ms … 404.4 ms    10 runs

Signed-off-by: Ashley Lee <[email protected]>

---
 lib/zmap.c | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/lib/zmap.c b/lib/zmap.c
index baec278..1ba52b5 100644
--- a/lib/zmap.c
+++ b/lib/zmap.c
@@ -112,7 +112,7 @@ static int z_erofs_load_compact_lcluster(struct 
z_erofs_maprecorder *m,
        const unsigned int lclusterbits = vi->z_lclusterbits;
        const unsigned int totalidx = BLK_ROUND_UP(sbi, vi->i_size);
        unsigned int compacted_4b_initial, compacted_2b, amortizedshift;
-       unsigned int vcnt, lo, lobits, encodebits, nblk, bytes;
+       unsigned int vcnt, vdiv, lo, lobits, encodebits, nblk, bytes;
        bool big_pcluster = vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_1;
        erofs_off_t pos;
        u8 *in, type;
@@ -144,13 +144,16 @@ static int z_erofs_load_compact_lcluster(struct 
z_erofs_maprecorder *m,
        pos += lcn * (1 << amortizedshift);
 
        /* figure out the lcluster count in this pack */
-       if (1 << amortizedshift == 4 && lclusterbits <= 14)
+       if (1 << amortizedshift == 4 && lclusterbits <= 14) {
                vcnt = 2;
-       else if (1 << amortizedshift == 2 && lclusterbits <= 12)
+               vdiv = 1;
+       } else if (1 << amortizedshift == 2 && lclusterbits <= 12) {
                vcnt = 16;
-       else
+               vdiv = 4;
+       } else {
                return -EOPNOTSUPP;
-
+       }
+
        in = erofs_read_metabuf(&m->map->buf, sbi, pos,
                                erofs_inode_in_metabox(vi));
        if (IS_ERR(in))
@@ -160,7 +163,7 @@ static int z_erofs_load_compact_lcluster(struct 
z_erofs_maprecorder *m,
        m->nextpackoff = round_down(pos, vcnt << amortizedshift) +
                         (vcnt << amortizedshift);
        lobits = max(lclusterbits, ilog2(Z_EROFS_LI_D0_CBLKCNT) + 1U);
-       encodebits = ((vcnt << amortizedshift) - sizeof(__le32)) * 8 / vcnt;
+       encodebits = (((vcnt << amortizedshift) - sizeof(__le32)) * 8) >> vdiv;
        bytes = pos & ((vcnt << amortizedshift) - 1);
        in -= bytes;
        i = bytes >> amortizedshift;
-- 
2.53.0

Reply via email to