Hi Sandeep,

On 2024/4/10 06:14, Sandeep Dhavale wrote:
Add optimization to treat data blocks filled with 0s as a hole.
Even though diskspace savings are comparable to chunk based or dedupe,
having no block assigned saves us redundant disk IOs during read.

To detect blocks filled with zeros during chunking, we insert block
filled with zeros (zerochunk) in the hashmap. If we detect a possible
dedupe, we map it to the hole so there is no physical block assigned.

Signed-off-by: Sandeep Dhavale <dhav...@google.com>
---
Changes since v1:
        - Instead of checking every block for 0s word by word,
          add a zerochunk in blobs during init. So we effectively
          detect the zero blocks by comparing the hash.
  include/erofs/blobchunk.h |  2 +-
  lib/blobchunk.c           | 41 ++++++++++++++++++++++++++++++++++++---
  mkfs/main.c               |  2 +-
  3 files changed, 40 insertions(+), 5 deletions(-)

diff --git a/include/erofs/blobchunk.h b/include/erofs/blobchunk.h
index a674640..ebe2efe 100644
--- a/include/erofs/blobchunk.h
+++ b/include/erofs/blobchunk.h
@@ -23,7 +23,7 @@ int erofs_write_zero_inode(struct erofs_inode *inode);
  int tarerofs_write_chunkes(struct erofs_inode *inode, erofs_off_t 
data_offset);
  int erofs_mkfs_dump_blobs(struct erofs_sb_info *sbi);
  void erofs_blob_exit(void);
-int erofs_blob_init(const char *blobfile_path);
+int erofs_blob_init(const char *blobfile_path, erofs_off_t chunksize);
  int erofs_mkfs_init_devices(struct erofs_sb_info *sbi, unsigned int devices);
#ifdef __cplusplus
diff --git a/lib/blobchunk.c b/lib/blobchunk.c
index 641e3d4..87c153f 100644
--- a/lib/blobchunk.c
+++ b/lib/blobchunk.c
@@ -323,13 +323,21 @@ int erofs_blob_write_chunked_file(struct erofs_inode 
*inode, int fd,
                        ret = -EIO;
                        goto err;
                }
-
                chunk = erofs_blob_getchunk(sbi, chunkdata, len);
                if (IS_ERR(chunk)) {
                        ret = PTR_ERR(chunk);
                        goto err;
                }


Sorry for late reply since I'm working on multi-threaded mkfs.

Can erofs_blob_getchunk directly return &erofs_holechunk? I mean,

static struct erofs_blobchunk *erofs_blob_getchunk(struct erofs_sb_info *sbi,
                                                u8 *buf, erofs_off_t chunksize)
{
...
        chunk = hashmap_get_from_hash(&blob_hashmap, hash, sha256);
        if (chunk) {
                DBG_BUGON(chunksize != chunk->chunksize);

                if (chunk->blkaddr == erofs_holechunk.blkaddr)
                        chunk = &erofs_holechunk;

                sbi->saved_by_deduplication += chunksize;
                erofs_dbg("Found duplicated chunk at %u", chunk->blkaddr);
                return chunk;
        }
...
}

+ if (chunk->blkaddr == erofs_holechunk.blkaddr) {
+                       *(void **)idx++ = &erofs_holechunk;
+                       erofs_update_minextblks(sbi, interval_start, pos,
+                                               &minextblks);
+                       interval_start = pos + len;


I guess several zerochunks can also be merged?  is this line
an expected behavior?

+                       lastch = NULL;
+                       continue;
+               }
+
                if (lastch && (lastch->device_id != chunk->device_id ||
                    erofs_pos(sbi, lastch->blkaddr) + lastch->chunksize !=
                    erofs_pos(sbi, chunk->blkaddr))) {

I guess we could form a helper like
static bool erofs_blob_can_merge(struct erofs_sb_info *sbi,
                            struct erofs_blobchunk *lastch,
                            struct erofs_blobchunk *chunk)
{
        if (lastch == &erofs_holechunk && chunk == &erofs_holechunk)
                return true;
        if (lastch->device_id == chunk->device_id &&
            erofs_pos(sbi, lastch->blkaddr) + lastch->chunksize ==
            erofs_pos(sbi, chunk->blkaddr))
                return true;
        return false;
}

                if (lastch && erofs_blob_can_merge(sbi, lastch, chunk)) {
                        ...
                }



@@ -540,7 +548,34 @@ void erofs_blob_exit(void)
        }
  }
-int erofs_blob_init(const char *blobfile_path)
+static int erofs_insert_zerochunk(erofs_off_t chunksize)
+{
+       u8 *zeros;
+       struct erofs_blobchunk *chunk;
+       u8 sha256[32];
+       unsigned int hash;
+
+       zeros = calloc(1, chunksize);
+       if (!zeros)
+               return -ENOMEM;
+
+       erofs_sha256(zeros, chunksize, sha256);
+       hash = memhash(sha256, sizeof(sha256));


`zeros` needs to be freed here I guess:
        free(zeros);

Thanks,
Gao Xiang

+       chunk = malloc(sizeof(struct erofs_blobchunk));
+       if (!chunk)
+               return -ENOMEM;> +
+       chunk->chunksize = chunksize;
+       /* treat chunk filled with zeros as hole */
+       chunk->blkaddr = erofs_holechunk.blkaddr;
+       memcpy(chunk->sha256, sha256, sizeof(sha256));
+
+       hashmap_entry_init(&chunk->ent, hash);
+       hashmap_add(&blob_hashmap, chunk);
+       return 0;
+}
+

Reply via email to