I am now seeing the exact same issues you are reporting. A heap release did
nothing for me.

The only odd thing I'm doing is migrating data in cephfs from one pool to
another. The process looks something like the following:
TARGET_DIR=/media/cephfs/labs/
TARGET_POOL="cephfs_ec_data"
setfattr -n ceph.dir.layout.pool -v ${TARGET_POOL} ${TARGET_DIR}
#for every file
##NEWFILE="${file}.ec"
##cp "${file}" "${NEWFILE}"
##mv "${NEWFILE}" "${file}"

I have a fear that this process may not be releasing the inode of ${file}
and deleting the objects from RADOS. But, I'm not sure that would have much
to do with MDS outside tracking an inode that isn't accessible anymore.



[root@mds0 ~]# rpm -qa | grep ceph
ceph-mgr-12.2.4-0.el7.x86_64
ceph-12.2.4-0.el7.x86_64
ceph-osd-12.2.4-0.el7.x86_64
ceph-release-1-1.el7.noarch
libcephfs2-12.2.4-0.el7.x86_64
ceph-base-12.2.4-0.el7.x86_64
ceph-mds-12.2.4-0.el7.x86_64
ceph-deploy-2.0.0-0.noarch
ceph-common-12.2.4-0.el7.x86_64
ceph-mon-12.2.4-0.el7.x86_64
ceph-radosgw-12.2.4-0.el7.x86_64
python-cephfs-12.2.4-0.el7.x86_64
ceph-selinux-12.2.4-0.el7.x86_64


[root@mds0 ~]# ceph daemon mds.mds0 config get mds_cache_memory_limit
{
    "mds_cache_memory_limit": "80530636800"
}


[root@mds0 ~]# ceph daemon mds.mds0 perf dump
{
    "AsyncMessenger::Worker-0": {
        "msgr_recv_messages": 48568037,
        "msgr_send_messages": 51895350,
        "msgr_recv_bytes": 50001752194,
        "msgr_send_bytes": 59667899407,
        "msgr_created_connections": 28522,
        "msgr_active_connections": 939,
        "msgr_running_total_time": 9158.145665485,
        "msgr_running_send_time": 3270.445768873,
        "msgr_running_recv_time": 8951.883602486,
        "msgr_running_fast_dispatch_time": 684.964408603
    },
    "AsyncMessenger::Worker-1": {
        "msgr_recv_messages": 81557461,
        "msgr_send_messages": 88149491,
        "msgr_recv_bytes": 59543645402,
        "msgr_send_bytes": 99790426210,
        "msgr_created_connections": 28705,
        "msgr_active_connections": 881,
        "msgr_running_total_time": 14513.332929088,
        "msgr_running_send_time": 5214.994372044,
        "msgr_running_recv_time": 13891.320681575,
        "msgr_running_fast_dispatch_time": 682.921363330
    },
    "AsyncMessenger::Worker-2": {
        "msgr_recv_messages": 104018424,
        "msgr_send_messages": 117265828,
        "msgr_recv_bytes": 70248474177,
        "msgr_send_bytes": 175930469394,
        "msgr_created_connections": 30034,
        "msgr_active_connections": 1043,
        "msgr_running_total_time": 18836.813930876,
        "msgr_running_send_time": 7227.884643396,
        "msgr_running_recv_time": 17825.385233846,
        "msgr_running_fast_dispatch_time": 692.710777921
    },
    "finisher-PurgeQueue": {
        "queue_len": 0,
        "complete_latency": {
            "avgcount": 22554047,
            "sum": 2515.425093728,
            "avgtime": 0.000111528
        }
    },
    "mds": {
        "request": 156766118,
        "reply": 156766111,
        "reply_latency": {
            "avgcount": 156766111,
            "sum": 337276.533677320,
            "avgtime": 0.002151463
        },
        "forward": 0,
        "dir_fetch": 6468158,
        "dir_commit": 539656,
        "dir_split": 0,
        "dir_merge": 0,
        "inode_max": 2147483647,
        "inodes": 35853368,
        "inodes_top": 23669670,
        "inodes_bottom": 12165298,
        "inodes_pin_tail": 18400,
        "inodes_pinned": 2039553,
        "inodes_expired": 142389542,
        "inodes_with_caps": 831824,
        "caps": 881384,
        "subtrees": 2,
        "traverse": 167546977,
        "traverse_hit": 53323050,
        "traverse_forward": 0,
        "traverse_discover": 0,
        "traverse_dir_fetch": 4853,
        "traverse_remote_ino": 0,
        "traverse_lock": 39597,
        "load_cent": 15676533928,
        "q": 0,
        "exported": 0,
        "exported_inodes": 0,
        "imported": 0,
        "imported_inodes": 0
    },
    "mds_cache": {
        "num_strays": 1369,
        "num_strays_delayed": 12,
        "num_strays_enqueuing": 0,
        "strays_created": 2667808,
        "strays_enqueued": 2666306,
        "strays_reintegrated": 246,
        "strays_migrated": 0,
        "num_recovering_processing": 0,
        "num_recovering_enqueued": 0,
        "num_recovering_prioritized": 0,
        "recovery_started": 524,
        "recovery_completed": 524,
        "ireq_enqueue_scrub": 0,
        "ireq_exportdir": 0,
        "ireq_flush": 0,
        "ireq_fragmentdir": 0,
        "ireq_fragstats": 0,
        "ireq_inodestats": 0
    },
    "mds_log": {
        "evadd": 34813343,
        "evex": 34809732,
        "evtrm": 34809732,
        "ev": 22489,
        "evexg": 0,
        "evexd": 728,
        "segadd": 47980,
        "segex": 47980,
        "segtrm": 47980,
        "seg": 31,
        "segexg": 0,
        "segexd": 1,
        "expos": 8687078876712,
        "wrpos": 8687143594883,
        "rdpos": 8586648077163,
        "jlat": {
            "avgcount": 12732690,
            "sum": 371322.453160705,
            "avgtime": 0.029162922
        },
        "replayed": 18878
    },
    "mds_mem": {
        "ino": 35852761,
        "ino+": 174413168,
        "ino-": 138560407,
        "dir": 1288886,
        "dir+": 6398671,
        "dir-": 5109785,
        "dn": 35853455,
        "dn+": 181545805,
        "dn-": 145692350,
        "cap": 881384,
        "cap+": 225924791,
        "cap-": 225043407,
        "rss": 124952096,
        "heap": 313964,
        "buf": 0
    },
    "mds_server": {
        "dispatch_client_request": 169327566,
        "dispatch_server_request": 0,
        "handle_client_request": 156766118,
        "handle_client_session": 1446020,
        "handle_slave_request": 0,
        "req_create": 2782862,
        "req_getattr": 7529707,
        "req_getfilelock": 5,
        "req_link": 298,
        "req_lookup": 123401139,
        "req_lookuphash": 0,
        "req_lookupino": 0,
        "req_lookupname": 89226,
        "req_lookupparent": 0,
        "req_lookupsnap": 0,
        "req_lssnap": 0,
        "req_mkdir": 42729,
        "req_mknod": 7,
        "req_mksnap": 0,
        "req_open": 5781795,
        "req_readdir": 8823398,
        "req_rename": 2066887,
        "req_renamesnap": 0,
        "req_rmdir": 32196,
        "req_rmsnap": 0,
        "req_rmxattr": 364883,
        "req_setattr": 161338,
        "req_setdirlayout": 0,
        "req_setfilelock": 5038771,
        "req_setlayout": 0,
        "req_setxattr": 2657833,
        "req_symlink": 3617,
        "req_unlink": 772280
    },
    "mds_sessions": {
        "session_count": 20,
        "session_add": 47,
        "session_remove": 27
    },
    "objecter": {
        "op_active": 2,
        "op_laggy": 0,
        "op_send": 47982687,
        "op_send_bytes": 105127128306,
        "op_resend": 13,
        "op_reply": 47982672,
        "op": 47982674,
        "op_r": 6735267,
        "op_w": 41247407,
        "op_rmw": 0,
        "op_pg": 0,
        "osdop_stat": 537779,
        "osdop_create": 4155246,
        "osdop_read": 173832,
        "osdop_write": 12906485,
        "osdop_writefull": 199372,
        "osdop_writesame": 0,
        "osdop_append": 0,
        "osdop_zero": 2,
        "osdop_truncate": 0,
        "osdop_delete": 22440537,
        "osdop_mapext": 0,
        "osdop_sparse_read": 0,
        "osdop_clonerange": 0,
        "osdop_getxattr": 6531688,
        "osdop_setxattr": 6577232,
        "osdop_cmpxattr": 0,
        "osdop_rmxattr": 0,
        "osdop_resetxattrs": 0,
        "osdop_tmap_up": 0,
        "osdop_tmap_put": 0,
        "osdop_tmap_get": 0,
        "osdop_call": 0,
        "osdop_watch": 0,
        "osdop_notify": 0,
        "osdop_src_cmpxattr": 0,
        "osdop_pgls": 0,
        "osdop_pgls_filter": 0,
        "osdop_other": 999516,
        "linger_active": 0,
        "linger_send": 0,
        "linger_resend": 0,
        "linger_ping": 0,
        "poolop_active": 0,
        "poolop_send": 0,
        "poolop_resend": 0,
        "poolstat_active": 0,
        "poolstat_send": 0,
        "poolstat_resend": 0,
        "statfs_active": 0,
        "statfs_send": 0,
        "statfs_resend": 0,
        "command_active": 0,
        "command_send": 0,
        "command_resend": 0,
        "map_epoch": 450530,
        "map_full": 0,
        "map_inc": 27226,
        "osd_sessions": 374,
        "osd_session_open": 87167,
        "osd_session_close": 86793,
        "osd_laggy": 0,
        "omap_wr": 1062388,
        "omap_rd": 12936360,
        "omap_del": 313476
    },
    "purge_queue": {
        "pq_executing_ops": 0,
        "pq_executing": 0,
        "pq_executed": 2666323
    },
    "throttle-msgr_dispatch_throttler-mds": {
        "val": 0,
        "max": 104857600,
        "get_started": 0,
        "get": 234143922,
        "get_sum": 162467221545,
        "get_or_fail_fail": 0,
        "get_or_fail_success": 234143922,
        "take": 0,
        "take_sum": 0,
        "put": 234143922,
        "put_sum": 162467221545,
        "wait": {
            "avgcount": 0,
            "sum": 0.000000000,
            "avgtime": 0.000000000
        }
    },
    "throttle-objecter_bytes": {
        "val": 18070,
        "max": 104857600,
        "get_started": 0,
        "get": 0,
        "get_sum": 0,
        "get_or_fail_fail": 0,
        "get_or_fail_success": 0,
        "take": 47982674,
        "take_sum": 105435633841,
        "put": 24512906,
        "put_sum": 105435615771,
        "wait": {
            "avgcount": 0,
            "sum": 0.000000000,
            "avgtime": 0.000000000
        }
    },
    "throttle-objecter_ops": {
        "val": 2,
        "max": 1024,
        "get_started": 0,
        "get": 0,
        "get_sum": 0,
        "get_or_fail_fail": 0,
        "get_or_fail_success": 0,
        "take": 47982674,
        "take_sum": 47982674,
        "put": 47982672,
        "put_sum": 47982672,
        "wait": {
            "avgcount": 0,
            "sum": 0.000000000,
            "avgtime": 0.000000000
        }
    },
    "throttle-write_buf_throttle": {
        "val": 0,
        "max": 3758096384,
        "get_started": 0,
        "get": 2666306,
        "get_sum": 247966490,
        "get_or_fail_fail": 0,
        "get_or_fail_success": 2666306,
        "take": 0,
        "take_sum": 0,
        "put": 173754,
        "put_sum": 247966490,
        "wait": {
            "avgcount": 0,
            "sum": 0.000000000,
            "avgtime": 0.000000000
        }
    },
    "throttle-write_buf_throttle-0x5601defe43a0": {
        "val": 0,
        "max": 3758096384,
        "get_started": 0,
        "get": 34813343,
        "get_sum": 100495515382,
        "get_or_fail_fail": 0,
        "get_or_fail_success": 34813343,
        "take": 0,
        "take_sum": 0,
        "put": 12732692,
        "put_sum": 100495515382,
        "wait": {
            "avgcount": 0,
            "sum": 0.000000000,
            "avgtime": 0.000000000
        }
    }
}



On Thu, Apr 19, 2018 at 12:49 AM, Alexandre DERUMIER <aderum...@odiso.com>
wrote:

> >>I don't find any clue. Next time it happens, could you please try
> >>"ceph tell mds.xxx heap release"
>
> don't seem to work
>
>
>
>
> USER         PID %CPU %MEM    VSZ   RSS TTY      STAT START   TIME COMMAND
> ceph     1211357 13.1 18.6 12676452 12286508 ?   Ssl  avril05 2567:28
> /usr/bin/ceph-mds -f --cluster ceph --id ceph4-2.odiso.net --setuser ceph
> --setgroup ceph
>
>
> # ceph tell mds.ceph4-2.odiso.net heap release
> mds.ceph4-2.odiso.net releasing free RAM back to system.
>
>
> USER         PID %CPU %MEM    VSZ   RSS TTY      STAT START   TIME COMMAND
> ceph     1211357 13.1 18.6 12676452 12286508 ?   Ssl  avril05 2567:36
> /usr/bin/ceph-mds -f --cluster ceph --id ceph4-2.odiso.net --setuser ceph
> --setgroup ceph
>
>
>
> I'll try to monitor memory to see when exactly it's growing, it's seem to
> grow time to time, but not continously.
>
>
> here the stats after heap release:
>
>
> # ceph daemon mds.ceph4-2.odiso.net cache status
> {
>     "pool": {
>         "items": 15123841,
>         "bytes": 5167594872
>     }
> }
>
> #  ceph daemon mds.ceph4-2.odiso.net perf dump
> {
>     "AsyncMessenger::Worker-0": {
>         "msgr_recv_messages": 327887491,
>         "msgr_send_messages": 329555332,
>         "msgr_recv_bytes": 1530854660227,
>         "msgr_send_bytes": 1174619728658,
>         "msgr_created_connections": 146,
>         "msgr_active_connections": 145,
>         "msgr_running_total_time": 16828.645933488,
>         "msgr_running_send_time": 6368.459700090,
>         "msgr_running_recv_time": 14406.742804542,
>         "msgr_running_fast_dispatch_time": 1378.745242725
>     },
>     "AsyncMessenger::Worker-1": {
>         "msgr_recv_messages": 177837885,
>         "msgr_send_messages": 170796581,
>         "msgr_recv_bytes": 1426860751988,
>         "msgr_send_bytes": 166774861696,
>         "msgr_created_connections": 145,
>         "msgr_active_connections": 144,
>         "msgr_running_total_time": 9298.266921246,
>         "msgr_running_send_time": 3129.373504230,
>         "msgr_running_recv_time": 7895.052894375,
>         "msgr_running_fast_dispatch_time": 1322.886415635
>     },
>     "AsyncMessenger::Worker-2": {
>         "msgr_recv_messages": 325631551,
>         "msgr_send_messages": 314206515,
>         "msgr_recv_bytes": 1403013169198,
>         "msgr_send_bytes": 308787752784,
>         "msgr_created_connections": 138,
>         "msgr_active_connections": 133,
>         "msgr_running_total_time": 15012.588633448,
>         "msgr_running_send_time": 5510.205039583,
>         "msgr_running_recv_time": 14002.408569714,
>         "msgr_running_fast_dispatch_time": 1260.624028645
>     },
>     "finisher-PurgeQueue": {
>         "queue_len": 0,
>         "complete_latency": {
>             "avgcount": 731407,
>             "sum": 8003.599511421,
>             "avgtime": 0.010942743
>         }
>     },
>     "mds": {
>         "request": 608911096,
>         "reply": 608910899,
>         "reply_latency": {
>             "avgcount": 608910899,
>             "sum": 1641293.658633345,
>             "avgtime": 0.002695457
>         },
>         "forward": 0,
>         "dir_fetch": 32598533,
>         "dir_commit": 1230989,
>         "dir_split": 9535,
>         "dir_merge": 9523,
>         "inode_max": 2147483647,
>         "inodes": 2087784,
>         "inodes_top": 190211,
>         "inodes_bottom": 154977,
>         "inodes_pin_tail": 1742596,
>         "inodes_pinned": 1816054,
>         "inodes_expired": 5550735649,
>         "inodes_with_caps": 1814707,
>         "caps": 3170853,
>         "subtrees": 2,
>         "traverse": 645302921,
>         "traverse_hit": 390729564,
>         "traverse_forward": 0,
>         "traverse_discover": 0,
>         "traverse_dir_fetch": 26620216,
>         "traverse_remote_ino": 1968,
>         "traverse_lock": 573,
>         "load_cent": 60931206319,
>         "q": 18,
>         "exported": 0,
>         "exported_inodes": 0,
>         "imported": 0,
>         "imported_inodes": 0
>     },
>     "mds_cache": {
>         "num_strays": 1885,
>         "num_strays_delayed": 0,
>         "num_strays_enqueuing": 0,
>         "strays_created": 621082,
>         "strays_enqueued": 619458,
>         "strays_reintegrated": 72,
>         "strays_migrated": 0,
>         "num_recovering_processing": 0,
>         "num_recovering_enqueued": 0,
>         "num_recovering_prioritized": 0,
>         "recovery_started": 1,
>         "recovery_completed": 1,
>         "ireq_enqueue_scrub": 0,
>         "ireq_exportdir": 0,
>         "ireq_flush": 0,
>         "ireq_fragmentdir": 19058,
>         "ireq_fragstats": 0,
>         "ireq_inodestats": 0
>     },
>     "mds_log": {
>         "evadd": 108025412,
>         "evex": 108027485,
>         "evtrm": 108026461,
>         "ev": 25484,
>         "evexg": 0,
>         "evexd": 1024,
>         "segadd": 131605,
>         "segex": 131609,
>         "segtrm": 131608,
>         "seg": 31,
>         "segexg": 0,
>         "segexd": 1,
>         "expos": 5222483101644,
>         "wrpos": 5222526671740,
>         "rdpos": 5036811490502,
>         "jlat": {
>             "avgcount": 19597987,
>             "sum": 41720.071108694,
>             "avgtime": 0.002128793
>         },
>         "replayed": 26533
>     },
>     "mds_mem": {
>         "ino": 2087350,
>         "ino+": 5533126211,
>         "ino-": 5531038861,
>         "dir": 321262,
>         "dir+": 5672027,
>         "dir-": 5350765,
>         "dn": 2087920,
>         "dn+": 5553775487,
>         "dn-": 5551687567,
>         "cap": 3170853,
>         "cap+": 646307641,
>         "cap-": 643136788,
>         "rss": 12286508,
>         "heap": 313916,
>         "buf": 0
>     },
>     "mds_server": {
>         "dispatch_client_request": 651833084,
>         "dispatch_server_request": 0,
>         "handle_client_request": 608911096,
>         "handle_client_session": 5163844,
>         "handle_slave_request": 0,
>         "req_create": 754987,
>         "req_getattr": 5199299,
>         "req_getfilelock": 0,
>         "req_link": 170,
>         "req_lookup": 476304151,
>         "req_lookuphash": 0,
>         "req_lookupino": 0,
>         "req_lookupname": 16868,
>         "req_lookupparent": 0,
>         "req_lookupsnap": 0,
>         "req_lssnap": 0,
>         "req_mkdir": 12204,
>         "req_mknod": 0,
>         "req_mksnap": 0,
>         "req_open": 106156167,
>         "req_readdir": 20293077,
>         "req_rename": 28443,
>         "req_renamesnap": 0,
>         "req_rmdir": 17522,
>         "req_rmsnap": 0,
>         "req_rmxattr": 0,
>         "req_setattr": 34735,
>         "req_setdirlayout": 0,
>         "req_setfilelock": 238574,
>         "req_setlayout": 0,
>         "req_setxattr": 2,
>         "req_symlink": 122,
>         "req_unlink": 609565
>     },
>     "mds_sessions": {
>         "session_count": 307,
>         "session_add": 398,
>         "session_remove": 91
>     },
>     "objecter": {
>         "op_active": 0,
>         "op_laggy": 0,
>         "op_send": 60152761,
>         "op_send_bytes": 189780235877,
>         "op_resend": 4,
>         "op_reply": 60152757,
>         "op": 60152757,
>         "op_r": 32760612,
>         "op_w": 27392145,
>         "op_rmw": 0,
>         "op_pg": 0,
>         "osdop_stat": 1131412,
>         "osdop_create": 791110,
>         "osdop_read": 27868,
>         "osdop_write": 19625820,
>         "osdop_writefull": 81003,
>         "osdop_writesame": 0,
>         "osdop_append": 0,
>         "osdop_zero": 2,
>         "osdop_truncate": 4161,
>         "osdop_delete": 931372,
>         "osdop_mapext": 0,
>         "osdop_sparse_read": 0,
>         "osdop_clonerange": 0,
>         "osdop_getxattr": 9914736,
>         "osdop_setxattr": 1582220,
>         "osdop_cmpxattr": 0,
>         "osdop_rmxattr": 0,
>         "osdop_resetxattrs": 0,
>         "osdop_tmap_up": 0,
>         "osdop_tmap_put": 0,
>         "osdop_tmap_get": 0,
>         "osdop_call": 0,
>         "osdop_watch": 0,
>         "osdop_notify": 0,
>         "osdop_src_cmpxattr": 0,
>         "osdop_pgls": 0,
>         "osdop_pgls_filter": 0,
>         "osdop_other": 4645746,
>         "linger_active": 0,
>         "linger_send": 0,
>         "linger_resend": 0,
>         "linger_ping": 0,
>         "poolop_active": 0,
>         "poolop_send": 0,
>         "poolop_resend": 0,
>         "poolstat_active": 0,
>         "poolstat_send": 0,
>         "poolstat_resend": 0,
>         "statfs_active": 0,
>         "statfs_send": 0,
>         "statfs_resend": 0,
>         "command_active": 0,
>         "command_send": 0,
>         "command_resend": 0,
>         "map_epoch": 3121,
>         "map_full": 0,
>         "map_inc": 76,
>         "osd_sessions": 18,
>         "osd_session_open": 20,
>         "osd_session_close": 2,
>         "osd_laggy": 0,
>         "omap_wr": 2227270,
>         "omap_rd": 65197068,
>         "omap_del": 48058
>     },
>     "purge_queue": {
>         "pq_executing_ops": 0,
>         "pq_executing": 0,
>         "pq_executed": 619458
>     },
>     "throttle-msgr_dispatch_throttler-mds": {
>         "val": 0,
>         "max": 104857600,
>         "get_started": 0,
>         "get": 831356927,
>         "get_sum": 4299208168815,
>         "get_or_fail_fail": 0,
>         "get_or_fail_success": 831356927,
>         "take": 0,
>         "take_sum": 0,
>         "put": 831356927,
>         "put_sum": 4299208168815,
>         "wait": {
>             "avgcount": 0,
>             "sum": 0.000000000,
>             "avgtime": 0.000000000
>         }
>     },
>     "throttle-objecter_bytes": {
>         "val": 0,
>         "max": 104857600,
>         "get_started": 0,
>         "get": 0,
>         "get_sum": 0,
>         "get_or_fail_fail": 0,
>         "get_or_fail_success": 0,
>         "take": 60152757,
>         "take_sum": 189890861007,
>         "put": 54571445,
>         "put_sum": 189890861007,
>         "wait": {
>             "avgcount": 0,
>             "sum": 0.000000000,
>             "avgtime": 0.000000000
>         }
>     },
>     "throttle-objecter_ops": {
>         "val": 0,
>         "max": 1024,
>         "get_started": 0,
>         "get": 0,
>         "get_sum": 0,
>         "get_or_fail_fail": 0,
>         "get_or_fail_success": 0,
>         "take": 60152757,
>         "take_sum": 60152757,
>         "put": 60152757,
>         "put_sum": 60152757,
>         "wait": {
>             "avgcount": 0,
>             "sum": 0.000000000,
>             "avgtime": 0.000000000
>         }
>     },
>     "throttle-write_buf_throttle": {
>         "val": 0,
>         "max": 3758096384,
>         "get_started": 0,
>         "get": 619458,
>         "get_sum": 57609986,
>         "get_or_fail_fail": 0,
>         "get_or_fail_success": 619458,
>         "take": 0,
>         "take_sum": 0,
>         "put": 27833,
>         "put_sum": 57609986,
>         "wait": {
>             "avgcount": 0,
>             "sum": 0.000000000,
>             "avgtime": 0.000000000
>         }
>     },
>     "throttle-write_buf_throttle-0x559471d00140": {
>         "val": 105525,
>         "max": 3758096384,
>         "get_started": 0,
>         "get": 108025412,
>         "get_sum": 185715179864,
>         "get_or_fail_fail": 0,
>         "get_or_fail_success": 108025412,
>         "take": 0,
>         "take_sum": 0,
>         "put": 19597987,
>         "put_sum": 185715074339,
>         "wait": {
>             "avgcount": 0,
>             "sum": 0.000000000,
>             "avgtime": 0.000000000
>         }
>     }
> }
>
> ----- Mail original -----
> De: "Zheng Yan" <uker...@gmail.com>
> À: "aderumier" <aderum...@odiso.com>
> Cc: "Patrick Donnelly" <pdonn...@redhat.com>, "ceph-users" <
> ceph-users@lists.ceph.com>
> Envoyé: Mardi 17 Avril 2018 05:20:18
> Objet: Re: [ceph-users] ceph mds memory usage 20GB : is it normal ?
>
> On Sat, Apr 14, 2018 at 9:23 PM, Alexandre DERUMIER <aderum...@odiso.com>
> wrote:
> > Hi,
> >
> > Still leaking again after update to 12.2.4, around 17G after 9 days
> >
> >
> >
> >
> > USER PID %CPU %MEM VSZ RSS TTY STAT START TIME COMMAND
> >
> > ceph 629903 50.7 25.9 17473680 17082432 ? Ssl avril05 6498:21
> /usr/bin/ceph-mds -f --cluster ceph --id ceph4-1.odiso.net --setuser ceph
> --setgroup ceph
> >
> >
> >
> >
> >
> > ~# ceph daemon mds.ceph4-1.odiso.net cache status
> > {
> > "pool": {
> > "items": 16019302,
> > "bytes": 5100941968
> > }
> > }
> >
> >
> >
> >
> >
> > # ceph daemon mds.ceph4-1.odiso.net perf dump
> > {
> > "AsyncMessenger::Worker-0": {
> > "msgr_recv_messages": 648541059,
> > "msgr_send_messages": 666102301,
> > "msgr_recv_bytes": 4943336751206,
> > "msgr_send_bytes": 868468165048,
> > "msgr_created_connections": 167,
> > "msgr_active_connections": 166,
> > "msgr_running_total_time": 33884.943400671,
> > "msgr_running_send_time": 12229.226645264,
> > "msgr_running_recv_time": 26234.680757843,
> > "msgr_running_fast_dispatch_time": 4650.248980986
> > },
> > "AsyncMessenger::Worker-1": {
> > "msgr_recv_messages": 732301444,
> > "msgr_send_messages": 750526966,
> > "msgr_recv_bytes": 4248782228635,
> > "msgr_send_bytes": 2379403291660,
> > "msgr_created_connections": 172,
> > "msgr_active_connections": 171,
> > "msgr_running_total_time": 38490.093448635,
> > "msgr_running_send_time": 14692.222019414,
> > "msgr_running_recv_time": 31000.304091618,
> > "msgr_running_fast_dispatch_time": 3945.573521893
> > },
> > "AsyncMessenger::Worker-2": {
> > "msgr_recv_messages": 503228767,
> > "msgr_send_messages": 485729577,
> > "msgr_recv_bytes": 3644656184942,
> > "msgr_send_bytes": 526380645708,
> > "msgr_created_connections": 156,
> > "msgr_active_connections": 156,
> > "msgr_running_total_time": 26566.051442840,
> > "msgr_running_send_time": 9335.249687474,
> > "msgr_running_recv_time": 22643.927960456,
> > "msgr_running_fast_dispatch_time": 3426.566334706
> > },
> > "finisher-PurgeQueue": {
> > "queue_len": 0,
> > "complete_latency": {
> > "avgcount": 2077128,
> > "sum": 10029.468276512,
> > "avgtime": 0.004828526
> > }
> > },
> > "mds": {
> > "request": 1320419754,
> > "reply": 1320418963,
> > "reply_latency": {
> > "avgcount": 1320418963,
> > "sum": 3567340.917522550,
> > "avgtime": 0.002701673
> > },
> > "forward": 0,
> > "dir_fetch": 95955541,
> > "dir_commit": 5380286,
> > "dir_split": 29080,
> > "dir_merge": 28453,
> > "inode_max": 2147483647,
> > "inodes": 2049324,
> > "inodes_top": 55759,
> > "inodes_bottom": 118910,
> > "inodes_pin_tail": 1874655,
> > "inodes_pinned": 1969667,
> > "inodes_expired": 14225864524,
> > "inodes_with_caps": 1969030,
> > "caps": 3010600,
> > "subtrees": 2,
> > "traverse": 1433042396,
> > "traverse_hit": 855810795,
> > "traverse_forward": 0,
> > "traverse_discover": 0,
> > "traverse_dir_fetch": 75553963,
> > "traverse_remote_ino": 5462,
> > "traverse_lock": 217,
> > "load_cent": 132079451933,
> > "q": 41,
> > "exported": 0,
> > "exported_inodes": 0,
> > "imported": 0,
> > "imported_inodes": 0
> > },
> > "mds_cache": {
> > "num_strays": 150,
> > "num_strays_delayed": 0,
> > "num_strays_enqueuing": 0,
> > "strays_created": 2317004,
> > "strays_enqueued": 2316671,
> > "strays_reintegrated": 288,
> > "strays_migrated": 0,
> > "num_recovering_processing": 0,
> > "num_recovering_enqueued": 0,
> > "num_recovering_prioritized": 0,
> > "recovery_started": 0,
> > "recovery_completed": 0,
> > "ireq_enqueue_scrub": 0,
> > "ireq_exportdir": 0,
> > "ireq_flush": 0,
> > "ireq_fragmentdir": 57533,
> > "ireq_fragstats": 0,
> > "ireq_inodestats": 0
> > },
> > "mds_log": {
> > "evadd": 293928039,
> > "evex": 293928281,
> > "evtrm": 293926233,
> > "ev": 26595,
> > "evexg": 0,
> > "evexd": 2048,
> > "segadd": 365381,
> > "segex": 365382,
> > "segtrm": 365380,
> > "seg": 32,
> > "segexg": 0,
> > "segexd": 2,
> > "expos": 4997676796422,
> > "wrpos": 4997732797135,
> > "rdpos": 4232612352311,
> > "jlat": {
> > "avgcount": 62629276,
> > "sum": 260619.838247062,
> > "avgtime": 0.004161310
> > },
> > "replayed": 24789
> > },
> > "mds_mem": {
> > "ino": 2048405,
> > "ino+": 14160488289,
> > "ino-": 14158439884,
> > "dir": 377882,
> > "dir+": 15421679,
> > "dir-": 15043797,
> > "dn": 2049614,
> > "dn+": 14231703198,
> > "dn-": 14229653584,
> > "cap": 3010600,
> > "cap+": 1555206662,
> > "cap-": 1552196062,
> > "rss": 17082432,
> > "heap": 313916,
> > "buf": 0
> > },
> > "mds_server": {
> > "dispatch_client_request": 1437033326,
> > "dispatch_server_request": 0,
> > "handle_client_request": 1320419754,
> > "handle_client_session": 11542297,
> > "handle_slave_request": 0,
> > "req_create": 18618128,
> > "req_getattr": 11195570,
> > "req_getfilelock": 0,
> > "req_link": 411,
> > "req_lookup": 1005844421,
> > "req_lookuphash": 0,
> > "req_lookupino": 0,
> > "req_lookupname": 37344,
> > "req_lookupparent": 0,
> > "req_lookupsnap": 0,
> > "req_lssnap": 0,
> > "req_mkdir": 691747,
> > "req_mknod": 18,
> > "req_mksnap": 0,
> > "req_open": 230213054,
> > "req_readdir": 50618109,
> > "req_rename": 17377032,
> > "req_renamesnap": 0,
> > "req_rmdir": 463707,
> > "req_rmsnap": 0,
> > "req_rmxattr": 0,
> > "req_setattr": 1963949,
> > "req_setdirlayout": 0,
> > "req_setfilelock": 210187,
> > "req_setlayout": 0,
> > "req_setxattr": 8,
> > "req_symlink": 1971,
> > "req_unlink": 1801435
> > },
> > "mds_sessions": {
> > "session_count": 305,
> > "session_add": 473,
> > "session_remove": 168
> > },
> > "objecter": {
> > "op_active": 0,
> > "op_laggy": 0,
> > "op_send": 197270397,
> > "op_send_bytes": 796275884964,
> > "op_resend": 7,
> > "op_reply": 197270390,
> > "op": 197270390,
> > "op_r": 96075672,
> > "op_w": 101194718,
> > "op_rmw": 0,
> > "op_pg": 0,
> > "osdop_stat": 4428036,
> > "osdop_create": 19400797,
> > "osdop_read": 31288,
> > "osdop_write": 62709547,
> > "osdop_writefull": 165583,
> > "osdop_writesame": 0,
> > "osdop_append": 0,
> > "osdop_zero": 2,
> > "osdop_truncate": 13280,
> > "osdop_delete": 3185444,
> > "osdop_mapext": 0,
> > "osdop_sparse_read": 0,
> > "osdop_clonerange": 0,
> > "osdop_getxattr": 27007173,
> > "osdop_setxattr": 38801594,
> > "osdop_cmpxattr": 0,
> > "osdop_rmxattr": 0,
> > "osdop_resetxattrs": 0,
> > "osdop_tmap_up": 0,
> > "osdop_tmap_put": 0,
> > "osdop_tmap_get": 0,
> > "osdop_call": 0,
> > "osdop_watch": 0,
> > "osdop_notify": 0,
> > "osdop_src_cmpxattr": 0,
> > "osdop_pgls": 0,
> > "osdop_pgls_filter": 0,
> > "osdop_other": 10143158,
> > "linger_active": 0,
> > "linger_send": 0,
> > "linger_resend": 0,
> > "linger_ping": 0,
> > "poolop_active": 0,
> > "poolop_send": 0,
> > "poolop_resend": 0,
> > "poolstat_active": 0,
> > "poolstat_send": 0,
> > "poolstat_resend": 0,
> > "statfs_active": 0,
> > "statfs_send": 0,
> > "statfs_resend": 0,
> > "command_active": 0,
> > "command_send": 0,
> > "command_resend": 0,
> > "map_epoch": 3044,
> > "map_full": 0,
> > "map_inc": 160,
> > "osd_sessions": 18,
> > "osd_session_open": 20,
> > "osd_session_close": 2,
> > "osd_laggy": 0,
> > "omap_wr": 9743114,
> > "omap_rd": 191911089,
> > "omap_del": 684272
> > },
> > "purge_queue": {
> > "pq_executing_ops": 0,
> > "pq_executing": 0,
> > "pq_executed": 2316671
> > },
> > "throttle-msgr_dispatch_throttler-mds": {
> > "val": 0,
> > "max": 104857600,
> > "get_started": 0,
> > "get": 1884071270,
> > "get_sum": 12697353890803,
> > "get_or_fail_fail": 0,
> > "get_or_fail_success": 1884071270,
> > "take": 0,
> > "take_sum": 0,
> > "put": 1884071270,
> > "put_sum": 12697353890803,
> > "wait": {
> > "avgcount": 0,
> > "sum": 0.000000000,
> > "avgtime": 0.000000000
> > }
> > },
> > "throttle-objecter_bytes": {
> > "val": 0,
> > "max": 104857600,
> > "get_started": 0,
> > "get": 0,
> > "get_sum": 0,
> > "get_or_fail_fail": 0,
> > "get_or_fail_success": 0,
> > "take": 197270390,
> > "take_sum": 796529593788,
> > "put": 183928495,
> > "put_sum": 796529593788,
> > "wait": {
> > "avgcount": 0,
> > "sum": 0.000000000,
> > "avgtime": 0.000000000
> > }
> > },
> > "throttle-objecter_ops": {
> > "val": 0,
> > "max": 1024,
> > "get_started": 0,
> > "get": 0,
> > "get_sum": 0,
> > "get_or_fail_fail": 0,
> > "get_or_fail_success": 0,
> > "take": 197270390,
> > "take_sum": 197270390,
> > "put": 197270390,
> > "put_sum": 197270390,
> > "wait": {
> > "avgcount": 0,
> > "sum": 0.000000000,
> > "avgtime": 0.000000000
> > }
> > },
> > "throttle-write_buf_throttle": {
> > "val": 0,
> > "max": 3758096384,
> > "get_started": 0,
> > "get": 2316671,
> > "get_sum": 215451035,
> > "get_or_fail_fail": 0,
> > "get_or_fail_success": 2316671,
> > "take": 0,
> > "take_sum": 0,
> > "put": 31223,
> > "put_sum": 215451035,
> > "wait": {
> > "avgcount": 0,
> > "sum": 0.000000000,
> > "avgtime": 0.000000000
> > }
> > },
> > "throttle-write_buf_throttle-0x563c33bea220": {
> > "val": 29763,
> > "max": 3758096384,
> > "get_started": 0,
> > "get": 293928039,
> > "get_sum": 765120443785,
> > "get_or_fail_fail": 0,
> > "get_or_fail_success": 293928039,
> > "take": 0,
> > "take_sum": 0,
> > "put": 62629276,
> > "put_sum": 765120414022,
> > "wait": {
> > "avgcount": 0,
> > "sum": 0.000000000,
> > "avgtime": 0.000000000
> > }
> > }
> > }
> >
>
> I don't find any clue. Next time it happens, could you please try
> "ceph tell mds.xxx heap release"
>
> >
> >
> > # ceph status
> > cluster:
> > id: e22b8e83-3036-4fe5-8fd5-5ce9d539beca
> > health: HEALTH_OK
> >
> > services:
> > mon: 3 daemons, quorum ceph4-1,ceph4-2,ceph4-3
> > mgr: ceph4-2.odiso.net(active), standbys: ceph4-3.odiso.net,
> ceph4-1.odiso.net
> > mds: cephfs4-1/1/1 up {0=ceph4-1.odiso.net=up:active}, 2 up:standby
> > osd: 18 osds: 18 up, 18 in
> >
> > data:
> > pools: 11 pools, 1992 pgs
> > objects: 72258k objects, 5918 GB
> > usage: 20088 GB used, 6737 GB / 26825 GB avail
> > pgs: 1992 active+clean
> >
> > io:
> > client: 3099 kB/s rd, 6412 kB/s wr, 108 op/s rd, 481 op/s wr
> >
> >
> > ----- Mail original -----
> > De: "Patrick Donnelly" <pdonn...@redhat.com>
> > À: "aderumier" <aderum...@odiso.com>
> > Cc: "ceph-users" <ceph-users@lists.ceph.com>
> > Envoyé: Mardi 27 Mars 2018 20:35:08
> > Objet: Re: [ceph-users] ceph mds memory usage 20GB : is it normal ?
> >
> > Hello Alexandre,
> >
> > On Thu, Mar 22, 2018 at 2:29 AM, Alexandre DERUMIER <aderum...@odiso.com>
> wrote:
> >> Hi,
> >>
> >> I'm running cephfs since 2 months now,
> >>
> >> and my active msd memory usage is around 20G now (still growing).
> >>
> >> ceph 1521539 10.8 31.2 20929836 20534868 ? Ssl janv.26 8573:34
> /usr/bin/ceph-mds -f --cluster ceph --id 2 --setuser ceph --setgroup ceph
> >> USER PID %CPU %MEM VSZ RSS TTY STAT START TIME COMMAND
> >>
> >>
> >> this is on luminous 12.2.2
> >>
> >> only tuning done is:
> >>
> >> mds_cache_memory_limit = 5368709120
> >>
> >>
> >> (5GB). I known it's a soft limit, but 20G seem quite huge vs 5GB ....
> >>
> >>
> >> Is it normal ?
> >
> > No, that's definitely not normal!
> >
> >
> >> # ceph daemon mds.2 perf dump mds
> >> {
> >> "mds": {
> >> "request": 1444009197,
> >> "reply": 1443999870,
> >> "reply_latency": {
> >> "avgcount": 1443999870,
> >> "sum": 1657849.656122933,
> >> "avgtime": 0.001148095
> >> },
> >> "forward": 0,
> >> "dir_fetch": 51740910,
> >> "dir_commit": 9069568,
> >> "dir_split": 64367,
> >> "dir_merge": 58016,
> >> "inode_max": 2147483647,
> >> "inodes": 2042975,
> >> "inodes_top": 152783,
> >> "inodes_bottom": 138781,
> >> "inodes_pin_tail": 1751411,
> >> "inodes_pinned": 1824714,
> >> "inodes_expired": 7258145573,
> >> "inodes_with_caps": 1812018,
> >> "caps": 2538233,
> >> "subtrees": 2,
> >> "traverse": 1591668547,
> >> "traverse_hit": 1259482170,
> >> "traverse_forward": 0,
> >> "traverse_discover": 0,
> >> "traverse_dir_fetch": 30827836,
> >> "traverse_remote_ino": 7510,
> >> "traverse_lock": 86236,
> >> "load_cent": 144401980319,
> >> "q": 49,
> >> "exported": 0,
> >> "exported_inodes": 0,
> >> "imported": 0,
> >> "imported_inodes": 0
> >> }
> >> }
> >
> > Can you also share `ceph daemon mds.2 cache status`, the full `ceph
> > daemon mds.2 perf dump`, and `ceph status`?
> >
> > Note [1] will be in 12.2.5 and may help with your issue.
> >
> > [1] https://github.com/ceph/ceph/pull/20527
> >
> > --
> > Patrick Donnelly
> >
> > _______________________________________________
> > ceph-users mailing list
> > ceph-users@lists.ceph.com
> > http://lists.ceph.com/listinfo.cgi/ceph-users-ceph.com
>
> _______________________________________________
> ceph-users mailing list
> ceph-users@lists.ceph.com
> http://lists.ceph.com/listinfo.cgi/ceph-users-ceph.com
>
_______________________________________________
ceph-users mailing list
ceph-users@lists.ceph.com
http://lists.ceph.com/listinfo.cgi/ceph-users-ceph.com

Reply via email to