Hi,
We have a CephFS Quincy cluster (17.2.7) used by Openshift for PVC
provisioning (and lot of snapshots) with ceph-csi driver.
On the metadata pool, we observed a continuous increase of the write
throughput activity from 10MB/s just after a restart of the MDS to
+200MB/s after 2 weeks.
We first thought it was related to the snapshots deletion activity (lot
of creations and deletions every day) because we saw the same kind of
increase for the number of Strays.
We tried to evaluate strays using the recursive scrub [1], Strays
decrease from 450K to 350K but no impact on the metadata write throughput.
Only a MDS restart makes the write throughput back to normal.
I saw this very familiar issue https://tracker.ceph.com/issues/53542 and
I'm wondering if this has been fixed in Quincy?
I tried to increase "mds_log_events_per_segment" and
"mds_log_max_segments" but nothing helped.
As I answered in the tracker, I also observed large 4M objects in
objecter_requests.
Anything we can do to prevent this and avoid restarting the MDS every
week? If we don't do that, PVC operations stay blocked on Openshift side.
Cheers,
Adrien
[1]
https://docs.ceph.com/en/quincy/cephfs/scrub/#evaluate-strays-using-recursive-scrub
The MDS perf dump if it can helps :
{
"AsyncMessenger::Worker-0": {
"msgr_recv_messages": 452889868,
"msgr_send_messages": 210932005,
"msgr_recv_bytes": 381554341884,
"msgr_send_bytes": 31897539519042,
"msgr_created_connections": 5392,
"msgr_active_connections": 213,
"msgr_running_total_time": 121964.941943159,
"msgr_running_send_time": 85193.342708858,
"msgr_running_recv_time": 40274.249544605,
"msgr_running_fast_dispatch_time": 15899.702855460,
"msgr_send_messages_queue_lat": {
"avgcount": 210932003,
"sum": 140532.121408536,
"avgtime": 0.000666243
},
"msgr_handle_ack_lat": {
"avgcount": 110313532,
"sum": 120.513184629,
"avgtime": 0.000001092
}
},
"AsyncMessenger::Worker-1": {
"msgr_recv_messages": 393334601,
"msgr_send_messages": 170841678,
"msgr_recv_bytes": 307649876710,
"msgr_send_bytes": 23017109865662,
"msgr_created_connections": 12712,
"msgr_active_connections": 218,
"msgr_running_total_time": 93136.265793790,
"msgr_running_send_time": 63604.143090523,
"msgr_running_recv_time": 30881.136884349,
"msgr_running_fast_dispatch_time": 12763.126679902,
"msgr_send_messages_queue_lat": {
"avgcount": 170841678,
"sum": 153574.265735747,
"avgtime": 0.000898927
},
"msgr_handle_ack_lat": {
"avgcount": 81594240,
"sum": 65.480936528,
"avgtime": 0.000000802
}
},
"AsyncMessenger::Worker-2": {
"msgr_recv_messages": 481956104,
"msgr_send_messages": 301348856,
"msgr_recv_bytes": 349838769013,
"msgr_send_bytes": 26783354654792,
"msgr_created_connections": 34394,
"msgr_active_connections": 215,
"msgr_running_total_time": 108807.807447203,
"msgr_running_send_time": 74577.666991790,
"msgr_running_recv_time": 41392.667149426,
"msgr_running_fast_dispatch_time": 13971.142479134,
"msgr_send_messages_queue_lat": {
"avgcount": 301348847,
"sum": 138655.862052108,
"avgtime": 0.000460117
},
"msgr_handle_ack_lat": {
"avgcount": 156375610,
"sum": 144.168097813,
"avgtime": 0.000000921
}
},
"cct": {
"total_workers": 1,
"unhealthy_workers": 0
},
"finisher-MDSRank": {
"queue_len": 0,
"complete_latency": {
"avgcount": 121564865,
"sum": 160601.142778677,
"avgtime": 0.001321114
}
},
"finisher-PurgeQueue": {
"queue_len": 0,
"complete_latency": {
"avgcount": 27338289,
"sum": 8785.146872773,
"avgtime": 0.000321349
}
},
"mds": {
"request": 210298249,
"reply": 210297956,
"reply_latency": {
"avgcount": 210297956,
"sum": 1148337.019419176,
"avgtime": 0.005460523
},
"slow_reply": 15,
"forward": 0,
"dir_fetch": 49843551,
"dir_commit": 65281430,
"dir_split": 6,
"dir_merge": 17,
"inodes": 5113817,
"inodes_top": 2080514,
"inodes_bottom": 2263929,
"inodes_pin_tail": 769374,
"inodes_pinned": 2141653,
"inodes_expired": 538454667,
"inodes_with_caps": 413837,
"caps": 457168,
"subtrees": 2,
"traverse": 262113637,
"traverse_hit": 245551660,
"traverse_forward": 0,
"traverse_discover": 0,
"traverse_dir_fetch": 1642726,
"traverse_remote_ino": 0,
"traverse_lock": 140893,
"load_cent": 392174,
"q": 0,
"exported": 0,
"exported_inodes": 0,
"imported": 0,
"imported_inodes": 0,
"openino_dir_fetch": 407633,
"openino_backtrace_fetch": 7011755,
"openino_peer_discover": 0,
"root_rfiles": 11618539,
"root_rbytes": 11106041675168,
"root_rsnaps": 7576,
"scrub_backtrace_fetch": 7009788,
"scrub_set_tag": 0,
"scrub_backtrace_repaired": 0,
"scrub_inotable_repaired": 0,
"scrub_dir_inodes": 1526050,
"scrub_dir_base_inodes": 3,
"scrub_dirfrag_rstats": 1526047,
"scrub_file_inodes": 5483733,
"handle_inode_file_caps": 0,
"ceph_cap_op_revoke": 5076186,
"ceph_cap_op_grant": 60192925,
"ceph_cap_op_trunc": 658164,
"ceph_cap_op_flushsnap_ack": 0,
"ceph_cap_op_flush_ack": 6,
"handle_client_caps": 70349707,
"handle_client_caps_dirty": 21547109,
"handle_client_cap_release": 11468771,
"process_request_cap_release": 83631146
},
"mds_cache": {
"num_strays": 347989,
"num_strays_delayed": 0,
"num_strays_enqueuing": 0,
"strays_created": 19259955,
"strays_enqueued": 20275628,
"strays_reintegrated": 469,
"strays_migrated": 0,
"num_recovering_processing": 0,
"num_recovering_enqueued": 0,
"num_recovering_prioritized": 0,
"recovery_started": 484,
"recovery_completed": 484,
"ireq_enqueue_scrub": 2,
"ireq_exportdir": 0,
"ireq_flush": 0,
"ireq_fragmentdir": 23,
"ireq_fragstats": 0,
"ireq_inodestats": 0
},
"mds_log": {
"evadd": 148604638,
"evex": 148687537,
"evtrm": 148686957,
"ev": 4640,
"evexg": 0,
"evexd": 580,
"segadd": 5143165,
"segex": 5143142,
"segtrm": 5143128,
"seg": 166,
"segexg": 0,
"segexd": 14,
"expos": 2655552712972056,
"wrpos": 2655556750517207,
"rdpos": 2574717722176006,
"jlat": {
"avgcount": 23948614,
"sum": 1026045.477642289,
"avgtime": 0.042843626
},
"replayed": 86959
},
"mds_mem": {
"ino": 5091233,
"ino+": 537662754,
"ino-": 532571521,
"dir": 385511,
"dir+": 48473070,
"dir-": 48087559,
"dn": 5114534,
"dn+": 564239550,
"dn-": 559125016,
"cap": 457198,
"cap+": 296065469,
"cap-": 295608271,
"rss": 21947020,
"heap": 223516
},
"mds_server": {
"dispatch_client_request": 265192774,
"dispatch_server_request": 0,
"handle_client_request": 210298249,
"handle_client_session": 44456292,
"handle_peer_request": 0,
"req_create_latency": {
"avgcount": 19045215,
"sum": 243874.341572325,
"avgtime": 0.012805019
},
"req_getattr_latency": {
"avgcount": 13244393,
"sum": 122372.637049784,
"avgtime": 0.009239580
},
"req_getfilelock_latency": {
"avgcount": 0,
"sum": 0.000000000,
"avgtime": 0.000000000
},
"req_link_latency": {
"avgcount": 108,
"sum": 0.089209875,
"avgtime": 0.000826017
},
"req_lookup_latency": {
"avgcount": 42591076,
"sum": 61153.611919024,
"avgtime": 0.001435831
},
"req_lookuphash_latency": {
"avgcount": 0,
"sum": 0.000000000,
"avgtime": 0.000000000
},
"req_lookupino_latency": {
"avgcount": 6119,
"sum": 12.051986148,
"avgtime": 0.001969600
},
"req_lookupname_latency": {
"avgcount": 0,
"sum": 0.000000000,
"avgtime": 0.000000000
},
"req_lookupparent_latency": {
"avgcount": 0,
"sum": 0.000000000,
"avgtime": 0.000000000
},
"req_lookupsnap_latency": {
"avgcount": 11789,
"sum": 1.916955778,
"avgtime": 0.000162605
},
"req_lssnap_latency": {
"avgcount": 174999,
"sum": 59.628852495,
"avgtime": 0.000340738
},
"req_mkdir_latency": {
"avgcount": 386018,
"sum": 5323.027974258,
"avgtime": 0.013789584
},
"req_mknod_latency": {
"avgcount": 0,
"sum": 0.000000000,
"avgtime": 0.000000000
},
"req_mksnap_latency": {
"avgcount": 11784,
"sum": 4342.774640033,
"avgtime": 0.368531452
},
"req_open_latency": {
"avgcount": 1719118,
"sum": 2957.961208770,
"avgtime": 0.001720627
},
"req_readdir_latency": {
"avgcount": 49303877,
"sum": 153658.488737674,
"avgtime": 0.003116559
},
"req_rename_latency": {
"avgcount": 1021332,
"sum": 4173.200914205,
"avgtime": 0.004086037
},
"req_renamesnap_latency": {
"avgcount": 0,
"sum": 0.000000000,
"avgtime": 0.000000000
},
"req_rmdir_latency": {
"avgcount": 355348,
"sum": 6609.629921457,
"avgtime": 0.018600442
},
"req_rmsnap_latency": {
"avgcount": 11476,
"sum": 4647.171801363,
"avgtime": 0.404947002
},
"req_rmxattr_latency": {
"avgcount": 0,
"sum": 0.000000000,
"avgtime": 0.000000000
},
"req_setattr_latency": {
"avgcount": 951092,
"sum": 60918.695526648,
"avgtime": 0.064051317
},
"req_setdirlayout_latency": {
"avgcount": 0,
"sum": 0.000000000,
"avgtime": 0.000000000
},
"req_setfilelock_latency": {
"avgcount": 42236246,
"sum": 87407.285106607,
"avgtime": 0.002069485
},
"req_setlayout_latency": {
"avgcount": 0,
"sum": 0.000000000,
"avgtime": 0.000000000
},
"req_setxattr_latency": {
"avgcount": 20450454,
"sum": 351779.625311317,
"avgtime": 0.017201555
},
"req_symlink_latency": {
"avgcount": 2,
"sum": 0.002286490,
"avgtime": 0.001143245
},
"req_unlink_latency": {
"avgcount": 18777510,
"sum": 39044.878444925,
"avgtime": 0.002079342
},
"cap_revoke_eviction": 0,
"cap_acquisition_throttle": 0,
"req_getvxattr_latency": {
"avgcount": 0,
"sum": 0.000000000,
"avgtime": 0.000000000
}
},
"mds_sessions": {
"session_count": 545,
"session_add": 26598,
"session_remove": 26053,
"sessions_open": 545,
"sessions_stale": 0,
"total_load": 11076,
"average_load": 20,
"avg_session_uptime": 37409721,
"mdthresh_evicted": 0
},
"mempool": {
"bloom_filter_bytes": 10309112,
"bloom_filter_items": 10309112,
"bluestore_alloc_bytes": 0,
"bluestore_alloc_items": 0,
"bluestore_cache_data_bytes": 0,
"bluestore_cache_data_items": 0,
"bluestore_cache_onode_bytes": 0,
"bluestore_cache_onode_items": 0,
"bluestore_cache_meta_bytes": 0,
"bluestore_cache_meta_items": 0,
"bluestore_cache_other_bytes": 0,
"bluestore_cache_other_items": 0,
"bluestore_Buffer_bytes": 0,
"bluestore_Buffer_items": 0,
"bluestore_Extent_bytes": 0,
"bluestore_Extent_items": 0,
"bluestore_Blob_bytes": 0,
"bluestore_Blob_items": 0,
"bluestore_SharedBlob_bytes": 0,
"bluestore_SharedBlob_items": 0,
"bluestore_inline_bl_bytes": 0,
"bluestore_inline_bl_items": 0,
"bluestore_fsck_bytes": 0,
"bluestore_fsck_items": 0,
"bluestore_txc_bytes": 0,
"bluestore_txc_items": 0,
"bluestore_writing_deferred_bytes": 0,
"bluestore_writing_deferred_items": 0,
"bluestore_writing_bytes": 0,
"bluestore_writing_items": 0,
"bluefs_bytes": 0,
"bluefs_items": 0,
"bluefs_file_reader_bytes": 0,
"bluefs_file_reader_items": 0,
"bluefs_file_writer_bytes": 0,
"bluefs_file_writer_items": 0,
"buffer_anon_bytes": 284731060,
"buffer_anon_items": 5959770,
"buffer_meta_bytes": 616,
"buffer_meta_items": 7,
"osd_bytes": 0,
"osd_items": 0,
"osd_mapbl_bytes": 0,
"osd_mapbl_items": 0,
"osd_pglog_bytes": 0,
"osd_pglog_items": 0,
"osdmap_bytes": 81480,
"osdmap_items": 2396,
"osdmap_mapping_bytes": 0,
"osdmap_mapping_items": 0,
"pgmap_bytes": 0,
"pgmap_items": 0,
"mds_co_bytes": 16277298996,
"mds_co_items": 260156681,
"unittest_1_bytes": 0,
"unittest_1_items": 0,
"unittest_2_bytes": 0,
"unittest_2_items": 0
},
"objecter": {
"op_active": 11,
"op_laggy": 0,
"op_send": 253406063,
"op_send_bytes": 81014361089067,
"op_resend": 0,
"op_reply": 253406052,
"oplen_avg": {
"avgcount": 253406063,
"sum": 451959672
},
"op": 253406063,
"op_r": 58269533,
"op_w": 195136530,
"op_rmw": 0,
"op_pg": 0,
"osdop_stat": 64974069,
"osdop_create": 11505623,
"osdop_read": 1275145,
"osdop_write": 34882891,
"osdop_writefull": 1169760,
"osdop_writesame": 0,
"osdop_append": 0,
"osdop_zero": 2,
"osdop_truncate": 4,
"osdop_delete": 79836916,
"osdop_mapext": 0,
"osdop_sparse_read": 0,
"osdop_clonerange": 0,
"osdop_getxattr": 56700621,
"osdop_setxattr": 17474636,
"osdop_cmpxattr": 0,
"osdop_rmxattr": 0,
"osdop_resetxattrs": 0,
"osdop_call": 0,
"osdop_watch": 0,
"osdop_notify": 0,
"osdop_src_cmpxattr": 0,
"osdop_pgls": 0,
"osdop_pgls_filter": 0,
"osdop_other": 683839,
"linger_active": 0,
"linger_send": 0,
"linger_resend": 0,
"linger_ping": 0,
"poolop_active": 0,
"poolop_send": 0,
"poolop_resend": 0,
"poolstat_active": 0,
"poolstat_send": 0,
"poolstat_resend": 0,
"statfs_active": 0,
"statfs_send": 0,
"statfs_resend": 0,
"command_active": 0,
"command_send": 0,
"command_resend": 0,
"map_epoch": 685274,
"map_full": 0,
"map_inc": 19880,
"osd_sessions": 96,
"osd_session_open": 96,
"osd_session_close": 0,
"osd_laggy": 0,
"omap_wr": 78737471,
"omap_rd": 99825478,
"omap_del": 4893217
},
"oft": {
"omap_total_objs": 4,
"omap_total_kv_pairs": 105334,
"omap_total_updates": 80286291,
"omap_total_removes": 72648222
},
"purge_queue": {
"pq_executing_ops": 0,
"pq_executing_ops_high_water": 1559,
"pq_executing": 0,
"pq_executing_high_water": 64,
"pq_executed": 20275628,
"pq_item_in_journal": 0
},
"throttle-msgr_dispatch_throttler-mds": {
"val": 0,
"max": 104857600,
"get_started": 0,
"get": 1328180571,
"get_sum": 935681276002,
"get_or_fail_fail": 0,
"get_or_fail_success": 1328180571,
"take": 0,
"take_sum": 0,
"put": 1328180571,
"put_sum": 935681276002,
"wait": {
"avgcount": 0,
"sum": 0.000000000,
"avgtime": 0.000000000
}
},
"throttle-objecter_bytes": {
"val": 21734389,
"max": 104857600,
"get_started": 0,
"get": 0,
"get_sum": 0,
"get_or_fail_fail": 0,
"get_or_fail_success": 0,
"take": 253406063,
"take_sum": 81151894362586,
"put": 253406052,
"put_sum": 81151872628197,
"wait": {
"avgcount": 0,
"sum": 0.000000000,
"avgtime": 0.000000000
}
},
"throttle-objecter_ops": {
"val": 11,
"max": 1024,
"get_started": 0,
"get": 0,
"get_sum": 0,
"get_or_fail_fail": 0,
"get_or_fail_success": 0,
"take": 253406063,
"take_sum": 253406063,
"put": 253406052,
"put_sum": 253406052,
"wait": {
"avgcount": 0,
"sum": 0.000000000,
"avgtime": 0.000000000
}
},
"throttle-write_buf_throttle": {
"val": 0,
"max": 3758096384,
"get_started": 0,
"get": 20275628,
"get_sum": 2230259884,
"get_or_fail_fail": 0,
"get_or_fail_success": 20275628,
"take": 0,
"take_sum": 0,
"put": 1242717,
"put_sum": 2230259884,
"wait": {
"avgcount": 0,
"sum": 0.000000000,
"avgtime": 0.000000000
}
},
"throttle-write_buf_throttle-0x56401344c0a0": {
"val": 0,
"max": 3758096384,
"get_started": 0,
"get": 148604638,
"get_sum": 80839025768070,
"get_or_fail_fail": 0,
"get_or_fail_success": 148604638,
"take": 0,
"take_sum": 0,
"put": 23948617,
"put_sum": 80839025768070,
"wait": {
"avgcount": 0,
"sum": 0.000000000,
"avgtime": 0.000000000
}
}
}
_______________________________________________
ceph-users mailing list -- ceph-users@ceph.io
To unsubscribe send an email to ceph-users-le...@ceph.io