An update. We were able to recover our filesystem (minus the two days between when the ZFS swap occurred and when we detected it and shut down the filesystem). Simply promoting the cloned ZFS volume (which was really our primary volume) and cleaning up the snapshot and clone got us back to normal. I did run an lfsck afterwards to clean up any problems from the two days we were running on the February clone of the MDT (orphaned files on the OSTs). I believe the lfsck is finished – lots of details below. I don’t think I see anything concerning but the output is a bit hard to interpret so if anyone sees otherwise, please let me know. Any other follow up advice would be appreciated as well.
[root@hpfs-fsl-mds1 ~]# lctl lfsck_query -M scratch-MDT0000 layout_mdts_init: 0 layout_mdts_scanning-phase1: 0 layout_mdts_scanning-phase2: 0 layout_mdts_completed: 0 layout_mdts_failed: 0 layout_mdts_stopped: 0 layout_mdts_paused: 0 layout_mdts_crashed: 0 layout_mdts_partial: 1 layout_mdts_co-failed: 0 layout_mdts_co-stopped: 0 layout_mdts_co-paused: 0 layout_mdts_unknown: 0 layout_osts_init: 0 layout_osts_scanning-phase1: 0 layout_osts_scanning-phase2: 0 layout_osts_completed: 24 layout_osts_failed: 0 layout_osts_stopped: 0 layout_osts_paused: 0 layout_osts_crashed: 0 layout_osts_partial: 0 layout_osts_co-failed: 0 layout_osts_co-stopped: 0 layout_osts_co-paused: 0 layout_osts_unknown: 0 layout_repaired: 6350036 namespace_mdts_init: 0 namespace_mdts_scanning-phase1: 0 namespace_mdts_scanning-phase2: 0 namespace_mdts_completed: 1 namespace_mdts_failed: 0 namespace_mdts_stopped: 0 namespace_mdts_paused: 0 namespace_mdts_crashed: 0 namespace_mdts_partial: 0 namespace_mdts_co-failed: 0 namespace_mdts_co-stopped: 0 namespace_mdts_co-paused: 0 namespace_mdts_unknown: 0 namespace_osts_init: 0 namespace_osts_scanning-phase1: 0 namespace_osts_scanning-phase2: 0 namespace_osts_completed: 0 namespace_osts_failed: 0 namespace_osts_stopped: 0 namespace_osts_paused: 0 namespace_osts_crashed: 0 namespace_osts_partial: 0 namespace_osts_co-failed: 0 namespace_osts_co-stopped: 0 namespace_osts_co-paused: 0 namespace_osts_unknown: 0 namespace_repaired: 1430801 [root@hpfs-fsl-mds1 ~]# [root@hpfs-fsl-mds1 ~]# lctl get_param -n mdd.scratch-MDT0000.lfsck_namespace name: lfsck_namespace magic: 0xa06249ff version: 2 status: completed flags: param: last_completed_time: 1656429356 time_since_last_completed: 26787 seconds latest_start_time: 1656367470 time_since_latest_start: 88673 seconds last_checkpoint_time: 1656429356 time_since_last_checkpoint: 26787 seconds latest_start_position: 15, N/A, N/A last_checkpoint_position: 983045597, N/A, N/A first_failure_position: N/A, N/A, N/A checked_phase1: 242090614 checked_phase2: 260856 updated_phase1: 1430801 updated_phase2: 0 failed_phase1: 0 failed_phase2: 0 directories: 12889816 dirent_repaired: 0 linkea_repaired: 1430801 nlinks_repaired: 0 multiple_linked_checked: 1870740 multiple_linked_repaired: 0 unknown_inconsistency: 0 unmatched_pairs_repaired: 0 dangling_repaired: 0 multiple_referenced_repaired: 0 bad_file_type_repaired: 0 lost_dirent_repaired: 0 local_lost_found_scanned: 0 local_lost_found_moved: 0 local_lost_found_skipped: 0 local_lost_found_failed: 0 striped_dirs_scanned: 0 striped_dirs_repaired: 0 striped_dirs_failed: 0 striped_dirs_disabled: 0 striped_dirs_skipped: 0 striped_shards_scanned: 0 striped_shards_repaired: 0 striped_shards_failed: 0 striped_shards_skipped: 0 name_hash_repaired: 0 linkea_overflow_cleared: 3 agent_entries_repaired: 0 success_count: 1 run_time_phase1: 60927 seconds run_time_phase2: 958 seconds average_speed_phase1: 3973 items/sec average_speed_phase2: 272 objs/sec average_speed_total: 3916 items/sec real_time_speed_phase1: N/A real_time_speed_phase2: N/A current_position: N/A [root@hpfs-fsl-mds1 ~]# [root@hpfs-fsl-mds1 ~]# lctl get_param -n mdd.scratch-MDT0000.lfsck_layout name: lfsck_layout magic: 0xb1732fed version: 2 status: partial flags: param: last_completed_time: 1656428398 time_since_last_completed: 27798 seconds latest_start_time: 1656367470 time_since_latest_start: 88726 seconds last_checkpoint_time: 1656428398 time_since_last_checkpoint: 27798 seconds latest_start_position: 15 last_checkpoint_position: 983045597 first_failure_position: 0 success_count: 1 repaired_dangling: 287730 repaired_unmatched_pair: 0 repaired_multiple_referenced: 0 repaired_orphan: 0 repaired_inconsistent_owner: 6062306 repaired_others: 0 skipped: 0 failed_phase1: 0 failed_phase2: 0 checked_phase1: 300484421 checked_phase2: 0 run_time_phase1: 60907 seconds run_time_phase2: 1 seconds average_speed_phase1: 4933 items/sec average_speed_phase2: 0 objs/sec real_time_speed_phase1: N/A real_time_speed_phase2: N/A current_position: N/A [root@hpfs-fsl-mds1 ~]# [root@hpfs-fsl-mds1 ~]# lctl get_param -n osd-ldiskfs.scratch-MDT0000.oi_scrub name: OI_scrub magic: 0x4c5fd252 oi_files: 64 status: completed flags: param: time_since_last_completed: 27850 seconds time_since_latest_start: 88778 seconds time_since_last_checkpoint: 27850 seconds latest_start_position: 224931713 last_checkpoint_position: 1073741825 first_failure_position: N/A checked: 233631851 updated: 0 failed: 0 prior_updated: 0 noscrub: 9527 igif: 0 success_count: 9 run_time: 75526 seconds average_speed: 3093 objects/sec real_time_speed: N/A current_position: N/A lf_scanned: 0 lf_repaired: 0 lf_failed: 0 [root@hpfs-fsl-mds1 ~]#
_______________________________________________ lustre-discuss mailing list [email protected] http://lists.lustre.org/listinfo.cgi/lustre-discuss-lustre.org
