Hi all I see node stalled at 'joining' for good 8 hours now: 3-node cluster v1.3.1, 512 vnodes (way too high but that's another matter), leveldb backend Cluster was originally 2-nodes only and after upgrading to 1.3.1 we attached another node No active transfers on the nodes at the moment, but from error log looks like vnode transfer stopped
riak-admin member-status Attempting to restart script through sudo -H -u riak ================================= Membership ================================== Status Ring Pending Node ------------------------------------------------------------------------------- joining 33.2% -- 'riak@10.173.240.3' valid 33.6% -- 'riak@10.173.240.2' valid 33.2% -- 'riak@10.173.240.21' ------------------------------------------------------------------------------- Valid:2 / Leaving:0 / Exiting:0 / Joining:1 / Down:0 riak-admin ring-status Attempting to restart script through sudo -H -u riak ================================== Claimant =================================== Claimant: 'riak@10.173.240.21' Status: up Ring Ready: true ============================== Ownership Handoff ============================== No pending changes. ============================== Unreachable Nodes ============================== All nodes are up and reachable Node that is joining shows *error.log* 2013-06-26 16:35:50.351 [error] <0.7222.0> gen_fsm <0.7222.0> in state ready terminated with reason: {timeout,{gen_fsm,sync_send_all_state_event,[<0.7223.0>,stop]}} 2013-06-26 16:35:51.549 [error] <0.7222.0> CRASH REPORT Process <0.7222.0> with 0 neighbours exited with reason: {timeout,{gen_fsm,sync_send_all_state_event,[<0.7223.0>,stop]}} in gen_fsm:handle_msg/7 line 511 2013-06-26 16:35:52.460 [error] <0.5316.0> gen_fsm <0.5316.0> in state ready terminated with reason: {timeout,{gen_fsm,sync_send_all_state_event,[<0.5317.0>,stop]}} 2013-06-26 16:35:52.696 [error] <0.5316.0> CRASH REPORT Process <0.5316.0> with 0 neighbours exited with reason: {timeout,{gen_fsm,sync_send_all_state_event,[<0.5317.0>,stop]}} in gen_fsm:handle_msg/7 line 511 2013-06-26 16:35:52.867 [error] <0.9088.0> gen_fsm <0.9088.0> in state ready terminated with reason: {timeout,{gen_fsm,sync_send_all_state_event,[<0.9089.0>,stop]}} 2013-06-26 16:35:53.007 [error] <0.9088.0> CRASH REPORT Process <0.9088.0> with 0 neighbours exited with reason: {timeout,{gen_fsm,sync_send_all_state_event,[<0.9089.0>,stop]}} in gen_fsm:handle_msg/7 line 511 2013-06-26 16:35:53.127 [error] <0.5794.0> gen_fsm <0.5794.0> in state ready terminated with reason: {timeout,{gen_fsm,sync_send_all_state_event,[<0.5795.0>,stop]}} 2013-06-26 16:35:53.248 [error] <0.5794.0> CRASH REPORT Process <0.5794.0> with 0 neighbours exited with reason: {timeout,{gen_fsm,sync_send_all_state_event,[<0.5795.0>,stop]}} in gen_fsm:handle_msg/7 line 511 2013-06-26 16:35:53.368 [error] <0.3990.0> gen_fsm <0.3990.0> in state ready terminated with reason: {timeout,{gen_fsm,sync_send_all_state_event,[<0.3991.0>,stop]}} 2013-06-26 16:35:53.498 [error] <0.3990.0> CRASH REPORT Process <0.3990.0> with 0 neighbours exited with reason: {timeout,{gen_fsm,sync_send_all_state_event,[<0.3991.0>,stop]}} in gen_fsm:handle_msg/7 line 511 2013-06-26 16:35:53.619 [error] <0.7017.0> gen_fsm <0.7017.0> in state ready terminated with reason: {timeout,{gen_fsm,sync_send_all_state_event,[<0.7018.0>,stop]}} 2013-06-26 16:35:53.739 [error] <0.7017.0> CRASH REPORT Process <0.7017.0> with 0 neighbours exited with reason: {timeout,{gen_fsm,sync_send_all_state_event,[<0.7018.0>,stop]}} in gen_fsm:handle_msg/7 line 511 2013-06-26 16:35:53.859 [error] <0.5120.0> gen_fsm <0.5120.0> in state ready terminated with reason: {timeout,{gen_fsm,sync_send_all_state_event,[<0.5121.0>,stop]}} 2013-06-26 16:35:53.980 [error] <0.5120.0> CRASH REPORT Process <0.5120.0> with 0 neighbours exited with reason: {timeout,{gen_fsm,sync_send_all_state_event,[<0.5121.0>,stop]}} in gen_fsm:handle_msg/7 line 511 2013-06-26 16:41:27.819 [error] <0.10046.0>@riak_core_handoff_sender:start_fold:226 hinted_handoff transfer of riak_kv_vnode from 'riak@10.173.240.3' 961964944883816959833284743408959721921971224576 to 'riak@10.173.240.2' 961964944883816959833284743408959721921971224576 failed because of exit:{noproc,{riak_core_gen_server,call,[{riak_kv_handoff_listener,' riak@10.173.240.2'},handoff_port,infinity]}} [{riak_core_gen_server,call,3,[{file,"src/riak_core_gen_server.erl"},{line,214}]},{riak_core_handoff_sender,start_fold,5,[{file,"src/riak_core_handoff_sender.erl"},{line,84}]}] 2013-06-26 16:41:27.819 [error] <0.10045.0>@riak_core_handoff_sender:start_fold:226 hinted_handoff transfer of riak_kv_vnode from 'riak@10.173.240.3' 790695221759101774106290427075020305556041629696 to 'riak@10.173.240.2' 790695221759101774106290427075020305556041629696 failed because of exit:{noproc,{riak_core_gen_server,call,[{riak_kv_handoff_listener,' riak@10.173.240.2'},handoff_port,infinity]}} [{riak_core_gen_server,call,3,[{file,"src/riak_core_gen_server.erl"},{line,214}]},{riak_core_handoff_sender,start_fold,5,[{file,"src/riak_core_handoff_sender.erl"},{line,84}]}] 2013-06-26 16:41:31.153 [error] <0.10061.0>@riak_core_handoff_sender:start_fold:226 hinted_handoff transfer of riak_kv_vnode from 'riak@10.173.240.3' 379647886259785328361504067873565706277810601984 to 'riak@10.173.240.2' 379647886259785328361504067873565706277810601984 failed because of exit:{noproc,{riak_core_gen_server,call,[{riak_kv_handoff_listener,' riak@10.173.240.2'},handoff_port,infinity]}} [{riak_core_gen_server,call,3,[{file,"src/riak_core_gen_server.erl"},{line,214}]},{riak_core_handoff_sender,start_fold,5,[{file,"src/riak_core_handoff_sender.erl"},{line,84}]}] 2013-06-26 16:45:23.242 [error] emulator Error in process <0.25919.0> on node 'riak@10.173.240.3' with exit value: {badarg,[{riak_core_stat,'-vnodeq_stats/0-lc$^0/1-0-',1,[{file,"src/riak_core_stat.erl"},{line,168}]},{riak_core_stat,'-vnodeq_stats/0-lc$^0/1-0-',1,[{file,"src/riak_core_stat.erl"},{line,169}]},{riak_core_stat,vnodeq_stats... *console log:* 2013-06-26 16:52:14.834 [info] <0.30751.0>@riak_core_handoff_receiver:process_message:99 Receiving handoff data for partition riak_kv_vnode:822094670998632891489572718402909198556462055424 2013-06-26 16:52:14.875 [info] <0.30751.0>@riak_core_handoff_receiver:handle_info:69 Handoff receiver for partition 822094670998632891489572718402909198556462055424 exited after processing 0 objects 2013-06-26 16:52:16.693 [info] <0.30763.0>@riak_core_handoff_receiver:process_message:99 Receiving handoff data for partition riak_kv_vnode:125597796958124469533129165311555572001681702912 2013-06-26 16:52:16.731 [info] <0.30763.0>@riak_core_handoff_receiver:handle_info:69 Handoff receiver for partition 125597796958124469533129165311555572001681702912 exited after processing 0 objects 2013-06-26 18:52:20.507 [info] <0.18639.3>@riak_kv_exchange_fsm:key_exchange:204 Repaired 2010 keys during active anti-entropy exchange of {436737793968023723603835506651545511733120466944,3} between {439592289353435643365952078590444502005885960192,'riak@10.173.240.3'} and {442446784738847563128068650529343492278651453440,'riak@10.173.240.21'} 2013-06-26 18:52:49.282 [info] <0.22964.3>@riak_kv_exchange_fsm:key_exchange:204 Repaired 1809 keys during active anti-entropy exchange of {439592289353435643365952078590444502005885960192,3} between {439592289353435643365952078590444502005885960192,'riak@10.173.240.3'} and {442446784738847563128068650529343492278651453440,'riak@10.173.240.21'} 2013-06-26 21:43:21.418 [info] <0.17334.7>@riak_kv_exchange_fsm:key_exchange:204 Repaired 1809 keys during active anti-entropy exchange of {48526421552002635955981722961282834637013385216,3} between {51380916937414555718098294900181824909778878464,'riak@10.173.240.3'} and {54235412322826475480214866839080815182544371712,'riak@10.173.240.21'} 2013-06-26 21:43:51.259 [info] <0.21268.7>@riak_kv_exchange_fsm:key_exchange:204 Repaired 1965 keys during active anti-entropy exchange of {51380916937414555718098294900181824909778878464,3} between {51380916937414555718098294900181824909778878464,'riak@10.173.240.3'} and {54235412322826475480214866839080815182544371712,'riak@10.173.240.21'} 2013-06-26 22:22:22.352 [info] <0.20533.8>@riak_kv_exchange_fsm:key_exchange:204 Repaired 1961 keys during active anti-entropy exchange of {271177061614132377401074334195404075912721858560,3} between {274031556999544297163190906134303066185487351808,'riak@10.173.240.3'} and {276886052384956216925307478073202056458252845056,'riak@10.173.240.2'} 2013-06-26 22:22:51.652 [info] <0.24761.8>@riak_kv_exchange_fsm:key_exchange:204 Repaired 2010 keys during active anti-entropy exchange of {274031556999544297163190906134303066185487351808,3} between {274031556999544297163190906134303066185487351808,'riak@10.173.240.3'} and {276886052384956216925307478073202056458252845056,'riak@10.173.240.2'} *riak-admin status* Attempting to restart script through sudo -H -u riak 1-minute stats for 'riak@10.173.240.3' ------------------------------------------- riak_kv_stat_ts : 1372288487 vnode_gets : 254 vnode_gets_total : 159164 vnode_puts : 0 vnode_puts_total : 42811 vnode_index_reads : 0 vnode_index_reads_total : 3706 vnode_index_writes : 0 vnode_index_writes_total : 42811 vnode_index_writes_postings : 0 vnode_index_writes_postings_total : 0 vnode_index_deletes : 0 vnode_index_deletes_total : 0 vnode_index_deletes_postings : 0 vnode_index_deletes_postings_total : 0 node_gets : 95 node_gets_total : 58191 node_get_fsm_siblings_mean : 0 node_get_fsm_siblings_median : 0 node_get_fsm_siblings_95 : 0 node_get_fsm_siblings_99 : 0 node_get_fsm_siblings_100 : 0 node_get_fsm_objsize_mean : 0 node_get_fsm_objsize_median : 0 node_get_fsm_objsize_95 : 0 node_get_fsm_objsize_99 : 0 node_get_fsm_objsize_100 : 0 node_get_fsm_time_mean : 1266 node_get_fsm_time_median : 1309 node_get_fsm_time_95 : 1526 node_get_fsm_time_99 : 2097 node_get_fsm_time_100 : 3879 node_puts : 0 node_puts_total : 192 node_put_fsm_time_mean : 0 node_put_fsm_time_median : 0 node_put_fsm_time_95 : 0 node_put_fsm_time_99 : 0 node_put_fsm_time_100 : 0 read_repairs : 0 read_repairs_total : 15325 coord_redirs_total : 126 executing_mappers : 0 precommit_fail : 0 postcommit_fail : 0 pbc_active : 0 pbc_connects : 8 pbc_connects_total : 2834 read_repairs_primary_notfound_one : 0 read_repairs_primary_notfound_count : 23123 read_repairs_fallback_notfound_one : 0 read_repairs_fallback_notfound_count : 1869 leveldb_read_block_error : 0 riak_pipe_stat_ts : 1372288486 pipeline_active : 0 pipeline_create_count : 0 pipeline_create_one : 0 pipeline_create_error_count : 0 pipeline_create_error_one : 0 cpu_nprocs : 418 cpu_avg1 : 5 cpu_avg5 : 18 cpu_avg15 : 26 mem_total : 2082766848 mem_allocated : 938852352 disk : [{"/",974010980,1}, {"/dev",1008180,1}, {"/run",406792,1}, {"/run/lock",5120,0}, {"/run/shm",1016976,0}, {"/boot",186663,17}] nodename : 'riak@10.173.240.3' connected_nodes : ['riak@10.173.240.21','riak@10.173.240.2'] sys_driver_version : <<"2.0">> sys_global_heaps_size : 0 sys_heap_type : private sys_logical_processors : 4 sys_otp_release : <<"R15B01">> sys_process_count : 4665 sys_smp_support : true sys_system_version : <<"Erlang R15B01 (erts-5.9.1) [source] [64-bit] [smp:4:4] [async-threads:64] [kernel-poll:true]">> sys_system_architecture : <<"x86_64-unknown-linux-gnu">> sys_threads_enabled : true sys_thread_pool_size : 64 sys_wordsize : 8 ring_members : ['riak@10.173.240.2','riak@10.173.240.21','riak@10.173.240.3 '] ring_num_partitions : 512 ring_ownership : <<"[{'riak@10.173.240.21',170},\n {'riak@10.173.240.2',172},\n {'riak@10.173.240.3',170}]">> ring_creation_size : 512 storage_backend : riak_kv_eleveldb_backend erlydtl_version : <<"0.7.0">> riak_control_version : <<"1.3.0">> cluster_info_version : <<"1.2.3">> riak_search_version : <<"1.3.0">> merge_index_version : <<"1.3.0">> riak_kv_version : <<"1.3.1">> riak_api_version : <<"1.3.1">> riak_pipe_version : <<"1.3.1">> riak_core_version : <<"1.3.1">> bitcask_version : <<"1.6.1">> basho_stats_version : <<"1.0.3">> webmachine_version : <<"1.9.3">> mochiweb_version : <<"1.5.1p3">> inets_version : <<"5.9">> erlang_js_version : <<"1.2.2">> runtime_tools_version : <<"1.8.8">> os_mon_version : <<"2.2.9">> riak_sysmon_version : <<"1.1.3">> ssl_version : <<"5.0.1">> public_key_version : <<"0.15">> crypto_version : <<"2.1">> sasl_version : <<"2.2.1">> lager_version : <<"1.2.2">> syntax_tools_version : <<"1.6.8">> compiler_version : <<"4.8.1">> stdlib_version : <<"1.18.1">> kernel_version : <<"2.15.1">> memory_total : 75985000 memory_processes : 44614494 memory_processes_used : 44614480 memory_system : 31370506 memory_atom : 569961 memory_atom_used : 539061 memory_binary : 3642608 memory_code : 10232207 memory_ets : 4826952 *Data size in Riak:* du -h --max-depth=1 /var/lib/riak/ ***** 10.0.20.21 ***** 0 /var/lib/riak/bitcask 132K /var/lib/riak/ring 8.0K /var/lib/riak/mr_queue 2.1M /var/lib/riak/kv_vnode 899M /var/lib/riak/leveldb 180M /var/lib/riak/anti_entropy 1.1G /var/lib/riak/ ***** 10.0.20.22 ***** 0 /var/lib/riak/bitcask 132K /var/lib/riak/ring 8.0K /var/lib/riak/mr_queue 2.1M /var/lib/riak/kv_vnode 292M /var/lib/riak/leveldb 220M /var/lib/riak/anti_entropy 513M /var/lib/riak/ ***** 10.0.20.23 ***** 0 /var/lib/riak/bitcask 64K /var/lib/riak/ring 2.1M /var/lib/riak/kv_vnode 53M /var/lib/riak/leveldb 23M /var/lib/riak/anti_entropy 77M /var/lib/riak/
_______________________________________________ riak-users mailing list riak-users@lists.basho.com http://lists.basho.com/mailman/listinfo/riak-users_lists.basho.com