Hi
I am using flink retained check points and along with
jobs/:jobid/checkpoints API for retrieving the latest retained check point
Following the response of Flink Checkpoints API:
I have my jobs restart attempts are 5
check point API response in "latest" key, check point file name of both
"restored" and "completed" values are having following behavior
1)Suppose the job is failed 3 times and recovered 4'th time, then both
values are same
2)Suppose the job is failed 4 times and recovered 5'th time, then both
values are same
3)Suppose the job is failed 5 times and recovered 6'th time, then both
values are same
4) Suppose the job is failed all 6 times and the job marked failed. then
also both the values are same
5)Suppose job is failed 6'th time , after recovering from 5 attempts
and made few check points, then both values are different.
During case (1), case (2), case (3) and case (4) i never had any issue.
Only When case (5) i had severe issue in my production as the "restored "
field check point doesn't exist
Please suggest any
{
"counts":{
"restored":6,
"total":3,
"in_progress":0,
"completed":3,
"failed":0
},
"summary":{
"state_size":{
"min":4879,
"max":4879,
"avg":4879
},
"end_to_end_duration":{
"min":25,
"max":130,
"avg":87
},
"alignment_buffered":{
"min":0,
"max":0,
"avg":0
}
},
"latest":{
"completed":{
"@class":"completed",
"id":7094,
"status":"COMPLETED",
"is_savepoint":false,
"trigger_timestamp":1590382502772,
"latest_ack_timestamp":1590382502902,
"state_size":4879,
"end_to_end_duration":130,
"alignment_buffered":0,
"num_subtasks":2,
"num_acknowledged_subtasks":2,
"tasks":{
},
"external_path":"file:/var/lib/persist/flink/checkpoints/29ae7600aa4f7d53a0dc1a0a7b257c85/chk-7094",
"discarded":false
},
"savepoint":null,
"failed":null,
"restored":{
"id":7093,
"restore_timestamp":1590382478448,
"is_savepoint":false,
"external_path":"file:/var/lib/persist/flink/checkpoints/29ae7600aa4f7d53a0dc1a0a7b257c85/chk-7093"
}
},
"history":[
{
"@class":"completed",
"id":7094,
"status":"COMPLETED",
"is_savepoint":false,
"trigger_timestamp":1590382502772,
"latest_ack_timestamp":1590382502902,
"state_size":4879,
"end_to_end_duration":130,
"alignment_buffered":0,
"num_subtasks":2,
"num_acknowledged_subtasks":2,
"tasks":{
},
"external_path":"file:/var/lib/persist/flink/checkpoints/29ae7600aa4f7d53a0dc1a0a7b257c85/chk-7094",
"discarded":false
},
{
"@class":"completed",
"id":7093,
"status":"COMPLETED",
"is_savepoint":false,
"trigger_timestamp":1590382310195,
"latest_ack_timestamp":1590382310220,
"state_size":4879,
"end_to_end_duration":25,
"alignment_buffered":0,
"num_subtasks":2,
"num_acknowledged_subtasks":2,
"tasks":{
},
"external_path":"file:/var/lib/persist/flink/checkpoints/29ae7600aa4f7d53a0dc1a0a7b257c85/chk-7093",
"discarded":false
},
{
"@class":"completed",
"id":7092,
"status":"COMPLETED",
"is_savepoint":false,
"trigger_timestamp":1590382190195,
"latest_ack_timestamp":1590382190303,
"state_size":4879,
"end_to_end_duration":108,
"alignment_buffered":0,
"num_subtasks":2,
"num_acknowledged_subtasks":2,
"tasks":{
},
"external_path":"file:/var/lib/persist/flink/checkpoints/29ae7600aa4f7d53a0dc1a0a7b257c85/chk-7092",
"discarded":true
}
]
}