gyfora commented on code in PR #978: URL: https://github.com/apache/flink-kubernetes-operator/pull/978#discussion_r2100627481
########## flink-kubernetes-operator/src/main/java/org/apache/flink/kubernetes/operator/observer/JobStatusObserver.java: ########## @@ -95,6 +105,141 @@ public boolean observe(FlinkResourceContext<R> ctx) { return false; } + /** + * Observe the exceptions raised in the job manager and take appropriate action. + * + * @param ctx the context with which the operation is executed + */ + protected void observeJobManagerExceptions(FlinkResourceContext<R> ctx) { + var resource = ctx.getResource(); + var operatorConfig = ctx.getOperatorConfig(); + var jobStatus = resource.getStatus().getJobStatus(); + + try { + var jobId = JobID.fromHexString(jobStatus.getJobId()); + // TODO: Ideally the best way to restrict the number of events is to use the query param + // `maxExceptions` + // but the JobExceptionsMessageParameters does not expose the parameters and nor does + // it have setters. + var history = + ctx.getFlinkService().getJobExceptions(resource, jobId, ctx.getObserveConfig()); + + if (history == null || history.getExceptionHistory() == null) { + return; + } + + var exceptionHistory = history.getExceptionHistory(); + var exceptions = exceptionHistory.getEntries(); + if (exceptions.isEmpty()) { + return; + } + + if (exceptionHistory.isTruncated()) { + LOG.warn( + "Job exception history is truncated for jobId '{}'. Some exceptions may be missing.", + jobId); + } + + String currentJobId = jobStatus.getJobId(); + Instant lastRecorded = null; // first reconciliation + + var cacheEntry = ctx.getExceptionCacheEntry(); + // a cache entry is created should always be present. The timestamp for the first + // reconciliation would be + // when the job was created. This check is still necessary because even though there + // might be an entry, + // the jobId could have changed since the job was first created. + if (cacheEntry.getJobId().equals(currentJobId)) { + lastRecorded = Instant.ofEpochMilli(cacheEntry.getLastTimestamp()); + } + + Instant now = Instant.now(); + int maxEvents = operatorConfig.getReportedExceptionEventsMaxCount(); + int maxStackTraceLines = operatorConfig.getReportedExceptionEventsMaxStackTraceLength(); + + int count = 0; + for (var exception : exceptions) { + Instant exceptionTime = Instant.ofEpochMilli(exception.getTimestamp()); + if (lastRecorded != null && exceptionTime.isBefore(lastRecorded)) { + continue; + } + + emitJobManagerExceptionEvent(ctx, exception, exceptionTime, maxStackTraceLines); + if (++count >= maxEvents) { + break; + } + } Review Comment: I wonder if we should take the last `maxEvents` exceptions from the list and start with those first so that new errors are always reported, but that may cause us to not report some later -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@flink.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org