[ https://issues.apache.org/jira/browse/FLINK-36867?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
Xintong Song closed FLINK-36867. -------------------------------- Resolution: Invalid > flink on k8s operator could not create TM > ----------------------------------------- > > Key: FLINK-36867 > URL: https://issues.apache.org/jira/browse/FLINK-36867 > Project: Flink > Issue Type: Bug > Components: Kubernetes Operator > Affects Versions: 1.8.0 > Environment: Client Version: version.Info\{Major:"1", Minor:"24", > GitVersion:"v1.24.1", GitCommit:"3ddd0f45aa91e2f30c70734b175631bec5b5825a", > GitTreeState:"clean", BuildDate:"2022-05-24T12:26:19Z", GoVersion:"go1.18.2", > Compiler:"gc", Platform:"linux/amd64"} > Kustomize Version: v4.5.4 > Server Version: version.Info\{Major:"1", Minor:"24", GitVersion:"v1.24.1", > GitCommit:"3ddd0f45aa91e2f30c70734b175631bec5b5825a", GitTreeState:"clean", > BuildDate:"2022-05-24T12:18:48Z", GoVersion:"go1.18.2", Compiler:"gc", > Platform:"linux/amd64"} > > flink 1.16.3 > operator 1.8.0 > Reporter: 周龙华 > Priority: Blocker > > 创建JobManager 成功,但是在提交ResourceManager 创建TaskManager时,一直不成功。报错如下 > 2024-12-08 13:55:20,302 INFO > org.apache.flink.runtime.externalresource.ExternalResourceUtils [] - Enabled > external resources: [] > 2024-12-08 13:55:20,302 INFO org.apache.flink.configuration.Configuration > [] - Config uses fallback configuration key > 'kubernetes.service-account' instead of key > 'kubernetes.taskmanager.service-account' > 2024-12-08 13:55:20,303 INFO > org.apache.flink.kubernetes.KubernetesResourceManagerDriver [] - Creating > new TaskManager pod with name > flink-ess-recall-mqtt-to-kafka-job-taskmanager-1-154 and resource <2048,0.5>. > 2024-12-08 13:55:30,305 WARN > org.apache.flink.kubernetes.KubernetesResourceManagerDriver [] - Could not > create pod flink-ess-recall-mqtt-to-kafka-job-taskmanager-1-154, exception: > java.util.concurrent.CompletionException: > io.fabric8.kubernetes.client.KubernetesClientException: An error has occurred. > 2024-12-08 13:55:30,306 WARN > org.apache.flink.runtime.resourcemanager.active.ActiveResourceManager [] - > Failed requesting worker with resource spec WorkerResourceSpec > \{cpuCores=0.5, taskHeapSize=1013.760mb (1063004400 bytes), taskOffHeapSize=0 > bytes, networkMemSize=158.720mb (166429984 bytes), managedMemSize=158.720mb > (166429984 bytes), numSlots=2}, current pending count: 0 > java.util.concurrent.CompletionException: > io.fabric8.kubernetes.client.KubernetesClientException: An error has occurred. > at java.util.concurrent.CompletableFuture.encodeThrowable(Unknown Source) > ~[?:?] > at java.util.concurrent.CompletableFuture.completeThrowable(Unknown Source) > ~[?:?] > at java.util.concurrent.CompletableFuture$AsyncRun.run(Unknown Source) ~[?:?] > at java.util.concurrent.ThreadPoolExecutor.runWorker(Unknown Source) ~[?:?] > at java.util.concurrent.ThreadPoolExecutor$Worker.run(Unknown Source) ~[?:?] > at java.lang.Thread.run(Unknown Source) ~[?:?] > Caused by: io.fabric8.kubernetes.client.KubernetesClientException: An error > has occurred. > at > io.fabric8.kubernetes.client.KubernetesClientException.launderThrowable(KubernetesClientException.java:129) > ~[flink-dist-1.16.1.jar:1.16.1] > at > io.fabric8.kubernetes.client.KubernetesClientException.launderThrowable(KubernetesClientException.java:122) > ~[flink-dist-1.16.1.jar:1.16.1] > at > io.fabric8.kubernetes.client.dsl.base.CreateOnlyResourceOperation.create(CreateOnlyResourceOperation.java:63) > ~[flink-dist-1.16.1.jar:1.16.1] > at > org.apache.flink.kubernetes.kubeclient.Fabric8FlinkKubeClient.lambda$createTaskManagerPod$1(Fabric8FlinkKubeClient.java:163) > ~[flink-dist-1.16.1.jar:1.16.1] > ... 4 more > Caused by: java.net.SocketTimeoutException: timeout > at > org.apache.flink.kubernetes.shaded.okhttp3.internal.http2.Http2Stream$StreamTimeout.newTimeoutException(Http2Stream.java:678) > ~[flink-dist-1.16.1.jar:1.16.1] > at > org.apache.flink.kubernetes.shaded.okhttp3.internal.http2.Http2Stream$StreamTimeout.exitAndThrowIfTimedOut(Http2Stream.java:686) > ~[flink-dist-1.16.1.jar:1.16.1] > at > org.apache.flink.kubernetes.shaded.okhttp3.internal.http2.Http2Stream.takeHeaders(Http2Stream.java:154) > ~[flink-dist-1.16.1.jar:1.16.1] > at > org.apache.flink.kubernetes.shaded.okhttp3.internal.http2.Http2ExchangeCodec.readResponseHeaders(Http2ExchangeCodec.java:136) > ~[flink-dist-1.16.1.jar:1.16.1] > at > org.apache.flink.kubernetes.shaded.okhttp3.internal.connection.Exchange.readResponseHeaders(Exchange.java:115) > ~[flink-dist-1.16.1.jar:1.16.1] > at > org.apache.flink.kubernetes.shaded.okhttp3.internal.http.CallServerInterceptor.intercept(CallServerInterceptor.java:94) > ~[flink-dist-1.16.1.jar:1.16.1] > at > org.apache.flink.kubernetes.shaded.okhttp3.internal.http.RealInterceptorChain.proceed(RealInterceptorChain.java:142) > ~[flink-dist-1.16.1.jar:1.16.1] > at > org.apache.flink.kubernetes.shaded.okhttp3.internal.connection.ConnectInterceptor.intercept(ConnectInterceptor.java:43) > ~[flink-dist-1.16.1.jar:1.16.1] > at > org.apache.flink.kubernetes.shaded.okhttp3.internal.http.RealInterceptorChain.proceed(RealInterceptorChain.java:142) > ~[flink-dist-1.16.1.jar:1.16.1] > at > org.apache.flink.kubernetes.shaded.okhttp3.internal.http.RealInterceptorChain.proceed(RealInterceptorChain.java:117) > ~[flink-dist-1.16.1.jar:1.16.1] > at > org.apache.flink.kubernetes.shaded.okhttp3.internal.cache.CacheInterceptor.intercept(CacheInterceptor.java:94) > ~[flink-dist-1.16.1.jar:1.16.1] > at > org.apache.flink.kubernetes.shaded.okhttp3.internal.http.RealInterceptorChain.proceed(RealInterceptorChain.java:142) > ~[flink-dist-1.16.1.jar:1.16.1] > at > org.apache.flink.kubernetes.shaded.okhttp3.internal.http.RealInterceptorChain.proceed(RealInterceptorChain.java:117) > ~[flink-dist-1.16.1.jar:1.16.1] > at > org.apache.flink.kubernetes.shaded.okhttp3.internal.http.BridgeInterceptor.intercept(BridgeInterceptor.java:93) > ~[flink-dist-1.16.1.jar:1.16.1] > at > org.apache.flink.kubernetes.shaded.okhttp3.internal.http.RealInterceptorChain.proceed(RealInterceptorChain.java:142) > ~[flink-dist-1.16.1.jar:1.16.1] > at > org.apache.flink.kubernetes.shaded.okhttp3.internal.http.RetryAndFollowUpInterceptor.intercept(RetryAndFollowUpInterceptor.java:88) > ~[flink-dist-1.16.1.jar:1.16.1] > at > org.apache.flink.kubernetes.shaded.okhttp3.internal.http.RealInterceptorChain.proceed(RealInterceptorChain.java:142) > ~[flink-dist-1.16.1.jar:1.16.1] > at > org.apache.flink.kubernetes.shaded.okhttp3.internal.http.RealInterceptorChain.proceed(RealInterceptorChain.java:117) > ~[flink-dist-1.16.1.jar:1.16.1] > at > io.fabric8.kubernetes.client.okhttp.OkHttpClientBuilderImpl$InteceptorAdapter.intercept(OkHttpClientBuilderImpl.java:62) > ~[flink-dist-1.16.1.jar:1.16.1] > at > org.apache.flink.kubernetes.shaded.okhttp3.internal.http.RealInterceptorChain.proceed(RealInterceptorChain.java:142) > ~[flink-dist-1.16.1.jar:1.16.1] > at > org.apache.flink.kubernetes.shaded.okhttp3.internal.http.RealInterceptorChain.proceed(RealInterceptorChain.java:117) > ~[flink-dist-1.16.1.jar:1.16.1] > at > io.fabric8.kubernetes.client.okhttp.OkHttpClientBuilderImpl$InteceptorAdapter.intercept(OkHttpClientBuilderImpl.java:62) > ~[flink-dist-1.16.1.jar:1.16.1] > at > org.apache.flink.kubernetes.shaded.okhttp3.internal.http.RealInterceptorChain.proceed(RealInterceptorChain.java:142) > ~[flink-dist-1.16.1.jar:1.16.1] > at > org.apache.flink.kubernetes.shaded.okhttp3.internal.http.RealInterceptorChain.proceed(RealInterceptorChain.java:117) > ~[flink-dist-1.16.1.jar:1.16.1] > at > io.fabric8.kubernetes.client.okhttp.OkHttpClientBuilderImpl$InteceptorAdapter.intercept(OkHttpClientBuilderImpl.java:62) > ~[flink-dist-1.16.1.jar:1.16.1] > at > org.apache.flink.kubernetes.shaded.okhttp3.internal.http.RealInterceptorChain.proceed(RealInterceptorChain.java:142) > ~[flink-dist-1.16.1.jar:1.16.1] > at > org.apache.flink.kubernetes.shaded.okhttp3.internal.http.RealInterceptorChain.proceed(RealInterceptorChain.java:117) > ~[flink-dist-1.16.1.jar:1.16.1] > at > io.fabric8.kubernetes.client.okhttp.OkHttpClientBuilderImpl$InteceptorAdapter.intercept(OkHttpClientBuilderImpl.java:62) > ~[flink-dist-1.16.1.jar:1.16.1] > at > org.apache.flink.kubernetes.shaded.okhttp3.internal.http.RealInterceptorChain.proceed(RealInterceptorChain.java:142) > ~[flink-dist-1.16.1.jar:1.16.1] > at > org.apache.flink.kubernetes.shaded.okhttp3.internal.http.RealInterceptorChain.proceed(RealInterceptorChain.java:117) > ~[flink-dist-1.16.1.jar:1.16.1] > at > org.apache.flink.kubernetes.shaded.okhttp3.RealCall.getResponseWithInterceptorChain(RealCall.java:229) > ~[flink-dist-1.16.1.jar:1.16.1] > at > org.apache.flink.kubernetes.shaded.okhttp3.RealCall.execute(RealCall.java:81) > ~[flink-dist-1.16.1.jar:1.16.1] > at > io.fabric8.kubernetes.client.okhttp.OkHttpClientImpl.send(OkHttpClientImpl.java:138) > ~[flink-dist-1.16.1.jar:1.16.1] > at > io.fabric8.kubernetes.client.dsl.base.OperationSupport.retryWithExponentialBackoff(OperationSupport.java:577) > ~[flink-dist-1.16.1.jar:1.16.1] > at > io.fabric8.kubernetes.client.dsl.base.OperationSupport.handleResponse(OperationSupport.java:556) > ~[flink-dist-1.16.1.jar:1.16.1] > at > io.fabric8.kubernetes.client.dsl.base.OperationSupport.handleResponse(OperationSupport.java:521) > ~[flink-dist-1.16.1.jar:1.16.1] > at > io.fabric8.kubernetes.client.dsl.base.OperationSupport.handleCreate(OperationSupport.java:308) > ~[flink-dist-1.16.1.jar:1.16.1] > at > io.fabric8.kubernetes.client.dsl.base.BaseOperation.handleCreate(BaseOperation.java:644) > ~[flink-dist-1.16.1.jar:1.16.1] > at > io.fabric8.kubernetes.client.dsl.base.BaseOperation.handleCreate(BaseOperation.java:83) > ~[flink-dist-1.16.1.jar:1.16.1] > at > io.fabric8.kubernetes.client.dsl.base.CreateOnlyResourceOperation.create(CreateOnlyResourceOperation.java:61) > ~[flink-dist-1.16.1.jar:1.16.1] > at > org.apache.flink.kubernetes.kubeclient.Fabric8FlinkKubeClient.lambda$createTaskManagerPod$1(Fabric8FlinkKubeClient.java:163) > ~[flink-dist-1.16.1.jar:1.16.1] > ... 4 more > 2024-12-08 13:55:30,306 INFO > org.apache.flink.runtime.resourcemanager.active.ActiveResourceManager [] - > Requesting new worker with resource spec WorkerResourceSpec \{cpuCores=0.5, > taskHeapSize=1013.760mb (1063004400 bytes), taskOffHeapSize=0 bytes, > networkMemSize=158.720mb (166429984 bytes), managedMemSize=158.720mb > (166429984 bytes), numSlots=2}, current pending count: 1. > > > > api-server 报错如下: > E1208 05:57:40.368984 1 writers.go:118] apiserver was unable to write a > JSON response: http: Handler timeout > E1208 05:57:40.369002 1 status.go:71] apiserver received an error that > is not an metav1.Status: &errors.errorString\{s:"http: Handler timeout"}: > http: Handler timeout > I1208 05:57:40.369025 1 trace.go:205] Trace[2098658489]: "Call > validating webhook" > configuration:resourcesquotas.quota.kubesphere.io,webhook:resourcesquotas.quota.kubesphere.io,resource:/v1, > > Resource=pods,subresource:,operation:CREATE,UID:3ee5f429-b549-4df6-849f-6695a7103e7a > (08-Dec-2024 05:57:30.370) (total time: 9998ms): > Trace[2098658489]: [9.998582768s] [9.998582768s] END > W1208 05:57:40.369042 1 dispatcher.go:142] Failed calling webhook, > failing open resourcesquotas.quota.kubesphere.io: failed calling webhook > "resourcesquotas.quota.kubesphere.io": failed to call webhook: Post > "https://ks-controller-manager.kubesphere-system.svc:443/validate-quota-kubesphere-io-v1alpha2?timeout=30s": > context canceled > E1208 05:57:40.369068 1 dispatcher.go:149] failed calling webhook > "resourcesquotas.quota.kubesphere.io": failed to call webhook: Post > "https://ks-controller-manager.kubesphere-system.svc:443/validate-quota-kubesphere-io-v1alpha2?timeout=30s": > context canceled > E1208 05:57:40.370331 1 writers.go:131] apiserver was unable to write a > fallback JSON response: http: Handler timeout > I1208 05:57:40.371563 1 trace.go:205] Trace[722200725]: "Create" > url:/api/v1/namespaces/base/pods,user-agent:flink,audit-id:74251d65-3538-40db-90b3-fd4d6dddb8d0,client:172.18.180.225,accept:,protocol:HTTP/2.0 > (08-Dec-2024 05:57:30.369) (total time: 10002ms): > Trace[722200725]: [10.002063455s] [10.002063455s] END > E1208 05:57:40.372108 1 timeout.go:141] post-timeout activity - > time-elapsed: 3.109877ms, POST "/api/v1/namespaces/base/pods" result: <nil> > {"level":"warn","ts":"2024-12-08T05:57:40.373Z","logger":"etcd-client","caller":"v3/retry_interceptor.go:62","msg":"retrying > of unary invoker > failed","target":"etcd-endpoints://0xc002dcd6c0/127.0.0.1:2379","attempt":0,"error":"rpc > error: code = Canceled desc = context canceled"} > E1208 05:57:40.373379 1 finisher.go:175] FinishRequest: post-timeout > activity - time-elapsed: 4.357591ms, panicked: false, err: context canceled, > panic-reason: <nil> > > > 一直如此循环,无法创建。 > yaml文件如下: > apiVersion: flink.apache.org/v1beta1 > kind: FlinkDeployment > metadata: > namespace: base > name: flink-ess-recall-mqtt-to-kafka-job > spec: > image: harbor.junengcloud.com/openfaas/flink:1.16.3 > flinkVersion: v1_16 > flinkConfiguration: > taskmanager.numberOfTaskSlots: "2" > taskmanager.memory.managed.fraction: "0.1" > serviceAccount: flink > podTemplate: > apiVersion: v1 > kind: Pod > metadata: > name: pod-template > spec: > containers: > # Do not change the main container name > - name: flink-main-container > volumeMounts: > - mountPath: /opt/flink/chk > name: checkpoint > - mountPath: /opt/flink/lib > name: flinklibs > - mountPath: /opt/flink/userlib > name: flinkuserlibs > - mountPath: /opt/flink/usercfg > name: flinkusercfgs > env: > - name: TZ > value: Asia/Shanghai > volumes: > - name: checkpoint > persistentVolumeClaim: > claimName: flink-chk > - name: flinklibs > persistentVolumeClaim: > claimName: flink-lib-jar > - name: flinkuserlibs > persistentVolumeClaim: > claimName: flink-user-jar > - name: flinkusercfgs > persistentVolumeClaim: > claimName: flink-user-conf > jobManager: > resource: > memory: "2048m" > cpu: 0.5 > taskManager: > resource: > memory: "2048m" > cpu: 0.5 > job: > jarURI: local:///opt/flink/userlib/flink-ess-1.0-SNAPSHOT.jar > entryClass: com.huidian.flink.ess.core.JobRunner > parallelism: 1 > args: > ["-config","/opt/flink/usercfg/flink-ess-config.json","-jobName","flink-ess-recall-mqtt-to-kafka-job"] > -- This message was sent by Atlassian Jira (v8.20.10#820010)