[ 
https://issues.apache.org/jira/browse/FLINK-36867?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Xintong Song closed FLINK-36867.
--------------------------------
    Resolution: Invalid

> flink on k8s operator could not create TM
> -----------------------------------------
>
>                 Key: FLINK-36867
>                 URL: https://issues.apache.org/jira/browse/FLINK-36867
>             Project: Flink
>          Issue Type: Bug
>          Components: Kubernetes Operator
>    Affects Versions: 1.8.0
>         Environment: Client Version: version.Info\{Major:"1", Minor:"24", 
> GitVersion:"v1.24.1", GitCommit:"3ddd0f45aa91e2f30c70734b175631bec5b5825a", 
> GitTreeState:"clean", BuildDate:"2022-05-24T12:26:19Z", GoVersion:"go1.18.2", 
> Compiler:"gc", Platform:"linux/amd64"}
> Kustomize Version: v4.5.4
> Server Version: version.Info\{Major:"1", Minor:"24", GitVersion:"v1.24.1", 
> GitCommit:"3ddd0f45aa91e2f30c70734b175631bec5b5825a", GitTreeState:"clean", 
> BuildDate:"2022-05-24T12:18:48Z", GoVersion:"go1.18.2", Compiler:"gc", 
> Platform:"linux/amd64"}
>  
> flink 1.16.3
> operator  1.8.0
>            Reporter: 周龙华
>            Priority: Blocker
>
> 创建JobManager 成功,但是在提交ResourceManager 创建TaskManager时,一直不成功。报错如下
> 2024-12-08 13:55:20,302 INFO  
> org.apache.flink.runtime.externalresource.ExternalResourceUtils [] - Enabled 
> external resources: []
> 2024-12-08 13:55:20,302 INFO  org.apache.flink.configuration.Configuration    
>              [] - Config uses fallback configuration key 
> 'kubernetes.service-account' instead of key 
> 'kubernetes.taskmanager.service-account'
> 2024-12-08 13:55:20,303 INFO  
> org.apache.flink.kubernetes.KubernetesResourceManagerDriver  [] - Creating 
> new TaskManager pod with name 
> flink-ess-recall-mqtt-to-kafka-job-taskmanager-1-154 and resource <2048,0.5>.
> 2024-12-08 13:55:30,305 WARN  
> org.apache.flink.kubernetes.KubernetesResourceManagerDriver  [] - Could not 
> create pod flink-ess-recall-mqtt-to-kafka-job-taskmanager-1-154, exception: 
> java.util.concurrent.CompletionException: 
> io.fabric8.kubernetes.client.KubernetesClientException: An error has occurred.
> 2024-12-08 13:55:30,306 WARN  
> org.apache.flink.runtime.resourcemanager.active.ActiveResourceManager [] - 
> Failed requesting worker with resource spec WorkerResourceSpec 
> \{cpuCores=0.5, taskHeapSize=1013.760mb (1063004400 bytes), taskOffHeapSize=0 
> bytes, networkMemSize=158.720mb (166429984 bytes), managedMemSize=158.720mb 
> (166429984 bytes), numSlots=2}, current pending count: 0
> java.util.concurrent.CompletionException: 
> io.fabric8.kubernetes.client.KubernetesClientException: An error has occurred.
> at java.util.concurrent.CompletableFuture.encodeThrowable(Unknown Source) 
> ~[?:?]
> at java.util.concurrent.CompletableFuture.completeThrowable(Unknown Source) 
> ~[?:?]
> at java.util.concurrent.CompletableFuture$AsyncRun.run(Unknown Source) ~[?:?]
> at java.util.concurrent.ThreadPoolExecutor.runWorker(Unknown Source) ~[?:?]
> at java.util.concurrent.ThreadPoolExecutor$Worker.run(Unknown Source) ~[?:?]
> at java.lang.Thread.run(Unknown Source) ~[?:?]
> Caused by: io.fabric8.kubernetes.client.KubernetesClientException: An error 
> has occurred.
> at 
> io.fabric8.kubernetes.client.KubernetesClientException.launderThrowable(KubernetesClientException.java:129)
>  ~[flink-dist-1.16.1.jar:1.16.1]
> at 
> io.fabric8.kubernetes.client.KubernetesClientException.launderThrowable(KubernetesClientException.java:122)
>  ~[flink-dist-1.16.1.jar:1.16.1]
> at 
> io.fabric8.kubernetes.client.dsl.base.CreateOnlyResourceOperation.create(CreateOnlyResourceOperation.java:63)
>  ~[flink-dist-1.16.1.jar:1.16.1]
> at 
> org.apache.flink.kubernetes.kubeclient.Fabric8FlinkKubeClient.lambda$createTaskManagerPod$1(Fabric8FlinkKubeClient.java:163)
>  ~[flink-dist-1.16.1.jar:1.16.1]
> ... 4 more
> Caused by: java.net.SocketTimeoutException: timeout
> at 
> org.apache.flink.kubernetes.shaded.okhttp3.internal.http2.Http2Stream$StreamTimeout.newTimeoutException(Http2Stream.java:678)
>  ~[flink-dist-1.16.1.jar:1.16.1]
> at 
> org.apache.flink.kubernetes.shaded.okhttp3.internal.http2.Http2Stream$StreamTimeout.exitAndThrowIfTimedOut(Http2Stream.java:686)
>  ~[flink-dist-1.16.1.jar:1.16.1]
> at 
> org.apache.flink.kubernetes.shaded.okhttp3.internal.http2.Http2Stream.takeHeaders(Http2Stream.java:154)
>  ~[flink-dist-1.16.1.jar:1.16.1]
> at 
> org.apache.flink.kubernetes.shaded.okhttp3.internal.http2.Http2ExchangeCodec.readResponseHeaders(Http2ExchangeCodec.java:136)
>  ~[flink-dist-1.16.1.jar:1.16.1]
> at 
> org.apache.flink.kubernetes.shaded.okhttp3.internal.connection.Exchange.readResponseHeaders(Exchange.java:115)
>  ~[flink-dist-1.16.1.jar:1.16.1]
> at 
> org.apache.flink.kubernetes.shaded.okhttp3.internal.http.CallServerInterceptor.intercept(CallServerInterceptor.java:94)
>  ~[flink-dist-1.16.1.jar:1.16.1]
> at 
> org.apache.flink.kubernetes.shaded.okhttp3.internal.http.RealInterceptorChain.proceed(RealInterceptorChain.java:142)
>  ~[flink-dist-1.16.1.jar:1.16.1]
> at 
> org.apache.flink.kubernetes.shaded.okhttp3.internal.connection.ConnectInterceptor.intercept(ConnectInterceptor.java:43)
>  ~[flink-dist-1.16.1.jar:1.16.1]
> at 
> org.apache.flink.kubernetes.shaded.okhttp3.internal.http.RealInterceptorChain.proceed(RealInterceptorChain.java:142)
>  ~[flink-dist-1.16.1.jar:1.16.1]
> at 
> org.apache.flink.kubernetes.shaded.okhttp3.internal.http.RealInterceptorChain.proceed(RealInterceptorChain.java:117)
>  ~[flink-dist-1.16.1.jar:1.16.1]
> at 
> org.apache.flink.kubernetes.shaded.okhttp3.internal.cache.CacheInterceptor.intercept(CacheInterceptor.java:94)
>  ~[flink-dist-1.16.1.jar:1.16.1]
> at 
> org.apache.flink.kubernetes.shaded.okhttp3.internal.http.RealInterceptorChain.proceed(RealInterceptorChain.java:142)
>  ~[flink-dist-1.16.1.jar:1.16.1]
> at 
> org.apache.flink.kubernetes.shaded.okhttp3.internal.http.RealInterceptorChain.proceed(RealInterceptorChain.java:117)
>  ~[flink-dist-1.16.1.jar:1.16.1]
> at 
> org.apache.flink.kubernetes.shaded.okhttp3.internal.http.BridgeInterceptor.intercept(BridgeInterceptor.java:93)
>  ~[flink-dist-1.16.1.jar:1.16.1]
> at 
> org.apache.flink.kubernetes.shaded.okhttp3.internal.http.RealInterceptorChain.proceed(RealInterceptorChain.java:142)
>  ~[flink-dist-1.16.1.jar:1.16.1]
> at 
> org.apache.flink.kubernetes.shaded.okhttp3.internal.http.RetryAndFollowUpInterceptor.intercept(RetryAndFollowUpInterceptor.java:88)
>  ~[flink-dist-1.16.1.jar:1.16.1]
> at 
> org.apache.flink.kubernetes.shaded.okhttp3.internal.http.RealInterceptorChain.proceed(RealInterceptorChain.java:142)
>  ~[flink-dist-1.16.1.jar:1.16.1]
> at 
> org.apache.flink.kubernetes.shaded.okhttp3.internal.http.RealInterceptorChain.proceed(RealInterceptorChain.java:117)
>  ~[flink-dist-1.16.1.jar:1.16.1]
> at 
> io.fabric8.kubernetes.client.okhttp.OkHttpClientBuilderImpl$InteceptorAdapter.intercept(OkHttpClientBuilderImpl.java:62)
>  ~[flink-dist-1.16.1.jar:1.16.1]
> at 
> org.apache.flink.kubernetes.shaded.okhttp3.internal.http.RealInterceptorChain.proceed(RealInterceptorChain.java:142)
>  ~[flink-dist-1.16.1.jar:1.16.1]
> at 
> org.apache.flink.kubernetes.shaded.okhttp3.internal.http.RealInterceptorChain.proceed(RealInterceptorChain.java:117)
>  ~[flink-dist-1.16.1.jar:1.16.1]
> at 
> io.fabric8.kubernetes.client.okhttp.OkHttpClientBuilderImpl$InteceptorAdapter.intercept(OkHttpClientBuilderImpl.java:62)
>  ~[flink-dist-1.16.1.jar:1.16.1]
> at 
> org.apache.flink.kubernetes.shaded.okhttp3.internal.http.RealInterceptorChain.proceed(RealInterceptorChain.java:142)
>  ~[flink-dist-1.16.1.jar:1.16.1]
> at 
> org.apache.flink.kubernetes.shaded.okhttp3.internal.http.RealInterceptorChain.proceed(RealInterceptorChain.java:117)
>  ~[flink-dist-1.16.1.jar:1.16.1]
> at 
> io.fabric8.kubernetes.client.okhttp.OkHttpClientBuilderImpl$InteceptorAdapter.intercept(OkHttpClientBuilderImpl.java:62)
>  ~[flink-dist-1.16.1.jar:1.16.1]
> at 
> org.apache.flink.kubernetes.shaded.okhttp3.internal.http.RealInterceptorChain.proceed(RealInterceptorChain.java:142)
>  ~[flink-dist-1.16.1.jar:1.16.1]
> at 
> org.apache.flink.kubernetes.shaded.okhttp3.internal.http.RealInterceptorChain.proceed(RealInterceptorChain.java:117)
>  ~[flink-dist-1.16.1.jar:1.16.1]
> at 
> io.fabric8.kubernetes.client.okhttp.OkHttpClientBuilderImpl$InteceptorAdapter.intercept(OkHttpClientBuilderImpl.java:62)
>  ~[flink-dist-1.16.1.jar:1.16.1]
> at 
> org.apache.flink.kubernetes.shaded.okhttp3.internal.http.RealInterceptorChain.proceed(RealInterceptorChain.java:142)
>  ~[flink-dist-1.16.1.jar:1.16.1]
> at 
> org.apache.flink.kubernetes.shaded.okhttp3.internal.http.RealInterceptorChain.proceed(RealInterceptorChain.java:117)
>  ~[flink-dist-1.16.1.jar:1.16.1]
> at 
> org.apache.flink.kubernetes.shaded.okhttp3.RealCall.getResponseWithInterceptorChain(RealCall.java:229)
>  ~[flink-dist-1.16.1.jar:1.16.1]
> at 
> org.apache.flink.kubernetes.shaded.okhttp3.RealCall.execute(RealCall.java:81) 
> ~[flink-dist-1.16.1.jar:1.16.1]
> at 
> io.fabric8.kubernetes.client.okhttp.OkHttpClientImpl.send(OkHttpClientImpl.java:138)
>  ~[flink-dist-1.16.1.jar:1.16.1]
> at 
> io.fabric8.kubernetes.client.dsl.base.OperationSupport.retryWithExponentialBackoff(OperationSupport.java:577)
>  ~[flink-dist-1.16.1.jar:1.16.1]
> at 
> io.fabric8.kubernetes.client.dsl.base.OperationSupport.handleResponse(OperationSupport.java:556)
>  ~[flink-dist-1.16.1.jar:1.16.1]
> at 
> io.fabric8.kubernetes.client.dsl.base.OperationSupport.handleResponse(OperationSupport.java:521)
>  ~[flink-dist-1.16.1.jar:1.16.1]
> at 
> io.fabric8.kubernetes.client.dsl.base.OperationSupport.handleCreate(OperationSupport.java:308)
>  ~[flink-dist-1.16.1.jar:1.16.1]
> at 
> io.fabric8.kubernetes.client.dsl.base.BaseOperation.handleCreate(BaseOperation.java:644)
>  ~[flink-dist-1.16.1.jar:1.16.1]
> at 
> io.fabric8.kubernetes.client.dsl.base.BaseOperation.handleCreate(BaseOperation.java:83)
>  ~[flink-dist-1.16.1.jar:1.16.1]
> at 
> io.fabric8.kubernetes.client.dsl.base.CreateOnlyResourceOperation.create(CreateOnlyResourceOperation.java:61)
>  ~[flink-dist-1.16.1.jar:1.16.1]
> at 
> org.apache.flink.kubernetes.kubeclient.Fabric8FlinkKubeClient.lambda$createTaskManagerPod$1(Fabric8FlinkKubeClient.java:163)
>  ~[flink-dist-1.16.1.jar:1.16.1]
> ... 4 more
> 2024-12-08 13:55:30,306 INFO  
> org.apache.flink.runtime.resourcemanager.active.ActiveResourceManager [] - 
> Requesting new worker with resource spec WorkerResourceSpec \{cpuCores=0.5, 
> taskHeapSize=1013.760mb (1063004400 bytes), taskOffHeapSize=0 bytes, 
> networkMemSize=158.720mb (166429984 bytes), managedMemSize=158.720mb 
> (166429984 bytes), numSlots=2}, current pending count: 1.
>  
>  
>  
> api-server 报错如下:
> E1208 05:57:40.368984       1 writers.go:118] apiserver was unable to write a 
> JSON response: http: Handler timeout
> E1208 05:57:40.369002       1 status.go:71] apiserver received an error that 
> is not an metav1.Status: &errors.errorString\{s:"http: Handler timeout"}: 
> http: Handler timeout
> I1208 05:57:40.369025       1 trace.go:205] Trace[2098658489]: "Call 
> validating webhook" 
> configuration:resourcesquotas.quota.kubesphere.io,webhook:resourcesquotas.quota.kubesphere.io,resource:/v1,
>  
> Resource=pods,subresource:,operation:CREATE,UID:3ee5f429-b549-4df6-849f-6695a7103e7a
>  (08-Dec-2024 05:57:30.370) (total time: 9998ms):
> Trace[2098658489]: [9.998582768s] [9.998582768s] END
> W1208 05:57:40.369042       1 dispatcher.go:142] Failed calling webhook, 
> failing open resourcesquotas.quota.kubesphere.io: failed calling webhook 
> "resourcesquotas.quota.kubesphere.io": failed to call webhook: Post 
> "https://ks-controller-manager.kubesphere-system.svc:443/validate-quota-kubesphere-io-v1alpha2?timeout=30s":
>  context canceled
> E1208 05:57:40.369068       1 dispatcher.go:149] failed calling webhook 
> "resourcesquotas.quota.kubesphere.io": failed to call webhook: Post 
> "https://ks-controller-manager.kubesphere-system.svc:443/validate-quota-kubesphere-io-v1alpha2?timeout=30s":
>  context canceled
> E1208 05:57:40.370331       1 writers.go:131] apiserver was unable to write a 
> fallback JSON response: http: Handler timeout
> I1208 05:57:40.371563       1 trace.go:205] Trace[722200725]: "Create" 
> url:/api/v1/namespaces/base/pods,user-agent:flink,audit-id:74251d65-3538-40db-90b3-fd4d6dddb8d0,client:172.18.180.225,accept:,protocol:HTTP/2.0
>  (08-Dec-2024 05:57:30.369) (total time: 10002ms):
> Trace[722200725]: [10.002063455s] [10.002063455s] END
> E1208 05:57:40.372108       1 timeout.go:141] post-timeout activity - 
> time-elapsed: 3.109877ms, POST "/api/v1/namespaces/base/pods" result: <nil>
> {"level":"warn","ts":"2024-12-08T05:57:40.373Z","logger":"etcd-client","caller":"v3/retry_interceptor.go:62","msg":"retrying
>  of unary invoker 
> failed","target":"etcd-endpoints://0xc002dcd6c0/127.0.0.1:2379","attempt":0,"error":"rpc
>  error: code = Canceled desc = context canceled"}
> E1208 05:57:40.373379       1 finisher.go:175] FinishRequest: post-timeout 
> activity - time-elapsed: 4.357591ms, panicked: false, err: context canceled, 
> panic-reason: <nil>
>  
>  
> 一直如此循环,无法创建。
> yaml文件如下:
> apiVersion: flink.apache.org/v1beta1
> kind: FlinkDeployment
> metadata:
>   namespace: base
>   name: flink-ess-recall-mqtt-to-kafka-job
> spec:
>   image: harbor.junengcloud.com/openfaas/flink:1.16.3
>   flinkVersion: v1_16
>   flinkConfiguration:
>     taskmanager.numberOfTaskSlots: "2"
>     taskmanager.memory.managed.fraction: "0.1"
>   serviceAccount: flink
>   podTemplate:
>     apiVersion: v1
>     kind: Pod
>     metadata:
>       name: pod-template
>     spec:
>       containers:
>         # Do not change the main container name
>         - name: flink-main-container
>           volumeMounts:
>             - mountPath: /opt/flink/chk
>               name: checkpoint
>             - mountPath: /opt/flink/lib
>               name: flinklibs
>             - mountPath: /opt/flink/userlib
>               name: flinkuserlibs
>             - mountPath: /opt/flink/usercfg
>               name: flinkusercfgs
>           env:
>             - name: TZ
>               value: Asia/Shanghai
>       volumes:
>         - name: checkpoint
>           persistentVolumeClaim:
>             claimName: flink-chk
>         - name: flinklibs
>           persistentVolumeClaim:
>             claimName: flink-lib-jar
>         - name: flinkuserlibs
>           persistentVolumeClaim:
>             claimName: flink-user-jar
>         - name: flinkusercfgs
>           persistentVolumeClaim:
>             claimName: flink-user-conf
>   jobManager:
>     resource:
>       memory: "2048m"
>       cpu: 0.5
>   taskManager:
>     resource:
>       memory: "2048m"
>       cpu: 0.5
>   job:
>     jarURI: local:///opt/flink/userlib/flink-ess-1.0-SNAPSHOT.jar
>     entryClass: com.huidian.flink.ess.core.JobRunner
>     parallelism: 1
>     args: 
> ["-config","/opt/flink/usercfg/flink-ess-config.json","-jobName","flink-ess-recall-mqtt-to-kafka-job"]
>  



--
This message was sent by Atlassian Jira
(v8.20.10#820010)

Reply via email to