一个job manager pod 提交job后,申请taskmanager失败
Taskmanager 的异常 Fatal error occurred in TaskExecutor akka.tcp://flink@179.10.251.70:6122/user/rpc/taskmanager_0. org.apache.flink.runtime.taskexecutor.exceptions.RegistrationTimeoutException: Could not register at the ResourceManager within the specified maximum registration duration 300000 ms. This indicates a problem with this instance. Terminating now. at org.apache.flink.runtime.taskexecutor.TaskExecutor.registrationTimeout(TaskExecutor.java:1251) ~[flink-dist_2.12-1.11.1.jar:1.11.1] at org.apache.flink.runtime.taskexecutor.TaskExecutor.lambda$startRegistrationTimeout$18(TaskExecutor.java:1237) ~[flink-dist_2.12-1.11.1.jar:1.11.1] at org.apache.flink.runtime.rpc.akka.AkkaRpcActor.handleRunAsync(AkkaRpcActor.java:402) ~[flink-dist_2.12-1.11.1.jar:1.11.1] at org.apache.flink.runtime.rpc.akka.AkkaRpcActor.handleRpcMessage(AkkaRpcActor.java:195) ~[flink-dist_2.12-1.11.1.jar:1.11.1] at org.apache.flink.runtime.rpc.akka.AkkaRpcActor.handleMessage(AkkaRpcActor.java:152) ~[flink-dist_2.12-1.11.1.jar:1.11.1] at akka.japi.pf.UnitCaseStatement.apply(CaseStatements.scala:26) [flink-dist_2.12-1.11.1.jar:1.11.1] at akka.japi.pf.UnitCaseStatement.apply(CaseStatements.scala:21) [flink-dist_2.12-1.11.1.jar:1.11.1] at scala.PartialFunction.applyOrElse(PartialFunction.scala:123) [flink-dist_2.12-1.11.1.jar:1.11.1] at scala.PartialFunction.applyOrElse$(PartialFunction.scala:122) [flink-dist_2.12-1.11.1.jar:1.11.1] at akka.japi.pf.UnitCaseStatement.applyOrElse(CaseStatements.scala:21) [flink-dist_2.12-1.11.1.jar:1.11.1] at scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:171) [flink-dist_2.12-1.11.1.jar:1.11.1] at scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:172) [flink-dist_2.12-1.11.1.jar:1.11.1] at scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:172) [flink-dist_2.12-1.11.1.jar:1.11.1] at akka.actor.Actor.aroundReceive(Actor.scala:517) [flink-dist_2.12-1.11.1.jar:1.11.1] at akka.actor.Actor.aroundReceive$(Actor.scala:515) [flink-dist_2.12-1.11.1.jar:1.11.1] at akka.actor.AbstractActor.aroundReceive(AbstractActor.scala:225) [flink-dist_2.12-1.11.1.jar:1.11.1] at akka.actor.ActorCell.receiveMessage(ActorCell.scala:592) [flink-dist_2.12-1.11.1.jar:1.11.1] at akka.actor.ActorCell.invoke(ActorCell.scala:561) [flink-dist_2.12-1.11.1.jar:1.11.1] at akka.dispatch.Mailbox.processMailbox(Mailbox.scala:258) [flink-dist_2.12-1.11.1.jar:1.11.1] at akka.dispatch.Mailbox.run(Mailbox.scala:225) [flink-dist_2.12-1.11.1.jar:1.11.1] at akka.dispatch.Mailbox.exec(Mailbox.scala:235) [flink-dist_2.12-1.11.1.jar:1.11.1] at akka.dispatch.forkjoin.ForkJoinTask.doExec(ForkJoinTask.java:260) [flink-dist_2.12-1.11.1.jar:1.11.1] at akka.dispatch.forkjoin.ForkJoinPool$WorkQueue.runTask(ForkJoinPool.java:1339) [flink-dist_2.12-1.11.1.jar:1.11.1] at akka.dispatch.forkjoin.ForkJoinPool.runWorker(ForkJoinPool.java:1979) [flink-dist_2.12-1.11.1.jar:1.11.1] at akka.dispatch.forkjoin.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:107) [flink-dist_2.12-1.11.1.jar:1.11.1] 2020-09-16 09:14:39,077 ERROR org.apache.flink.runtime.taskexecutor.TaskManagerRunner [] - Fatal error occurred while executing the TaskManager. Shutting it down... org.apache.flink.runtime.taskexecutor.exceptions.RegistrationTimeoutException: Could not register at the ResourceManager within the specified maximum registration duration 300000 ms. This indicates a problem with this instance. Terminating now. at org.apache.flink.runtime.taskexecutor.TaskExecutor.registrationTimeout(TaskExecutor.java:1251) ~[flink-dist_2.12-1.11.1.jar:1.11.1] at org.apache.flink.runtime.taskexecutor.TaskExecutor.lambda$startRegistrationTimeout$18(TaskExecutor.java:1237) ~[flink-dist_2.12-1.11.1.jar:1.11.1] at org.apache.flink.runtime.rpc.akka.AkkaRpcActor.handleRunAsync(AkkaRpcActor.java:402) ~ Jobmanger 异常 0d5f8478a2ab4e17d816810752f669eb) switched from SCHEDULED to FAILED on not deployed. org.apache.flink.runtime.jobmanager.scheduler.NoResourceAvailableException: Could not allocate the required slot within slot request timeout. Please make sure that the cluster has enough resources. at org.apache.flink.runtime.scheduler.DefaultScheduler.maybeWrapWithNoResourceAvailableException(DefaultScheduler.java:441) ~[flink-dist_2.12-1.11.1.jar:1.11.1] at org.apache.flink.runtime.scheduler.DefaultScheduler.lambda$assignResourceOrHandleError$6(DefaultScheduler.java:422) ~[flink-dist_2.12-1.11.1.jar:1.11.1] at java.util.concurrent.CompletableFuture.uniHandle(CompletableFuture.java:836) ~[?:1.8.0_265] at java.util.concurrent.CompletableFuture$UniHandle.tryFire(CompletableFuture.java:811) ~[?:1.8.0_265] at java.util.concurrent.CompletableFuture.postComplete(CompletableFuture.java:488) ~[?:1.8.0_265] at java.util.concurrent.CompletableFuture.completeExceptionally(CompletableFuture.java:1990) ~[?:1.8.0_265] at org.apache.flink.runtime.jobmaster.slotpool.SchedulerImpl.lambda$internalAllocateSlot$0(SchedulerImpl.java:168) ~[flink-dist_2.12-1.11.1.jar:1.11.1] at java.util.concurrent.CompletableFuture.uniWhenComplete(CompletableFuture.java:774) ~[?:1.8.0_265] at java.util.concurrent.CompletableFuture$UniWhenComplete.tryFire(CompletableFuture.java:750) ~[?:1.8.0_265] at java.util.concurrent.CompletableFuture.postComplete(CompletableFuture.java:488) ~[?:1.8.0_265] at java.util.concurrent.CompletableFuture.completeExceptionally(CompletableFuture.java:1990) ~[?:1.8.0_265] at org.apache.flink.runtime.jobmaster.slotpool.SlotSharingManager$SingleTaskSlot.release(SlotSharingManager.java:726) ~[flink-dist_2.12-1.11.1.jar:1.11.1] at org.apache.flink.runtime.jobmaster.slotpool.SlotSharingManager$MultiTaskSlot.release(SlotSharingManager.java:537) ~[flink-dist_2.12-1.11.1.jar:1.11.1] at org.apache.flink.runtime.jobmaster.slotpool.SlotSharingManager$MultiTaskSlot.lambda$new$0(SlotSharingManager.java:432) ~[flink-dist_2.12-1.11.1.jar:1.11.1] at java.util.concurrent.CompletableFuture.uniHandle(CompletableFuture.java:836) ~[?:1.8.0_265] at java.util.concurrent.CompletableFuture$UniHandle.tryFire(CompletableFuture.java:811) ~[?:1.8.0_265] at java.util.concurrent.CompletableFuture.postComplete(CompletableFuture.java:488) ~[?:1.8.0_265] at java.util.concurrent.CompletableFuture.completeExceptionally(CompletableFuture.java:1990) ~[?:1.8.0_265] at org.apache.flink.runtime.concurrent.FutureUtils.lambda$forwardTo$21(FutureUtils.java:1120) ~[flink-dist_2.12-1.11.1.jar:1.11.1] at java.util.concurrent.CompletableFuture.uniWhenComplete(CompletableFuture.java:774) ~[?:1.8.0_265] at java.util.concurrent.CompletableFuture$UniWhenComplete.tryFire(CompletableFuture.java:750) ~[?:1.8.0_265] at java.util.concurrent.CompletableFuture.postComplete(CompletableFuture.java:488) ~[?:1.8.0_265] at java.util.concurrent.CompletableFuture.completeExceptionally(CompletableFuture.java:1990) ~[?:1.8.0_265] at org.apache.flink.runtime.concurrent.FutureUtils$Timeout.run(FutureUtils.java:1036) ~[flink-dist_2.12-1.11.1.jar:1.11.1] at org.apache.flink.runtime.rpc.akka.AkkaRpcActor.handleRunAsync(AkkaRpcActor.java:402) ~[flink-dist_2.12-1.11.1.jar:1.11.1] at org.apache.flink.runtime.rpc.akka.AkkaRpcActor.handleRpcMessage(AkkaRpcActor.java:195) ~[flink-dist_2.12-1.11.1.jar:1.11.1] at org.apache.flink.runtime.rpc.akka.FencedAkkaRpcActor.handleRpcMessage(FencedAkkaRpcActor.java:74) ~[flink-dist_2.12-1.11.1.jar:1.11.1] at org.apache.flink.runtime.rpc.akka.AkkaRpcActor.handleMessage(AkkaRpcActor.java:152) ~[flink-dist_2.12-1.11.1.jar:1.11.1] at akka.japi.pf.UnitCaseStatement.apply(CaseStatements.scala:26) [flink-dist_2.12-1.11.1.jar:1.11.1] at akka.japi.pf.UnitCaseStatement.apply(CaseStatements.scala:21) [flink-dist_2.12-1.11.1.jar:1.11.1] at scala.PartialFunction.applyOrElse(PartialFunction.scala:123) [flink-dist_2.12-1.11.1.jar:1.11.1] at scala.PartialFunction.applyOrElse$(PartialFunction.scala:122) [flink-dist_2.12-1.11.1.jar:1.11.1] at akka.japi.pf.UnitCaseStatement.applyOrElse(CaseStatements.scala:21) [flink-dist_2.12-1.11.1.jar:1.11.1] at scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:171) [flink-dist_2.12-1.11.1.jar:1.11.1] at scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:172) [flink-dist_2.12-1.11.1.jar:1.11.1] at scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:172) [flink-dist_2.12-1.11.1.jar:1.11.1] at akka.actor.Actor.aroundReceive(Actor.scala:517) [flink-dist_2.12-1.11.1.jar:1.11.1] at akka.actor.Actor.aroundReceive$(Actor.scala:515) [flink-dist_2.12-1.11.1.jar:1.11.1] at akka.actor.AbstractActor.aroundReceive(AbstractActor.scala:225) [flink-dist_2.12-1.11.1.jar:1.11.1] at akka.actor.ActorCell.receiveMessage(ActorCell.scala:592) [flink-dist_2.12-1.11.1.jar:1.11.1] at akka.actor.ActorCell.invoke(ActorCell.scala:561) [flink-dist_2.12-1.11.1.jar:1.11.1] at akka.dispatch.Mailbox.processMailbox(Mailbox.scala:258) [flink-dist_2.12-1.11.1.jar:1.11.1] at akka.dispatch.Mailbox.run(Mailbox.scala:225) [flink-dist_2.12-1.11.1.jar:1.11.1] at akka.dispatch.Mailbox.exec(Mailbox.scala:235) [flink-dist_2.12-1.11.1.jar:1.11.1] at akka.dispatch.forkjoin.ForkJoinTask.doExec(ForkJoinTask.java:260) [flink-dist_2.12-1.11.1.jar:1.11.1] at akka.dispatch.forkjoin.ForkJoinPool$WorkQueue.runTask(ForkJoinPool.java:1339) [flink-dist_2.12-1.11.1.jar:1.11.1] at akka.dispatch.forkjoin.ForkJoinPool.runWorker(ForkJoinPool.java:1979) [flink-dist_2.12-1.11.1.jar:1.11.1] at akka.dispatch.forkjoin.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:107) [flink-dist_2.12-1.11.1.jar:1.11.1] Caused by: java.util.concurrent.CompletionException: java.util.concurrent.TimeoutException at java.util.concurrent.CompletableFuture.encodeThrowable(CompletableFuture.java:292) ~[?:1.8.0_265] at java.util.concurrent.CompletableFuture.completeThrowable(CompletableFuture.java:308) ~[?:1.8.0_265] at java.util.concurrent.CompletableFuture.uniApply(CompletableFuture.java:607) ~[?:1.8.0_265] at java.util.concurrent.CompletableFuture$UniApply.tryFire(CompletableFuture.java:591) ~[?:1.8.0_265] ... 27 more Caused by: java.util.concurrent.TimeoutException ... 25 more 2020-09-16 09:14:35,603 INFO org.apache.flink.runtime.executiongraph.failover.flip1.RestartPipelinedRegionFailoverStrategy [] - Calculating tasks to restart to recover the failed task cbc357ccb763df2852fee8c4fc7d55f2_0. 2020-09-16 09:14:35,604 INFO org.apache.flink.runtime.executiongraph.failover.flip1.RestartPipelinedRegionFailoverStrategy [] - 7 tasks should be restarted to recover the failed task cbc357ccb763df2852fee8c4fc7d55f2_0. 2020-09-16 09:14:35,606 INFO org.apache.flink.runtime.executiongraph.ExecutionGraph [] - Job Dragonsnake Indicator Accumulator Job (c60f4f384f904467e4c54416d634852f) switched from state RUNNING to FAILING. org.apache.flink.runtime.JobException: Recovery is suppressed by NoRestartBackoffTimeStrategy at org.apache.flink.runtime.executiongraph.failover.flip1.ExecutionFailureHandler.handleFailure(ExecutionFailureHandler.java:116) ~[flink-dist_2.12-1.11.1.jar:1.11.1] at org.apache.flink.runtime.executiongraph.failover.flip1. |
你这个报错看着是TM向JM注册超时了,使用的HA还是非HA部署呢
如果是HA的话,TM是直接使用JM的Pod ip进行通信的,这个时候需要登录pod确认一下网络是否是通的 如果是非HA的话,TM是使用service来向JM注册,你需要检查一下K8s的kube proxy是否正常 另外,是所有TM都注册不上来,还是只有个别的。这个也可以排除网络问题 Best, Yang yanzhibo <[hidden email]> 于2020年9月16日周三 下午5:25写道: > 一个job manager pod 提交job后,申请taskmanager失败 > > > Taskmanager 的异常 > > Fatal error occurred in TaskExecutor akka.tcp:// > flink@179.10.251.70:6122/user/rpc/taskmanager_0. > org.apache.flink.runtime.taskexecutor.exceptions.RegistrationTimeoutException: > Could not register at the ResourceManager within the specified maximum > registration duration 300000 ms. This indicates a problem with this > instance. Terminating now. > at > org.apache.flink.runtime.taskexecutor.TaskExecutor.registrationTimeout(TaskExecutor.java:1251) > ~[flink-dist_2.12-1.11.1.jar:1.11.1] > at > org.apache.flink.runtime.taskexecutor.TaskExecutor.lambda$startRegistrationTimeout$18(TaskExecutor.java:1237) > ~[flink-dist_2.12-1.11.1.jar:1.11.1] > at > org.apache.flink.runtime.rpc.akka.AkkaRpcActor.handleRunAsync(AkkaRpcActor.java:402) > ~[flink-dist_2.12-1.11.1.jar:1.11.1] > at > org.apache.flink.runtime.rpc.akka.AkkaRpcActor.handleRpcMessage(AkkaRpcActor.java:195) > ~[flink-dist_2.12-1.11.1.jar:1.11.1] > at > org.apache.flink.runtime.rpc.akka.AkkaRpcActor.handleMessage(AkkaRpcActor.java:152) > ~[flink-dist_2.12-1.11.1.jar:1.11.1] > at akka.japi.pf.UnitCaseStatement.apply(CaseStatements.scala:26) > [flink-dist_2.12-1.11.1.jar:1.11.1] > at akka.japi.pf.UnitCaseStatement.apply(CaseStatements.scala:21) > [flink-dist_2.12-1.11.1.jar:1.11.1] > at scala.PartialFunction.applyOrElse(PartialFunction.scala:123) > [flink-dist_2.12-1.11.1.jar:1.11.1] > at scala.PartialFunction.applyOrElse$(PartialFunction.scala:122) > [flink-dist_2.12-1.11.1.jar:1.11.1] > at akka.japi.pf.UnitCaseStatement.applyOrElse(CaseStatements.scala:21) > [flink-dist_2.12-1.11.1.jar:1.11.1] > at > scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:171) > [flink-dist_2.12-1.11.1.jar:1.11.1] > at > scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:172) > [flink-dist_2.12-1.11.1.jar:1.11.1] > at > scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:172) > [flink-dist_2.12-1.11.1.jar:1.11.1] > at akka.actor.Actor.aroundReceive(Actor.scala:517) > [flink-dist_2.12-1.11.1.jar:1.11.1] > at akka.actor.Actor.aroundReceive$(Actor.scala:515) > [flink-dist_2.12-1.11.1.jar:1.11.1] > at akka.actor.AbstractActor.aroundReceive(AbstractActor.scala:225) > [flink-dist_2.12-1.11.1.jar:1.11.1] > at akka.actor.ActorCell.receiveMessage(ActorCell.scala:592) > [flink-dist_2.12-1.11.1.jar:1.11.1] > at akka.actor.ActorCell.invoke(ActorCell.scala:561) > [flink-dist_2.12-1.11.1.jar:1.11.1] > at akka.dispatch.Mailbox.processMailbox(Mailbox.scala:258) > [flink-dist_2.12-1.11.1.jar:1.11.1] > at akka.dispatch.Mailbox.run(Mailbox.scala:225) > [flink-dist_2.12-1.11.1.jar:1.11.1] > at akka.dispatch.Mailbox.exec(Mailbox.scala:235) > [flink-dist_2.12-1.11.1.jar:1.11.1] > at > akka.dispatch.forkjoin.ForkJoinTask.doExec(ForkJoinTask.java:260) > [flink-dist_2.12-1.11.1.jar:1.11.1] > at > akka.dispatch.forkjoin.ForkJoinPool$WorkQueue.runTask(ForkJoinPool.java:1339) > [flink-dist_2.12-1.11.1.jar:1.11.1] > at > akka.dispatch.forkjoin.ForkJoinPool.runWorker(ForkJoinPool.java:1979) > [flink-dist_2.12-1.11.1.jar:1.11.1] > at > akka.dispatch.forkjoin.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:107) > [flink-dist_2.12-1.11.1.jar:1.11.1] > 2020-09-16 09:14:39,077 ERROR > org.apache.flink.runtime.taskexecutor.TaskManagerRunner [] - Fatal > error occurred while executing the TaskManager. Shutting it down... > org.apache.flink.runtime.taskexecutor.exceptions.RegistrationTimeoutException: > Could not register at the ResourceManager within the specified maximum > registration duration 300000 ms. This indicates a problem with this > instance. Terminating now. > at > org.apache.flink.runtime.taskexecutor.TaskExecutor.registrationTimeout(TaskExecutor.java:1251) > ~[flink-dist_2.12-1.11.1.jar:1.11.1] > at > org.apache.flink.runtime.taskexecutor.TaskExecutor.lambda$startRegistrationTimeout$18(TaskExecutor.java:1237) > ~[flink-dist_2.12-1.11.1.jar:1.11.1] > at > org.apache.flink.runtime.rpc.akka.AkkaRpcActor.handleRunAsync(AkkaRpcActor.java:402) > ~ > > > Jobmanger 异常 > > 0d5f8478a2ab4e17d816810752f669eb) switched from SCHEDULED to FAILED on not > deployed. > org.apache.flink.runtime.jobmanager.scheduler.NoResourceAvailableException: > Could not allocate the required slot within slot request timeout. Please > make sure that the cluster has enough resources. > at > org.apache.flink.runtime.scheduler.DefaultScheduler.maybeWrapWithNoResourceAvailableException(DefaultScheduler.java:441) > ~[flink-dist_2.12-1.11.1.jar:1.11.1] > at > org.apache.flink.runtime.scheduler.DefaultScheduler.lambda$assignResourceOrHandleError$6(DefaultScheduler.java:422) > ~[flink-dist_2.12-1.11.1.jar:1.11.1] > at > java.util.concurrent.CompletableFuture.uniHandle(CompletableFuture.java:836) > ~[?:1.8.0_265] > at > java.util.concurrent.CompletableFuture$UniHandle.tryFire(CompletableFuture.java:811) > ~[?:1.8.0_265] > at > java.util.concurrent.CompletableFuture.postComplete(CompletableFuture.java:488) > ~[?:1.8.0_265] > at > java.util.concurrent.CompletableFuture.completeExceptionally(CompletableFuture.java:1990) > ~[?:1.8.0_265] > at > org.apache.flink.runtime.jobmaster.slotpool.SchedulerImpl.lambda$internalAllocateSlot$0(SchedulerImpl.java:168) > ~[flink-dist_2.12-1.11.1.jar:1.11.1] > at > java.util.concurrent.CompletableFuture.uniWhenComplete(CompletableFuture.java:774) > ~[?:1.8.0_265] > at > java.util.concurrent.CompletableFuture$UniWhenComplete.tryFire(CompletableFuture.java:750) > ~[?:1.8.0_265] > at > java.util.concurrent.CompletableFuture.postComplete(CompletableFuture.java:488) > ~[?:1.8.0_265] > at > java.util.concurrent.CompletableFuture.completeExceptionally(CompletableFuture.java:1990) > ~[?:1.8.0_265] > at > org.apache.flink.runtime.jobmaster.slotpool.SlotSharingManager$SingleTaskSlot.release(SlotSharingManager.java:726) > ~[flink-dist_2.12-1.11.1.jar:1.11.1] > at > org.apache.flink.runtime.jobmaster.slotpool.SlotSharingManager$MultiTaskSlot.release(SlotSharingManager.java:537) > ~[flink-dist_2.12-1.11.1.jar:1.11.1] > at > org.apache.flink.runtime.jobmaster.slotpool.SlotSharingManager$MultiTaskSlot.lambda$new$0(SlotSharingManager.java:432) > ~[flink-dist_2.12-1.11.1.jar:1.11.1] > at > java.util.concurrent.CompletableFuture.uniHandle(CompletableFuture.java:836) > ~[?:1.8.0_265] > at > java.util.concurrent.CompletableFuture$UniHandle.tryFire(CompletableFuture.java:811) > ~[?:1.8.0_265] > at > java.util.concurrent.CompletableFuture.postComplete(CompletableFuture.java:488) > ~[?:1.8.0_265] > at > java.util.concurrent.CompletableFuture.completeExceptionally(CompletableFuture.java:1990) > ~[?:1.8.0_265] > at > org.apache.flink.runtime.concurrent.FutureUtils.lambda$forwardTo$21(FutureUtils.java:1120) > ~[flink-dist_2.12-1.11.1.jar:1.11.1] > at > java.util.concurrent.CompletableFuture.uniWhenComplete(CompletableFuture.java:774) > ~[?:1.8.0_265] > at > java.util.concurrent.CompletableFuture$UniWhenComplete.tryFire(CompletableFuture.java:750) > ~[?:1.8.0_265] > at > java.util.concurrent.CompletableFuture.postComplete(CompletableFuture.java:488) > ~[?:1.8.0_265] > at > java.util.concurrent.CompletableFuture.completeExceptionally(CompletableFuture.java:1990) > ~[?:1.8.0_265] > at > org.apache.flink.runtime.concurrent.FutureUtils$Timeout.run(FutureUtils.java:1036) > ~[flink-dist_2.12-1.11.1.jar:1.11.1] > at > org.apache.flink.runtime.rpc.akka.AkkaRpcActor.handleRunAsync(AkkaRpcActor.java:402) > ~[flink-dist_2.12-1.11.1.jar:1.11.1] > at > org.apache.flink.runtime.rpc.akka.AkkaRpcActor.handleRpcMessage(AkkaRpcActor.java:195) > ~[flink-dist_2.12-1.11.1.jar:1.11.1] > at > org.apache.flink.runtime.rpc.akka.FencedAkkaRpcActor.handleRpcMessage(FencedAkkaRpcActor.java:74) > ~[flink-dist_2.12-1.11.1.jar:1.11.1] > at > org.apache.flink.runtime.rpc.akka.AkkaRpcActor.handleMessage(AkkaRpcActor.java:152) > ~[flink-dist_2.12-1.11.1.jar:1.11.1] > at akka.japi.pf.UnitCaseStatement.apply(CaseStatements.scala:26) > [flink-dist_2.12-1.11.1.jar:1.11.1] > at akka.japi.pf.UnitCaseStatement.apply(CaseStatements.scala:21) > [flink-dist_2.12-1.11.1.jar:1.11.1] > at scala.PartialFunction.applyOrElse(PartialFunction.scala:123) > [flink-dist_2.12-1.11.1.jar:1.11.1] > at scala.PartialFunction.applyOrElse$(PartialFunction.scala:122) > [flink-dist_2.12-1.11.1.jar:1.11.1] > at akka.japi.pf.UnitCaseStatement.applyOrElse(CaseStatements.scala:21) > [flink-dist_2.12-1.11.1.jar:1.11.1] > at scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:171) > [flink-dist_2.12-1.11.1.jar:1.11.1] > at scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:172) > [flink-dist_2.12-1.11.1.jar:1.11.1] > at scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:172) > [flink-dist_2.12-1.11.1.jar:1.11.1] > at akka.actor.Actor.aroundReceive(Actor.scala:517) > [flink-dist_2.12-1.11.1.jar:1.11.1] > at akka.actor.Actor.aroundReceive$(Actor.scala:515) > [flink-dist_2.12-1.11.1.jar:1.11.1] > at akka.actor.AbstractActor.aroundReceive(AbstractActor.scala:225) > [flink-dist_2.12-1.11.1.jar:1.11.1] > at akka.actor.ActorCell.receiveMessage(ActorCell.scala:592) > [flink-dist_2.12-1.11.1.jar:1.11.1] > at akka.actor.ActorCell.invoke(ActorCell.scala:561) > [flink-dist_2.12-1.11.1.jar:1.11.1] > at akka.dispatch.Mailbox.processMailbox(Mailbox.scala:258) > [flink-dist_2.12-1.11.1.jar:1.11.1] > at akka.dispatch.Mailbox.run(Mailbox.scala:225) > [flink-dist_2.12-1.11.1.jar:1.11.1] > at akka.dispatch.Mailbox.exec(Mailbox.scala:235) > [flink-dist_2.12-1.11.1.jar:1.11.1] > at akka.dispatch.forkjoin.ForkJoinTask.doExec(ForkJoinTask.java:260) > [flink-dist_2.12-1.11.1.jar:1.11.1] > at > akka.dispatch.forkjoin.ForkJoinPool$WorkQueue.runTask(ForkJoinPool.java:1339) > [flink-dist_2.12-1.11.1.jar:1.11.1] > at > akka.dispatch.forkjoin.ForkJoinPool.runWorker(ForkJoinPool.java:1979) > [flink-dist_2.12-1.11.1.jar:1.11.1] > at > akka.dispatch.forkjoin.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:107) > [flink-dist_2.12-1.11.1.jar:1.11.1] > Caused by: java.util.concurrent.CompletionException: > java.util.concurrent.TimeoutException > at > java.util.concurrent.CompletableFuture.encodeThrowable(CompletableFuture.java:292) > ~[?:1.8.0_265] > at > java.util.concurrent.CompletableFuture.completeThrowable(CompletableFuture.java:308) > ~[?:1.8.0_265] > at > java.util.concurrent.CompletableFuture.uniApply(CompletableFuture.java:607) > ~[?:1.8.0_265] > at > java.util.concurrent.CompletableFuture$UniApply.tryFire(CompletableFuture.java:591) > ~[?:1.8.0_265] > ... 27 more > Caused by: java.util.concurrent.TimeoutException > ... 25 more > 2020-09-16 09:14:35,603 INFO > org.apache.flink.runtime.executiongraph.failover.flip1.RestartPipelinedRegionFailoverStrategy > [] - Calculating tasks to restart to recover the failed task > cbc357ccb763df2852fee8c4fc7d55f2_0. > 2020-09-16 09:14:35,604 INFO > org.apache.flink.runtime.executiongraph.failover.flip1.RestartPipelinedRegionFailoverStrategy > [] - 7 tasks should be restarted to recover the failed task > cbc357ccb763df2852fee8c4fc7d55f2_0. > 2020-09-16 09:14:35,606 INFO > org.apache.flink.runtime.executiongraph.ExecutionGraph [] - Job > Dragonsnake Indicator Accumulator Job (c60f4f384f904467e4c54416d634852f) > switched from state RUNNING to FAILING. > org.apache.flink.runtime.JobException: Recovery is suppressed by > NoRestartBackoffTimeStrategy > at > org.apache.flink.runtime.executiongraph.failover.flip1.ExecutionFailureHandler.handleFailure(ExecutionFailureHandler.java:116) > ~[flink-dist_2.12-1.11.1.jar:1.11.1] > at org.apache.flink.runtime.executiongraph.failover.flip1. > > > |
是非ha,所有tm都注册不上来,但是在tm的pod中 根据service 是可以ping 通 jobmanager的
> 2020年9月17日 上午11:10,Yang Wang <[hidden email]> 写道: > > 你这个报错看着是TM向JM注册超时了,使用的HA还是非HA部署呢 > > 如果是HA的话,TM是直接使用JM的Pod ip进行通信的,这个时候需要登录pod确认一下网络是否是通的 > 如果是非HA的话,TM是使用service来向JM注册,你需要检查一下K8s的kube proxy是否正常 > > 另外,是所有TM都注册不上来,还是只有个别的。这个也可以排除网络问题 > > > Best, > Yang > > yanzhibo <[hidden email]> 于2020年9月16日周三 下午5:25写道: > >> 一个job manager pod 提交job后,申请taskmanager失败 >> >> >> Taskmanager 的异常 >> >> Fatal error occurred in TaskExecutor akka.tcp:// >> flink@179.10.251.70:6122/user/rpc/taskmanager_0. >> org.apache.flink.runtime.taskexecutor.exceptions.RegistrationTimeoutException: >> Could not register at the ResourceManager within the specified maximum >> registration duration 300000 ms. This indicates a problem with this >> instance. Terminating now. >> at >> org.apache.flink.runtime.taskexecutor.TaskExecutor.registrationTimeout(TaskExecutor.java:1251) >> ~[flink-dist_2.12-1.11.1.jar:1.11.1] >> at >> org.apache.flink.runtime.taskexecutor.TaskExecutor.lambda$startRegistrationTimeout$18(TaskExecutor.java:1237) >> ~[flink-dist_2.12-1.11.1.jar:1.11.1] >> at >> org.apache.flink.runtime.rpc.akka.AkkaRpcActor.handleRunAsync(AkkaRpcActor.java:402) >> ~[flink-dist_2.12-1.11.1.jar:1.11.1] >> at >> org.apache.flink.runtime.rpc.akka.AkkaRpcActor.handleRpcMessage(AkkaRpcActor.java:195) >> ~[flink-dist_2.12-1.11.1.jar:1.11.1] >> at >> org.apache.flink.runtime.rpc.akka.AkkaRpcActor.handleMessage(AkkaRpcActor.java:152) >> ~[flink-dist_2.12-1.11.1.jar:1.11.1] >> at akka.japi.pf.UnitCaseStatement.apply(CaseStatements.scala:26) >> [flink-dist_2.12-1.11.1.jar:1.11.1] >> at akka.japi.pf.UnitCaseStatement.apply(CaseStatements.scala:21) >> [flink-dist_2.12-1.11.1.jar:1.11.1] >> at scala.PartialFunction.applyOrElse(PartialFunction.scala:123) >> [flink-dist_2.12-1.11.1.jar:1.11.1] >> at scala.PartialFunction.applyOrElse$(PartialFunction.scala:122) >> [flink-dist_2.12-1.11.1.jar:1.11.1] >> at akka.japi.pf.UnitCaseStatement.applyOrElse(CaseStatements.scala:21) >> [flink-dist_2.12-1.11.1.jar:1.11.1] >> at >> scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:171) >> [flink-dist_2.12-1.11.1.jar:1.11.1] >> at >> scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:172) >> [flink-dist_2.12-1.11.1.jar:1.11.1] >> at >> scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:172) >> [flink-dist_2.12-1.11.1.jar:1.11.1] >> at akka.actor.Actor.aroundReceive(Actor.scala:517) >> [flink-dist_2.12-1.11.1.jar:1.11.1] >> at akka.actor.Actor.aroundReceive$(Actor.scala:515) >> [flink-dist_2.12-1.11.1.jar:1.11.1] >> at akka.actor.AbstractActor.aroundReceive(AbstractActor.scala:225) >> [flink-dist_2.12-1.11.1.jar:1.11.1] >> at akka.actor.ActorCell.receiveMessage(ActorCell.scala:592) >> [flink-dist_2.12-1.11.1.jar:1.11.1] >> at akka.actor.ActorCell.invoke(ActorCell.scala:561) >> [flink-dist_2.12-1.11.1.jar:1.11.1] >> at akka.dispatch.Mailbox.processMailbox(Mailbox.scala:258) >> [flink-dist_2.12-1.11.1.jar:1.11.1] >> at akka.dispatch.Mailbox.run(Mailbox.scala:225) >> [flink-dist_2.12-1.11.1.jar:1.11.1] >> at akka.dispatch.Mailbox.exec(Mailbox.scala:235) >> [flink-dist_2.12-1.11.1.jar:1.11.1] >> at >> akka.dispatch.forkjoin.ForkJoinTask.doExec(ForkJoinTask.java:260) >> [flink-dist_2.12-1.11.1.jar:1.11.1] >> at >> akka.dispatch.forkjoin.ForkJoinPool$WorkQueue.runTask(ForkJoinPool.java:1339) >> [flink-dist_2.12-1.11.1.jar:1.11.1] >> at >> akka.dispatch.forkjoin.ForkJoinPool.runWorker(ForkJoinPool.java:1979) >> [flink-dist_2.12-1.11.1.jar:1.11.1] >> at >> akka.dispatch.forkjoin.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:107) >> [flink-dist_2.12-1.11.1.jar:1.11.1] >> 2020-09-16 09:14:39,077 ERROR >> org.apache.flink.runtime.taskexecutor.TaskManagerRunner [] - Fatal >> error occurred while executing the TaskManager. Shutting it down... >> org.apache.flink.runtime.taskexecutor.exceptions.RegistrationTimeoutException: >> Could not register at the ResourceManager within the specified maximum >> registration duration 300000 ms. This indicates a problem with this >> instance. Terminating now. >> at >> org.apache.flink.runtime.taskexecutor.TaskExecutor.registrationTimeout(TaskExecutor.java:1251) >> ~[flink-dist_2.12-1.11.1.jar:1.11.1] >> at >> org.apache.flink.runtime.taskexecutor.TaskExecutor.lambda$startRegistrationTimeout$18(TaskExecutor.java:1237) >> ~[flink-dist_2.12-1.11.1.jar:1.11.1] >> at >> org.apache.flink.runtime.rpc.akka.AkkaRpcActor.handleRunAsync(AkkaRpcActor.java:402) >> ~ >> >> >> Jobmanger 异常 >> >> 0d5f8478a2ab4e17d816810752f669eb) switched from SCHEDULED to FAILED on not >> deployed. >> org.apache.flink.runtime.jobmanager.scheduler.NoResourceAvailableException: >> Could not allocate the required slot within slot request timeout. Please >> make sure that the cluster has enough resources. >> at >> org.apache.flink.runtime.scheduler.DefaultScheduler.maybeWrapWithNoResourceAvailableException(DefaultScheduler.java:441) >> ~[flink-dist_2.12-1.11.1.jar:1.11.1] >> at >> org.apache.flink.runtime.scheduler.DefaultScheduler.lambda$assignResourceOrHandleError$6(DefaultScheduler.java:422) >> ~[flink-dist_2.12-1.11.1.jar:1.11.1] >> at >> java.util.concurrent.CompletableFuture.uniHandle(CompletableFuture.java:836) >> ~[?:1.8.0_265] >> at >> java.util.concurrent.CompletableFuture$UniHandle.tryFire(CompletableFuture.java:811) >> ~[?:1.8.0_265] >> at >> java.util.concurrent.CompletableFuture.postComplete(CompletableFuture.java:488) >> ~[?:1.8.0_265] >> at >> java.util.concurrent.CompletableFuture.completeExceptionally(CompletableFuture.java:1990) >> ~[?:1.8.0_265] >> at >> org.apache.flink.runtime.jobmaster.slotpool.SchedulerImpl.lambda$internalAllocateSlot$0(SchedulerImpl.java:168) >> ~[flink-dist_2.12-1.11.1.jar:1.11.1] >> at >> java.util.concurrent.CompletableFuture.uniWhenComplete(CompletableFuture.java:774) >> ~[?:1.8.0_265] >> at >> java.util.concurrent.CompletableFuture$UniWhenComplete.tryFire(CompletableFuture.java:750) >> ~[?:1.8.0_265] >> at >> java.util.concurrent.CompletableFuture.postComplete(CompletableFuture.java:488) >> ~[?:1.8.0_265] >> at >> java.util.concurrent.CompletableFuture.completeExceptionally(CompletableFuture.java:1990) >> ~[?:1.8.0_265] >> at >> org.apache.flink.runtime.jobmaster.slotpool.SlotSharingManager$SingleTaskSlot.release(SlotSharingManager.java:726) >> ~[flink-dist_2.12-1.11.1.jar:1.11.1] >> at >> org.apache.flink.runtime.jobmaster.slotpool.SlotSharingManager$MultiTaskSlot.release(SlotSharingManager.java:537) >> ~[flink-dist_2.12-1.11.1.jar:1.11.1] >> at >> org.apache.flink.runtime.jobmaster.slotpool.SlotSharingManager$MultiTaskSlot.lambda$new$0(SlotSharingManager.java:432) >> ~[flink-dist_2.12-1.11.1.jar:1.11.1] >> at >> java.util.concurrent.CompletableFuture.uniHandle(CompletableFuture.java:836) >> ~[?:1.8.0_265] >> at >> java.util.concurrent.CompletableFuture$UniHandle.tryFire(CompletableFuture.java:811) >> ~[?:1.8.0_265] >> at >> java.util.concurrent.CompletableFuture.postComplete(CompletableFuture.java:488) >> ~[?:1.8.0_265] >> at >> java.util.concurrent.CompletableFuture.completeExceptionally(CompletableFuture.java:1990) >> ~[?:1.8.0_265] >> at >> org.apache.flink.runtime.concurrent.FutureUtils.lambda$forwardTo$21(FutureUtils.java:1120) >> ~[flink-dist_2.12-1.11.1.jar:1.11.1] >> at >> java.util.concurrent.CompletableFuture.uniWhenComplete(CompletableFuture.java:774) >> ~[?:1.8.0_265] >> at >> java.util.concurrent.CompletableFuture$UniWhenComplete.tryFire(CompletableFuture.java:750) >> ~[?:1.8.0_265] >> at >> java.util.concurrent.CompletableFuture.postComplete(CompletableFuture.java:488) >> ~[?:1.8.0_265] >> at >> java.util.concurrent.CompletableFuture.completeExceptionally(CompletableFuture.java:1990) >> ~[?:1.8.0_265] >> at >> org.apache.flink.runtime.concurrent.FutureUtils$Timeout.run(FutureUtils.java:1036) >> ~[flink-dist_2.12-1.11.1.jar:1.11.1] >> at >> org.apache.flink.runtime.rpc.akka.AkkaRpcActor.handleRunAsync(AkkaRpcActor.java:402) >> ~[flink-dist_2.12-1.11.1.jar:1.11.1] >> at >> org.apache.flink.runtime.rpc.akka.AkkaRpcActor.handleRpcMessage(AkkaRpcActor.java:195) >> ~[flink-dist_2.12-1.11.1.jar:1.11.1] >> at >> org.apache.flink.runtime.rpc.akka.FencedAkkaRpcActor.handleRpcMessage(FencedAkkaRpcActor.java:74) >> ~[flink-dist_2.12-1.11.1.jar:1.11.1] >> at >> org.apache.flink.runtime.rpc.akka.AkkaRpcActor.handleMessage(AkkaRpcActor.java:152) >> ~[flink-dist_2.12-1.11.1.jar:1.11.1] >> at akka.japi.pf.UnitCaseStatement.apply(CaseStatements.scala:26) >> [flink-dist_2.12-1.11.1.jar:1.11.1] >> at akka.japi.pf.UnitCaseStatement.apply(CaseStatements.scala:21) >> [flink-dist_2.12-1.11.1.jar:1.11.1] >> at scala.PartialFunction.applyOrElse(PartialFunction.scala:123) >> [flink-dist_2.12-1.11.1.jar:1.11.1] >> at scala.PartialFunction.applyOrElse$(PartialFunction.scala:122) >> [flink-dist_2.12-1.11.1.jar:1.11.1] >> at akka.japi.pf.UnitCaseStatement.applyOrElse(CaseStatements.scala:21) >> [flink-dist_2.12-1.11.1.jar:1.11.1] >> at scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:171) >> [flink-dist_2.12-1.11.1.jar:1.11.1] >> at scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:172) >> [flink-dist_2.12-1.11.1.jar:1.11.1] >> at scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:172) >> [flink-dist_2.12-1.11.1.jar:1.11.1] >> at akka.actor.Actor.aroundReceive(Actor.scala:517) >> [flink-dist_2.12-1.11.1.jar:1.11.1] >> at akka.actor.Actor.aroundReceive$(Actor.scala:515) >> [flink-dist_2.12-1.11.1.jar:1.11.1] >> at akka.actor.AbstractActor.aroundReceive(AbstractActor.scala:225) >> [flink-dist_2.12-1.11.1.jar:1.11.1] >> at akka.actor.ActorCell.receiveMessage(ActorCell.scala:592) >> [flink-dist_2.12-1.11.1.jar:1.11.1] >> at akka.actor.ActorCell.invoke(ActorCell.scala:561) >> [flink-dist_2.12-1.11.1.jar:1.11.1] >> at akka.dispatch.Mailbox.processMailbox(Mailbox.scala:258) >> [flink-dist_2.12-1.11.1.jar:1.11.1] >> at akka.dispatch.Mailbox.run(Mailbox.scala:225) >> [flink-dist_2.12-1.11.1.jar:1.11.1] >> at akka.dispatch.Mailbox.exec(Mailbox.scala:235) >> [flink-dist_2.12-1.11.1.jar:1.11.1] >> at akka.dispatch.forkjoin.ForkJoinTask.doExec(ForkJoinTask.java:260) >> [flink-dist_2.12-1.11.1.jar:1.11.1] >> at >> akka.dispatch.forkjoin.ForkJoinPool$WorkQueue.runTask(ForkJoinPool.java:1339) >> [flink-dist_2.12-1.11.1.jar:1.11.1] >> at >> akka.dispatch.forkjoin.ForkJoinPool.runWorker(ForkJoinPool.java:1979) >> [flink-dist_2.12-1.11.1.jar:1.11.1] >> at >> akka.dispatch.forkjoin.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:107) >> [flink-dist_2.12-1.11.1.jar:1.11.1] >> Caused by: java.util.concurrent.CompletionException: >> java.util.concurrent.TimeoutException >> at >> java.util.concurrent.CompletableFuture.encodeThrowable(CompletableFuture.java:292) >> ~[?:1.8.0_265] >> at >> java.util.concurrent.CompletableFuture.completeThrowable(CompletableFuture.java:308) >> ~[?:1.8.0_265] >> at >> java.util.concurrent.CompletableFuture.uniApply(CompletableFuture.java:607) >> ~[?:1.8.0_265] >> at >> java.util.concurrent.CompletableFuture$UniApply.tryFire(CompletableFuture.java:591) >> ~[?:1.8.0_265] >> ... 27 more >> Caused by: java.util.concurrent.TimeoutException >> ... 25 more >> 2020-09-16 09:14:35,603 INFO >> org.apache.flink.runtime.executiongraph.failover.flip1.RestartPipelinedRegionFailoverStrategy >> [] - Calculating tasks to restart to recover the failed task >> cbc357ccb763df2852fee8c4fc7d55f2_0. >> 2020-09-16 09:14:35,604 INFO >> org.apache.flink.runtime.executiongraph.failover.flip1.RestartPipelinedRegionFailoverStrategy >> [] - 7 tasks should be restarted to recover the failed task >> cbc357ccb763df2852fee8c4fc7d55f2_0. >> 2020-09-16 09:14:35,606 INFO >> org.apache.flink.runtime.executiongraph.ExecutionGraph [] - Job >> Dragonsnake Indicator Accumulator Job (c60f4f384f904467e4c54416d634852f) >> switched from state RUNNING to FAILING. >> org.apache.flink.runtime.JobException: Recovery is suppressed by >> NoRestartBackoffTimeStrategy >> at >> org.apache.flink.runtime.executiongraph.failover.flip1.ExecutionFailureHandler.handleFailure(ExecutionFailureHandler.java:116) >> ~[flink-dist_2.12-1.11.1.jar:1.11.1] >> at org.apache.flink.runtime.executiongraph.failover.flip1. >> >> >> |
从你发的报错栈来看TM是用的ip地址去连的,正常如果是非HA的话,应该是通过service来连接的
因为JM在非HA情况下rpc地址是bind到service上的 你是否有对Flink的代码做修改呢,或者用native模式起来以后,修改过ConfigMap等 Best, Yang yanzhibo <[hidden email]> 于2020年9月17日周四 下午3:55写道: > 是非ha,所有tm都注册不上来,但是在tm的pod中 根据service 是可以ping 通 jobmanager的 > > > > 2020年9月17日 上午11:10,Yang Wang <[hidden email]> 写道: > > > > 你这个报错看着是TM向JM注册超时了,使用的HA还是非HA部署呢 > > > > 如果是HA的话,TM是直接使用JM的Pod ip进行通信的,这个时候需要登录pod确认一下网络是否是通的 > > 如果是非HA的话,TM是使用service来向JM注册,你需要检查一下K8s的kube proxy是否正常 > > > > 另外,是所有TM都注册不上来,还是只有个别的。这个也可以排除网络问题 > > > > > > Best, > > Yang > > > > yanzhibo <[hidden email]> 于2020年9月16日周三 下午5:25写道: > > > >> 一个job manager pod 提交job后,申请taskmanager失败 > >> > >> > >> Taskmanager 的异常 > >> > >> Fatal error occurred in TaskExecutor akka.tcp:// > >> flink@179.10.251.70:6122/user/rpc/taskmanager_0. > >> > org.apache.flink.runtime.taskexecutor.exceptions.RegistrationTimeoutException: > >> Could not register at the ResourceManager within the specified maximum > >> registration duration 300000 ms. This indicates a problem with this > >> instance. Terminating now. > >> at > >> > org.apache.flink.runtime.taskexecutor.TaskExecutor.registrationTimeout(TaskExecutor.java:1251) > >> ~[flink-dist_2.12-1.11.1.jar:1.11.1] > >> at > >> > org.apache.flink.runtime.taskexecutor.TaskExecutor.lambda$startRegistrationTimeout$18(TaskExecutor.java:1237) > >> ~[flink-dist_2.12-1.11.1.jar:1.11.1] > >> at > >> > org.apache.flink.runtime.rpc.akka.AkkaRpcActor.handleRunAsync(AkkaRpcActor.java:402) > >> ~[flink-dist_2.12-1.11.1.jar:1.11.1] > >> at > >> > org.apache.flink.runtime.rpc.akka.AkkaRpcActor.handleRpcMessage(AkkaRpcActor.java:195) > >> ~[flink-dist_2.12-1.11.1.jar:1.11.1] > >> at > >> > org.apache.flink.runtime.rpc.akka.AkkaRpcActor.handleMessage(AkkaRpcActor.java:152) > >> ~[flink-dist_2.12-1.11.1.jar:1.11.1] > >> at akka.japi.pf.UnitCaseStatement.apply(CaseStatements.scala:26) > >> [flink-dist_2.12-1.11.1.jar:1.11.1] > >> at akka.japi.pf.UnitCaseStatement.apply(CaseStatements.scala:21) > >> [flink-dist_2.12-1.11.1.jar:1.11.1] > >> at scala.PartialFunction.applyOrElse(PartialFunction.scala:123) > >> [flink-dist_2.12-1.11.1.jar:1.11.1] > >> at scala.PartialFunction.applyOrElse$(PartialFunction.scala:122) > >> [flink-dist_2.12-1.11.1.jar:1.11.1] > >> at akka.japi.pf > .UnitCaseStatement.applyOrElse(CaseStatements.scala:21) > >> [flink-dist_2.12-1.11.1.jar:1.11.1] > >> at > >> scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:171) > >> [flink-dist_2.12-1.11.1.jar:1.11.1] > >> at > >> scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:172) > >> [flink-dist_2.12-1.11.1.jar:1.11.1] > >> at > >> scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:172) > >> [flink-dist_2.12-1.11.1.jar:1.11.1] > >> at akka.actor.Actor.aroundReceive(Actor.scala:517) > >> [flink-dist_2.12-1.11.1.jar:1.11.1] > >> at akka.actor.Actor.aroundReceive$(Actor.scala:515) > >> [flink-dist_2.12-1.11.1.jar:1.11.1] > >> at > akka.actor.AbstractActor.aroundReceive(AbstractActor.scala:225) > >> [flink-dist_2.12-1.11.1.jar:1.11.1] > >> at akka.actor.ActorCell.receiveMessage(ActorCell.scala:592) > >> [flink-dist_2.12-1.11.1.jar:1.11.1] > >> at akka.actor.ActorCell.invoke(ActorCell.scala:561) > >> [flink-dist_2.12-1.11.1.jar:1.11.1] > >> at akka.dispatch.Mailbox.processMailbox(Mailbox.scala:258) > >> [flink-dist_2.12-1.11.1.jar:1.11.1] > >> at akka.dispatch.Mailbox.run(Mailbox.scala:225) > >> [flink-dist_2.12-1.11.1.jar:1.11.1] > >> at akka.dispatch.Mailbox.exec(Mailbox.scala:235) > >> [flink-dist_2.12-1.11.1.jar:1.11.1] > >> at > >> akka.dispatch.forkjoin.ForkJoinTask.doExec(ForkJoinTask.java:260) > >> [flink-dist_2.12-1.11.1.jar:1.11.1] > >> at > >> > akka.dispatch.forkjoin.ForkJoinPool$WorkQueue.runTask(ForkJoinPool.java:1339) > >> [flink-dist_2.12-1.11.1.jar:1.11.1] > >> at > >> akka.dispatch.forkjoin.ForkJoinPool.runWorker(ForkJoinPool.java:1979) > >> [flink-dist_2.12-1.11.1.jar:1.11.1] > >> at > >> > akka.dispatch.forkjoin.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:107) > >> [flink-dist_2.12-1.11.1.jar:1.11.1] > >> 2020-09-16 09:14:39,077 ERROR > >> org.apache.flink.runtime.taskexecutor.TaskManagerRunner [] - Fatal > >> error occurred while executing the TaskManager. Shutting it down... > >> > org.apache.flink.runtime.taskexecutor.exceptions.RegistrationTimeoutException: > >> Could not register at the ResourceManager within the specified maximum > >> registration duration 300000 ms. This indicates a problem with this > >> instance. Terminating now. > >> at > >> > org.apache.flink.runtime.taskexecutor.TaskExecutor.registrationTimeout(TaskExecutor.java:1251) > >> ~[flink-dist_2.12-1.11.1.jar:1.11.1] > >> at > >> > org.apache.flink.runtime.taskexecutor.TaskExecutor.lambda$startRegistrationTimeout$18(TaskExecutor.java:1237) > >> ~[flink-dist_2.12-1.11.1.jar:1.11.1] > >> at > >> > org.apache.flink.runtime.rpc.akka.AkkaRpcActor.handleRunAsync(AkkaRpcActor.java:402) > >> ~ > >> > >> > >> Jobmanger 异常 > >> > >> 0d5f8478a2ab4e17d816810752f669eb) switched from SCHEDULED to FAILED on > not > >> deployed. > >> > org.apache.flink.runtime.jobmanager.scheduler.NoResourceAvailableException: > >> Could not allocate the required slot within slot request timeout. Please > >> make sure that the cluster has enough resources. > >> at > >> > org.apache.flink.runtime.scheduler.DefaultScheduler.maybeWrapWithNoResourceAvailableException(DefaultScheduler.java:441) > >> ~[flink-dist_2.12-1.11.1.jar:1.11.1] > >> at > >> > org.apache.flink.runtime.scheduler.DefaultScheduler.lambda$assignResourceOrHandleError$6(DefaultScheduler.java:422) > >> ~[flink-dist_2.12-1.11.1.jar:1.11.1] > >> at > >> > java.util.concurrent.CompletableFuture.uniHandle(CompletableFuture.java:836) > >> ~[?:1.8.0_265] > >> at > >> > java.util.concurrent.CompletableFuture$UniHandle.tryFire(CompletableFuture.java:811) > >> ~[?:1.8.0_265] > >> at > >> > java.util.concurrent.CompletableFuture.postComplete(CompletableFuture.java:488) > >> ~[?:1.8.0_265] > >> at > >> > java.util.concurrent.CompletableFuture.completeExceptionally(CompletableFuture.java:1990) > >> ~[?:1.8.0_265] > >> at > >> > org.apache.flink.runtime.jobmaster.slotpool.SchedulerImpl.lambda$internalAllocateSlot$0(SchedulerImpl.java:168) > >> ~[flink-dist_2.12-1.11.1.jar:1.11.1] > >> at > >> > java.util.concurrent.CompletableFuture.uniWhenComplete(CompletableFuture.java:774) > >> ~[?:1.8.0_265] > >> at > >> > java.util.concurrent.CompletableFuture$UniWhenComplete.tryFire(CompletableFuture.java:750) > >> ~[?:1.8.0_265] > >> at > >> > java.util.concurrent.CompletableFuture.postComplete(CompletableFuture.java:488) > >> ~[?:1.8.0_265] > >> at > >> > java.util.concurrent.CompletableFuture.completeExceptionally(CompletableFuture.java:1990) > >> ~[?:1.8.0_265] > >> at > >> > org.apache.flink.runtime.jobmaster.slotpool.SlotSharingManager$SingleTaskSlot.release(SlotSharingManager.java:726) > >> ~[flink-dist_2.12-1.11.1.jar:1.11.1] > >> at > >> > org.apache.flink.runtime.jobmaster.slotpool.SlotSharingManager$MultiTaskSlot.release(SlotSharingManager.java:537) > >> ~[flink-dist_2.12-1.11.1.jar:1.11.1] > >> at > >> > org.apache.flink.runtime.jobmaster.slotpool.SlotSharingManager$MultiTaskSlot.lambda$new$0(SlotSharingManager.java:432) > >> ~[flink-dist_2.12-1.11.1.jar:1.11.1] > >> at > >> > java.util.concurrent.CompletableFuture.uniHandle(CompletableFuture.java:836) > >> ~[?:1.8.0_265] > >> at > >> > java.util.concurrent.CompletableFuture$UniHandle.tryFire(CompletableFuture.java:811) > >> ~[?:1.8.0_265] > >> at > >> > java.util.concurrent.CompletableFuture.postComplete(CompletableFuture.java:488) > >> ~[?:1.8.0_265] > >> at > >> > java.util.concurrent.CompletableFuture.completeExceptionally(CompletableFuture.java:1990) > >> ~[?:1.8.0_265] > >> at > >> > org.apache.flink.runtime.concurrent.FutureUtils.lambda$forwardTo$21(FutureUtils.java:1120) > >> ~[flink-dist_2.12-1.11.1.jar:1.11.1] > >> at > >> > java.util.concurrent.CompletableFuture.uniWhenComplete(CompletableFuture.java:774) > >> ~[?:1.8.0_265] > >> at > >> > java.util.concurrent.CompletableFuture$UniWhenComplete.tryFire(CompletableFuture.java:750) > >> ~[?:1.8.0_265] > >> at > >> > java.util.concurrent.CompletableFuture.postComplete(CompletableFuture.java:488) > >> ~[?:1.8.0_265] > >> at > >> > java.util.concurrent.CompletableFuture.completeExceptionally(CompletableFuture.java:1990) > >> ~[?:1.8.0_265] > >> at > >> > org.apache.flink.runtime.concurrent.FutureUtils$Timeout.run(FutureUtils.java:1036) > >> ~[flink-dist_2.12-1.11.1.jar:1.11.1] > >> at > >> > org.apache.flink.runtime.rpc.akka.AkkaRpcActor.handleRunAsync(AkkaRpcActor.java:402) > >> ~[flink-dist_2.12-1.11.1.jar:1.11.1] > >> at > >> > org.apache.flink.runtime.rpc.akka.AkkaRpcActor.handleRpcMessage(AkkaRpcActor.java:195) > >> ~[flink-dist_2.12-1.11.1.jar:1.11.1] > >> at > >> > org.apache.flink.runtime.rpc.akka.FencedAkkaRpcActor.handleRpcMessage(FencedAkkaRpcActor.java:74) > >> ~[flink-dist_2.12-1.11.1.jar:1.11.1] > >> at > >> > org.apache.flink.runtime.rpc.akka.AkkaRpcActor.handleMessage(AkkaRpcActor.java:152) > >> ~[flink-dist_2.12-1.11.1.jar:1.11.1] > >> at akka.japi.pf.UnitCaseStatement.apply(CaseStatements.scala:26) > >> [flink-dist_2.12-1.11.1.jar:1.11.1] > >> at akka.japi.pf.UnitCaseStatement.apply(CaseStatements.scala:21) > >> [flink-dist_2.12-1.11.1.jar:1.11.1] > >> at scala.PartialFunction.applyOrElse(PartialFunction.scala:123) > >> [flink-dist_2.12-1.11.1.jar:1.11.1] > >> at scala.PartialFunction.applyOrElse$(PartialFunction.scala:122) > >> [flink-dist_2.12-1.11.1.jar:1.11.1] > >> at akka.japi.pf > .UnitCaseStatement.applyOrElse(CaseStatements.scala:21) > >> [flink-dist_2.12-1.11.1.jar:1.11.1] > >> at > scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:171) > >> [flink-dist_2.12-1.11.1.jar:1.11.1] > >> at > scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:172) > >> [flink-dist_2.12-1.11.1.jar:1.11.1] > >> at > scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:172) > >> [flink-dist_2.12-1.11.1.jar:1.11.1] > >> at akka.actor.Actor.aroundReceive(Actor.scala:517) > >> [flink-dist_2.12-1.11.1.jar:1.11.1] > >> at akka.actor.Actor.aroundReceive$(Actor.scala:515) > >> [flink-dist_2.12-1.11.1.jar:1.11.1] > >> at akka.actor.AbstractActor.aroundReceive(AbstractActor.scala:225) > >> [flink-dist_2.12-1.11.1.jar:1.11.1] > >> at akka.actor.ActorCell.receiveMessage(ActorCell.scala:592) > >> [flink-dist_2.12-1.11.1.jar:1.11.1] > >> at akka.actor.ActorCell.invoke(ActorCell.scala:561) > >> [flink-dist_2.12-1.11.1.jar:1.11.1] > >> at akka.dispatch.Mailbox.processMailbox(Mailbox.scala:258) > >> [flink-dist_2.12-1.11.1.jar:1.11.1] > >> at akka.dispatch.Mailbox.run(Mailbox.scala:225) > >> [flink-dist_2.12-1.11.1.jar:1.11.1] > >> at akka.dispatch.Mailbox.exec(Mailbox.scala:235) > >> [flink-dist_2.12-1.11.1.jar:1.11.1] > >> at akka.dispatch.forkjoin.ForkJoinTask.doExec(ForkJoinTask.java:260) > >> [flink-dist_2.12-1.11.1.jar:1.11.1] > >> at > >> > akka.dispatch.forkjoin.ForkJoinPool$WorkQueue.runTask(ForkJoinPool.java:1339) > >> [flink-dist_2.12-1.11.1.jar:1.11.1] > >> at > >> akka.dispatch.forkjoin.ForkJoinPool.runWorker(ForkJoinPool.java:1979) > >> [flink-dist_2.12-1.11.1.jar:1.11.1] > >> at > >> > akka.dispatch.forkjoin.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:107) > >> [flink-dist_2.12-1.11.1.jar:1.11.1] > >> Caused by: java.util.concurrent.CompletionException: > >> java.util.concurrent.TimeoutException > >> at > >> > java.util.concurrent.CompletableFuture.encodeThrowable(CompletableFuture.java:292) > >> ~[?:1.8.0_265] > >> at > >> > java.util.concurrent.CompletableFuture.completeThrowable(CompletableFuture.java:308) > >> ~[?:1.8.0_265] > >> at > >> > java.util.concurrent.CompletableFuture.uniApply(CompletableFuture.java:607) > >> ~[?:1.8.0_265] > >> at > >> > java.util.concurrent.CompletableFuture$UniApply.tryFire(CompletableFuture.java:591) > >> ~[?:1.8.0_265] > >> ... 27 more > >> Caused by: java.util.concurrent.TimeoutException > >> ... 25 more > >> 2020-09-16 09:14:35,603 INFO > >> > org.apache.flink.runtime.executiongraph.failover.flip1.RestartPipelinedRegionFailoverStrategy > >> [] - Calculating tasks to restart to recover the failed task > >> cbc357ccb763df2852fee8c4fc7d55f2_0. > >> 2020-09-16 09:14:35,604 INFO > >> > org.apache.flink.runtime.executiongraph.failover.flip1.RestartPipelinedRegionFailoverStrategy > >> [] - 7 tasks should be restarted to recover the failed task > >> cbc357ccb763df2852fee8c4fc7d55f2_0. > >> 2020-09-16 09:14:35,606 INFO > >> org.apache.flink.runtime.executiongraph.ExecutionGraph [] - Job > >> Dragonsnake Indicator Accumulator Job (c60f4f384f904467e4c54416d634852f) > >> switched from state RUNNING to FAILING. > >> org.apache.flink.runtime.JobException: Recovery is suppressed by > >> NoRestartBackoffTimeStrategy > >> at > >> > org.apache.flink.runtime.executiongraph.failover.flip1.ExecutionFailureHandler.handleFailure(ExecutionFailureHandler.java:116) > >> ~[flink-dist_2.12-1.11.1.jar:1.11.1] > >> at org.apache.flink.runtime.executiongraph.failover.flip1. > >> > >> > >> > > |
In reply to this post by yanzhibo
跟jobmanager在同一个node上的tm是可以注册到jm上的,其他的node是不行的 Jm是单机模式部署 > 2020年9月17日 下午3:55,yanzhibo <[hidden email]> 写道: > > 是非ha,所有tm都注册不上来,但是在tm的pod中 根据service 是可以ping 通 jobmanager的 > > >> 2020年9月17日 上午11:10,Yang Wang <[hidden email]> 写道: >> >> 你这个报错看着是TM向JM注册超时了,使用的HA还是非HA部署呢 >> >> 如果是HA的话,TM是直接使用JM的Pod ip进行通信的,这个时候需要登录pod确认一下网络是否是通的 >> 如果是非HA的话,TM是使用service来向JM注册,你需要检查一下K8s的kube proxy是否正常 >> >> 另外,是所有TM都注册不上来,还是只有个别的。这个也可以排除网络问题 >> >> >> Best, >> Yang >> >> yanzhibo <[hidden email]> 于2020年9月16日周三 下午5:25写道: >> >>> 一个job manager pod 提交job后,申请taskmanager失败 >>> >>> >>> Taskmanager 的异常 >>> >>> Fatal error occurred in TaskExecutor akka.tcp:// >>> flink@179.10.251.70:6122/user/rpc/taskmanager_0. >>> org.apache.flink.runtime.taskexecutor.exceptions.RegistrationTimeoutException: >>> Could not register at the ResourceManager within the specified maximum >>> registration duration 300000 ms. This indicates a problem with this >>> instance. Terminating now. >>> at >>> org.apache.flink.runtime.taskexecutor.TaskExecutor.registrationTimeout(TaskExecutor.java:1251) >>> ~[flink-dist_2.12-1.11.1.jar:1.11.1] >>> at >>> org.apache.flink.runtime.taskexecutor.TaskExecutor.lambda$startRegistrationTimeout$18(TaskExecutor.java:1237) >>> ~[flink-dist_2.12-1.11.1.jar:1.11.1] >>> at >>> org.apache.flink.runtime.rpc.akka.AkkaRpcActor.handleRunAsync(AkkaRpcActor.java:402) >>> ~[flink-dist_2.12-1.11.1.jar:1.11.1] >>> at >>> org.apache.flink.runtime.rpc.akka.AkkaRpcActor.handleRpcMessage(AkkaRpcActor.java:195) >>> ~[flink-dist_2.12-1.11.1.jar:1.11.1] >>> at >>> org.apache.flink.runtime.rpc.akka.AkkaRpcActor.handleMessage(AkkaRpcActor.java:152) >>> ~[flink-dist_2.12-1.11.1.jar:1.11.1] >>> at akka.japi.pf.UnitCaseStatement.apply(CaseStatements.scala:26) >>> [flink-dist_2.12-1.11.1.jar:1.11.1] >>> at akka.japi.pf.UnitCaseStatement.apply(CaseStatements.scala:21) >>> [flink-dist_2.12-1.11.1.jar:1.11.1] >>> at scala.PartialFunction.applyOrElse(PartialFunction.scala:123) >>> [flink-dist_2.12-1.11.1.jar:1.11.1] >>> at scala.PartialFunction.applyOrElse$(PartialFunction.scala:122) >>> [flink-dist_2.12-1.11.1.jar:1.11.1] >>> at akka.japi.pf.UnitCaseStatement.applyOrElse(CaseStatements.scala:21) >>> [flink-dist_2.12-1.11.1.jar:1.11.1] >>> at >>> scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:171) >>> [flink-dist_2.12-1.11.1.jar:1.11.1] >>> at >>> scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:172) >>> [flink-dist_2.12-1.11.1.jar:1.11.1] >>> at >>> scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:172) >>> [flink-dist_2.12-1.11.1.jar:1.11.1] >>> at akka.actor.Actor.aroundReceive(Actor.scala:517) >>> [flink-dist_2.12-1.11.1.jar:1.11.1] >>> at akka.actor.Actor.aroundReceive$(Actor.scala:515) >>> [flink-dist_2.12-1.11.1.jar:1.11.1] >>> at akka.actor.AbstractActor.aroundReceive(AbstractActor.scala:225) >>> [flink-dist_2.12-1.11.1.jar:1.11.1] >>> at akka.actor.ActorCell.receiveMessage(ActorCell.scala:592) >>> [flink-dist_2.12-1.11.1.jar:1.11.1] >>> at akka.actor.ActorCell.invoke(ActorCell.scala:561) >>> [flink-dist_2.12-1.11.1.jar:1.11.1] >>> at akka.dispatch.Mailbox.processMailbox(Mailbox.scala:258) >>> [flink-dist_2.12-1.11.1.jar:1.11.1] >>> at akka.dispatch.Mailbox.run(Mailbox.scala:225) >>> [flink-dist_2.12-1.11.1.jar:1.11.1] >>> at akka.dispatch.Mailbox.exec(Mailbox.scala:235) >>> [flink-dist_2.12-1.11.1.jar:1.11.1] >>> at >>> akka.dispatch.forkjoin.ForkJoinTask.doExec(ForkJoinTask.java:260) >>> [flink-dist_2.12-1.11.1.jar:1.11.1] >>> at >>> akka.dispatch.forkjoin.ForkJoinPool$WorkQueue.runTask(ForkJoinPool.java:1339) >>> [flink-dist_2.12-1.11.1.jar:1.11.1] >>> at >>> akka.dispatch.forkjoin.ForkJoinPool.runWorker(ForkJoinPool.java:1979) >>> [flink-dist_2.12-1.11.1.jar:1.11.1] >>> at >>> akka.dispatch.forkjoin.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:107) >>> [flink-dist_2.12-1.11.1.jar:1.11.1] >>> 2020-09-16 09:14:39,077 ERROR >>> org.apache.flink.runtime.taskexecutor.TaskManagerRunner [] - Fatal >>> error occurred while executing the TaskManager. Shutting it down... >>> org.apache.flink.runtime.taskexecutor.exceptions.RegistrationTimeoutException: >>> Could not register at the ResourceManager within the specified maximum >>> registration duration 300000 ms. This indicates a problem with this >>> instance. Terminating now. >>> at >>> org.apache.flink.runtime.taskexecutor.TaskExecutor.registrationTimeout(TaskExecutor.java:1251) >>> ~[flink-dist_2.12-1.11.1.jar:1.11.1] >>> at >>> org.apache.flink.runtime.taskexecutor.TaskExecutor.lambda$startRegistrationTimeout$18(TaskExecutor.java:1237) >>> ~[flink-dist_2.12-1.11.1.jar:1.11.1] >>> at >>> org.apache.flink.runtime.rpc.akka.AkkaRpcActor.handleRunAsync(AkkaRpcActor.java:402) >>> ~ >>> >>> >>> Jobmanger 异常 >>> >>> 0d5f8478a2ab4e17d816810752f669eb) switched from SCHEDULED to FAILED on not >>> deployed. >>> org.apache.flink.runtime.jobmanager.scheduler.NoResourceAvailableException: >>> Could not allocate the required slot within slot request timeout. Please >>> make sure that the cluster has enough resources. >>> at >>> org.apache.flink.runtime.scheduler.DefaultScheduler.maybeWrapWithNoResourceAvailableException(DefaultScheduler.java:441) >>> ~[flink-dist_2.12-1.11.1.jar:1.11.1] >>> at >>> org.apache.flink.runtime.scheduler.DefaultScheduler.lambda$assignResourceOrHandleError$6(DefaultScheduler.java:422) >>> ~[flink-dist_2.12-1.11.1.jar:1.11.1] >>> at >>> java.util.concurrent.CompletableFuture.uniHandle(CompletableFuture.java:836) >>> ~[?:1.8.0_265] >>> at >>> java.util.concurrent.CompletableFuture$UniHandle.tryFire(CompletableFuture.java:811) >>> ~[?:1.8.0_265] >>> at >>> java.util.concurrent.CompletableFuture.postComplete(CompletableFuture.java:488) >>> ~[?:1.8.0_265] >>> at >>> java.util.concurrent.CompletableFuture.completeExceptionally(CompletableFuture.java:1990) >>> ~[?:1.8.0_265] >>> at >>> org.apache.flink.runtime.jobmaster.slotpool.SchedulerImpl.lambda$internalAllocateSlot$0(SchedulerImpl.java:168) >>> ~[flink-dist_2.12-1.11.1.jar:1.11.1] >>> at >>> java.util.concurrent.CompletableFuture.uniWhenComplete(CompletableFuture.java:774) >>> ~[?:1.8.0_265] >>> at >>> java.util.concurrent.CompletableFuture$UniWhenComplete.tryFire(CompletableFuture.java:750) >>> ~[?:1.8.0_265] >>> at >>> java.util.concurrent.CompletableFuture.postComplete(CompletableFuture.java:488) >>> ~[?:1.8.0_265] >>> at >>> java.util.concurrent.CompletableFuture.completeExceptionally(CompletableFuture.java:1990) >>> ~[?:1.8.0_265] >>> at >>> org.apache.flink.runtime.jobmaster.slotpool.SlotSharingManager$SingleTaskSlot.release(SlotSharingManager.java:726) >>> ~[flink-dist_2.12-1.11.1.jar:1.11.1] >>> at >>> org.apache.flink.runtime.jobmaster.slotpool.SlotSharingManager$MultiTaskSlot.release(SlotSharingManager.java:537) >>> ~[flink-dist_2.12-1.11.1.jar:1.11.1] >>> at >>> org.apache.flink.runtime.jobmaster.slotpool.SlotSharingManager$MultiTaskSlot.lambda$new$0(SlotSharingManager.java:432) >>> ~[flink-dist_2.12-1.11.1.jar:1.11.1] >>> at >>> java.util.concurrent.CompletableFuture.uniHandle(CompletableFuture.java:836) >>> ~[?:1.8.0_265] >>> at >>> java.util.concurrent.CompletableFuture$UniHandle.tryFire(CompletableFuture.java:811) >>> ~[?:1.8.0_265] >>> at >>> java.util.concurrent.CompletableFuture.postComplete(CompletableFuture.java:488) >>> ~[?:1.8.0_265] >>> at >>> java.util.concurrent.CompletableFuture.completeExceptionally(CompletableFuture.java:1990) >>> ~[?:1.8.0_265] >>> at >>> org.apache.flink.runtime.concurrent.FutureUtils.lambda$forwardTo$21(FutureUtils.java:1120) >>> ~[flink-dist_2.12-1.11.1.jar:1.11.1] >>> at >>> java.util.concurrent.CompletableFuture.uniWhenComplete(CompletableFuture.java:774) >>> ~[?:1.8.0_265] >>> at >>> java.util.concurrent.CompletableFuture$UniWhenComplete.tryFire(CompletableFuture.java:750) >>> ~[?:1.8.0_265] >>> at >>> java.util.concurrent.CompletableFuture.postComplete(CompletableFuture.java:488) >>> ~[?:1.8.0_265] >>> at >>> java.util.concurrent.CompletableFuture.completeExceptionally(CompletableFuture.java:1990) >>> ~[?:1.8.0_265] >>> at >>> org.apache.flink.runtime.concurrent.FutureUtils$Timeout.run(FutureUtils.java:1036) >>> ~[flink-dist_2.12-1.11.1.jar:1.11.1] >>> at >>> org.apache.flink.runtime.rpc.akka.AkkaRpcActor.handleRunAsync(AkkaRpcActor.java:402) >>> ~[flink-dist_2.12-1.11.1.jar:1.11.1] >>> at >>> org.apache.flink.runtime.rpc.akka.AkkaRpcActor.handleRpcMessage(AkkaRpcActor.java:195) >>> ~[flink-dist_2.12-1.11.1.jar:1.11.1] >>> at >>> org.apache.flink.runtime.rpc.akka.FencedAkkaRpcActor.handleRpcMessage(FencedAkkaRpcActor.java:74) >>> ~[flink-dist_2.12-1.11.1.jar:1.11.1] >>> at >>> org.apache.flink.runtime.rpc.akka.AkkaRpcActor.handleMessage(AkkaRpcActor.java:152) >>> ~[flink-dist_2.12-1.11.1.jar:1.11.1] >>> at akka.japi.pf.UnitCaseStatement.apply(CaseStatements.scala:26) >>> [flink-dist_2.12-1.11.1.jar:1.11.1] >>> at akka.japi.pf.UnitCaseStatement.apply(CaseStatements.scala:21) >>> [flink-dist_2.12-1.11.1.jar:1.11.1] >>> at scala.PartialFunction.applyOrElse(PartialFunction.scala:123) >>> [flink-dist_2.12-1.11.1.jar:1.11.1] >>> at scala.PartialFunction.applyOrElse$(PartialFunction.scala:122) >>> [flink-dist_2.12-1.11.1.jar:1.11.1] >>> at akka.japi.pf.UnitCaseStatement.applyOrElse(CaseStatements.scala:21) >>> [flink-dist_2.12-1.11.1.jar:1.11.1] >>> at scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:171) >>> [flink-dist_2.12-1.11.1.jar:1.11.1] >>> at scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:172) >>> [flink-dist_2.12-1.11.1.jar:1.11.1] >>> at scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:172) >>> [flink-dist_2.12-1.11.1.jar:1.11.1] >>> at akka.actor.Actor.aroundReceive(Actor.scala:517) >>> [flink-dist_2.12-1.11.1.jar:1.11.1] >>> at akka.actor.Actor.aroundReceive$(Actor.scala:515) >>> [flink-dist_2.12-1.11.1.jar:1.11.1] >>> at akka.actor.AbstractActor.aroundReceive(AbstractActor.scala:225) >>> [flink-dist_2.12-1.11.1.jar:1.11.1] >>> at akka.actor.ActorCell.receiveMessage(ActorCell.scala:592) >>> [flink-dist_2.12-1.11.1.jar:1.11.1] >>> at akka.actor.ActorCell.invoke(ActorCell.scala:561) >>> [flink-dist_2.12-1.11.1.jar:1.11.1] >>> at akka.dispatch.Mailbox.processMailbox(Mailbox.scala:258) >>> [flink-dist_2.12-1.11.1.jar:1.11.1] >>> at akka.dispatch.Mailbox.run(Mailbox.scala:225) >>> [flink-dist_2.12-1.11.1.jar:1.11.1] >>> at akka.dispatch.Mailbox.exec(Mailbox.scala:235) >>> [flink-dist_2.12-1.11.1.jar:1.11.1] >>> at akka.dispatch.forkjoin.ForkJoinTask.doExec(ForkJoinTask.java:260) >>> [flink-dist_2.12-1.11.1.jar:1.11.1] >>> at >>> akka.dispatch.forkjoin.ForkJoinPool$WorkQueue.runTask(ForkJoinPool.java:1339) >>> [flink-dist_2.12-1.11.1.jar:1.11.1] >>> at >>> akka.dispatch.forkjoin.ForkJoinPool.runWorker(ForkJoinPool.java:1979) >>> [flink-dist_2.12-1.11.1.jar:1.11.1] >>> at >>> akka.dispatch.forkjoin.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:107) >>> [flink-dist_2.12-1.11.1.jar:1.11.1] >>> Caused by: java.util.concurrent.CompletionException: >>> java.util.concurrent.TimeoutException >>> at >>> java.util.concurrent.CompletableFuture.encodeThrowable(CompletableFuture.java:292) >>> ~[?:1.8.0_265] >>> at >>> java.util.concurrent.CompletableFuture.completeThrowable(CompletableFuture.java:308) >>> ~[?:1.8.0_265] >>> at >>> java.util.concurrent.CompletableFuture.uniApply(CompletableFuture.java:607) >>> ~[?:1.8.0_265] >>> at >>> java.util.concurrent.CompletableFuture$UniApply.tryFire(CompletableFuture.java:591) >>> ~[?:1.8.0_265] >>> ... 27 more >>> Caused by: java.util.concurrent.TimeoutException >>> ... 25 more >>> 2020-09-16 09:14:35,603 INFO >>> org.apache.flink.runtime.executiongraph.failover.flip1.RestartPipelinedRegionFailoverStrategy >>> [] - Calculating tasks to restart to recover the failed task >>> cbc357ccb763df2852fee8c4fc7d55f2_0. >>> 2020-09-16 09:14:35,604 INFO >>> org.apache.flink.runtime.executiongraph.failover.flip1.RestartPipelinedRegionFailoverStrategy >>> [] - 7 tasks should be restarted to recover the failed task >>> cbc357ccb763df2852fee8c4fc7d55f2_0. >>> 2020-09-16 09:14:35,606 INFO >>> org.apache.flink.runtime.executiongraph.ExecutionGraph [] - Job >>> Dragonsnake Indicator Accumulator Job (c60f4f384f904467e4c54416d634852f) >>> switched from state RUNNING to FAILING. >>> org.apache.flink.runtime.JobException: Recovery is suppressed by >>> NoRestartBackoffTimeStrategy >>> at >>> org.apache.flink.runtime.executiongraph.failover.flip1.ExecutionFailureHandler.handleFailure(ExecutionFailureHandler.java:116) >>> ~[flink-dist_2.12-1.11.1.jar:1.11.1] >>> at org.apache.flink.runtime.executiongraph.failover.flip1. >>> >>> >>> > |
你确认一下挂载给TM的ConfigMap都是一样的吗,因为从你给的Log来看,应该不是用的社区文档里面的yaml[1]来运行的
另外,如果能够把JobManager和TaskManager的log分享一下的话,查问题会更方便一些 [1]. https://ci.apache.org/projects/flink/flink-docs-master/ops/deployment/kubernetes.html Best, Yang yanzhibo <[hidden email]> 于2020年9月18日周五 下午6:57写道: > > 跟jobmanager在同一个node上的tm是可以注册到jm上的,其他的node是不行的 > > Jm是单机模式部署 > > > 2020年9月17日 下午3:55,yanzhibo <[hidden email]> 写道: > > > > 是非ha,所有tm都注册不上来,但是在tm的pod中 根据service 是可以ping 通 jobmanager的 > > > > > >> 2020年9月17日 上午11:10,Yang Wang <[hidden email]> 写道: > >> > >> 你这个报错看着是TM向JM注册超时了,使用的HA还是非HA部署呢 > >> > >> 如果是HA的话,TM是直接使用JM的Pod ip进行通信的,这个时候需要登录pod确认一下网络是否是通的 > >> 如果是非HA的话,TM是使用service来向JM注册,你需要检查一下K8s的kube proxy是否正常 > >> > >> 另外,是所有TM都注册不上来,还是只有个别的。这个也可以排除网络问题 > >> > >> > >> Best, > >> Yang > >> > >> yanzhibo <[hidden email]> 于2020年9月16日周三 下午5:25写道: > >> > >>> 一个job manager pod 提交job后,申请taskmanager失败 > >>> > >>> > >>> Taskmanager 的异常 > >>> > >>> Fatal error occurred in TaskExecutor akka.tcp:// > >>> flink@179.10.251.70:6122/user/rpc/taskmanager_0. > >>> > org.apache.flink.runtime.taskexecutor.exceptions.RegistrationTimeoutException: > >>> Could not register at the ResourceManager within the specified maximum > >>> registration duration 300000 ms. This indicates a problem with this > >>> instance. Terminating now. > >>> at > >>> > org.apache.flink.runtime.taskexecutor.TaskExecutor.registrationTimeout(TaskExecutor.java:1251) > >>> ~[flink-dist_2.12-1.11.1.jar:1.11.1] > >>> at > >>> > org.apache.flink.runtime.taskexecutor.TaskExecutor.lambda$startRegistrationTimeout$18(TaskExecutor.java:1237) > >>> ~[flink-dist_2.12-1.11.1.jar:1.11.1] > >>> at > >>> > org.apache.flink.runtime.rpc.akka.AkkaRpcActor.handleRunAsync(AkkaRpcActor.java:402) > >>> ~[flink-dist_2.12-1.11.1.jar:1.11.1] > >>> at > >>> > org.apache.flink.runtime.rpc.akka.AkkaRpcActor.handleRpcMessage(AkkaRpcActor.java:195) > >>> ~[flink-dist_2.12-1.11.1.jar:1.11.1] > >>> at > >>> > org.apache.flink.runtime.rpc.akka.AkkaRpcActor.handleMessage(AkkaRpcActor.java:152) > >>> ~[flink-dist_2.12-1.11.1.jar:1.11.1] > >>> at akka.japi.pf.UnitCaseStatement.apply(CaseStatements.scala:26) > >>> [flink-dist_2.12-1.11.1.jar:1.11.1] > >>> at akka.japi.pf.UnitCaseStatement.apply(CaseStatements.scala:21) > >>> [flink-dist_2.12-1.11.1.jar:1.11.1] > >>> at scala.PartialFunction.applyOrElse(PartialFunction.scala:123) > >>> [flink-dist_2.12-1.11.1.jar:1.11.1] > >>> at scala.PartialFunction.applyOrElse$(PartialFunction.scala:122) > >>> [flink-dist_2.12-1.11.1.jar:1.11.1] > >>> at akka.japi.pf > .UnitCaseStatement.applyOrElse(CaseStatements.scala:21) > >>> [flink-dist_2.12-1.11.1.jar:1.11.1] > >>> at > >>> scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:171) > >>> [flink-dist_2.12-1.11.1.jar:1.11.1] > >>> at > >>> scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:172) > >>> [flink-dist_2.12-1.11.1.jar:1.11.1] > >>> at > >>> scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:172) > >>> [flink-dist_2.12-1.11.1.jar:1.11.1] > >>> at akka.actor.Actor.aroundReceive(Actor.scala:517) > >>> [flink-dist_2.12-1.11.1.jar:1.11.1] > >>> at akka.actor.Actor.aroundReceive$(Actor.scala:515) > >>> [flink-dist_2.12-1.11.1.jar:1.11.1] > >>> at > akka.actor.AbstractActor.aroundReceive(AbstractActor.scala:225) > >>> [flink-dist_2.12-1.11.1.jar:1.11.1] > >>> at akka.actor.ActorCell.receiveMessage(ActorCell.scala:592) > >>> [flink-dist_2.12-1.11.1.jar:1.11.1] > >>> at akka.actor.ActorCell.invoke(ActorCell.scala:561) > >>> [flink-dist_2.12-1.11.1.jar:1.11.1] > >>> at akka.dispatch.Mailbox.processMailbox(Mailbox.scala:258) > >>> [flink-dist_2.12-1.11.1.jar:1.11.1] > >>> at akka.dispatch.Mailbox.run(Mailbox.scala:225) > >>> [flink-dist_2.12-1.11.1.jar:1.11.1] > >>> at akka.dispatch.Mailbox.exec(Mailbox.scala:235) > >>> [flink-dist_2.12-1.11.1.jar:1.11.1] > >>> at > >>> akka.dispatch.forkjoin.ForkJoinTask.doExec(ForkJoinTask.java:260) > >>> [flink-dist_2.12-1.11.1.jar:1.11.1] > >>> at > >>> > akka.dispatch.forkjoin.ForkJoinPool$WorkQueue.runTask(ForkJoinPool.java:1339) > >>> [flink-dist_2.12-1.11.1.jar:1.11.1] > >>> at > >>> akka.dispatch.forkjoin.ForkJoinPool.runWorker(ForkJoinPool.java:1979) > >>> [flink-dist_2.12-1.11.1.jar:1.11.1] > >>> at > >>> > akka.dispatch.forkjoin.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:107) > >>> [flink-dist_2.12-1.11.1.jar:1.11.1] > >>> 2020-09-16 09:14:39,077 ERROR > >>> org.apache.flink.runtime.taskexecutor.TaskManagerRunner [] - Fatal > >>> error occurred while executing the TaskManager. Shutting it down... > >>> > org.apache.flink.runtime.taskexecutor.exceptions.RegistrationTimeoutException: > >>> Could not register at the ResourceManager within the specified maximum > >>> registration duration 300000 ms. This indicates a problem with this > >>> instance. Terminating now. > >>> at > >>> > org.apache.flink.runtime.taskexecutor.TaskExecutor.registrationTimeout(TaskExecutor.java:1251) > >>> ~[flink-dist_2.12-1.11.1.jar:1.11.1] > >>> at > >>> > org.apache.flink.runtime.taskexecutor.TaskExecutor.lambda$startRegistrationTimeout$18(TaskExecutor.java:1237) > >>> ~[flink-dist_2.12-1.11.1.jar:1.11.1] > >>> at > >>> > org.apache.flink.runtime.rpc.akka.AkkaRpcActor.handleRunAsync(AkkaRpcActor.java:402) > >>> ~ > >>> > >>> > >>> Jobmanger 异常 > >>> > >>> 0d5f8478a2ab4e17d816810752f669eb) switched from SCHEDULED to FAILED on > not > >>> deployed. > >>> > org.apache.flink.runtime.jobmanager.scheduler.NoResourceAvailableException: > >>> Could not allocate the required slot within slot request timeout. > Please > >>> make sure that the cluster has enough resources. > >>> at > >>> > org.apache.flink.runtime.scheduler.DefaultScheduler.maybeWrapWithNoResourceAvailableException(DefaultScheduler.java:441) > >>> ~[flink-dist_2.12-1.11.1.jar:1.11.1] > >>> at > >>> > org.apache.flink.runtime.scheduler.DefaultScheduler.lambda$assignResourceOrHandleError$6(DefaultScheduler.java:422) > >>> ~[flink-dist_2.12-1.11.1.jar:1.11.1] > >>> at > >>> > java.util.concurrent.CompletableFuture.uniHandle(CompletableFuture.java:836) > >>> ~[?:1.8.0_265] > >>> at > >>> > java.util.concurrent.CompletableFuture$UniHandle.tryFire(CompletableFuture.java:811) > >>> ~[?:1.8.0_265] > >>> at > >>> > java.util.concurrent.CompletableFuture.postComplete(CompletableFuture.java:488) > >>> ~[?:1.8.0_265] > >>> at > >>> > java.util.concurrent.CompletableFuture.completeExceptionally(CompletableFuture.java:1990) > >>> ~[?:1.8.0_265] > >>> at > >>> > org.apache.flink.runtime.jobmaster.slotpool.SchedulerImpl.lambda$internalAllocateSlot$0(SchedulerImpl.java:168) > >>> ~[flink-dist_2.12-1.11.1.jar:1.11.1] > >>> at > >>> > java.util.concurrent.CompletableFuture.uniWhenComplete(CompletableFuture.java:774) > >>> ~[?:1.8.0_265] > >>> at > >>> > java.util.concurrent.CompletableFuture$UniWhenComplete.tryFire(CompletableFuture.java:750) > >>> ~[?:1.8.0_265] > >>> at > >>> > java.util.concurrent.CompletableFuture.postComplete(CompletableFuture.java:488) > >>> ~[?:1.8.0_265] > >>> at > >>> > java.util.concurrent.CompletableFuture.completeExceptionally(CompletableFuture.java:1990) > >>> ~[?:1.8.0_265] > >>> at > >>> > org.apache.flink.runtime.jobmaster.slotpool.SlotSharingManager$SingleTaskSlot.release(SlotSharingManager.java:726) > >>> ~[flink-dist_2.12-1.11.1.jar:1.11.1] > >>> at > >>> > org.apache.flink.runtime.jobmaster.slotpool.SlotSharingManager$MultiTaskSlot.release(SlotSharingManager.java:537) > >>> ~[flink-dist_2.12-1.11.1.jar:1.11.1] > >>> at > >>> > org.apache.flink.runtime.jobmaster.slotpool.SlotSharingManager$MultiTaskSlot.lambda$new$0(SlotSharingManager.java:432) > >>> ~[flink-dist_2.12-1.11.1.jar:1.11.1] > >>> at > >>> > java.util.concurrent.CompletableFuture.uniHandle(CompletableFuture.java:836) > >>> ~[?:1.8.0_265] > >>> at > >>> > java.util.concurrent.CompletableFuture$UniHandle.tryFire(CompletableFuture.java:811) > >>> ~[?:1.8.0_265] > >>> at > >>> > java.util.concurrent.CompletableFuture.postComplete(CompletableFuture.java:488) > >>> ~[?:1.8.0_265] > >>> at > >>> > java.util.concurrent.CompletableFuture.completeExceptionally(CompletableFuture.java:1990) > >>> ~[?:1.8.0_265] > >>> at > >>> > org.apache.flink.runtime.concurrent.FutureUtils.lambda$forwardTo$21(FutureUtils.java:1120) > >>> ~[flink-dist_2.12-1.11.1.jar:1.11.1] > >>> at > >>> > java.util.concurrent.CompletableFuture.uniWhenComplete(CompletableFuture.java:774) > >>> ~[?:1.8.0_265] > >>> at > >>> > java.util.concurrent.CompletableFuture$UniWhenComplete.tryFire(CompletableFuture.java:750) > >>> ~[?:1.8.0_265] > >>> at > >>> > java.util.concurrent.CompletableFuture.postComplete(CompletableFuture.java:488) > >>> ~[?:1.8.0_265] > >>> at > >>> > java.util.concurrent.CompletableFuture.completeExceptionally(CompletableFuture.java:1990) > >>> ~[?:1.8.0_265] > >>> at > >>> > org.apache.flink.runtime.concurrent.FutureUtils$Timeout.run(FutureUtils.java:1036) > >>> ~[flink-dist_2.12-1.11.1.jar:1.11.1] > >>> at > >>> > org.apache.flink.runtime.rpc.akka.AkkaRpcActor.handleRunAsync(AkkaRpcActor.java:402) > >>> ~[flink-dist_2.12-1.11.1.jar:1.11.1] > >>> at > >>> > org.apache.flink.runtime.rpc.akka.AkkaRpcActor.handleRpcMessage(AkkaRpcActor.java:195) > >>> ~[flink-dist_2.12-1.11.1.jar:1.11.1] > >>> at > >>> > org.apache.flink.runtime.rpc.akka.FencedAkkaRpcActor.handleRpcMessage(FencedAkkaRpcActor.java:74) > >>> ~[flink-dist_2.12-1.11.1.jar:1.11.1] > >>> at > >>> > org.apache.flink.runtime.rpc.akka.AkkaRpcActor.handleMessage(AkkaRpcActor.java:152) > >>> ~[flink-dist_2.12-1.11.1.jar:1.11.1] > >>> at akka.japi.pf.UnitCaseStatement.apply(CaseStatements.scala:26) > >>> [flink-dist_2.12-1.11.1.jar:1.11.1] > >>> at akka.japi.pf.UnitCaseStatement.apply(CaseStatements.scala:21) > >>> [flink-dist_2.12-1.11.1.jar:1.11.1] > >>> at scala.PartialFunction.applyOrElse(PartialFunction.scala:123) > >>> [flink-dist_2.12-1.11.1.jar:1.11.1] > >>> at scala.PartialFunction.applyOrElse$(PartialFunction.scala:122) > >>> [flink-dist_2.12-1.11.1.jar:1.11.1] > >>> at akka.japi.pf > .UnitCaseStatement.applyOrElse(CaseStatements.scala:21) > >>> [flink-dist_2.12-1.11.1.jar:1.11.1] > >>> at > scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:171) > >>> [flink-dist_2.12-1.11.1.jar:1.11.1] > >>> at > scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:172) > >>> [flink-dist_2.12-1.11.1.jar:1.11.1] > >>> at > scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:172) > >>> [flink-dist_2.12-1.11.1.jar:1.11.1] > >>> at akka.actor.Actor.aroundReceive(Actor.scala:517) > >>> [flink-dist_2.12-1.11.1.jar:1.11.1] > >>> at akka.actor.Actor.aroundReceive$(Actor.scala:515) > >>> [flink-dist_2.12-1.11.1.jar:1.11.1] > >>> at akka.actor.AbstractActor.aroundReceive(AbstractActor.scala:225) > >>> [flink-dist_2.12-1.11.1.jar:1.11.1] > >>> at akka.actor.ActorCell.receiveMessage(ActorCell.scala:592) > >>> [flink-dist_2.12-1.11.1.jar:1.11.1] > >>> at akka.actor.ActorCell.invoke(ActorCell.scala:561) > >>> [flink-dist_2.12-1.11.1.jar:1.11.1] > >>> at akka.dispatch.Mailbox.processMailbox(Mailbox.scala:258) > >>> [flink-dist_2.12-1.11.1.jar:1.11.1] > >>> at akka.dispatch.Mailbox.run(Mailbox.scala:225) > >>> [flink-dist_2.12-1.11.1.jar:1.11.1] > >>> at akka.dispatch.Mailbox.exec(Mailbox.scala:235) > >>> [flink-dist_2.12-1.11.1.jar:1.11.1] > >>> at akka.dispatch.forkjoin.ForkJoinTask.doExec(ForkJoinTask.java:260) > >>> [flink-dist_2.12-1.11.1.jar:1.11.1] > >>> at > >>> > akka.dispatch.forkjoin.ForkJoinPool$WorkQueue.runTask(ForkJoinPool.java:1339) > >>> [flink-dist_2.12-1.11.1.jar:1.11.1] > >>> at > >>> akka.dispatch.forkjoin.ForkJoinPool.runWorker(ForkJoinPool.java:1979) > >>> [flink-dist_2.12-1.11.1.jar:1.11.1] > >>> at > >>> > akka.dispatch.forkjoin.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:107) > >>> [flink-dist_2.12-1.11.1.jar:1.11.1] > >>> Caused by: java.util.concurrent.CompletionException: > >>> java.util.concurrent.TimeoutException > >>> at > >>> > java.util.concurrent.CompletableFuture.encodeThrowable(CompletableFuture.java:292) > >>> ~[?:1.8.0_265] > >>> at > >>> > java.util.concurrent.CompletableFuture.completeThrowable(CompletableFuture.java:308) > >>> ~[?:1.8.0_265] > >>> at > >>> > java.util.concurrent.CompletableFuture.uniApply(CompletableFuture.java:607) > >>> ~[?:1.8.0_265] > >>> at > >>> > java.util.concurrent.CompletableFuture$UniApply.tryFire(CompletableFuture.java:591) > >>> ~[?:1.8.0_265] > >>> ... 27 more > >>> Caused by: java.util.concurrent.TimeoutException > >>> ... 25 more > >>> 2020-09-16 09:14:35,603 INFO > >>> > org.apache.flink.runtime.executiongraph.failover.flip1.RestartPipelinedRegionFailoverStrategy > >>> [] - Calculating tasks to restart to recover the failed task > >>> cbc357ccb763df2852fee8c4fc7d55f2_0. > >>> 2020-09-16 09:14:35,604 INFO > >>> > org.apache.flink.runtime.executiongraph.failover.flip1.RestartPipelinedRegionFailoverStrategy > >>> [] - 7 tasks should be restarted to recover the failed task > >>> cbc357ccb763df2852fee8c4fc7d55f2_0. > >>> 2020-09-16 09:14:35,606 INFO > >>> org.apache.flink.runtime.executiongraph.ExecutionGraph [] - Job > >>> Dragonsnake Indicator Accumulator Job > (c60f4f384f904467e4c54416d634852f) > >>> switched from state RUNNING to FAILING. > >>> org.apache.flink.runtime.JobException: Recovery is suppressed by > >>> NoRestartBackoffTimeStrategy > >>> at > >>> > org.apache.flink.runtime.executiongraph.failover.flip1.ExecutionFailureHandler.handleFailure(ExecutionFailureHandler.java:116) > >>> ~[flink-dist_2.12-1.11.1.jar:1.11.1] > >>> at org.apache.flink.runtime.executiongraph.failover.flip1. > >>> > >>> > >>> > > > > |
Free forum by Nabble | Edit this page |