flink监听mysql的binlog的日志做实时计算,其中有一个任务,表数据只有2万,但是每次checkpoint总是失败,其他的任务没有这种情况:
2025-09-05 09:49:18,724 WARN org.apache.flink.runtime.checkpoint.CheckpointFailureManager [] - Failed to trigger or complete checkpoint 114 for job 07a364cb8fea5fb1953fd9fb148b8925. (0 consecutive failed attempts so far)
org.apache.flink.runtime.checkpoint.CheckpointException: Checkpoint expired before completing.
at org.apache.flink.runtime.checkpoint.CheckpointCoordinator$CheckpointCanceller.run(CheckpointCoordinator.java:2346) ~[flink-dist-1.20.1.jar:1.20.1]
at org.apache.flink.util.MdcUtils.lambda$wrapRunnable$1(MdcUtils.java:70) ~[flink-dist-1.20.1.jar:1.20.1]
at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511) [?:1.8.0_361]
at java.util.concurrent.FutureTask.run(FutureTask.java:266) [?:1.8.0_361]
at java.util.concurrent.ScheduledThreadPoolExecutor$ScheduledFutureTask.access$201(ScheduledThreadPoolExecutor.java:180) [?:1.8.0_361]
at java.util.concurrent.ScheduledThreadPoolExecutor$ScheduledFutureTask.run(ScheduledThreadPoolExecutor.java:293) [?:1.8.0_361]
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) [?:1.8.0_361]
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) [?:1.8.0_361]
at java.lang.Thread.run(Thread.java:750) [?:1.8.0_361]
2025-09-05 09:49:18,725 INFO org.apache.flink.runtime.checkpoint.CheckpointRequestDecider [] - checkpoint request time in queue: 660001
2025-09-05 09:49:18,727 INFO org.apache.flink.runtime.jobmaster.JobMaster [] - Trying to recover from a global failure.
org.apache.flink.util.FlinkRuntimeException: Exceeded checkpoint tolerable failure threshold. The latest checkpoint failed due to Checkpoint expired before completing., view the Checkpoint History tab or the Job Manager log to find out why continuous checkpoints failed.
at org.apache.flink.runtime.checkpoint.CheckpointFailureManager.checkFailureAgainstCounter(CheckpointFailureManager.java:212) ~[flink-dist-1.20.1.jar:1.20.1]
at org.apache.flink.runtime.checkpoint.CheckpointFailureManager.handleJobLevelCheckpointException(CheckpointFailureManager.java:169) ~[flink-dist-1.20.1.jar:1.20.1]
at org.apache.flink.runtime.checkpoint.CheckpointFailureManager.handleCheckpointException(CheckpointFailureManager.java:122) ~[flink-dist-1.20.1.jar:1.20.1]
at org.apache.flink.runtime.checkpoint.CheckpointCoordinator.abortPendingCheckpoint(CheckpointCoordinator.java:2281) ~[flink-dist-1.20.1.jar:1.20.1]
at org.apache.flink.runtime.checkpoint.CheckpointCoordinator.abortPendingCheckpoint(CheckpointCoordinator.java:2260) ~[flink-dist-1.20.1.jar:1.20.1]
at org.apache.flink.runtime.checkpoint.CheckpointCoordinator.access$1200(CheckpointCoordinator.java:102) ~[flink-dist-1.20.1.jar:1.20.1]
at org.apache.flink.runtime.checkpoint.CheckpointCoordinator$CheckpointCanceller.run(CheckpointCoordinator.java:2346) ~[flink-dist-1.20.1.jar:1.20.1]
at org.apache.flink.util.MdcUtils.lambda$wrapRunnable$1(MdcUtils.java:70) ~[flink-dist-1.20.1.jar:1.20.1]
at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511) ~[?:1.8.0_361]
at java.util.concurrent.FutureTask.run(FutureTask.java:266) ~[?:1.8.0_361]
at java.util.concurrent.ScheduledThreadPoolExecutor$ScheduledFutureTask.access$201(ScheduledThreadPoolExecutor.java:180) ~[?:1.8.0_361]
at java.util.concurrent.ScheduledThreadPoolExecutor$ScheduledFutureTask.run(ScheduledThreadPoolExecutor.java:293) ~[?:1.8.0_361]
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) ~[?:1.8.0_361]
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) ~[?:1.8.0_361]
at java.lang.Thread.run(Thread.java:750) ~[?:1.8.0_361]
2025-09-05 09:49:18,728 INFO org.apache.flink.runtime.jobmaster.JobMaster [] - 4 tasks will be restarted to recover from a global failure.
2025-09-05 09:49:18,728 INFO org.apache.flink.runtime.executiongraph.ExecutionGraph [] - Job SapOrderInfo-rt_dwh_ods.ODS_S001_BUT000 (07a364cb8fea5fb1953fd9fb148b8925) switched from state RUNNING to RESTARTING.
2025-09-05 09:49:18,728 WARN org.apache.flink.runtime.checkpoint.CheckpointFailureManager [] - Failed to trigger or complete checkpoint 115 for job 07a364cb8fea5fb1953fd9fb148b8925. (0 consecutive failed attempts so far)
org.apache.flink.runtime.checkpoint.CheckpointException: Checkpoint Coordinator is suspending.
at org.apache.flink.runtime.checkpoint.CheckpointCoordinator.stopCheckpointScheduler(CheckpointCoordinator.java:2068) ~[flink-dist-1.20.1.jar:1.20.1]
at org.apache.flink.runtime.checkpoint.CheckpointCoordinatorDeActivator.jobStatusChanges(CheckpointCoordinatorDeActivator.java:49) ~[flink-dist-1.20.1.jar:1.20.1]
at org.apache.flink.runtime.executiongraph.DefaultExecutionGraph.notifyJobStatusChange(DefaultExecutionGraph.java:1609) ~[flink-dist-1.20.1.jar:1.20.1]
at org.apache.flink.runtime.executiongraph.DefaultExecutionGraph.transitionState(DefaultExecutionGraph.java:1167) ~[flink-dist-1.20.1.jar:1.20.1]
at org.apache.flink.runtime.executiongraph.DefaultExecutionGraph.transitionState(DefaultExecutionGraph.java:1139) ~[flink-dist-1.20.1.jar:1.20.1]
at org.apache.flink.runtime.scheduler.SchedulerBase.transitionExecutionGraphState(SchedulerBase.java:601) ~[flink-dist-1.20.1.jar:1.20.1]
at org.apache.flink.runtime.scheduler.DefaultScheduler.addVerticesToRestartPending(DefaultScheduler.java:386) ~[flink-dist-1.20.1.jar:1.20.1]
at org.apache.flink.runtime.scheduler.DefaultScheduler.restartTasksWithDelay(DefaultScheduler.java:362) ~[flink-dist-1.20.1.jar:1.20.1]
at org.apache.flink.runtime.scheduler.DefaultScheduler.maybeRestartTasks(DefaultScheduler.java:330) ~[flink-dist-1.20.1.jar:1.20.1]
at org.apache.flink.runtime.scheduler.DefaultScheduler.handleGlobalFailure(DefaultScheduler.java:325) ~[flink-dist-1.20.1.jar:1.20.1]
at org.apache.flink.runtime.scheduler.UpdateSchedulerNgOnInternalFailuresListener.notifyGlobalFailure(UpdateSchedulerNgOnInternalFailuresListener.java:57) ~[flink-dist-1.20.1.jar:1.20.1]
at org.apache.flink.runtime.executiongraph.DefaultExecutionGraph.failGlobal(DefaultExecutionGraph.java:1092) ~[flink-dist-1.20.1.jar:1.20.1]
at org.apache.flink.runtime.executiongraph.DefaultExecutionGraph$1.lambda$failJob$0(DefaultExecutionGraph.java:477) ~[flink-dist-1.20.1.jar:1.20.1]
at org.apache.flink.runtime.rpc.pekko.PekkoRpcActor.lambda$handleRunAsync$4(PekkoRpcActor.java:460) ~[flink-rpc-akka365eff67-1c9c-4c09-a6d0-3bcb63efea84.jar:1.20.1]
at org.apache.flink.runtime.concurrent.ClassLoadingUtils.runWithContextClassLoader(ClassLoadingUtils.java:68) ~[flink-dist-1.20.1.jar:1.20.1]
at org.apache.flink.runtime.rpc.pekko.PekkoRpcActor.handleRunAsync(PekkoRpcActor.java:460) ~[flink-rpc-akka365eff67-1c9c-4c09-a6d0-3bcb63efea84.jar:1.20.1]
at org.apache.flink.runtime.rpc.pekko.PekkoRpcActor.handleRpcMessage(PekkoRpcActor.java:225) ~[flink-rpc-akka365eff67-1c9c-4c09-a6d0-3bcb63efea84.jar:1.20.1]
at org.apache.flink.runtime.rpc.pekko.FencedPekkoRpcActor.handleRpcMessage(FencedPekkoRpcActor.java:88) ~[flink-rpc-akka365eff67-1c9c-4c09-a6d0-3bcb63efea84.jar:1.20.1]
at org.apache.flink.runtime.rpc.pekko.PekkoRpcActor.handleMessage(PekkoRpcActor.java:174) ~[flink-rpc-akka365eff67-1c9c-4c09-a6d0-3bcb63efea84.jar:1.20.1]
at org.apache.pekko.japi.pf.UnitCaseStatement.apply(CaseStatements.scala:33) [flink-rpc-akka365eff67-1c9c-4c09-a6d0-3bcb63efea84.jar:1.20.1]
at org.apache.pekko.japi.pf.UnitCaseStatement.apply(CaseStatements.scala:29) [flink-rpc-akka365eff67-1c9c-4c09-a6d0-3bcb63efea84.jar:1.20.1]
at scala.PartialFunction.applyOrElse(PartialFunction.scala:127) [flink-rpc-akka365eff67-1c9c-4c09-a6d0-3bcb63efea84.jar:1.20.1]
at scala.PartialFunction.applyOrElse$(PartialFunction.scala:126) [flink-rpc-akka365eff67-1c9c-4c09-a6d0-3bcb63efea84.jar:1.20.1]
at org.apache.pekko.japi.pf.UnitCaseStatement.applyOrElse(CaseStatements.scala:29) [flink-rpc-akka365eff67-1c9c-4c09-a6d0-3bcb63efea84.jar:1.20.1]
at scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:175) [flink-rpc-akka365eff67-1c9c-4c09-a6d0-3bcb63efea84.jar:1.20.1]
at scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:176) [flink-rpc-akka365eff67-1c9c-4c09-a6d0-3bcb63efea84.jar:1.20.1]
at scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:176) [flink-rpc-akka365eff67-1c9c-4c09-a6d0-3bcb63efea84.jar:1.20.1]
at org.apache.pekko.actor.Actor.aroundReceive(Actor.scala:547) [flink-rpc-akka365eff67-1c9c-4c09-a6d0-3bcb63efea84.jar:1.20.1]
at org.apache.pekko.actor.Actor.aroundReceive$(Actor.scala:545) [flink-rpc-akka365eff67-1c9c-4c09-a6d0-3bcb63efea84.jar:1.20.1]
at org.apache.pekko.actor.AbstractActor.aroundReceive(AbstractActor.scala:229) [flink-rpc-akka365eff67-1c9c-4c09-a6d0-3bcb63efea84.jar:1.20.1]
at org.apache.pekko.actor.ActorCell.receiveMessage(ActorCell.scala:590) [flink-rpc-akka365eff67-1c9c-4c09-a6d0-3bcb63efea84.jar:1.20.1]
at org.apache.pekko.actor.ActorCell.invoke(ActorCell.scala:557) [flink-rpc-akka365eff67-1c9c-4c09-a6d0-3bcb63efea84.jar:1.20.1]
at org.apache.pekko.dispatch.Mailbox.processMailbox(Mailbox.scala:272) [flink-rpc-akka365eff67-1c9c-4c09-a6d0-3bcb63efea84.jar:1.20.1]
at org.apache.pekko.dispatch.Mailbox.run(Mailbox.scala:233) [flink-rpc-akka365eff67-1c9c-4c09-a6d0-3bcb63efea84.jar:1.20.1]
at org.apache.pekko.dispatch.Mailbox.exec(Mailbox.scala:245) [flink-rpc-akka365eff67-1c9c-4c09-a6d0-3bcb63efea84.jar:1.20.1]
at java.util.concurrent.ForkJoinTask.doExec(ForkJoinTask.java:289) [?:1.8.0_361]
at java.util.concurrent.ForkJoinPool$WorkQueue.runTask(ForkJoinPool.java:1067) [?:1.8.0_361]
at java.util.concurrent.ForkJoinPool.runWorker(ForkJoinPool.java:1703) [?:1.8.0_361]
at java.util.concurrent.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:172) [?:1.8.0_361]