txm119161336 2026-01-05 21:11 采纳率: 0%
浏览 5

自定义执行器executor被mesos agent自动杀掉

我写了一个自定义的mesos执行器 是一个简单的示例 代码如下

package com.example;

import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import org.apache.mesos.Executor;
import org.apache.mesos.ExecutorDriver;
import org.apache.mesos.Protos;
import org.apache.mesos.Protos.TaskInfo;

import java.util.HashMap;
import java.util.Map;
import java.util.concurrent.ExecutorService;


@Slf4j
public final class TaskExecutor implements Executor {
    


    public TaskExecutor() {
        log.info("task executor .........");
    }
    
    @Override
    public void registered(final ExecutorDriver executorDriver, final Protos.ExecutorInfo executorInfo, final Protos.FrameworkInfo frameworkInfo, final Protos.SlaveInfo slaveInfo) {
        log.info("task registered......");

    }
    
    @Override
    public void reregistered(final ExecutorDriver executorDriver, final Protos.SlaveInfo slaveInfo) {
        log.info("task reregistered......");
    }
    
    @Override
    public void disconnected(final ExecutorDriver executorDriver) {
        log.info("disconnected .........");
    }
    
    @Override
    public void launchTask(final ExecutorDriver executorDriver, final TaskInfo taskInfo) {
        log.info("launchTask......");
        Protos.TaskStatus runningStatus = Protos.TaskStatus.newBuilder()
                .setTaskId(taskInfo.getTaskId())
                .setState(Protos.TaskState.TASK_RUNNING)
                .build();
        executorDriver.sendStatusUpdate(runningStatus);

        // 2. 执行实际任务(以下任选一种实现)
        // 方式A: 执行shell命令(如果TaskInfo中有command)
        String command = taskInfo.getData().toStringUtf8();
        try {
            Process process = Runtime.getRuntime().exec(command);

            // 等待任务完成
            int exitCode = process.waitFor();

            // 3. 根据结果发送最终状态
            Protos.TaskState finalState = (exitCode == 0)
                    ? Protos.TaskState.TASK_FINISHED
                    : Protos.TaskState.TASK_FAILED;

            Protos.TaskStatus finalStatus = Protos.TaskStatus.newBuilder()
                    .setTaskId(taskInfo.getTaskId())
                    .setState(finalState)
                    .build();
            executorDriver.sendStatusUpdate(finalStatus);

        } catch (Exception e) {
            log.error("Task execution failed", e);
            Protos.TaskStatus errorStatus = Protos.TaskStatus.newBuilder()
                    .setTaskId(taskInfo.getTaskId())
                    .setState(Protos.TaskState.TASK_FAILED)
                    .setMessage("Exception: " + e.getMessage())
                    .build();
            executorDriver.sendStatusUpdate(errorStatus);
        }
    }
    
    @Override
    public void killTask(final ExecutorDriver executorDriver, final Protos.TaskID taskID) {
        log.info("task got killed......");
        executorDriver.sendStatusUpdate(Protos.TaskStatus.newBuilder().setTaskId(taskID).setState(Protos.TaskState.TASK_KILLED).build());
    }
    
    @Override
    public void frameworkMessage(final ExecutorDriver executorDriver, final byte[] bytes) {
    }
    
    @Override
    public void shutdown(final ExecutorDriver executorDriver) {
    }
    
    @Override
    public void error(final ExecutorDriver executorDriver, final String message) {
        log.error("call executor error, message is: {}", message);
    }

}

启动以后 进程会被mesos主动杀掉 下面是我的日志 我是在docker容器上部署的mesos-agent 考虑到是一个简单的例子 基本没有资源问题 比如oom 让我困惑的是 什么原因导致了进程被杀 后来我用了docker容器启动一个镜像nginx例子 进程启动后一分钟也被mesos自动杀掉 请教一下这是什么原因造成的呢

I0103 16:46:27.701495   109 exec.cpp:162] Version: 1.7.1
I0103 16:46:27.717497   116 exec.cpp:236] Executor registered on agent 157ff8a1-f680-453c-ab7d-2b4200dac5f0-S0
I0103 16:46:27.720517   116 executor.cpp:184] Received SUBSCRIBED event
I0103 16:46:27.721859   116 executor.cpp:188] Subscribed executor on mesos-agent
I0103 16:46:27.723505   116 executor.cpp:184] Received LAUNCH event
I0103 16:46:27.725533   116 executor.cpp:687] Starting task executor2-v1-1767429987406-1-mesos_agent-DEFAULT
I0103 16:46:27.730633   116 executor.cpp:502] Running '/usr/libexec/mesos/mesos-containerizer launch <POSSIBLY-SENSITIVE-DATA>'
I0103 16:46:27.733171   116 executor.cpp:702] Forked command at 119
I0103 16:46:28.437938   121 exec.cpp:162] Version: 1.7.1
I0103 16:46:28.451352   144 exec.cpp:445] Executor asked to shutdown
I0103 16:46:33.470431   110 executor.cpp:1003] Command terminated with signal Killed (pid: 119)
I0103 16:46:34.474253   118 process.cpp:926] Stopped the socket accept loop

  • 写回答

2条回答 默认 最新

  • 紫雾凌寒 新星创作者: 人工智能技术领域 2026-01-05 21:12
    关注

    让 “紫雾凌寒” 来帮你回答,本回答结合 DeepSeek

    如果还有问题,请评论或点击头像私信,如果有帮助到您, 请关注、采纳、点赞

    1. 关键点分析
      - 自定义Mesos执行器被Mesos agent自动杀掉,原因可能有多种。比如资源不足被强制回收,执行器运行过程中出现异常导致Mesos agent认为其运行不健康而杀掉,或者网络等相关环境因素影响导致通信异常进而被清理等。
    2. 解决方案
      - 检查资源分配
      • 确保在Mesos框架配置中,为自定义执行器分配了足够的资源。例如,在Mesos调度器配置文件中设置相应的资源请求,如cpusmem等。
      • 示例代码如下(假设使用的是Mesos原生Java API):
    import org.apache.mesos.Protos;
    import org.apache.mesos.SchedulerDriver;
    import org.apache.mesos.Util;
    import org.apache.mesos.executor.ExecutorDriver;
    import org.apache.mesos.executor.ExecutorDriverImpl;
    import org.apache.mesos.executor.ProtosExecutorDriver;
    import org.apache.mesos.framework.Protos.FrameworkInfo;
    import org.apache.mesos.framework.Protos.MasterInfo;
    import org.apache.mesos.scheduler.Protos.SchedulerInfo;
    
    public class CustomExecutorMain {
        public static void main(String[] args) {
            MasterInfo masterInfo = MasterInfo.newBuilder()
                  .setHost("mesos-master-host")
                  .setPort(5050)
                  .build();
            FrameworkInfo frameworkInfo = FrameworkInfo.newBuilder()
                  .setName("CustomExecutorFramework")
                  .setUser("")
                  .setFailoverTimeout(5000)
                  .build();
            SchedulerInfo schedulerInfo = SchedulerInfo.newBuilder()
                  .setExecutor(Protos.ExecutorInfo.newBuilder()
                      .setName("CustomExecutor")
                      .setSource("custom-executor-source")
                      .setCommand(Protos.CommandInfo.newBuilder()
                          .setValue("java -cp target/custom-executor-1.0.jar com.example.CustomExecutor")
                          .build())
                      .addResources(Protos.Resource.newBuilder()
                          .setName("cpus")
                          .setType(Protos.Value.Type.SCALAR)
                          .setScalar(Protos.Value.Scalar.newBuilder().setValue(0.1))
                          .build())
                      .addResources(Protos.Resource.newBuilder()
                          .setName("mem")
                          .setType(Protos.Value.Type.SCALAR)
                          .setScalar(Protos.Value.Scalar.newBuilder().setValue(64))
                          .build())
                      .build())
                  .build();
            SchedulerDriver schedulerDriver = new org.apache.mesos.SchedulerDriver(new CustomScheduler(), frameworkInfo, masterInfo, schedulerInfo);
            int status = schedulerDriver.run();
            schedulerDriver.stop();
            System.exit(status);
        }
    }
    
    • 排查执行器异常
      • 在执行器代码中添加更详细的日志记录,以便在被杀掉时能获取更多信息。例如在CustomExecutor类中添加日志:
    import org.apache.mesos.executor.ExecutorDriver;
    import org.apache.mesos.executor.ExecutorDriverImpl;
    import org.apache.mesos.executor.ProtosExecutorDriver;
    import org.apache.mesos.protos.TaskStatus;
    import org.slf4j.Logger;
    import org.slf4j.LoggerFactory;
    
    public class CustomExecutor implements org.apache.mesos.executor.Executor {
        private static final Logger logger = LoggerFactory.getLogger(CustomExecutor.class);
    
        @Override
        public void registered(ExecutorDriver driver, org.apache.mesos.protos.ExecutorInfo executorInfo, org.apache.mesos.protos.FrameworkInfo frameworkInfo, org.apache.mesos.protos.MasterInfo masterInfo) {
            logger.info("Executor registered");
        }
    
        @Override
        public void reregistered(ExecutorDriver driver, org.apache.mesos.protos.MasterInfo masterInfo) {
            logger.info("Executor reregistered");
        }
    
        @Override
        public void disconnected(ExecutorDriver driver) {
            logger.info("Executor disconnected");
        }
    
        @Override
        public void launchTask(ExecutorDriver driver, org.apache.mesos.protos.TaskInfo task) {
            logger.info("Launching task: {}", task.getName());
            // 模拟任务执行
            try {
                Thread.sleep(1000);
            } catch (InterruptedException e) {
                e.printStackTrace();
            }
            TaskStatus status = TaskStatus.newBuilder()
                  .setTaskId(task.getTaskId())
                  .setState(TaskStatus.State.TASK_FINISHED)
                  .build();
            driver.sendStatusUpdate(status);
        }
    
        @Override
        public void killTask(ExecutorDriver driver, org.apache.mesos.protos.TaskID taskId) {
            logger.info("Killing task: {}", taskId.getValue());
        }
    
        @Override
        public void frameworkMessage(ExecutorDriver driver, byte[] bytes) {
            logger.info("Received framework message: {}", bytes);
        }
    
        @Override
        public void shutdown(ExecutorDriver driver) {
            logger.info("Executor shutting down");
        }
    
        @Override
        public void error(ExecutorDriver driver, org.apache.mesos.protos.ExecutorError error) {
            logger.error("Executor error: {}", error.getMessage());
        }
    }
    
    • 检查网络和通信
      • 确保Mesos agent和执行器之间网络畅通,没有防火墙等限制阻止通信。
      • 检查执行器和Mesos agent之间的心跳机制是否正常工作,例如在执行器代码中定期发送心跳消息。
        3. 多种解决方案优缺点
    • 检查资源分配
      • 优点:从根本上解决资源不足导致被强制杀掉的问题,确保执行器有足够资源运行。
      • 缺点:如果资源分配合理但仍被杀掉,可能无法解决根本问题。
    • 排查执行器异常
      • 优点:能详细了解执行器运行过程中的情况,有助于定位具体异常原因。
      • 缺点:可能需要花费较多时间分析日志,且如果异常复杂可能难以快速定位。
    • 检查网络和通信
      • 优点:解决因网络问题导致的通信异常进而被清理的情况。
      • 缺点:如果不是网络问题,可能无法解决问题。
        4. 总结
    • 自定义Mesos执行器被自动杀掉,首先要检查资源分配是否合理,确保有足够资源。然后排查执行器代码中的异常,通过添加日志等方式定位问题。最后检查网络和通信情况,保证两者之间能正常交互。综合这几个方面的检查和处理,有助于找到并解决执行器被自动杀掉的问题。

    希望以上解答对您有所帮助。如果您有任何疑问,欢迎在评论区提出。

    评论

报告相同问题?

问题事件

  • 修改了问题 1月5日
  • 创建了问题 1月5日