小记--------spark的worker原理分析及源码分析

时间：2019-12-17 15:24:00 阅读：108 评论：0 收藏：0 [点我收藏+]

标签：tco ipa sbo inter svc ejbca dcs vsc auc

Worker类源码位置： org.apache.spark.deploy.worker

/**
*启动driver的源码分析
*/
case LaunchDriver(driverId, driverDesc) =>
  logInfo(s"Asked to launch driver $driverId")
 
//创建DriverRunner线程
  val driver = new DriverRunner(
    conf,
    driverId,
    workDir,
    sparkHome,
    driverDesc.copy(command = Worker.maybeUpdateSSLSettings(driverDesc.command, conf)),
    self,
    workerUri,
    securityMgr)
 
//把DriverRunner线程加入Drivers的hashset中
  drivers(driverId) = driver
 
//启动driver
  driver.start() //详细代码见：代码1
 
 
  coresUsed += driverDesc.cores
  memoryUsed += driverDesc.mem
 
 
代码1
/** Starts a thread to run and manage the driver. */
private[worker] def start() = {
 
  //DriverRunner机制分析
  //启动一个java线程
  new Thread("DriverRunner for " + driverId) {
    override def run() {
      var shutdownHook: AnyRef = null
      try {
        shutdownHook = ShutdownHookManager.addShutdownHook { () =>
          logInfo(s"Worker shutting down, killing driver $driverId")
          kill()
        }
 
        // prepare driver jars and run driver
        // 在此处进行第一步：创建DriverRunner的工作目录
        // 第二步，下载用户上传的jar（我们编写完的spark应用程序，如果是java，用maven打个jar包，如果是scala，那么会用export将它导出为jar包）
        //第三步 构建ProcessBuilder
        val exitCode = prepareAndRunDriver()//详细代码见：代码2
 
 
        // set final state depending on if forcibly killed and process exit code
        // 对driver的退出状态做一些处理
        finalState = if (exitCode == 0) {
          Some(DriverState.FINISHED)
        } else if (killed) {
          Some(DriverState.KILLED)
        } else {
          Some(DriverState.FAILED)
        }
      } catch {
        case e: Exception =>
          kill()
          finalState = Some(DriverState.ERROR)
          finalException = Some(e)
      } finally {
        if (shutdownHook != null) {
          ShutdownHookManager.removeShutdownHook(shutdownHook)
        }
      }
 
 
      // notify worker of final driver state, possible exception
        // 这个DriverRunner这个线程，向它所属的worker的actor，发送一个DriverStateChanged的事件 
      worker.send(DriverStateChanged(driverId, finalState.get, finalException))//详细代码见：代码3
    }
  }.start()
}
 
 
 
代码2
private[worker] def prepareAndRunDriver(): Int = {
  val driverDir = createWorkingDirectory()//创建DriverRunner的工作目录
  val localJarFilename = downloadUserJar(driverDir)//第二步，下载用户上传的jar
 
 
  def substituteVariables(argument: String): String = argument match {
    case "{{WORKER_URL}}" => workerUrl
    case "{{USER_JAR}}" => localJarFilename
    case other => other
  }
 
 
  // TODO: If we add ability to submit multiple jars they should also be added here
 
  // 构建ProcessBuilder
  // 传入了driver的启动命令，需要的内存大小等信息
  val builder = CommandUtils.buildProcessBuilder(driverDesc.command, securityManager,
    driverDesc.mem, sparkHome.getAbsolutePath, substituteVariables)
 
 
  runDriver(builder, driverDir, driverDesc.supervise)
}
 
 
代码3
//driver执行完以后，driverrunner线程会发送一个状态给worker
//然后worker实际上会将DriverStateChanged消息发送给Master
case driverStateChanged @ DriverStateChanged(driverId, state, exception) =>
  handleDriverStateChanged(driverStateChanged)//详细代码见：代码4
 
 
代码4
private[worker] def handleDriverStateChanged(driverStateChanged: DriverStateChanged): Unit = {
  val driverId = driverStateChanged.driverId
  val exception = driverStateChanged.exception
  val state = driverStateChanged.state
  state match {
    case DriverState.ERROR =>
      logWarning(s"Driver $driverId failed with unrecoverable exception: ${exception.get}")
    case DriverState.FAILED =>
      logWarning(s"Driver $driverId exited with failure")
    case DriverState.FINISHED =>
      logInfo(s"Driver $driverId exited successfully")
    case DriverState.KILLED =>
      logInfo(s"Driver $driverId was killed by user")
    case _ =>
      logDebug(s"Driver $driverId changed state to $state")
  }
 
//worker把DriverStateChanged消息发送给Master
// Master会对状态进行修改
  sendToMaster(driverStateChanged)
 
//将driver从本地缓存中移除
  val driver = drivers.remove(driverId).get
 
//将driver加入完成driver的队列
  finishedDrivers(driverId) = driver
  trimFinishedDriversIfNecessary()
 
//将driver的内存和CPU进行释放
  memoryUsed -= driver.driverDesc.mem
  coresUsed -= driver.driverDesc.cores
}

/**
*启动Executor的源码分析
*/
case LaunchExecutor(masterUrl, appId, execId, appDesc, cores_, memory_) =>
  if (masterUrl != activeMasterUrl) {
    logWarning("Invalid Master (" + masterUrl + ") attempted to launch executor.")
  } else {
    try {
      logInfo("Asked to launch executor %s/%d for %s".format(appId, execId, appDesc.name))
 
 
 
      // Create the executor‘s working directory    
      // 创建executor本地工作目录
      val executorDir = new File(workDir, appId + "/" + execId)
      if (!executorDir.mkdirs()) {
        throw new IOException("Failed to create directory " + executorDir)
      }
 
 
      // Create local dirs for the executor. These are passed to the executor via the
      // SPARK_EXECUTOR_DIRS environment variable, and deleted by the Worker when the
      // application finishes.
      val appLocalDirs = appDirectories.getOrElse(appId,
        Utils.getOrCreateLocalRootDirs(conf).map { dir =>
          val appDir = Utils.createDirectory(dir, namePrefix = "executor")
          Utils.chmod700(appDir)
          appDir.getAbsolutePath()
        }.toSeq)
      appDirectories(appId) = appLocalDirs
 
        //创建ExecutorRunner
      val manager = new ExecutorRunner(
        appId,
        execId,
        appDesc.copy(command = Worker.maybeUpdateSSLSettings(appDesc.command, conf)),
        cores_,
        memory_,
        self,
        workerId,
        host,
        webUi.boundPort,
        publicAddress,
        sparkHome,
        executorDir,
        workerUri,
        conf,
        appLocalDirs, ExecutorState.RUNNING)
 
    //把executorRunner加入本地缓存
      executors(appId + "/" + execId) = manager
 
    //启动ExecutorRunner
      manager.start()//详细代码：见代码5
 
    //加上Executor需要使用的CPU 内存的资源
      coresUsed += cores_
      memoryUsed += memory_
 
    //向master返回一个ExecutorStateChanged事件，用于master修改状态
      sendToMaster(ExecutorStateChanged(appId, execId, manager.state, None, None))
    } catch {
      case e: Exception =>
        logError(s"Failed to launch executor $appId/$execId for ${appDesc.name}.", e)
        if (executors.contains(appId + "/" + execId)) {
          executors(appId + "/" + execId).kill()
          executors -= appId + "/" + execId
        }
        sendToMaster(ExecutorStateChanged(appId, execId, ExecutorState.FAILED,
          Some(e.toString), None))
    }
  }
 
 
代码5
private[worker] def start() {
 
    //创建一个java线程
  workerThread = new Thread("ExecutorRunner for " + fullId) {
    override def run() { fetchAndRunExecutor() }//详细代码见代码6
  }
  workerThread.start()
  // Shutdown hook that kills actors on shutdown.
  shutdownHook = ShutdownHookManager.addShutdownHook { () =>
    // It‘s possible that we arrive here before calling `fetchAndRunExecutor`, then `state` will
    // be `ExecutorState.RUNNING`. In this case, we should set `state` to `FAILED`.
    if (state == ExecutorState.RUNNING) {
      state = ExecutorState.FAILED
    }
    killProcess(Some("Worker shutting down")) }
}
 
 
 
代码6
/**
* Download and run the executor described in our ApplicationDescription
*/
private def fetchAndRunExecutor() {
  try {
    // Launch the process
 
    //封装一个ProcessBuilder
    val builder = CommandUtils.buildProcessBuilder(appDesc.command, new SecurityManager(conf),
      memory, sparkHome.getAbsolutePath, substituteVariables)
    val command = builder.command()
    val formattedCommand = command.asScala.mkString("\"", "\" \"", "\"")
    logInfo(s"Launch command: $formattedCommand")
 
 
    builder.directory(executorDir)
    builder.environment.put("SPARK_EXECUTOR_DIRS", appLocalDirs.mkString(File.pathSeparator))
    // In case we are running this from within the Spark Shell, avoid creating a "scala"
    // parent process for the executor command
    builder.environment.put("SPARK_LAUNCH_WITH_SCALA", "0")
 
 
    // Add webUI log urls
    val baseUrl =
      if (conf.getBoolean("spark.ui.reverseProxy", false)) {
        s"/proxy/$workerId/logPage/?appId=$appId&executorId=$execId&logType="
      } else {
        s"http://$publicAddress:$webUiPort/logPage/?appId=$appId&executorId=$execId&logType="
      }
    builder.environment.put("SPARK_LOG_URL_STDERR", s"${baseUrl}stderr")
    builder.environment.put("SPARK_LOG_URL_STDOUT", s"${baseUrl}stdout")
 
 
    process = builder.start()
 
    //重定向到输出流文件（将是stdout和stderr）
    //将executor的InputStream和ErrorStream，输出的信息
    //分贝重定向到本地工作目录的stdout文件，和stderr文件中
    val header = "Spark Executor Command: %s\n%s\n\n".format(
      formattedCommand, "=" * 40)
 
 
    // Redirect its stdout and stderr to files
    val stdout = new File(executorDir, "stdout")
    stdoutAppender = FileAppender(process.getInputStream, stdout, conf)
 
 
    val stderr = new File(executorDir, "stderr")
    Files.write(header, stderr, StandardCharsets.UTF_8)
    stderrAppender = FileAppender(process.getErrorStream, stderr, conf)
 
 
    // Wait for it to exit; executor may exit with code 0 (when driver instructs it to shutdown)
    // or with nonzero exit code
    // 调用Proess的waitFor()方法，启动executor进程
    val exitCode = process.waitFor()
 
    // executor执行完之后拿到返回值状态
    state = ExecutorState.EXITED
    val message = "Command exited with code " + exitCode
   
 //向ExecutorRunner线程所属的Worker actor，发送ExecutorStateChanged消息
    worker.send(ExecutorStateChanged(appId, execId, state, Some(message), Some(exitCode)))//详细代码见：代码7
  } catch {
    case interrupted: InterruptedException =>
      logInfo("Runner thread for executor " + fullId + " interrupted")
      state = ExecutorState.KILLED
      killProcess(None)
    case e: Exception =>
      logError("Error running executor", e)
      state = ExecutorState.FAILED
      killProcess(Some(e.toString))
  }
}
 
代码7
//向master发送executorstatechanged事件
case executorStateChanged @ ExecutorStateChanged(appId, execId, state, message, exitStatus) =>
  handleExecutorStateChanged(executorStateChanged)//详细代码见：代码8
 
 
代码8
private[worker] def handleExecutorStateChanged(executorStateChanged: ExecutorStateChanged):
  Unit = {
 
// 直接向master也发送一个executorstatechanged消息
  sendToMaster(executorStateChanged)
  val state = executorStateChanged.state
 
// 如果executor状态是finished
  if (ExecutorState.isFinished(state)) {
    val appId = executorStateChanged.appId
    val fullId = appId + "/" + executorStateChanged.execId
    val message = executorStateChanged.message
    val exitStatus = executorStateChanged.exitStatus
    executors.get(fullId) match {
      case Some(executor) =>
        logInfo("Executor " + fullId + " finished with state " + state +
          message.map(" message " + _).getOrElse("") +
          exitStatus.map(" exitStatus " + _).getOrElse(""))
 
        // 将executor从内存中移除
        executors -= fullId
        finishedExecutors(fullId) = executor
        trimFinishedExecutorsIfNecessary()
 
        // 释放executor占用的内存和CPU资源
        coresUsed -= executor.cores
        memoryUsed -= executor.memory
      case None =>
        logInfo("Unknown Executor " + fullId + " finished with state " + state +
          message.map(" message " + _).getOrElse("") +
          exitStatus.map(" exitStatus " + _).getOrElse(""))
    }
    maybeCleanupApplication(appId)
  }
}

小记--------spark的worker原理分析及源码分析

标签：tco ipa sbo inter svc ejbca dcs vsc auc

原文地址：https://www.cnblogs.com/yzqyxq/p/12054358.html

踩

(0)

评论一句话评论（0）

分享档案

更多>

2021年07月29日 (22)
2021年07月28日 (40)
2021年07月27日 (32)
2021年07月26日 (79)
2021年07月23日 (29)
2021年07月22日 (30)
2021年07月21日 (42)
2021年07月20日 (16)
2021年07月19日 (90)
2021年07月16日 (35)

周排行