剖析runC源码：创建、初始化与启动过程详解（02）

最编程 2024-02-29 19:36:33

...

0、回顾

runC 是一个OCI 规范的实现，容器标准化的产物。今天开始逐步从容器创建开始深入分析源码，尽可能的深入到自己的知识盲区，且短时间不能弄懂的区域；
往期：
- 【runC】01-runC-介绍与命令

1、runC 目录结构介绍

runC 的代码结构是非常清晰，只要具备一点golang基础便可以轻松入坑，下面大致介绍下结构：

runc/main.go 文件是命令的入口文件，其余子命令模块也存在于该层目录，如 create.go，delete.go, exec.go 等；整个命令模块的组装使用了 github.com/urfave/cli 这个库
runc/libcontainer 目录是主要存放每个子命令模块的工作主体逻辑；

以上是比较重要的两点，其他的有兴趣自行查看，下面进入主题；

2、源码分析

2.1、简要介绍

一个容器启动主要分为三大部分，如文章题目所示
- create: 主要是为了解析、组装容器启动的配置和与子进程的消息通道等；
- init : 主要根据容器配置启动容器整个运行环境，包括熟知ns，cgroups, seccomp, apparmor, caps等;
- start : 主要是为了通知init 进程启动容器；
所以启动流程大致如下

runc create --> runc init --> runc start

create | init | start 三者的关系时序图
源码下载

$ git clone https://github.com/opencontainers/runc

2.2、runc create

参数展示

$ runc create -h

runc/create.go

create 命令入口

var createCommand = cli.Command{
	Name:  "create",
	Usage: "create a container",
	...
	},
	Action: func(context *cli.Context) error {
		.....
		// 获取规范配置文件，并进行简单的检查及赋值
		spec, err := setupSpec(context)
		if err != nil {
			return err
		}
		// 开始组装container 配置，动作为【create】
        // [进入 startContainer]
		status, err := startContainer(context, spec, CT_ACT_CREATE, nil)
		if err != nil {
			return err
		}
		os.Exit(status)
		return nil
	},

后面进入 startContainer

runc/utils_linux.go startContainer (1)

startContainer，这部分代码主要分为两部分
- 一个是createContaienr函数；
- 一个是runner结构体的run方法；

func startContainer(context *cli.Context, spec *specs.Spec, action CtAct, criuOpts *libcontainer.CriuOpts) (int, error) {
	id := context.Args().First()
	if id == "" {
		return -1, errEmptyID
	}
       // 建立接收console 信息的sockt 非必要, 该socket用于接收整个创建过程的状态变化
	notifySocket := newNotifySocket(context, os.Getenv("NOTIFY_SOCKET"), id)
	if notifySocket != nil {
		if err := notifySocket.setupSpec(context, spec); err != nil {
			return -1, err
		}
	}
	// [先进入 crateContainer]
	container, err := createContainer(context, id, spec)
	if err != nil {
		return -1, err
	}
	...
        // 后面会继续贴剩下的代码及注释
	r := &runner{
		enableSubreaper: !context.Bool("no-subreaper"),
		shouldDestroy:   true,
		container:       container,
		listenFDs:       listenFDs,
		notifySocket:    notifySocket,
		consoleSocket:   context.String("console-socket"),
		detach:          context.Bool("detach"),
		pidFile:         context.String("pid-file"),
		preserveFDs:     context.Int("preserve-fds"),
		action:          action,
		criuOpts:        criuOpts,
		init:            true,
		logLevel:        logLevel,
	}
	return r.run(spec.Process)
}

runc/utils_linux.go createContainer

createContainer

func createContainer(context *cli.Context, id string, spec *specs.Spec) (libcontainer.Container, error) {
	 // 是否使用非root 的cgroup
	rootlessCg, err := shouldUseRootlessCgroupManager(context)
	if err != nil {
		return nil, err
	}
	// 根据OCI 规范创建 container 配置文件
    // [进入CreateLibcontainerConfig]
	config, err := specconv.CreateLibcontainerConfig(&specconv.CreateOpts{
		CgroupName:       id,
        	// 是否使用systemd-cgroup， 不使用的话默认选择 user.slice
		UseSystemdCgroup: context.GlobalBool("systemd-cgroup"),
        	// 是否不 pivotroot, 一般只有rootfs 在闪存上才不固定rootfs
		NoPivotRoot:      context.Bool("no-pivot"),
		NoNewKeyring:     context.Bool("no-new-keyring"),
		Spec:             spec,
        	// 获取EUID, 用于系统决定用户对系统资源的访问权限，通常情况下等于RUID。 非root 情况启动；
		RootlessEUID:     os.Geteuid() != 0,
		RootlessCgroups:  rootlessCg,
	})
	if err != nil {
		return nil, err
	}
	// 通过命令上下文，加载一个可执行的工厂容器类，创建一个runc init 的容器对象 【进入loadFactory】
	factory, err := loadFactory(context)
	if err != nil {
		return nil, err
	}
	// 创建一个容器，对container 的root 目录，即bundle 目录进行设置，当前容器状态为 stopped
	return factory.Create(id, config)
}

下面将进入 CreateLibcontainerConfig ，看看OCI 规范了容器哪些配置，及它们是如何生效；

runc/libcontainer/specconv/spec_linux.go CreateLibcontainerConfig

CreateLibcontainerConfig 由于这块代码块篇幅有点长想了解详情的可以点击 [code] 进入

func CreateLibcontainerConfig(opts *CreateOpts) (*configs.Config, error) {
       // 让runc 的工作目录固定在bundle 直到的目录下，没有指定即当前目录
	rcwd, err := os.Getwd()
	if err != nil {
		return nil, err
	}
	...
	// 指定rootfs, 在 config.json 里面指定了当前目录的 rootfs 文件夹
	rootfsPath := spec.Root.Path
	if !filepath.IsAbs(rootfsPath) {
		rootfsPath = filepath.Join(cwd, rootfsPath)
	}
	labels := []string{}
	for k, v := range spec.Annotations {
		labels = append(labels, k+"="+v)
	}
	// 将已有的createOpts 组装到最终的 config 上
	config := &configs.Config{
		Rootfs:          rootfsPath,
		NoPivotRoot:     opts.NoPivotRoot,
		Readonlyfs:      spec.Root.Readonly,
		Hostname:        spec.Hostname,
		Labels:          append(labels, "bundle="+cwd),
		NoNewKeyring:    opts.NoNewKeyring,
		RootlessEUID:    opts.RootlessEUID,
		RootlessCgroups: opts.RootlessCgroups,
	}
 	// 根据规范挂载目录，对应的是config.json 的 mounts 字段
// 如： /proc, /dev, /dev/pts, /dev/shm, /dev/mqueue, /sys/, /sys/fs/cgroup 等
	for _, m := range spec.Mounts {
		config.Mounts = append(config.Mounts, createLibcontainerMount(cwd, m))
	}

	// 创建挂载分区, 默认挂载分区 AllowedDevices 和 OCI 规范的分区
	// AllowedDevices https://github.com/opencontainers/runc/blob/master/libcontainer/specconv/spec_linux.go#L64
	defaultDevs, err := createDevices(spec, config)
	if err != nil {
		return nil, err
	}

	// 创建cgroup 资源控制的配置, 传入默认分区, 返回 cgroup 资源配置
	/* 可控的资源对象
	legacySubsystems = []subsystem{
		&fs.CpusetGroup{},
		&fs.DevicesGroup{},
		&fs.MemoryGroup{},
		&fs.CpuGroup{},
		&fs.CpuacctGroup{},
		&fs.PidsGroup{},
		&fs.BlkioGroup{},
		&fs.HugetlbGroup{},
		&fs.PerfEventGroup{},
		&fs.FreezerGroup{},
		&fs.NetPrioGroup{},
		&fs.NetClsGroup{},
		&fs.NameGroup{GroupName: "name=systemd"},
	}
	*/
	c, err := CreateCgroupConfig(opts, defaultDevs)
	if err != nil {
		return nil, err
	}

	config.Cgroups = c

	// set linux-specific config
	if spec.Linux != nil {
		...

		// 加载namespace， 默认加载 pid, network, ipc, uts, mount
		for _, ns := range spec.Linux.Namespaces {
			t, exists := namespaceMapping[ns.Type]
			if !exists {
				return nil, fmt.Errorf("namespace %q does not exist", ns)
			}
			if config.Namespaces.Contains(t) {
				return nil, fmt.Errorf("malformed spec file: duplicated ns %q", ns)
			}
			config.Namespaces.Add(t, ns.Path)
		}
  		// 如果存在network ns 就设置 lookback 回环地址
		if config.Namespaces.Contains(configs.NEWNET) && config.Namespaces.PathOf(configs.NEWNET) == "" {
			config.Networks = []*configs.Network{
				{
					Type: "loopback",
				},
			}
		}
		// 如果存在user ns 就设置user 的 rootID groupID
		if config.Namespaces.Contains(configs.NEWUSER) {
			if err := setupUserNamespace(spec, config); err != nil {
				return nil, err
			}
		}
		...
  		// 设置intel 芯片参数
		if spec.Linux.IntelRdt != nil {
			config.IntelRdt = &configs.IntelRdt{}
			if spec.Linux.IntelRdt.L3CacheSchema != "" {
				config.IntelRdt.L3CacheSchema = spec.Linux.IntelRdt.L3CacheSchema
			}
			if spec.Linux.IntelRdt.MemBwSchema != "" {
				config.IntelRdt.MemBwSchema = spec.Linux.IntelRdt.MemBwSchema
			}
		}
	}
	if spec.Process != nil {
  		// 设置 oom scoret
		config.OomScoreAdj = spec.Process.OOMScoreAdj
		// privileges
		config.NoNewPrivileges = spec.Process.NoNewPrivileges
 		// umask
		config.Umask = spec.Process.User.Umask
		// selinux
		if spec.Process.SelinuxLabel != "" {
			config.ProcessLabel = spec.Process.SelinuxLabel
		}
		// 赋予容器部分root的能力
		if spec.Process.Capabilities != nil {
			config.Capabilities = &configs.Capabilities{
				Bounding:    spec.Process.Capabilities.Bounding,
				Effective:   spec.Process.Capabilities.Effective,
				Permitted:   spec.Process.Capabilities.Permitted,
				Inheritable: spec.Process.Capabilities.Inheritable,
				Ambient:     spec.Process.Capabilities.Ambient,
			}
		}
	}
	// 容器生命周期钩子
	/*
	preStart : 在启动init 进程前的hook，根据注释该hook已经被废弃
	CreateRuntime : 该hook的执行期是，在环境变量执行后，及pivot_root执行前，需要等init进程通知 create 进程；
	CreateContainer : CreateRuntime执行完毕后，就执行当前hook
    Poststart :init 进程启动后, 即容器环境准备完毕, 用户进程执行前；
	StartContainer : init 进程启动后, 即容器环境准备完毕，但用户进程还没启动； created状态的时候
	  * 上面 Poststart， StartContainer 两个看起来比较相似，大部分同学可能认识poststart；
        	poststart 是在 create进程收到 init 进程已经初始化完成了后执行的；
        	startContainer 是 init进程收到 start 进程的信息后执行的，这个执行也是在启动用户进程前；
	Poststop 
	*/
	createHooks(spec, config)
	config.Version = specs.Version
	return config, nil
}

runc/utils_linux.go startContainer (2)

回到之前startContainer, 分析剩下的代码

func startContainer(context *cli.Context, spec *specs.Spec, action CtAct, criuOpts *libcontainer.CriuOpts) (int, error) {
	...
	// 刚刚是从createContainer进入的， 现在从该函数开始
	container, err := createContainer(context, id, spec)
	if err != nil {
		return -1, err
	}
	...


	// 这是一个extraFiles 文件描述符队列，主要用于给 init 进程读取参数使用
	// 一个程序拥有 0 1 2 三个标准文件描述符，标准输入，标准输出，标准错误；在此之外接收的文件描述符称为额外文件描述符即 extraFiles
	// 后面到了init 进程组装的时候也会在此次提起；
	listenFDs := []*os.File{}
	if os.Getenv("LISTEN_FDS") != "" {
		listenFDs = activation.Files(false)
	}

	...
    // runner 是装载 init 进程的核心，在此前的工作都是以组装配置和校对配置为主，现在正式把配置内容装载后运行init进程;
	r := &runner{
		// 是否指定当前进程不收集僵尸进程，托孤行为
		enableSubreaper: !context.Bool("no-subreaper"),
		shouldDestroy:   true,
		container:       container,
		listenFDs:       listenFDs,
		notifySocket:    notifySocket,
		consoleSocket:   context.String("console-socket"),
		detach:          context.Bool("detach"),
		pidFile:         context.String("pid-file"),
		preserveFDs:     context.Int("preserve-fds"),
		action:          action,
		// 热迁移工具的参数，在create 命令下该参数是空的
		criuOpts:        criuOpts,
		// 是否需要初始化
		init:            true,
		logLevel:        logLevel,
	}
    // [进入 r.run]
	return r.run(spec.Process)
}

下面进入看看 runner.run 是如何装载配置

runc/utils_linux.go runner.run

func (r *runner) run(config *specs.Process) (int, error) {
	var err error
	defer func() {
		if err != nil {
			// 运行完毕后会进行container的销毁,
			// 最终会导向 runc/libcontainer/state_linux.go 下的状态机进行destroy
			r.destroy()
		}
	}()
	if err = r.checkTerminal(config); err != nil {
		return -1, err
	}
	// 创建一个libcontainer 的 process 结构体对象, 该对象是一个容器进程的抽象结构，主要统一配置应用
	process, err := newProcess(*config, r.init, r.logLevel)
	if err != nil {
		return -1, err
	}
	// 这里就是前面提到ExtraFiles, 设定的fd 从3开始加
	if len(r.listenFDs) > 0 {
		process.Env = append(process.Env, "LISTEN_FDS="+strconv.Itoa(len(r.listenFDs)), "LISTEN_PID=1")
		process.ExtraFiles = append(process.ExtraFiles, r.listenFDs...)
	}
	baseFd := 3 + len(process.ExtraFiles)
	...
	}
	...
	var (
		detach = r.detach || (r.action == CT_ACT_CREATE)
	)
	// started.
	// signalhandler 如果启动 subreaper 就会设置 1 号进程为孤儿进程的托管者
	handler := newSignalHandler(r.enableSubreaper, r.notifySocket)
	// 设置进程的IO
	// 当config.Terminal == true 把IO文件符先设定好(使用了epoll io 监听stdin), 为了后面容器启动后可以得到一个可读写的tty，这里涉及到 socketpair 知识；
	// 当 config.Terminal == false 和 detach == false 会直接把容器IO copy 到当前终端stdout, stderr
	// 设置过程有点复杂，有兴趣的可以查看源码
	tty, err := setupIO(process, rootuid, rootgid, config.Terminal, detach, r.consoleSocket)
	if err != nil {
		return -1, err
	}
	defer tty.Close()

	switch r.action {
	case CT_ACT_CREATE:
 		// 其实这几个action 最终实现的动作都差不多，后面还会有新的文章进行详解；
		// 本次主要讲的是 create 动作
		// 【进入 container.Start】
		err = r.container.Start(process)
	case CT_ACT_RESTORE:
		err = r.container.Restore(process, r.criuOpts)
	case CT_ACT_RUN:
		err = r.container.Run(process)
	default:
		panic("Unknown action")
	}
	...
	// 创建当前进程文件
	if r.pidFile != "" {
		if err = createPidFile(r.pidFile, process); err != nil {
			r.terminate(process)
			return -1, err
		}
	}
	....
}

下面进入 container.Start

runc/libccontainer/container_linux.go linuxContaienr.Start

func (c *linuxContainer) Start(process *Process) error {
	c.m.Lock()
	defer c.m.Unlock()
	if c.config.Cgroups.Resources.SkipDevices {
		return newGenericError(errors.New("can't start container with SkipDevices set"), ConfigInvalid)
	}
	// 需要初始化，创建为了后面调用exec 的通信管道文件 (这个创建通信管道在这里显得很突兀，只有看到后面才发现这是一个挺巧妙的设计)
	if process.Init {
		if err := c.createExecFifo(); err != nil {
			return err
		}
	}
	// 【进入 c.start】
	if err := c.start(process); err != nil {
		if process.Init {
        	// start 进程(runc start) 执行完毕后，删除exec.fifo文件，看到这句应该大概猜到上面exec.fifo 文件可能和 start 进程的执行有关系
			c.deleteExecFifo()
		}
		return err
	}
	return nil
}

下面进入 c.start 即 linuxContainer.start

runc/libccontainer/container_linux.go linuxContaienr.start(1)

该方法有两个方法需要深究
- linuxContainer.newParentProcess
- parent.start

func (c *linuxContainer) start(process *Process) error {
	// 创建父进程，父进程指的是当前create进程，而子进程指的是init进程，
	// [进入 c.newParentProcess]
	parent, err := c.newParentProcess(process)
	if err != nil {
		return newSystemErrorWithCause(err, "creating new parent process")
	}
	// 读取child日志文件管道
	parent.forwardChildLogs()
	// [进入 parent.start]
	if err := parent.start(); err != nil {
		return newSystemErrorWithCause(err, "starting container process")
	}
	...
}

下面我们先进入 linuxContainer.newParentProcess

runc/libccontainer/container_linux.go linuxContaienr.newParentProcess

func (c *linuxContainer) newParentProcess(p *Process) (parentProcess, error) {
	// 创建init父子进程的通信管道；因为准备要从当前进程创建容器
    // init-p, init-c
	parentInitPipe, childInitPipe, err := utils.NewSockPair("init")
	if err != nil {
		return nil, newSystemErrorWithCause(err, "creating new init pipe")
	}
	// 这些写法都是为了创建管道文件进行进程间通信
	messageSockPair := filePair{parentInitPipe, childInitPipe}

	parentLogPipe, childLogPipe, err := os.Pipe()
	if err != nil {
		return nil, fmt.Errorf("Unable to create the log pipe:  %s", err)
	}
	logFilePair := filePair{parentLogPipe, childLogPipe}
	// 组装command 模版，生成init 命令，设置大量以 _xxx 这样格式的环境变量，即大量的extraFiles 文件
	cmd := c.commandTemplate(p, childInitPipe, childLogPipe)
	// 当不需要init
	if !p.Init {
		// 设置进程到对应的namespace, 这个做法是对应到 exec，
		return c.newSetnsProcess(p, cmd, messageSockPair, logFilePair)
	}
	// 将exec.fifo 也加入到extraFiles
	if err := c.includeExecFifo(cmd); err != nil {
		return nil, newSystemErrorWithCause(err, "including execfifo in cmd.Exec setup")
	}
	// 生成initProcess 对象， 设置 _LIBCONTAINER_INITTYPE 为standard
	// 并生成了 bootstrapData，主要是namespace 和 oom score
	// 装载了 cgroupManager, 后面分析 resume/pause 时会提到部分cgroupManager
	// init-p, init-c 通信管道设置
	return c.newInitProcess(p, cmd, messageSockPair, logFilePair)
}

下面回到 runc/libccontainer/container_linux.go linuxContaienr.start

runc/libccontainer/container_linux.go linuxContaienr.start(2)

func (c *linuxContainer) start(process *Process) error {
	...
	parent, err := c.newParentProcess(process)
	if err != nil {
		return newSystemErrorWithCause(err, "creating new parent process")
	}
	// -- 从这开始 --
	// 读取child日志文件管道
	parent.forwardChildLogs()
	// [进入 parent.start]
	if err := parent.start(); err != nil {
		return newSystemErrorWithCause(err, "starting container process")
	}
	if process.Init {
		if c.config.Hooks != nil {
			s, err := c.currentOCIState()
			if err != nil {
				return err
			}
			// 执行poststart hook (容器创建成功后，运行前的任务)
			if err := c.config.Hooks[configs.Poststart].RunHooks(s); err != nil {
				if err := ignoreTerminateErrors(parent.terminate()); err != nil {
					logrus.Warn(errorsf.Wrapf(err, "Running Poststart hook"))
				}
				return err
			}
		}
	}
	return nil
}

下面进入 parent.start 即 initProcess.start 准备启动init 进程了；

runc/libcontainer/process_linux.go initProcess.start

create 的最终目的就是启动init 进程，让init 进程创建容器环境

func (p *initProcess) start() (retErr error) {
	defer p.messageSockPair.parent.Close()
	// 调用 runc init
	err := p.cmd.Start()
	...
	if err != nil {
		p.process.ops = nil
		return newSystemErrorWithCause(err, "starting init process command")
	}
	...
	// 限定子进程的cgroups，避免有进程逃离cgroup限定
	if err := p.manager.Apply(p.pid()); err != nil {
		return newSystemErrorWithCause(err, "applying cgroup configuration for process")
	}
	if p.intelRdtManager != nil {
		if err := p.intelRdtManager.Apply(p.pid()); err != nil {
			return newSystemErrorWithCause(err, "applying Intel RDT configuration for process")
		}
	}
	// //将bootstrapData写入init通道,runc init进程接收到会设置自身运行的namespaces等
	if _, err := io.Copy(p.messageSockPair.parent, p.bootstrapData); err != nil {
		return newSystemErrorWithCause(err, "copying bootstrap data to pipe")
	}
	//通过init pipe获取子进程的pid
	childPid, err := p.getChildPid()
	if err != nil {
		return newSystemErrorWithCause(err, "getting the final child's pid from pipe")
	}

	// 获取子进程的文件描述符路径
	fds, err := getPipeFds(childPid)
	if err != nil {
		return newSystemErrorWithCausef(err, "getting pipe fds for pid %d", childPid)
	}
	// 记录子进程的的额外文件描述符路径，以免后面找不到
	p.setExternalDescriptors(fds)

	// 通知init 进程，设置新的cgroup namespace
	if p.config.Config.Namespaces.Contains(configs.NEWCGROUP) && p.config.Config.Namespaces.PathOf(configs.NEWCGROUP) == "" {
		if _, err := p.messageSockPair.parent.Write([]byte{createCgroupns}); err != nil {
			return newSystemErrorWithCause(err, "sending synchronization value to init process")
		}
	}

	// 等待nsexec进程执行，这部分因为go语言对于namespace支持缺陷，
	// 导致这部分实现没有使用go语言，采用C语言实现，通过init-c 这个管道获取 pid信息，
	// 然后接收上一步中的bootstrapData，设置进程的namspace，最后runc init go语言实现部分逻辑。
	if err := p.waitForChildExit(childPid); err != nil {
		return newSystemErrorWithCause(err, "waiting for our first child to exit")
	}
	
	...
	// 发送init配置给 init 进程
 	if err := p.sendConfig(); err != nil {
		return newSystemErrorWithCause(err, "sending config to init process")
	}
	var (
		sentRun    bool
		sentResume bool
	)

	// 和初始化进程的进行状态同步
	// parseSync 是会循环到socket 关闭
	ierr := parseSync(p.messageSockPair.parent, func(sync *syncT) error {
		switch sync.Type {
		// 当init进程ready 了
		case procReady:
			// rlimits 文件数量
			if err := setupRlimits(p.config.Rlimits, p.pid()); err != nil {
				return newSystemErrorWithCause(err, "setting rlimits for ready process")
			}
			// 只有不存在 mount namespacce 的时候才可以执行hooks， 一般情况都是需要mount namespace的
			if !p.config.Config.Namespaces.Contains(configs.NEWNS) {
				// Setup cgroup before the hook, so that the prestart and CreateRuntime hook could apply cgroup permissions.
				if err := p.manager.Set(p.config.Config); err != nil {
					return newSystemErrorWithCause(err, "setting cgroup config for ready process")
				}
				...
				if p.config.Config.Hooks != nil {
					s, err := p.container.currentOCIState()
					if err != nil {
						return err
					}
					// 设置子进程pid
					s.Pid = p.cmd.Process.Pid
 					// createing 状态
					s.Status = specs.StateCreating
					hooks := p.config.Config.Hooks
					
					if err := hooks[configs.Prestart].RunHooks(s); err != nil {
						return err
					}
					if err := hooks[configs.CreateRuntime].RunHooks(s); err != nil {
						return err
					}
				}
			}

			// generate a timestamp indicating when the container was started
			p.container.created = time.Now().UTC()
			p.container.state = &createdState{
				c: p.container,
			}

			state, uerr := p.container.updateState(p)
			if uerr != nil {
				return newSystemErrorWithCause(err, "store init state")
			}
			p.container.initProcessStartTime = state.InitProcessStartTime

			// 状态同步是子进程，子进程继续操作
			if err := writeSync(p.messageSockPair.parent, procRun); err != nil {
				return newSystemErrorWithCause(err, "writing syncT 'run'")
			}
			sentRun = true
		// 收到 init 进程过来的hook信号，pivot_root 即将执行
		case procHooks:
			// 设置进程的 cgroup
			if err := p.manager.Set(p.config.Config); err != nil {
				return newSystemErrorWithCause(err, "setting cgroup config for procHooks process")
			}
			// 设置 intel 芯片配置
			if p.intelRdtManager != nil {
				if err := p.intelRdtManager.Set(p.config.Config); err != nil {
					return newSystemErrorWithCause(err, "setting Intel RDT config for procHooks process")
				}
			}
			// 执行hook
			if p.config.Config.Hooks != nil {
				s, err := p.container.currentOCIState()
				if err != nil {
					return err
				}
				s.Pid = p.cmd.Process.Pid
				s.Status = specs.StateCreating
				hooks := p.config.Config.Hooks

				if err := hooks[configs.Prestart].RunHooks(s); err != nil {
					return err
				}
				if err := hooks[configs.CreateRuntime].RunHooks(s); err != nil {
					return err
				}
			}
			// 通知init 进程恢复，继续执行pivot_root
			if err := writeSync(p.messageSockPair.parent, procResume); err != nil {
				return newSystemErrorWithCause(err, "writing syncT 'resume'")
			}
			sentResume = true
		}

		return nil
	})
	// 等待init 回调, 如果回调成功便完成剩余的设置即生命周期的HOOK调用
	if !sentRun {
		return newSystemErrorWithCause(ierr, "container init")
	}
	// 等待Hook 回调成功
	if p.config.Config.Namespaces.Contains(configs.NEWNS) && !sentResume {
		return newSystemError(errors.New("could not synchronise after executing prestart and CreateRuntime hooks with container process"))
	}
	// 关闭与init 的通信管道
	if err := unix.Shutdown(int(p.messageSockPair.parent.Fd()), unix.SHUT_WR); err != nil {
		return newSystemErrorWithCause(err, "shutting down init pipe")
	}

	// Must be done after Shutdown so the child will exit and we can wait for it.
	if ierr != nil {
		p.wait()
		return ierr
	}
	return nil
}

至此 create 进程分析完毕了，可以看到 create 进程在等待 init 进程的初始化完成的通知，下面开始分析init 进程，看看 init做了些什么动作, 再坚持一下Fighting @_@;

2.3、runc init

runc/init.go

package main

import (
	...
    // 在这里有一个点是和上面init进程启动时的等待 nsexec 的执行，等就是这个nsenter的执行，这部分呢代码由C语言实现, 它当包被引入的时候就立刻执行，所以无须额外调用； 
	_ "github.com/opencontainers/runc/libcontainer/nsenter"
	...
)
...
...
var initCommand = cli.Command{
	Name:  "init",
	Usage: `initialize the namespaces and launch the process (do not call it outside of runc)`,
	Action: func(context *cli.Context) error {
		factory, _ := libcontainer.New("")
		// 【进入 StartInitialization】
		if err := factory.StartInitialization(); err != nil {
			os.Exit(1)
		}
		panic("libcontainer: container init failed to exec")
	},
}

下面进入 factory.StartInitialization

runc/libcontainer/factory_linux.go LinuxFactory.StartInitialization

func (l *LinuxFactory) StartInitialization() (err error) {
	// Get the INITPIPE.
    // 获取 init pipe， 这里是之前提到 init-c 这个文件描述fd
	envInitPipe := os.Getenv("_LIBCONTAINER_INITPIPE")
	pipefd, err := strconv.Atoi(envInitPipe)
	if err != nil {
		return fmt.Errorf("unable to convert _LIBCONTAINER_INITPIPE=%s to int: %s", envInitPipe, err)
	}
	// 通过fd 建立pipe 与父进程通信
	pipe := os.NewFile(uintptr(pipefd), "pipe")
	defer pipe.Close()

	// runc create设置子进程模块时使用的环境变量，初始化类型 stander 以及 exec.fifo管道
	fifofd := -1
	envInitType := os.Getenv("_LIBCONTAINER_INITTYPE")
	it := initType(envInitType)
	if it == initStandard {
		envFifoFd := os.Getenv("_LIBCONTAINER_FIFOFD")
		if fifofd, err = strconv.Atoi(envFifoFd); err != nil {
			return fmt.Errorf("unable to convert _LIBCONTAINER_FIFOFD=%s to int: %s", envFifoFd, err)
		}
	}

	// 清除继承的 process's environment
	os.Clearenv()
	...
	// 返回一个 starndar 的 linuxStandardInit, 如果我们是执行exec的话, 则返回linuxSetnsInit
	i, err := newContainerInit(it, pipe, consoleSocket, fifofd)
	if err != nil {
		return err
	}
    // [最后进入 Init]
	return i.Init()
}

下面进入 linuxStandardInit.Init

runc/libcontainer/standar_init_linux.go linuxStandardInit.Init

篇幅比较长，主要是为了尽可能完整展示，以及回过头看能很好的回想起来；

func (l *linuxStandardInit) Init() error {
	// 把goroutine 锁定在某个线程上，它保证将在同一线程中完成对C库的多次连续调用。
	runtime.LockOSThread()
	defer runtime.UnlockOSThread()

	// 用于保留或缓存安全数据、身份验证密钥、加密密钥及其他内核数据 (能力有限不太懂怎么玩的)
	if !l.config.Config.NoNewKeyring {
		....
	}
	// 设置网络，默认为lo网卡，然后设置路由规则
	if err := setupNetwork(l.config); err != nil {
		return err
	}
	// 设置路由
	if err := setupRoute(l.config.Config); err != nil {
		return err
	}

	// initialises the labeling system
	selinux.GetEnabled()
	// 准备rootfs，主要是根目录挂载，外部卷挂载，创建设备
	// 通知runc create进行pre start的hook调用，pivot_root 或 change_root，限定进程使用根目录。
	// 需要注意一点，容器的pre start的hook调用发生在限定容器的根目录之前。
	if err := prepareRootfs(l.pipe, l.config); err != nil {
		return err
	}
	...

	// 完成最终的rootfs ，主要是把需要挂载的mount point 挂上去
	if l.config.Config.Namespaces.Contains(configs.NEWNS) {
		if err := finalizeRootfs(l.config.Config); err != nil {
			return err
		}
	}
	// 设置hostname
	if hostname := l.config.Config.Hostname; hostname != "" {
		if err := unix.Sethostname([]byte(hostname)); err != nil {
			return errors.Wrap(err, "sethostname")
		}
	}
	// 设置app 文件设备访问控制
	if err := apparmor.ApplyProfile(l.config.AppArmorProfile); err != nil {
		return errors.Wrap(err, "apply apparmor profile")
	}

	// 写系统属性 For e.g. net.ipv4.ip_forward translated to /proc/sys/net/ipv4/ip_forward
	for key, value := range l.config.Config.Sysctl {
		if err := writeSystemProperty(key, value); err != nil {
			return errors.Wrapf(err, "write sysctl key %s", key)
		}
	}
	...
	...
	if err != nil {
		return errors.Wrap(err, "get pdeath signal")
	}
	// 设置 privileges
	if l.config.NoNewPrivileges {
		if err := unix.Prctl(unix.PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); err != nil {
			return errors.Wrap(err, "set nonewprivileges")
		}
	}

	// 通知runc create已经完成基础的初始化内容，我们准备好执行 exec
	if err := syncParentReady(l.pipe); err != nil {
		return errors.Wrap(err, "sync ready")
	}
	...
	...
	// 设置程序可用的系统调用
	if l.config.Config.Seccomp != nil && !l.config.NoNewPrivileges {
		if err := seccomp.InitSeccomp(l.config.Config.Seccomp); err != nil {
			return err
		}
	}
	// 配置正确的capability，用户和工作目录
	if err := finalizeNamespace(l.config); err != nil {
		return err
	}
	...
    // 如果处于某些什么原因 parentid 变换了，就会kill掉自己
	if unix.Getppid() != l.parentPid {
		return unix.Kill(unix.Getpid(), unix.SIGKILL)
	}
	// 查看可执行文件在容器内是否存在, 因为已经此时容器的上下文环境，rootfs等已全部准备就绪
	// 在当前的根文件系统，应该是能找到一个可执行的runc文件
	name, err := exec.LookPath(l.config.Args[0])
	if err != nil {
		return err
	}
	// 关闭 stander pipe (init-c) 通信通道
	l.pipe.Close()
	// 在执行容器启动命令前，等待exec.fifo管道在另一端被打开
	// /proc/self/fd/ 下可以看到一个 fd -> /run/runc/<containerID>/
	fd, err := unix.Open("/proc/self/fd/"+strconv.Itoa(l.fifoFd), unix.O_WRONLY|unix.O_CLOEXEC, 0)
	if err != nil {
		return newSystemErrorWithCause(err, "open exec fifo")
	}
	//向exec.fifo管道写入数据，然后init进程阻塞，等待runc start调用，打开exec.fifo管道读取内容，然后执行容器启动命令。
	// 读取完管道内容，管道通信就会结束，不会继续阻塞，这里涉及 socketpair 的知识点；
	if _, err := unix.Write(fd, []byte("0")); err != nil {
		return newSystemErrorWithCause(err, "write 0 exec fifo")
	}
	//关闭exec.fifo
	unix.Close(l.fifoFd)
	// 因为之前为了容器的上下文创建需要较多的系统调用，所以现在重新回收后一部分系统调用
	if l.config.Config.Seccomp != nil && l.config.NoNewPrivileges {
		if err := seccomp.InitSeccomp(l.config.Config.Seccomp); err != nil {
			return newSystemErrorWithCause(err, "init seccomp")
		}
	}

	s := l.config.SpecState
	s.Pid = unix.Getpid()
	// 设置为created 状态
	s.Status = specs.StateCreated
	if err := l.config.Config.Hooks[configs.StartContainer].RunHooks(s); err != nil {
		return err
	}
	// 执行容器的启动命令
	// 在之前的文章提到过 exec 这个系统调用会让子程序替代掉父进程的上下文空间， 也就是替代init 的进程空间；
	if err := unix.Exec(name, l.config.Args[0:], os.Environ()); err != nil {
		return newSystemErrorWithCause(err, "exec user process")
	}
	return nil
}

至此为止，已经看完create 和 init 的联动了，下面应该是 runc start 登场了；都到这里了，再坚持一下吧，runc start 的代码重点比较少，Fighting！！

2.4、runc start

快速开始, 老套路看入口

runc/start.go

...
	}
		// 获取container 的方式，是通过之前文章提到libcontainer.Factory 容器工厂类加载containerID 及配置
		container, err := getContainer(context)
		if err != nil {
			return err
		}
		...
		switch status {
		// start 前对应的状态就是容器环境已经创建好了，在等待start, 然后切换至running
		case libcontainer.Created:
			...
			// 执行exec 容器进程替代init
            // [进入 container.Exec]
			if err := container.Exec(); err != nil {
				return err
			}
			if notifySocket != nil {
				return notifySocket.waitForContainer(container)
			}
			return nil
		case libcontainer.Stopped:
			return errors.New("cannot start a container that has stopped")
		case libcontainer.Running:
			return errors.New("cannot start an already running container")
		default:
			return fmt.Errorf(
...

下面进入 container.Exec

runc/libcontainer/container_linux.go linuxContainer.exec

func (c *linuxContainer) exec() error {
	path := filepath.Join(c.root, execFifoFilename)
	pid := c.initProcess.pid()
	// 读取 /run/runc/<containerID>/exec.fifo 管道，由于socketpair管道特性，父进程(init进程)被读取信息后便不会阻塞，继续往下执行，关闭socket
	// 参考 runc/libcontainer/containner_linux.go Init 206行
	blockingFifoOpenCh := awaitFifoOpen(path)
	// 获取exec.fifo文件中内容，或者等待进程变为僵尸进程
	for {
		select {
		case result := <-blockingFifoOpenCh:
			// handleFifoResult 最后读完内容后会删除掉 exec.fifo
			return handleFifoResult(result)

		case <-time.After(time.Millisecond * 100):
			stat, err := system.Stat(pid)
			if err != nil || stat.State == system.Zombie {
				if err := handleFifoResult(fifoOpen(path, false)); err != nil {
					return errors.New("container process is already dead")
				}
				return nil
			}
		}
	}
}

start 进程到这也结束了，它主要是读取一下阻塞init进程的socket，让init 进程往下执行启动容器最终的执行命令；exec.fifo 就是它们的一个切割，割裂成两个控制命令；

3、总结

第一次写源码分析，今天总算是完成了之前的一个Flag，本应该上周就要完成；但为了力求真实反映容器启动过程，反复的阅读代码和查阅，才把这篇分析肝完；下周继续进行runc 其他子模块的源码分析；
如果文章对您有帮助，请在左边点赞，收藏

4、参考

github.com/opencontain…

5、系列

【runC】01-runC-介绍与命令
【runC】02-runC-源码分析-[create, init, start]
【runC】03-runC-源码分析-exec
【runC】04-runC-源码分析-[pause, resume]

上一篇：快速掌握 Gin Web框架：轻松入门只需一日的教学指南

下一篇：如何解决Git推送到master分支失败：pre-receive钩子被拒绝 (! [remote rejected]) 的问题与应对方法