syzkaller 源码阅读笔记 —— syz-manager
syzkaller 中的 syz-manager 负责管理虚拟机,监控 crash,以及复现 crash
main
从 main 函数开始,解析参数,加载 config 文件,随后调用 RunManager
// syz-manager/manager.go
func main() {
if prog.GitRevision == "" {
log.Fatalf("bad syz-manager build: build with make, run bin/syz-manager")
}
flag.Parse()
log.EnableLogCaching(1000, 1<<20)
cfg, err := mgrconfig.LoadFile(*flagConfig)
if err != nil {
log.Fatalf("%v", err)
}
RunManager(cfg)
}
RunManager
// syz-manager/manager.go
func RunManager(cfg *mgrconfig.Config) {
var vmPool *vm.Pool
// Type "none" is a special case for debugging/development when manager
// does not start any VMs, but instead you start them manually
// and start syz-fuzzer there.
if cfg.Type != "none" {
var err error
vmPool, err = vm.Create(cfg, *flagDebug) // 「1」
if err != nil {
log.Fatalf("%v", err)
}
}
crashdir := filepath.Join(cfg.Workdir, "crashes")
osutil.MkdirAll(crashdir)
reporter, err := report.NewReporter(cfg)
if err != nil {
log.Fatalf("%v", err)
}
mgr := &Manager{
cfg: cfg,
vmPool: vmPool,
//...
}
mgr.preloadCorpus()
mgr.initStats() // Initializes prometheus variables.
mgr.initHTTP() // Creates HTTP server.
mgr.collectUsedFiles()
// Create RPC server for fuzzers.
mgr.serv, err = startRPCServer(mgr)
if err != nil {
log.Fatalf("failed to create rpc server: %v", err)
}
if cfg.DashboardAddr != "" {
// ...
}
go func() { // 「2」
for lastTime := time.Now(); ; {
time.Sleep(10 * time.Second)
now := time.Now()
diff := now.Sub(lastTime)
lastTime = now
mgr.mu.Lock()
if mgr.firstConnect.IsZero() {
mgr.mu.Unlock()
continue
}
mgr.fuzzingTime += diff * time.Duration(atomic.LoadUint32(&mgr.numFuzzing))
executed := mgr.stats.execTotal.get()
crashes := mgr.stats.crashes.get()
corpusCover := mgr.stats.corpusCover.get()
corpusSignal := mgr.stats.corpusSignal.get()
maxSignal := mgr.stats.maxSignal.get()
mgr.mu.Unlock()
numReproducing := atomic.LoadUint32(&mgr.numReproducing)
numFuzzing := atomic.LoadUint32(&mgr.numFuzzing)
log.Logf(0, "VMs %v, executed %v, cover %v, signal %v/%v, crashes %v, repro %v",
numFuzzing, executed, corpusCover, corpusSignal, maxSignal, crashes, numReproducing)
}
}()
if *flagBench != "" {
// ...
}
if mgr.dash != nil {
go mgr.dashboardReporter()
}
osutil.HandleInterrupts(vm.Shutdown)
if mgr.vmPool == nil {
log.Logf(0, "no VMs started (type=none)")
log.Logf(0, "you are supposed to start syz-fuzzer manually as:")
log.Logf(0, "syz-fuzzer -manager=manager.ip:%v [other flags as necessary]", mgr.serv.port)
<-vm.Shutdown
return
}
mgr.vmLoop() // 「3」
}
「1」 ,调用 vm.Create
创建 vmPool,顾名思义,vmPool 是用于管理虚拟机的一个池,这里不同的虚拟化方案都使用统一的接口来创建 vmPool,具体随后细述
「2」,开启一个 goroutine,定时记录虚拟机状态,覆盖率,crash 数量等信息
「3」,调用 mgr.vmLoop
开始启动虚拟机进行 fuzz,这个函数涵盖了大量的工作
vm.Create
// vm/vm.go
// Create creates a VM pool that can be used to create individual VMs.
func Create(cfg *mgrconfig.Config, debug bool) (*Pool, error) {
typ, ok := vmimpl.Types[cfg.Type] // 「1」
if !ok {
return nil, fmt.Errorf("unknown instance type '%v'", cfg.Type)
}
env := &vmimpl.Env{
Name: cfg.Name,
OS: cfg.TargetOS,
Arch: cfg.TargetVMArch,
Workdir: cfg.Workdir,
Image: cfg.Image,
SSHKey: cfg.SSHKey,
SSHUser: cfg.SSHUser,
Timeouts: cfg.Timeouts,
Debug: debug,
Config: cfg.VM,
}
impl, err := typ.Ctor(env) // 「2」
if err != nil {
return nil, err
}
return &Pool{
impl: impl,
workdir: env.Workdir,
template: cfg.WorkdirTemplate,
timeouts: cfg.Timeouts,
}, nil
}
「1」,根据配置文件的 type
字段获取对应 Type 对象为后续使用,type
字段表示虚拟机的类型,如 qemu,vmware
// vm/vmimpl/vmimpl.go
type Type struct {
Ctor ctorFunc
Overcommit bool
}
var (
// ...
Types = make(map[string]Type)
)
// Register registers a new VM type within the package.
func Register(typ string, ctor ctorFunc, allowsOvercommit bool) {
Types[typ] = Type{
Ctor: ctor,
Overcommit: allowsOvercommit,
}
}
各种虚拟化对应的模块调用 Register
函数注册 Type 对象,以 qemu 为例
// vm/qemu/qemu.go
func init() {
var _ vmimpl.Infoer = (*instance)(nil)
vmimpl.Register("qemu", ctor, true)
}
「2」,调用对应 Type 对象的 Ctor 函数指针创建实现了 Pool 接口的对象,以 qemu 为例,观看 qemu.ctor 函数实现
// vm/vmimpl/vmimpl.go
type Pool interface {
// Count returns total number of VMs in the pool.
Count() int
// Create creates and boots a new VM instance.
Create(workdir string, index int) (Instance, error)
}
// vm/qemu/qemu.go
type Pool struct {
env *vmimpl.Env
cfg *Config
target *targets.Target
archConfig *archConfig
version string
}
func ctor(env *vmimpl.Env) (vmimpl.Pool, error) {
archConfig := archConfigs[env.OS+"/"+env.Arch]
cfg := &Config{
Count: 1,
CPU: 1,
Mem: 1024,
ImageDevice: "hda",
Qemu: archConfig.Qemu,
QemuArgs: archConfig.QemuArgs,
NetDev: archConfig.NetDev,
Snapshot: true,
}
// ...
pool := &Pool{
env: env,
cfg: cfg,
version: version,
target: targets.Get(env.OS, env.Arch),
archConfig: archConfig,
}
return pool, nil
}
mgr.vmLoop
// syz-manager/manager.go
func (mgr *Manager) vmLoop() {
// ...
for shutdown != nil || len(instances) != vmCount {
mgr.mu.Lock()
phase := mgr.phase
mgr.mu.Unlock()
// ...
log.Logf(1, "loop: phase=%v shutdown=%v instances=%v/%v %+v repro: pending=%v reproducing=%v queued=%v",
phase, shutdown == nil, len(instances), vmCount, instances,
len(pendingRepro), len(reproducing), len(reproQueue))
canRepro := func() bool {
return phase >= phaseTriagedHub &&
len(reproQueue) != 0 && reproInstances+instancesPerRepro <= vmCount
}
if shutdown != nil {
for canRepro() && len(instances) >= instancesPerRepro { // 「1」
last := len(reproQueue) - 1
crash := reproQueue[last]
reproQueue[last] = nil
reproQueue = reproQueue[:last]
vmIndexes := append([]int{}, instances[len(instances)-instancesPerRepro:]...)
instances = instances[:len(instances)-instancesPerRepro]
reproInstances += instancesPerRepro
atomic.AddUint32(&mgr.numReproducing, 1)
log.Logf(1, "loop: starting repro of '%v' on instances %+v", crash.Title, vmIndexes)
go func() {
features := mgr.checkResult.Features
res, stats, err := repro.Run(crash.Output, mgr.cfg, features, mgr.reporter, mgr.vmPool, vmIndexes)
reproDone <- &ReproResult{
instances: vmIndexes,
report0: crash.Report,
res: res,
stats: stats,
err: err,
hub: crash.hub,
}
}()
}
for !canRepro() && len(instances) != 0 { // 「2」
last := len(instances) - 1
idx := instances[last]
instances = instances[:last]
log.Logf(1, "loop: starting instance %v", idx)
go func() {
crash, err := mgr.runInstance(idx)
runDone <- &RunResult{idx, crash, err}
}()
}
}
// ...
}
}
「1」,存在需要复现的 crash,且有足够的虚拟机,则取出 instancesPerRepro (4 与 vmCount 的最小值) 个虚拟机以供调用 repro.Run
复现 crash
「2」,没有 crash 且还有虚拟机可用,取出一个虚拟机调用 mgr.runInstance
进行 fuzz
mgr.runInstance
// syz-manager/manager.go
func (mgr *Manager) runInstance(index int) (*Crash, error) {
mgr.checkUsedFiles()
instanceName := fmt.Sprintf("vm-%d", index)
rep, vmInfo, err := mgr.runInstanceInner(index, instanceName)
machineInfo := mgr.serv.shutdownInstance(instanceName)
if len(vmInfo) != 0 {
machineInfo = append(append(vmInfo, '\n'), machineInfo...)
}
// ...
crash := &Crash{
vmIndex: index,
hub: false,
Report: rep,
machineInfo: machineInfo,
}
return crash, nil
}
调用 mgr.runInstanceInner
后,整理输出返回
// syz-manager/manager.go
func (mgr *Manager) runInstanceInner(index int, instanceName string) (*report.Report, []byte, error) {
inst, err := mgr.vmPool.Create(index)
// ...
fuzzerBin, err := inst.Copy(mgr.cfg.FuzzerBin) // 「1」
// ...
// If ExecutorBin is provided, it means that syz-executor is already in the image,
// so no need to copy it.
executorBin := mgr.sysTarget.ExecutorBin
if executorBin == "" {
executorBin, err = inst.Copy(mgr.cfg.ExecutorBin) // 「2」
// ...
}
// ...
cmd := instance.FuzzerCmd(fuzzerBin, executorBin, instanceName,
mgr.cfg.TargetOS, mgr.cfg.TargetArch, fwdAddr, mgr.cfg.Sandbox, procs, fuzzerV,
mgr.cfg.Cover, *flagDebug, false, false, true, mgr.cfg.Timeouts.Slowdown)
outc, errc, err := inst.Run(mgr.cfg.Timeouts.VMRunningTime, mgr.vmStop, cmd) // 「3」
// ...
var vmInfo []byte
rep := inst.MonitorExecution(outc, errc, mgr.reporter, vm.ExitTimeout) 「4」
if rep == nil {
// ...
} else {
vmInfo, err = inst.Info()
if err != nil {
// ...
}
}
return rep, vmInfo, nil
}
「1」,复制 syz-fuzzer 到虚拟机中
「2」,复制 syz-executor 到虚拟机中
「3」,ssh 执行虚拟机里的 syz-fuzzer
// vm/qemu/qemu.go
func (inst *instance) Run(timeout time.Duration, stop <-chan bool, command string) (
<-chan []byte, <-chan error, error) {
// ...
sshArgs := vmimpl.SSHArgsForward(inst.debug, inst.sshkey, inst.port, inst.forwardPort)
// ...
} else {
args = []string{"ssh"}
args = append(args, sshArgs...)
args = append(args, inst.sshuser+"@localhost", "cd "+inst.targetDir()+" && "+command)
}
if inst.debug {
log.Logf(0, "running command: %#v", args)
}
cmd := osutil.Command(args[0], args[1:]...)
// ...
}
「4」,监控虚拟机 oops 信息,用 bytes.Contains
检查是否有特征序列来查看是否出现了 crash
repro.Run
最主要就是调用了 ctx.repro
// pkg/repro/repro.go
func (ctx *context) repro(entries []*prog.LogEntry, crashStart int) (*Result, error) {
// ...
res, err := ctx.extractProg(entries) // 「1」
// ...
res, err = ctx.minimizeProg(res) // 「2」
// ...
// Try extracting C repro without simplifying options first.
res, err = ctx.extractC(res) // 「3」
// ...
// Simplify options and try extracting C repro.
if !res.CRepro {
res, err = ctx.simplifyProg(res) // 「4」
// ...
}
// Simplify C related options.
if res.CRepro {
res, err = ctx.simplifyC(res) // 「5」
// ...
}
return res, nil
}
「1」,提取出造成 crash 的程序
// pkg/repro/repro.go
func (ctx *context) extractProg(entries []*prog.LogEntry) (*Result, error) {
// ...
// Extract last program on every proc.
procs := make(map[int]int)
for i, ent := range entries {
procs[ent.Proc] = i
}
var indices []int
for _, idx := range procs {
indices = append(indices, idx)
}
sort.Ints(indices)
var lastEntries []*prog.LogEntry
for i := len(indices) - 1; i >= 0; i-- { // 倒序存储执行的程序
lastEntries = append(lastEntries, entries[indices[i]])
}
for _, timeout := range ctx.testTimeouts {
// Execute each program separately to detect simple crashes caused by a single program.
// Programs are executed in reverse order, usually the last program is the guilty one.
res, err := ctx.extractProgSingle(lastEntries, timeout) // 一个一个执行单个程序,直到触发 crash
if err != nil {
return nil, err
}
if res != nil {
ctx.reproLogf(3, "found reproducer with %d syscalls", len(res.Prog.Calls))
return res, nil
}
// Don't try bisecting if there's only one entry.
if len(entries) == 1 {
continue
}
// Execute all programs and bisect the log to find multiple guilty programs.
res, err = ctx.extractProgBisect(entries, timeout) // 单个程序无法触发 crash,可能需要多个程序
// ...
}
// ...
}
「2」,简化调用和参数
// pkg/repro/repro.go
func (ctx *context) minimizeProg(res *Result) (*Result, error) {
// ...
res.Prog, _ = prog.Minimize(res.Prog, -1, true,
func(p1 *prog.Prog, callIndex int) bool {
crashed, err := ctx.testProg(p1, res.Duration, res.Opts)
if err != nil {
ctx.reproLogf(0, "minimization failed with %v", err)
return false
}
return crashed
})
return res, nil
}
// prog/minimization.go
func Minimize(p0 *Prog, callIndex0 int, crash bool, pred0 func(*Prog, int) bool) (*Prog, int) {
pred := func(p *Prog, callIndex int) bool {
p.sanitizeFix()
p.debugValidate()
return pred0(p, callIndex)
}
// ...
// Try to remove all calls except the last one one-by-one.
p0, callIndex0 = removeCalls(p0, callIndex0, crash, pred)
// Try to reset all call props to their default values.
p0 = resetCallProps(p0, callIndex0, pred)
// Try to minimize individual calls.
for i := 0; i < len(p0.Calls); i++ { // 去除系统调用无关的参数
ctx := &minimizeArgsCtx{
target: p0.Target,
p0: &p0,
callIndex0: callIndex0,
crash: crash,
pred: pred,
triedPaths: make(map[string]bool),
}
again:
ctx.p = p0.Clone()
ctx.call = ctx.p.Calls[i]
for j, field := range ctx.call.Meta.Args {
if ctx.do(ctx.call.Args[j], field.Name, "") {
goto again
}
}
p0 = minimizeCallProps(p0, i, callIndex0, pred)
}
// ...
return p0, callIndex0
}
「3」,根据 crash 构造 C 代码
「4」、「5」,简化构造成功的 C 代码