简单介绍 Linux 内核的 KVM 模块,以 MIPS 体系结构为例,内核源码版本为 5.10.42。

KVM (Kernel-based Virtual Machine)

KVM 是一个完整的虚拟化解决方案,允许用户在特定机器上用虚拟机直接运行未经修改的 Linux 内核。

KVM 通常和 QEMU 结合使用,见 QEMU 简介

KVM 的 API 接口可以参考内核文档

KVM 模块有体系结构相关和无关的两部分,简单起见,体系结构相关的部分只考虑 MIPS 的代码,选用内核源码版本为 5.10.42。

模块初始化

arch/mips/kvm/mips.c

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
static int __init kvm_mips_init(void)
{
    int ret;

    if (cpu_has_mmid) {
        pr_warn("KVM does not yet support MMIDs. KVM Disabled\n");
        return -EOPNOTSUPP;
    }

    ret = kvm_mips_entry_setup();
    if (ret)
        return ret;

    ret = kvm_init(NULL, sizeof(struct kvm_vcpu), 0, THIS_MODULE);

    if (ret)
        return ret;

    if (boot_cpu_type() == CPU_LOONGSON64)
        kvm_priority_to_irq = kvm_loongson3_priority_to_irq;

    register_die_notifier(&kvm_mips_csr_die_notifier);

    return 0;
}

module_init(kvm_mips_init);

kvm_mips_init() 函数会调用体系结构无关的初始化函数 kvm_init()

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
                  struct module *module)
{
    struct kvm_cpu_compat_check c;
    int r;
    int cpu;

    r = kvm_arch_init(opaque);
    if (r)
        goto out_fail;

    /*
     * kvm_arch_init makes sure there's at most one caller
     * for architectures that support multiple implementations,
     * like intel and amd on x86.
     * kvm_arch_init must be called before kvm_irqfd_init to avoid creating
     * conflicts in case kvm is already setup for another implementation.
     */
    r = kvm_irqfd_init();
    if (r)
        goto out_irqfd;

    if (!zalloc_cpumask_var(&cpus_hardware_enabled, GFP_KERNEL)) {
        r = -ENOMEM;
        goto out_free_0;
    }

    r = kvm_arch_hardware_setup(opaque);
    if (r < 0)
        goto out_free_1;

    c.ret = &r;
    c.opaque = opaque;
    for_each_online_cpu(cpu) {
        smp_call_function_single(cpu, check_processor_compat, &c, 1);
        if (r < 0)
            goto out_free_2;
    }

    r = cpuhp_setup_state_nocalls(CPUHP_AP_KVM_STARTING, "kvm/cpu:starting",
                                  kvm_starting_cpu, kvm_dying_cpu);
    if (r)
        goto out_free_2;
    register_reboot_notifier(&kvm_reboot_notifier);

    /* A kmem cache lets us meet the alignment requirements of fx_save. */
    if (!vcpu_align)
        vcpu_align = __alignof__(struct kvm_vcpu);
    kvm_vcpu_cache =
        kmem_cache_create_usercopy("kvm_vcpu", vcpu_size, vcpu_align,
                                   SLAB_ACCOUNT,
                                   offsetof(struct kvm_vcpu, arch),
                                   sizeof_field(struct kvm_vcpu, arch),
                                   NULL);
    if (!kvm_vcpu_cache) {
        r = -ENOMEM;
        goto out_free_3;
    }

    r = kvm_async_pf_init();
    if (r)
        goto out_free;

    kvm_chardev_ops.owner = module;
    kvm_vm_fops.owner = module;
    kvm_vcpu_fops.owner = module;

    r = misc_register(&kvm_dev);
    if (r) {
        pr_err("kvm: misc device register failed\n");
        goto out_unreg;
    }

    register_syscore_ops(&kvm_syscore_ops);

    kvm_preempt_ops.sched_in = kvm_sched_in;
    kvm_preempt_ops.sched_out = kvm_sched_out;

    kvm_init_debug();

    r = kvm_vfio_ops_init();
    WARN_ON(r);

    return 0;

out_unreg:
    kvm_async_pf_deinit();
out_free:
    kmem_cache_destroy(kvm_vcpu_cache);
out_free_3:
    unregister_reboot_notifier(&kvm_reboot_notifier);
    cpuhp_remove_state_nocalls(CPUHP_AP_KVM_STARTING);
out_free_2:
    kvm_arch_hardware_unsetup();
out_free_1:
    free_cpumask_var(cpus_hardware_enabled);
out_free_0:
    kvm_irqfd_exit();
out_irqfd:
    kvm_arch_exit();
out_fail:
    return r;
}
EXPORT_SYMBOL_GPL(kvm_init);

可以看到,kvm_init() 会调用 misc_register() 函数来注册块设备 /dev/kvm

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
static long kvm_dev_ioctl(struct file *filp,
                          unsigned int ioctl, unsigned long arg)
{
    long r = -EINVAL;

    switch (ioctl) {
    case KVM_GET_API_VERSION:
        if (arg)
            goto out;
        r = KVM_API_VERSION;
        break;
    case KVM_CREATE_VM:
        r = kvm_dev_ioctl_create_vm(arg);
        break;
    case KVM_CHECK_EXTENSION:
        r = kvm_vm_ioctl_check_extension_generic(NULL, arg);
        break;
    case KVM_GET_VCPU_MMAP_SIZE:
        if (arg)
            goto out;
        r = PAGE_SIZE;     /* struct kvm_run */
#ifdef CONFIG_X86
        r += PAGE_SIZE;    /* pio data page */
#endif
#ifdef CONFIG_KVM_MMIO
        r += PAGE_SIZE;    /* coalesced mmio ring page */
#endif
        break;
    case KVM_TRACE_ENABLE:
    case KVM_TRACE_PAUSE:
    case KVM_TRACE_DISABLE:
        r = -EOPNOTSUPP;
        break;
    default:
        return kvm_arch_dev_ioctl(filp, ioctl, arg);
    }
out:
    return r;
}

static struct file_operations kvm_chardev_ops = {
    .unlocked_ioctl = kvm_dev_ioctl,
    .llseek         = noop_llseek,
    KVM_COMPAT(kvm_dev_ioctl),
};

static struct miscdevice kvm_dev = {
    KVM_MINOR,
    "kvm",
    &kvm_chardev_ops,
};

创建虚拟机

TODO