内存管理初始化

Linux 内核学习笔记系列，内存管理部分，简单介绍内存管理初始化。

内存管理初始化简介

体系结构相关的部分只考虑 x86-32 架构。

初始化相关的操作总是从 start_kernel() 开始看起，与内存管理初始化相关的流程如下（图片来源于 PLKA）：

start_kernel()

start_kernel() 函数位于 init/main.c。

`setup_arch()`

setup_arch() 的流程如下（图片来源于 PLKA）：

setup_arch()

setup_arch() 函数位于 arch/x86/kernel/setup.c。

这部分内容在内核中经常被重构，导致不同内核版本间差异较大。事实上，我所查看的源码版本虽然与 PLKA 指示的内核版本很接近，但图中的很多函数已经重构了。

`machine_specific_memory_setup()`

直接搜索找不到 machine_specific_memory_setup() 函数，该函数应该已经被重构，推测功能接近的函数为 setup_memory_map()。

x86-32 架构使用 BIOS 的 e820 功能来探测物理内存。

在 arch/x86/kernel/e820.c 中，定义了 char *__init default_machine_specific_memory_setup(void) 函数，这个函数在 arch/x86/kernel/x86_init.c 中被赋值给了 resources.memory_setup：

/*
 * The platform setup functions are preset with the default functions
 * for standard PC hardware.
 */
struct x86_init_ops x86_init __initdata = {

    .resources = {
        .probe_roms         = x86_init_noop,
        .reserve_resources  = reserve_standard_io_resources,
        .memory_setup       = default_machine_specific_memory_setup,
    },
    ...

之后在 setup_memory_map() 函数中被调用，而 setup_memory_map() 函数在 setup_arch() 中被调用。

arch/x86/kernel/e820.c：

void __init setup_memory_map(void)
{
    char *who;

    who = x86_init.resources.memory_setup();
    memcpy(&e820_saved, &e820, sizeof(struct e820map));
    printk(KERN_INFO "BIOS-provided physical RAM map:\n");
e820_print_map(who);
}

`parse_early_param()`

内核的命令行参数可能有与内存管理相关的选项，因此把 parse_early_param() 也视为内存管理初始化的一部分。

parse_early_param() 函数首先在 setup_arch() 函数中调用，虽然之后 start_kernel() 也会调用它。该函数的定义位于 init/main.c：

/* Arch code calls this early on, or if not, just before other parsing. */
void __init parse_early_param(void)
{
    static __initdata int done = 0;
    static __initdata char tmp_cmdline[COMMAND_LINE_SIZE];

    if (done)
        return;

    /* All fall through to do_early_param. */
    strlcpy(tmp_cmdline, boot_command_line, COMMAND_LINE_SIZE);
    parse_early_options(tmp_cmdline);
    done = 1;
}

该函数调用 init/main.c 中的 parse_early_options()，通过 parse_args() 函数处理（parse_args() 细节不展开了）：

void __init parse_early_options(char *cmdline)
{
    parse_args("early options", cmdline, NULL, 0, do_early_param);
}

`setup_memory()`

该函数在当前版本内核源码里已经找不到了，推测改名为了 initmem_init() 和，UMA 和 NUMA 版本分别放置于 arch/x86/mm/init_32.c 和 arch/x86/mm/numa_32.c。

arch/x86/mm/init_32.c：

#ifndef CONFIG_NEED_MULTIPLE_NODES
void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn,
                                int acpi, int k8)
{
#ifdef CONFIG_HIGHMEM
    highstart_pfn = highend_pfn = max_pfn;
    if (max_pfn > max_low_pfn)
        highstart_pfn = max_low_pfn;
    e820_register_active_regions(0, 0, highend_pfn);
    sparse_memory_present_with_active_regions(0);
    printk(KERN_NOTICE "%ldMB HIGHMEM available.\n",
        pages_to_mb(highend_pfn - highstart_pfn));
    num_physpages = highend_pfn;
    high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1;
#else
    e820_register_active_regions(0, 0, max_low_pfn);
    sparse_memory_present_with_active_regions(0);
    num_physpages = max_low_pfn;
    high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1;
#endif
#ifdef CONFIG_FLATMEM
    max_mapnr = num_physpages;
#endif
    __vmalloc_start_set = true;

    printk(KERN_NOTICE "%ldMB LOWMEM available.\n",
            pages_to_mb(max_low_pfn));

    setup_bootmem_allocator();
}
#endif /* !CONFIG_NEED_MULTIPLE_NODES */

arch/x86/mm/numa_32.c：

void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn,
                                int acpi, int k8)
{
    int nid;
    long kva_target_pfn;

    /*
     * When mapping a NUMA machine we allocate the node_mem_map arrays
     * from node local memory.  They are then mapped directly into KVA
     * between zone normal and vmalloc space.  Calculate the size of
     * this space and use it to adjust the boundary between ZONE_NORMAL
     * and ZONE_HIGHMEM.
     */

    get_memcfg_numa();

    kva_pages = roundup(calculate_numa_remap_pages(), PTRS_PER_PTE);

    kva_target_pfn = round_down(max_low_pfn - kva_pages, PTRS_PER_PTE);
    do {
        kva_start_pfn = find_e820_area(kva_target_pfn<<PAGE_SHIFT,
                    max_low_pfn<<PAGE_SHIFT,
                    kva_pages<<PAGE_SHIFT,
                    PTRS_PER_PTE<<PAGE_SHIFT) >> PAGE_SHIFT;
        kva_target_pfn -= PTRS_PER_PTE;
    } while (kva_start_pfn == -1UL && kva_target_pfn > min_low_pfn);

    if (kva_start_pfn == -1UL)
        panic("Can not get kva space\n");

    printk(KERN_INFO "kva_start_pfn ~ %lx max_low_pfn ~ %lx\n",
        kva_start_pfn, max_low_pfn);
    printk(KERN_INFO "max_pfn = %lx\n", max_pfn);

    /* avoid clash with initrd */
    reserve_early(kva_start_pfn<<PAGE_SHIFT,
              (kva_start_pfn + kva_pages)<<PAGE_SHIFT,
             "KVA PG");
#ifdef CONFIG_HIGHMEM
    highstart_pfn = highend_pfn = max_pfn;
    if (max_pfn > max_low_pfn)
        highstart_pfn = max_low_pfn;
    printk(KERN_NOTICE "%ldMB HIGHMEM available.\n",
           pages_to_mb(highend_pfn - highstart_pfn));
    num_physpages = highend_pfn;
    high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1;
#else
    num_physpages = max_low_pfn;
    high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1;
#endif
    printk(KERN_NOTICE "%ldMB LOWMEM available.\n",
            pages_to_mb(max_low_pfn));
    printk(KERN_DEBUG "max_low_pfn = %lx, highstart_pfn = %lx\n",
            max_low_pfn, highstart_pfn);

    printk(KERN_DEBUG "Low memory ends at vaddr %08lx\n",
            (ulong) pfn_to_kaddr(max_low_pfn));
    for_each_online_node(nid) {
        init_remap_allocator(nid);

        allocate_pgdat(nid);
    }
    remap_numa_kva();

    printk(KERN_DEBUG "High memory starts at vaddr %08lx\n",
            (ulong) pfn_to_kaddr(highstart_pfn));
    for_each_online_node(nid)
        propagate_e820_map_node(nid);

    for_each_online_node(nid) {
        memset(NODE_DATA(nid), 0, sizeof(struct pglist_data));
        NODE_DATA(nid)->node_id = nid;
#ifndef CONFIG_NO_BOOTMEM
        NODE_DATA(nid)->bdata = &bootmem_node_data[nid];
#endif
    }

    setup_bootmem_allocator();
}

该函数主要用于确定物理页的数目，然后初始化 bootmem 分配器，bootmem 分配器的具体细节见 bootmem 分配器。

`paging_init()`

paging_init() 函数位于 arch/x86/mm/init_32.c：

/*
 * paging_init() sets up the page tables - note that the first 8MB are
 * already mapped by head.S.
 *
 * This routines also unmaps the page at virtual kernel address 0, so
 * that we can trap those pesky NULL-reference errors in the kernel.
 */
void __init paging_init(void)
{
    pagetable_init();

    __flush_tlb_all();

    kmap_init();

    /*
     * NOTE: at this point the bootmem allocator is fully available.
     */
    sparse_init();
    zone_sizes_init();
}

该函数会调用 pagetable_init 来确保直接映射的地址空间的物理内存被初始化：

#ifdef CONFIG_HIGHMEM
static void __init permanent_kmaps_init(pgd_t *pgd_base)
{
    unsigned long vaddr;
    pgd_t *pgd;
    pud_t *pud;
    pmd_t *pmd;
    pte_t *pte;

    vaddr = PKMAP_BASE;
    page_table_range_init(vaddr, vaddr + PAGE_SIZE*LAST_PKMAP, pgd_base);

    pgd = swapper_pg_dir + pgd_index(vaddr);
    pud = pud_offset(pgd, vaddr);
    pmd = pmd_offset(pud, vaddr);
    pte = pte_offset_kernel(pmd, vaddr);
    pkmap_page_table = pte;
}
#else
static inline void permanent_kmaps_init(pgd_t *pgd_base)
{
}
#endif /* CONFIG_HIGHMEM */

static void __init pagetable_init(void)
{
    pgd_t *pgd_base = swapper_pg_dir;

    permanent_kmaps_init(pgd_base);
}

这部分的细节太多，我不打算深入看了。

`zone_sizes_init()`

可以看到，在当前版本，该函数的调用者变成了 paging_init() 函数，而非 setup_arch() 函数。

static void __init zone_sizes_init(void)
{
    unsigned long max_zone_pfns[MAX_NR_ZONES];
    memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
    max_zone_pfns[ZONE_DMA] =
        virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
    max_zone_pfns[ZONE_NORMAL] = max_low_pfn;
#ifdef CONFIG_HIGHMEM
    max_zone_pfns[ZONE_HIGHMEM] = highend_pfn;
#endif

    free_area_init_nodes(max_zone_pfns);
}

add_active_range() 对可用的物理内存建立一个相对简单的列表。但该函数已经不在 zone_sizes_init() 中被调用了，搜索后发现该函数在 initmem_init() 中已经被调用了，函数调用链是 initmem_init() -> e820_register_active_regions() -> add_active_range()。

free_area_init_nodes() 它使用之前建立的列表来建立完备的内核数据结构。该函数是体系结构无关的，定义于 mm/page_alloc.c，这里不展开。

`setup_per_cpu_areas()`

setup_per_cpu_areas() 函数位于 arch/x86/kernel/setup_percpu.c，用于初始化 per-CPU 高速缓存。

void __init setup_per_cpu_areas(void)
{
    unsigned int cpu;
    unsigned long delta;
    int rc;

    pr_info("NR_CPUS:%d nr_cpumask_bits:%d nr_cpu_ids:%d nr_node_ids:%d\n",
        NR_CPUS, nr_cpumask_bits, nr_cpu_ids, nr_node_ids);

    /*
     * Allocate percpu area.  Embedding allocator is our favorite;
     * however, on NUMA configurations, it can result in very
     * sparse unit mapping and vmalloc area isn't spacious enough
     * on 32bit.  Use page in that case.
     */
#ifdef CONFIG_X86_32
    if (pcpu_chosen_fc == PCPU_FC_AUTO && pcpu_need_numa())
        pcpu_chosen_fc = PCPU_FC_PAGE;
#endif
    rc = -EINVAL;
    if (pcpu_chosen_fc != PCPU_FC_PAGE) {
        const size_t atom_size = cpu_has_pse ? PMD_SIZE : PAGE_SIZE;
        const size_t dyn_size = PERCPU_MODULE_RESERVE +
            PERCPU_DYNAMIC_RESERVE - PERCPU_FIRST_CHUNK_RESERVE;

        rc = pcpu_embed_first_chunk(PERCPU_FIRST_CHUNK_RESERVE,
                        dyn_size, atom_size,
                        pcpu_cpu_distance,
                        pcpu_fc_alloc, pcpu_fc_free);
        if (rc < 0)
            pr_warning("%s allocator failed (%d), falling back to page size\n",
                   pcpu_fc_names[pcpu_chosen_fc], rc);
    }
    if (rc < 0)
        rc = pcpu_page_first_chunk(PERCPU_FIRST_CHUNK_RESERVE,
                       pcpu_fc_alloc, pcpu_fc_free,
                       pcpup_populate_pte);
    if (rc < 0)
        panic("cannot initialize percpu area (err=%d)", rc);

    /* alrighty, percpu areas up and running */
    delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start;
    for_each_possible_cpu(cpu) {
        per_cpu_offset(cpu) = delta + pcpu_unit_offsets[cpu];
        per_cpu(this_cpu_off, cpu) = per_cpu_offset(cpu);
        per_cpu(cpu_number, cpu) = cpu;
        setup_percpu_segment(cpu);
        setup_stack_canary_segment(cpu);
        /*
         * Copy data used in early init routines from the
         * initial arrays to the per cpu data areas.  These
         * arrays then become expendable and the *_early_ptr's
         * are zeroed indicating that the static arrays are
         * gone.
         */
#ifdef CONFIG_X86_LOCAL_APIC
        per_cpu(x86_cpu_to_apicid, cpu) =
            early_per_cpu_map(x86_cpu_to_apicid, cpu);
        per_cpu(x86_bios_cpu_apicid, cpu) =
            early_per_cpu_map(x86_bios_cpu_apicid, cpu);
#endif
#ifdef CONFIG_X86_64
        per_cpu(irq_stack_ptr, cpu) =
            per_cpu(irq_stack_union.irq_stack, cpu) +
            IRQ_STACK_SIZE - 64;
#ifdef CONFIG_NUMA
        per_cpu(x86_cpu_to_node_map, cpu) =
            early_per_cpu_map(x86_cpu_to_node_map, cpu);
#endif
#endif
        /*
         * Up to this point, the boot CPU has been using .data.init
         * area.  Reload any changed state for the boot CPU.
         */
        if (cpu == boot_cpu_id)
            switch_to_new_gdt(cpu);
    }

    /* indicate the early static arrays will soon be gone */
#ifdef CONFIG_X86_LOCAL_APIC
    early_per_cpu_ptr(x86_cpu_to_apicid) = NULL;
    early_per_cpu_ptr(x86_bios_cpu_apicid) = NULL;
#endif
#if defined(CONFIG_X86_64) && defined(CONFIG_NUMA)
    early_per_cpu_ptr(x86_cpu_to_node_map) = NULL;
#endif

#if defined(CONFIG_X86_64) && defined(CONFIG_NUMA)
    /*
     * make sure boot cpu node_number is right, when boot cpu is on the
     * node that doesn't have mem installed
     */
    per_cpu(node_number, boot_cpu_id) = cpu_to_node(boot_cpu_id);
#endif

    /* Setup node to cpumask map */
    setup_node_to_cpumask_map();

    /* Setup cpu initialized, callin, callout masks */
    setup_cpu_local_masks();
}

`build_all_zonelists()`

build_all_zonelists() 函数位于 mm/page_alloc.c，用于初始化结点和内存区域。

该函数和其调用的相关函数定义如下：

static void build_zonelists(pg_data_t *pgdat)
{
    int j, node, load;
    enum zone_type i;
    nodemask_t used_mask;
    int local_node, prev_node;
    struct zonelist *zonelist;
    int order = current_zonelist_order;

    /* initialize zonelists */
    for (i = 0; i < MAX_ZONELISTS; i++) {
        zonelist = pgdat->node_zonelists + i;
        zonelist->_zonerefs[0].zone = NULL;
        zonelist->_zonerefs[0].zone_idx = 0;
    }

    /* NUMA-aware ordering of nodes */
    local_node = pgdat->node_id;
    load = nr_online_nodes;
    prev_node = local_node;
    nodes_clear(used_mask);

    memset(node_order, 0, sizeof(node_order));
    j = 0;

    while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {
        int distance = node_distance(local_node, node);

        /*
         * If another node is sufficiently far away then it is better
         * to reclaim pages in a zone before going off node.
         */
        if (distance > RECLAIM_DISTANCE)
            zone_reclaim_mode = 1;

        /*
         * We don't want to pressure a particular node.
         * So adding penalty to the first node in same
         * distance group to make it round-robin.
         */
        if (distance != node_distance(local_node, prev_node))
            node_load[node] = load;

        prev_node = node;
        load--;
        if (order == ZONELIST_ORDER_NODE)
            build_zonelists_in_node_order(pgdat, node);
        else
            node_order[j++] = node;     /* remember order */
    }

    if (order == ZONELIST_ORDER_ZONE) {
        /* calculate node order -- i.e., DMA last! */
        build_zonelists_in_zone_order(pgdat, j);
    }

    build_thisnode_zonelists(pgdat);
}

/* return values int ....just for stop_machine() */
static int __build_all_zonelists(void *dummy)
{
    int nid;
    int cpu;

#ifdef CONFIG_NUMA
    memset(node_load, 0, sizeof(node_load));
#endif
    for_each_online_node(nid) {
        pg_data_t *pgdat = NODE_DATA(nid);

        build_zonelists(pgdat);
        build_zonelist_cache(pgdat);
    }

    /*
     * Initialize the boot_pagesets that are going to be used
     * for bootstrapping processors. The real pagesets for
     * each zone will be allocated later when the per cpu
     * allocator is available.
     *
     * boot_pagesets are used also for bootstrapping offline
     * cpus if the system is already booted because the pagesets
     * are needed to initialize allocators on a specific cpu too.
     * F.e. the percpu allocator needs the page allocator which
     * needs the percpu allocator in order to allocate its pagesets
     * (a chicken-egg dilemma).
     */
    for_each_possible_cpu(cpu)
        setup_pageset(&per_cpu(boot_pageset, cpu), 0);

    return 0;
}

void build_all_zonelists(void)
{
    set_zonelist_order();

    if (system_state == SYSTEM_BOOTING) {
        __build_all_zonelists(NULL);
        mminit_verify_zonelist();
        cpuset_init_current_mems_allowed();
    } else {
        /* we have to stop all cpus to guarantee there is no user
           of zonelist */
        stop_machine(__build_all_zonelists, NULL, NULL);
        /* cpuset refresh routine should be here */
    }
    vm_total_pages = nr_free_pagecache_pages();
    /*
     * Disable grouping by mobility if the number of pages in the
     * system is too low to allow the mechanism to work. It would be
     * more accurate, but expensive to check per-zone. This check is
     * made on memory-hotadd so a system can start with mobility
     * disabled and enable it later
     */
    if (vm_total_pages < (pageblock_nr_pages * MIGRATE_TYPES))
        page_group_by_mobility_disabled = 1;
    else
        page_group_by_mobility_disabled = 0;

    printk("Built %i zonelists in %s order, mobility grouping %s.  "
        "Total pages: %ld\n",
            nr_online_nodes,
            zonelist_order_name[current_zonelist_order],
            page_group_by_mobility_disabled ? "off" : "on",
            vm_total_pages);
#ifdef CONFIG_NUMA
    printk("Policy zone: %s\n", zone_names[policy_zone]);
#endif
}

`mm_init()`

mm_init() 函数位于 init/main.c：

/*
 * Set up kernel memory allocators
 */
static void __init mm_init(void)
{
    /*
     * page_cgroup requires countinous pages as memmap
     * and it's bigger than MAX_ORDER unless SPARSEMEM.
     */
    page_cgroup_init_flatmem();
    mem_init();
    kmem_cache_init();
    pgtable_cache_init();
    vmalloc_init();
}

mem_init() 函数用于停用 bootmem 分配器并迁移到实际的内存管理函数（见 bootmem 分配器），kmem_cache_init() 函数用于初始化 slab/slob/slub 分配器。

`setup_per_cpu_pageset()`

setup_per_cpu_pageset() 函数位于 mm/page_alloc.c，用于分配并初始化 pageset：

/*
 * Allocate per cpu pagesets and initialize them.
 * Before this call only boot pagesets were available.
 * Boot pagesets will no longer be used by this processorr
 * after setup_per_cpu_pageset().
 */
void __init setup_per_cpu_pageset(void)
{
    struct zone *zone;
    int cpu;

    for_each_populated_zone(zone) {
        zone->pageset = alloc_percpu(struct per_cpu_pageset);

        for_each_possible_cpu(cpu) {
            struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu);

            setup_pageset(pcp, zone_batchsize(zone));

            if (percpu_pagelist_fraction)
                setup_pagelist_highmark(pcp,
                    (zone->present_pages /
                        percpu_pagelist_fraction));
        }
    }
}

FreeFlyingSheep