Search code examples
cmallocvirtual-memorymemory-mappingpmap

Why malloc doesn't malloc?


Here's a C program to introduce the problem.

#include <stdlib.h>
#include <stdio.h>
#include <math.h>

int main(int argc, char *argv[]) {
    if(argc != 2) {
        printf("Provide a number to indicate the number of bytes (in Mega)\n");
        exit(8);
    }
    int num = atoi(argv[1]);
    size_t max = num * pow(2, 18);
    printf("declared %ld ints\n", max);
    int *a = malloc(max * sizeof(int));
    while(1) {
        for(size_t i = 0; i < max; i++) {
            printf("%d",a[i]);
        }
    }

    return 0;
}

The program does simple things. It read a number, say n, from the command line, and then requests n MB memory via malloc.

The question is when I start the program and type free in the terminal (in Linux), it turns out that the used memory indicated by free is much smaller than the requested memory (if you give a large n).

Here's the output of free after I type ./a.out 1000

$ free -h
               total        used        free      shared  buff/cache   available
Mem:            12Gi       649Mi        11Gi       0.0Ki       320Mi        11Gi
Swap:          4.0Gi          0B       4.0Gi

And a more detailed output from pmap

$ pmap 18414 -x
18414:   ./a.out 1000
Address           Kbytes     RSS   Dirty Mode  Mapping
00005642164b8000       4       4       0 r---- a.out
00005642164b9000       4       4       0 r-x-- a.out
00005642164ba000       4       4       0 r---- a.out
00005642164bb000       4       4       4 r---- a.out
00005642164bc000       4       4       4 rw--- a.out
0000564218248000     132       4       4 rw---   [ anon ]
00007fa6d1b9a000 1024016      12      12 rw---   [ anon ]
00007fa71039e000     160     160       0 r---- libc.so.6
00007fa7103c6000    1620     852       0 r-x-- libc.so.6
00007fa71055b000     352     148       0 r---- libc.so.6
00007fa7105b3000      16      16      16 r---- libc.so.6
00007fa7105b7000       8       8       8 rw--- libc.so.6
00007fa7105b9000      52      20      20 rw---   [ anon ]
00007fa7105d0000       8       4       4 rw---   [ anon ]
00007fa7105d2000       8       8       0 r---- ld-linux-x86-64.so.2
00007fa7105d4000     168     168       0 r-x-- ld-linux-x86-64.so.2
00007fa7105fe000      44      44       0 r---- ld-linux-x86-64.so.2
00007fa71060a000       8       8       8 r---- ld-linux-x86-64.so.2
00007fa71060c000       8       8       8 rw--- ld-linux-x86-64.so.2
00007ffdc8b06000     136      12      12 rw---   [ stack ]
00007ffdc8b9b000      16       0       0 r----   [ anon ]
00007ffdc8b9f000       4       4       0 r-x--   [ anon ]
---------------- ------- ------- ------- 
total kB         1026776    1496     100

Things become interesting when I modified the sentence in the for loop to a[i] = 1;. When I write to the memory, free and pmap tells me that there are actually 1000MB used in physical memory.

Why would that happen? Is it that " read" from heap will not bring new pages to physical memory, but "write" does? I suspect this is related to the so-called anonymous file. However, there are few discussions regarding it. And I failed to find something useful on the web.

I would appreciate it very much if anyone could give some help.

Updated: For those who are curious about if the compiler does the optimization, here's the assembly code:

    .file   "memory-user.c"
    .text
    .section    .rodata
    .align 8
.LC0:
    .string "Provide a number to indicate the number of bytes (in Mega)"
.LC3:
    .string "declared %ld ints\n"
.LC4:
    .string "%d"
    .text
    .globl  main
    .type   main, @function
main:
.LFB6:
    .cfi_startproc
    endbr64
    pushq   %rbp
    .cfi_def_cfa_offset 16
    .cfi_offset 6, -16
    movq    %rsp, %rbp
    .cfi_def_cfa_register 6
    subq    $48, %rsp
    movl    %edi, -36(%rbp)
    movq    %rsi, -48(%rbp)
    cmpl    $2, -36(%rbp)
    je  .L2
    leaq    .LC0(%rip), %rax
    movq    %rax, %rdi
    call    puts@PLT
    movl    $8, %edi
    call    exit@PLT
.L2:
    movq    -48(%rbp), %rax
    addq    $8, %rax
    movq    (%rax), %rax
    movq    %rax, %rdi
    call    atoi@PLT
    movl    %eax, -28(%rbp)
    pxor    %xmm1, %xmm1
    cvtsi2sdl   -28(%rbp), %xmm1
    movsd   .LC1(%rip), %xmm0
    mulsd   %xmm1, %xmm0
    comisd  .LC2(%rip), %xmm0
    jnb .L3
    cvttsd2siq  %xmm0, %rax
    movq    %rax, -16(%rbp)
    jmp .L4
.L3:
    movsd   .LC2(%rip), %xmm1
    subsd   %xmm1, %xmm0
    cvttsd2siq  %xmm0, %rax
    movq    %rax, -16(%rbp)
    movabsq $-9223372036854775808, %rax
    xorq    %rax, -16(%rbp)
.L4:
    movq    -16(%rbp), %rax
    movq    %rax, -16(%rbp)
    call    getpid@PLT
    movl    %eax, %edx
    movq    -16(%rbp), %rax
    movq    %rax, %rsi
    leaq    .LC3(%rip), %rax
    movq    %rax, %rdi
    movl    $0, %eax
    call    printf@PLT
    movq    -16(%rbp), %rax
    movl    $4, %esi
    movq    %rax, %rdi
    call    calloc@PLT
    movq    %rax, -8(%rbp)
.L7:
    movq    $0, -24(%rbp)
    jmp .L5
.L6:
    movq    -24(%rbp), %rax
    leaq    0(,%rax,4), %rdx
    movq    -8(%rbp), %rax
    addq    %rdx, %rax
    movl    (%rax), %eax
    movl    %eax, %esi
    leaq    .LC4(%rip), %rax
    movq    %rax, %rdi
    movl    $0, %eax
    call    printf@PLT
    addq    $1, -24(%rbp)
.L5:
    movq    -24(%rbp), %rax
    cmpq    -16(%rbp), %rax
    jb  .L6
    jmp .L7
    .cfi_endproc
.LFE6:
    .size   main, .-main
    .section    .rodata
    .align 8
.LC1:
    .long   0
    .long   1091567616
    .align 8
.LC2:
    .long   0
    .long   1138753536
    .ident  "GCC: (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0"
    .section    .note.GNU-stack,"",@progbits
    .section    .note.gnu.property,"a"
    .align 8
    .long   1f - 0f
    .long   4f - 1f
    .long   5
0:
    .string "GNU"
1:
    .align 8
    .long   0xc0000002
    .long   3f - 2f
2:
    .long   0x3
3:
    .align 8
4:

Solution

  • Let's start by pointing out that reading malloc'ed memory without initializing it first is unspecified behavior. Maybe you should use calloc() instead.

    In glibc (which is the library I assume you are using), malloc() (and calloc() and others in the family) normally manages the heap using the brk system call. However, for very large allocations like yours, mmap is used (see mallopt() if you wish to change the threshold at which malloc() starts using mmap).

    Both of these system calls invoke the operating system's virtual memory manager, which has to allocate you some pages of memory (on x86 regular pages are 4KiB). However, most operating systems do lazy allocation. The OS will mark those pages as used, but won't allocate any physical memory for them. When your code references the memory, it will fault and that's when the OS will actually map those pages to physical memory so that you can use them.

    Normally, the OS will actually map all allocated pages to a single physical zero-filled page, so that there's no overhead for reads. See also: Why malloc+memset is slower than calloc?

    Now, if you look at man 1 free, you will see that it always displays physical memory usage. It has nothing to do with the quirks of virtual memory.