RDTSC を printf の %llu で表示したら値が 32 bit に丸められた

MinGW と x86_64 で二重にハマった…．

問題

プログラム

#define RDTSC(X) \
    do { \
        __asm__ __volatile__ ("cpuid" : : : "eax", "ebx", "ecx", "edx"); \
        __asm__ __volatile__ ("rdtsc" : "=A" (X)); \
    } while (0)

void test_rdtsc(void)
{
    unsigned long long tsc;

    RDTSC(tsc);

    printf("tsc : %llu\n", tsc);
}

実行結果 (Cygwin -mno-cygwin (MinGW))

$ ./test_rdtsc
tsc : 2735474994
$ ./test_rdtsc
tsc : 3760894466
$ ./test_rdtsc
tsc : 202777325

実行結果 (x86_64 Linux)

$ ./test_rdtsc
tsc : 2412462413
$ ./test_rdtsc
tsc : 3431930822
$ ./test_rdtsc
tsc : 57576939

TSC がラップアラウンドした！？というか，よく見たら 32 bit に切り詰められていた (RDTSC の値は 64 bit であり，4 GHz の CPU でもラップアラウンドに 150 年近く掛かる．しかし 32 bit なら 1 秒強)．

解決 (MinGW)

結論

MinGW (および Cygwin の GCC で -mno-cygwin オプションを付けた場合) の printf は %llu (%lld) フォーマットを受け付けない．代わりに，%I64u (%I64d) フォーマットを使う．

変数 tsc に格納された値がおかしいのか
printf の表示がおかしいのか

問題の切り分けのためにプログラムを少し修正してみた．

void test_rdtsc(void)
{
    unsigned long long tsc;

    RDTSC(tsc);

    printf("tsc : %llu\n", tsc);
    printf("tsc >> 32: %llu\n", tsc >> 32);
}

実行結果

$ ./test_rdtsc
tsc : 581982201
tsc >> 32: 88102

結論．printf の表示がおかしい．tsc >> 32 の結果，上位 32 bit に値が入っていることが確認できたため．

Cygwin の printf は %llu フォーマットに対応していない！？と思ったが，試行錯誤しているうちに，-mno-cygwin オプションを付けなければ表示が 32 bit に切り詰められないことに気づいた (そもそも cygwin1.dll が無い環境で実行させるためにこのオプションを付けた)．

MinGW で64bit整数を printf するときの注意 - Gulfweed

MinGW は Windows の C Library (MSVCR) を使うので普通は %lld を解釈してくれません。Cygwin から開発をしていると、この事を忘れてはまり易いので注意。
...
VC++ と同じように %I64u, %I64d を使えば大丈夫です。

( ゜д゜) ポカーン

気を取り直してプログラムを修正．

    printf("tsc : %llu\n", tsc);
    printf("tsc : %I64u\n", tsc);
    printf("tsc >> 32: %llu\n", tsc >> 32);

コンパイル．%I64u の行で警告が出るが無視．

$ gcc -O3 -W -Wall -mno-cygwin -o test_rdtsc test_rdtsc.c
test_rdtsc.c: In function `test_rdtsc':
test_rdtsc.c:11: warning: unsigned int format, different type arg (arg 2)

実行結果

$ ./test_rdtsc
tsc : 770969165
tsc : 378765346868813
tsc >> 32: 88188

%I64u だと切り詰められない．

　_, ._ 
(;゜ Д゜) …?!

解決 (x86_64)

結論

x86_64 GCC において，インラインアセンブリのオペランド制約 A は edx:eax ではなく rax に展開される．
__asm__ __volatile__ ("rdtsc" : "=A" (X));
しかし，rdtsc 命令は x86_64 でも edx:eax に値を返すため，rax だけを見ると下位 32 bit しか格納されていない．そのため，rdtsc 命令実行後に eax と edx を自前で edx:eax 形式に合成する必要がある．

問題の切り分け．

変数 tsc に格納された値がおかしいのか
printf の表示がおかしいのか

実行結果

$ ./test_rdtsc
tsc : 2382197693
tsc >> 32: 0

結論．変数 tsc に格納された値がおかしい．
アセンブリを確認．

#APP
        cpuid
        rdtsc
#NO_APP
        popq    %rbx
        movq    %rax, %rsi
        movl    $.LC0, %edi
        xorl    %eax, %eax
        jmp     printf

x86_64 GCC の関数呼び出し規約において，rsi は以下の printf の引数 tsc に当たる部分．

    printf("tsc : %llu\n", tsc);

つまり，rdtsc の結果が rax に入っていると仮定している．しかし実際は，

RDTSC - Read Time-Stamp Counter
...
IA-32e モードでの操作
レガシーモードと同じ。RDX[31:0] にはMSR[63:32] がロードされ、RAX[31:0] には
MSR[31:0] がロードされる。

(Intel EM64T のマニュアルより)

AMD 空気嫁．rax が 64 bit レジスタなんだから x86 の edx:eax 形式を維持する必要はないじゃまいか．

しかし info gcc を見ると，

5.35.4 Constraints for Particular Machines
...
Intel 386—i386.h
...
A
Specifies the `a' or `d' registers. This is primarily useful for 64-bit integer values (when in 32-bit mode) intended to be returned with the `d' register holding the most significant bits and the `a' register holding the least significant bits.

GCC も空気嫁．この説明では，オペランド制約 A が x86_64 環境で edx:eax ではなく rax に展開されるなんて読み取れない．

__asm__ __volatile__ ("rdtsc" : "=A" (X));

プログラム修正．RDTSC マクロの中で強引にキャストして edx:eax っぽいことをする．とてつもなくキモイ．

#define RDTSC(X) \
    do { \
        __asm__ __volatile__ ("cpuid" : : : "rax", "rbx", "rcx", "rdx"); \
        __asm__ __volatile__ ("rdtsc" : "=d" (((unsigned int *)&(X))[0]), \
                                        "=a" (((unsigned int *)&(X))[1])); \
    } while (0)

コンパイル．お馴染みの strict-aliasing rules な警告が出るが気にしない．

$ gcc -O3 -W -Wall -o test_rdtsc test_rdtsc.c
test_rdtsc.c: In function 'test_rdtsc':
test_rdtsc.c:8: warning: dereferencing type-punned pointer will break strict-aliasing rules
test_rdtsc.c:8: warning: dereferencing type-punned pointer will break strict-aliasing rules

アセンブリ表示．

#APP
        cpuid
        rdtsc
#NO_APP
        movl    %eax, -4(%rsp)
        movl    %edx, -8(%rsp)
        movl    $.LC0, %edi
        movq    -8(%rsp), %rsi
        xorl    %eax, %eax
        popq    %rbx
        jmp     printf

ちゃんとスタック経由して edx:eax が rsi にコピーされてるし，大丈夫なんじゃね？

        movl    %eax, -4(%rsp)
        movl    %edx, -8(%rsp)
        movq    -8(%rsp), %rsi

実行結果

$ ./test_rdtsc
tsc : 0

あれ？

気を取り直してプログラム修正 2．rdtsc の後，シフトと OR で edx:eax 形式に合成するよ．

#define RDTSC(X) \
    do { \
        __asm__ __volatile__ ("cpuid" : : : "rax", "rbx", "rcx", "rdx"); \
        __asm__ __volatile__ ( \
            "rdtsc\n\t" \
            "shlq       $32, %%rdx\n\t" \
            "orq        %%rdx, %%rax\n\t" \
            "movq       %%rax, %0" : "=g" (X) : : "rax", "rdx"); \
    } while (0)

アセンブリ表示．概ね期待通り．rax 上で edx:eax 形式が完成した後，printf の引数として rsi にコピーされている．

#APP
        cpuid
        rdtsc
        shlq    $32, %rdx
        orq     %rdx, %rax
        movq    %rax, %rsi
#NO_APP
        popq    %rbx
        movl    $.LC0, %edi
        xorl    %eax, %eax
        jmp     printf

実行結果

$ ./test_rdtsc
tsc : 947942173731320
$ ./test_rdtsc
tsc : 947942783190946
$ ./test_rdtsc
tsc : 947943263127632

問題なさそう．

追記 12/11

strict-aliasing rules の件．test_rdtsc 関数自体は上記のように問題のない順番だったが，よく見ると，実際は main 関数に test_rdtsc 関数がインライン展開されており，その main 関数は以下のように順番が狂っており，edx:eax が 8(%rsp) にコピーされる前に 8(%rsp) が rsi にコピーされていた．

#APP
        cpuid
        rdtsc
#NO_APP
        movq    8(%rsp), %rsi
        movl    %eax, 8(%rsp)
        movl    $.LC0, %edi
        xorl    %eax, %eax
        movl    %edx, 12(%rsp)
        call    printf

-O2 以上の最適化フラグを付けると，プログラムが strict-aliasing rules を守っているという前提で最適化が掛かる (-fstrict-aliasing が暗黙的に付加される) ため，順番が狂っていたらしい．

-fno-strict-aliasing を付けると strict-aliasing rules を守っているという前提での最適化が掛からないため，結果が正しく表示された．

$ gcc -O3 -W -Wall -fno-strict-aliasing -o test_rdtsc test_rdtsc.c
$ ./test_rdtsc
tsc : 1156795886920691
$ ./test_rdtsc
tsc : 1156797309530725
$ ./test_rdtsc
tsc : 1156798115809692

参考

Radium Software Development - strict aliasing rule