I'm studied about pipeline stall on branch predict miss so I make some codes of mine avoid stalling and be faster. But I can't know if this optimization really matters or makes things worse. I don't know muschabout asm, or cpus.
I add some disassembly codes of mine. So guys, am I optimizing the program correctly? Is it faster than before? Can you tell me If I optimize codes like this, what should be a problem?
// before
switch (i - '0')
case 0: a.f1(); break;
case 1: a.f2(); break;
case 2: a.f3(); break;
case 3: a.f4(); break;
///asm with 12 cases
switch (i - '0')
00007FF620461434 movsx ecx,byte ptr [rax]
00007FF620461437 add ecx,0FFFFFFD0h
00007FF62046143A cmp ecx,0Bh
00007FF62046143D ja main+185h (07FF6204614D5h)
00007FF620461443 movsxd rcx,ecx
00007FF620461446 mov edx,dword ptr [r11+rcx*4+1614h]
00007FF62046144E add rdx,r11
00007FF620461451 jmp rdx
// asm with 4 cases
64: switch (i - '0')
00007FF6927413A5 movsx eax,byte ptr [rdx]
00007FF6927413A8 sub eax,30h
00007FF6927413AB je main+110h (07FF6927413E0h)
00007FF6927413AD sub eax,1
00007FF6927413B0 je main+104h (07FF6927413D4h)
00007FF6927413B2 sub eax,1
00007FF6927413B5 je main+0F8h (07FF6927413C8h)
00007FF6927413B7 cmp eax,1
00007FF6927413BA jne main+11Ah (07FF6927413EAh)
69: case 3: a.f4(); break;
00007FF6927413BC lea rcx,[a]
00007FF6927413C1 call OBJ::f4 (07FF6927412C0h)
00007FF6927413C6 jmp main+11Ah (07FF6927413EAh)
68: case 2: a.f3(); break;
00007FF6927413C8 lea rcx,[a]
00007FF6927413CD call OBJ::f3 (07FF6927412B0h)
00007FF6927413D2 jmp main+11Ah (07FF6927413EAh)
67: case 1: a.f2(); break;
00007FF6927413D4 lea rcx,[a]
00007FF6927413D9 call OBJ::f2 (07FF6927412A0h)
00007FF6927413DE jmp main+11Ah (07FF6927413EAh)
65: {
66: case 0: a.f1(); break;
00007FF6927413E0 lea rcx,[a]
00007FF6927413E5 call OBJ::f1 (07FF692741290h)
static decltype(&OBJ::f1) func[4] = { &OBJ::f1, &OBJ::f2, &OBJ::f3, &OBJ::f4 };
(a.*func[i - '0'])();
// asm
61: static decltype(&OBJ::f1) func[4] = { &OBJ::f1, &OBJ::f2, &OBJ::f3, &OBJ::f4 };
62: (a.*func[i - '0'])();
00007FF71D7213B9 movsx rax,byte ptr [rbx]
00007FF71D7213BD lea rcx,[a]
00007FF71D7213C2 call qword ptr [r13+rax*8-180h]
I'm using MSVC. this code is in main-loop. below is my test code, input is
#include <iostream>
#include <chrono>
using clk = std::chrono::high_resolution_clock;
using namespace std::chrono;
using namespace std::literals::string_view_literals;
namespace timer {
static clk::time_point StopWatch;
inline void start() {
StopWatch = clk::now();
inline void end(const std::string_view mess = ""sv)
auto t = clk::now();
std::cout << mess << " : " << duration_cast<milliseconds>(t - StopWatch) << '\n';
// controll //
#define noBranch
#define noInline
// controll //
#ifdef noInline
#define INLINE __declspec(noinline)
#define INLINE
class OBJ {
size_t x = 0;
INLINE void f1() {
x += 13;
INLINE void f2() {
x += 23;
INLINE void f3() {
x += 18;
INLINE void f4() {
x += 15;
int main()
size_t sum = 0;
std::string in;
std::cin >> in;
for (size_t q = 0; q < 1'000'000; q++) {
for (const auto i : in) {
OBJ a;
#ifdef noBranch
static decltype(&OBJ::f1) func[4] = { &OBJ::f1, &OBJ::f2, &OBJ::f3, &OBJ::f4 };
(a.*func[i - '0'])();
switch (i - '0')
case 0: a.f1(); break;
case 1: a.f2(); break;
case 2: a.f3(); break;
case 3: a.f4(); break;
sum += a.x;
std::cout << "sum" << sum << std::endl;
Call by a pointer is also a branch, so branch mis-prediction apply here as well.
C++ pointer-to-member-function is a complex construct that is hard for the compiler to optimize.
Making truly branchless code using a lookup table would have worked, if you look up data, not code, and make no branches based on that data.