This is Words and Buttons Online — a collection of interactive #tutorials, #demos, and #quizzes about #mathematics, #algorithms and #programming.

C++ magic squares demystified with Valgrind and disassembly

This is a follow-up on Challenge your performance intuition with C++ magic squares. The original post got some public attention and also raised some intriguing questions. Measurements, sure, let's say we measure code A against code B and the B is faster. But why?

We can get a hint or two with varlgrind's cachegrind and with good old disassembly. So let's do that.

But just before we start, I want to remind you that it is all only a game and the CPU, the compiler, and the benchmark are the rules. They might not necessarily represent the “real life” well enough, although in the “real life” knights don't walk in a silly “L” pattern either, and it doesn't make chess any less interesting.

However, the benchmark proposed does have a flaw! It produces data in certain order that shadows the importance of branch prediction. Branch mis-prediction is certainly not something you would want to ignore. If you are interested in more relevant benchmarking, please take a look at Learning the value of good benchmarking technique with C++ magic squares by Veedrac. It's very insightful!

Shifts vs. no shifts

ioquatix from Reddit wonders how lookups can be faster than direct shifts. Honestly, for me this was also a mystery.

auto c15 = '5' * 3;
uint_fast64_t ideal_char_map =
    static_cast<uint_fast64_t>(0x1FF) << 49;
uint_fast64_t char_map_one = 1u;

bool check_if_magic(const std::string& sq)
  {
  if ((sq[0] + sq[1] + sq[2] != c15)
    || (sq[3] + sq[4] + sq[5] != c15)
    || (sq[6] + sq[7] + sq[8] != c15)

    || (sq[0] + sq[3] + sq[6] != c15)
    || (sq[1] + sq[4] + sq[7] != c15)
    || (sq[2] + sq[5] + sq[8] != c15)

    || (sq[0] + sq[4] + sq[8] != c15)
    || (sq[2] + sq[4] + sq[6] != c15))
    return false;

  auto char_map = ideal_char_map;
  for(auto i = 0u; i < 9; ++i)
    char_map ^= char_map_one << sq[i];
  if (char_map != 0)
    return false;

  return true;
  }
    
auto magic_number = '5' * 3;
auto not_so_magic_number = '5' * 2;
const std::array<uint16_t, 58> bit_shifts {
  0, 0, 0, 0,   0, 0, 0, 0,
  0, 0, 0, 0,   0, 0, 0, 0,
  0, 0, 0, 0,   0, 0, 0, 0,
  0, 0, 0, 0,   0, 0, 0, 0,

  0, 0, 0, 0,   0, 0, 0, 0,
  0, 0, 0, 0,   0, 0, 0, 0,
  0, 1, 2, 4,   8, 16, 32, 64,
  128, 256
  };

bool check_if_magic(const std::string& sq)
  {
  if ((sq[0] + sq[1] + sq[2] != magic_number)
    || (sq[3] + sq[5] != not_so_magic_number)
    || (sq[6] + sq[7] + sq[8] != magic_number)

    || (sq[0] + sq[3] + sq[6] != magic_number)
    || (sq[1] + sq[7] != not_so_magic_number)
    || (sq[2] + sq[5] + sq[8] != magic_number)

    || (sq[0] + sq[4] + sq[8] != magic_number)
    || (sq[2] + sq[4] + sq[6] != magic_number))
    return false;

  auto char_map = 0u;
  for(auto i = 0u; i < 9; ++i)
    char_map ^= bit_shifts[sq[i]];
  if (char_map != 511)
    return false;

  return true;
  }
    

I think it would be helpful to look at the disassembly.

Z14check_if_magic ...
.LFB1711:
	.cfi_startproc
	movq	(%rdi), %rdx
	xorl	%eax, %eax
	movsbl	(%rdx), %esi
	movsbl	1(%rdx), %edi
	movsbl	2(%rdx), %r8d
	leal	(%rsi,%rdi), %ecx
	addl	%r8d, %ecx
	cmpl	magic_number(%rip), %ecx
	je	.L21
	ret
	.p2align 4,,10
	.p2align 3
.L21:
	pushq	%r13
	.cfi_def_cfa_offset 16
	.cfi_offset 13, -16
	pushq	%r12
	.cfi_def_cfa_offset 24
	.cfi_offset 12, -24
	pushq	%rbp
	.cfi_def_cfa_offset 32
	.cfi_offset 6, -32
	pushq	%rbx
	.cfi_def_cfa_offset 40
	.cfi_offset 3, -40
	movsbl	3(%rdx), %r10d
	movsbl	4(%rdx), %r11d
	movsbl	5(%rdx), %ebx
	leal	(%r10,%r11), %r9d
	addl	%ebx, %r9d
	cmpl	%r9d, %ecx
	je	.L22
.L13:
	popq	%rbx
	.cfi_remember_state
	.cfi_restore 3
	.cfi_def_cfa_offset 32
	popq	%rbp
	.cfi_restore 6
	.cfi_def_cfa_offset 24
	popq	%r12
	.cfi_restore 12
	.cfi_def_cfa_offset 16
	popq	%r13
	.cfi_restore 13
	.cfi_def_cfa_offset 8
	ret
	.p2align 4,,10
	.p2align 3
.L22:
	.cfi_restore_state
	movsbl	6(%rdx), %ebp
	movsbl	7(%rdx), %r13d
	movsbl	8(%rdx), %r12d
	leal	0(%rbp,%r13), %r9d
	addl	%r12d, %r9d
	cmpl	%r9d, %ecx
	jne	.L13
	leal	(%rsi,%r10), %r9d
	addl	%ebp, %r9d
	cmpl	%r9d, %ecx
	jne	.L13
	addl	%r11d, %edi
	addl	%r13d, %edi
	cmpl	%edi, %ecx
	jne	.L13
	leal	(%r8,%rbx), %edi
	addl	%r12d, %edi
	cmpl	%edi, %ecx
	jne	.L13
	addl	%r11d, %esi
	addl	%r12d, %esi
	cmpl	%esi, %ecx
	jne	.L13
	addl	%r11d, %r8d
	addl	%ebp, %r8d
	cmpl	%r8d, %ecx
	jne	.L13
	movq	ideal_char_map(%rip), %rsi
	movq	char_map_one(%rip), %r8
	leaq	9(%rdx), %rdi
	movq	%rdx, %rax
	.p2align 4,,10
	.p2align 3
.L4:
	movsbl	(%rax), %ecx
	movq	%r8, %rdx
	addq	$1, %rax
	salq	%cl, %rdx
	xorq	%rdx, %rsi
	cmpq	%rax, %rdi
	jne	.L4
	testq	%rsi, %rsi
	sete	%al
	jmp	.L13
	.cfi_endproc
	
_Z14check_if_magic ...
.LFB1752:
	.cfi_startproc
	movq	(%rdi), %rdx
	xorl	%eax, %eax
	movsbl	(%rdx), %esi
	movsbl	1(%rdx), %edi
	movsbl	2(%rdx), %r8d
	leal	(%rsi,%rdi), %ecx
	addl	%r8d, %ecx
	cmpl	magic_number(%rip), %ecx
	je	.L17
	ret
	.p2align 4,,10
	.p2align 3
.L17:
	pushq	%r13
	.cfi_def_cfa_offset 16
	.cfi_offset 13, -16
	pushq	%r12
	.cfi_def_cfa_offset 24
	.cfi_offset 12, -24
	pushq	%rbp
	.cfi_def_cfa_offset 32
	.cfi_offset 6, -32
	pushq	%rbx
	.cfi_def_cfa_offset 40
	.cfi_offset 3, -40
	movsbl	3(%rdx), %r9d
	movsbl	5(%rdx), %r10d
	leal	(%r9,%r10), %ebp
	cmpl	not_so_magic_number(%rip), %ebp
	je	.L18
.L2:
	popq	%rbx
	.cfi_remember_state
	.cfi_restore 3
	.cfi_def_cfa_offset 32
	popq	%rbp
	.cfi_restore 6
	.cfi_def_cfa_offset 24
	popq	%r12
	.cfi_restore 12
	.cfi_def_cfa_offset 16
	popq	%r13
	.cfi_restore 13
	.cfi_def_cfa_offset 8
	ret
	.p2align 4,,10
	.p2align 3
.L18:
	.cfi_restore_state
	movsbl	6(%rdx), %r11d
	movsbl	7(%rdx), %r13d
	movsbl	8(%rdx), %ebx
	leal	(%r11,%r13), %r12d
	addl	%ebx, %r12d
	cmpl	%r12d, %ecx
	jne	.L2
	addl	%esi, %r9d
	addl	%r11d, %r9d
	cmpl	%r9d, %ecx
	jne	.L2
	addl	%r13d, %edi
	cmpl	%edi, %ebp
	jne	.L2
	leal	(%r8,%r10), %edi
	addl	%ebx, %edi
	cmpl	%edi, %ecx
	jne	.L2
	movsbl	4(%rdx), %edi
	addl	%edi, %esi
	addl	%ebx, %esi
	cmpl	%esi, %ecx
	jne	.L2
	addl	%r8d, %edi
	addl	%edi, %r11d
	cmpl	%r11d, %ecx
	jne	.L2
	movq	%rdx, %rax
	leaq	9(%rdx), %rsi
	xorl	%edx, %edx
	.p2align 4,,10
	.p2align 3
.L3:
	movsbq	(%rax), %rcx
	addq	$1, %rax
	movzwl	_ZL25bit_shifts(%rcx,%rcx), %ecx
	xorl	%ecx, %edx
	cmpq	%rax, %rsi
	jne	.L3
	cmpl	$511, %edx
	sete	%al
	jmp	.L2
	.cfi_endproc
	

And... I was wrong. It is not really helpful. Apparently, we read more data, and do less instructions, but what does this mean in terms of performance? To settle this we need Valgrind.

Valgrind is a dynamic analyzer, it can do lots of stuff for us. In this case I would be the most interested in the memory read/writes vs. instruction reads.

I   refs:      13,099,624,571
I1  misses:             1,846
LLi misses:             1,705
I1  miss rate:           0.00%
LLi miss rate:           0.00%


D   refs:       7,435,891,049
    (4,739,364,171 rd   + 2,696,526,878 wr)

D1  misses:            15,886
    (       13,670 rd   +         2,216 wr)

LLd misses:             9,155
    (        7,758 rd   +         1,397 wr)

D1  miss rate:            0.0%
    (          0.0%     +           0.0%  )

LLd miss rate:            0.0%
    (          0.0%     +           0.0%  )


LL refs:               17,732
    (       15,516 rd   +         2,216 wr)

LL misses:             10,860
    (        9,463 rd   +         1,397 wr)

LL miss rate:             0.0%
    (          0.0%     +           0.0%  )
    
I   refs:      13,041,321,061
I1  misses:             1,843
LLi misses:             1,705
I1  miss rate:           0.00%
LLi miss rate:           0.00%


D   refs:       7,438,562,794
    (4,742,035,917 rd   + 2,696,526,877 wr)

D1  misses:            15,894
    (       13,676 rd   +         2,218 wr)

LLd misses:             9,156
    (        7,761 rd   +         1,395 wr)

D1  miss rate:            0.0%
    (          0.0%     +           0.0%  )

LLd miss rate:            0.0%
    (          0.0%     +           0.0%  )


LL refs:               17,737
    (       15,519 rd   +         2,218 wr)

LL misses:             10,861
    (        9,466 rd   +         1,395 wr)

LL miss rate:             0.0%
    (          0.0%     +           0.0%  )
    

Apparently, we now have 3 000 000 more data cache reads, but also 60 000 000 less instruction cache reads. All and all, it is more effective with the lookup.

But this works only because all the data fit in L1-cache perfectly. Which is, of course, rarely the case on practice, so it is generally better to abstain from using lookups. It is true, it just doesn't help in this particular exercise.

Variables vs. constants

FbF_ from Reddit justly points out that leaving global variables non-constant is very wrong. Excellent point! This is indeed a mistake, of course, everything that is meant to be constant should be `const`. But does it affect performance all that much?

auto c15 = '5' * 3;
uint_fast64_t ideal_char_map =
    static_cast<uint_fast64_t>(0x1FF) << 49;
uint_fast64_t char_map_one = 1u;

bool check_if_magic(const std::string& sq)
  {
  if ((sq[0] + sq[1] + sq[2] != c15)
    || (sq[3] + sq[4] + sq[5] != c15)
    || (sq[6] + sq[7] + sq[8] != c15)

    || (sq[0] + sq[3] + sq[6] != c15)
    || (sq[1] + sq[4] + sq[7] != c15)
    || (sq[2] + sq[5] + sq[8] != c15)

    || (sq[0] + sq[4] + sq[8] != c15)
    || (sq[2] + sq[4] + sq[6] != c15))
    return false;

  auto char_map = ideal_char_map;
  for(auto i = 0u; i < 9; ++i)
    char_map ^= char_map_one << sq[i];
  if (char_map != 0)
    return false;

  return true;
  }
    
const auto c15 = '5' * 3;
const uint_fast64_t ideal_char_map =
    static_cast<uint_fast64_t>(0x1FF) << 49;
const uint_fast64_t char_map_one = 1u;

bool check_if_magic(const std::string& sq)
  {
  if ((sq[0] + sq[1] + sq[2] != c15)
    || (sq[3] + sq[4] + sq[5] != c15)
    || (sq[6] + sq[7] + sq[8] != c15)

    || (sq[0] + sq[3] + sq[6] != c15)
    || (sq[1] + sq[4] + sq[7] != c15)
    || (sq[2] + sq[5] + sq[8] != c15)

    || (sq[0] + sq[4] + sq[8] != c15)
    || (sq[2] + sq[4] + sq[6] != c15))
    return false;

  auto char_map = ideal_char_map;
  for(auto i = 0u; i < 9; ++i)
    char_map ^= char_map_one << sq[i];
  if (char_map != 0)
    return false;

  return true;
  }
    

Well, the difference in runtime is subtle. They both run for about 1.8 seconds along with the generator. But the difference is quite visible in the code. Indeed, the constant version does look better.

Z14check_if_magic ...
.LFB1711:
	.cfi_startproc
	movq	(%rdi), %rdx
	xorl	%eax, %eax
	movsbl	(%rdx), %esi
	movsbl	1(%rdx), %edi
	movsbl	2(%rdx), %r8d
	leal	(%rsi,%rdi), %ecx
	addl	%r8d, %ecx
	cmpl	magic_number(%rip), %ecx
	je	.L21
	ret
	.p2align 4,,10
	.p2align 3
.L21:
	pushq	%r13
	.cfi_def_cfa_offset 16
	.cfi_offset 13, -16
	pushq	%r12
	.cfi_def_cfa_offset 24
	.cfi_offset 12, -24
	pushq	%rbp
	.cfi_def_cfa_offset 32
	.cfi_offset 6, -32
	pushq	%rbx
	.cfi_def_cfa_offset 40
	.cfi_offset 3, -40
	movsbl	3(%rdx), %r10d
	movsbl	4(%rdx), %r11d
	movsbl	5(%rdx), %ebx
	leal	(%r10,%r11), %r9d
	addl	%ebx, %r9d
	cmpl	%r9d, %ecx
	je	.L22
.L13:
	popq	%rbx
	.cfi_remember_state
	.cfi_restore 3
	.cfi_def_cfa_offset 32
	popq	%rbp
	.cfi_restore 6
	.cfi_def_cfa_offset 24
	popq	%r12
	.cfi_restore 12
	.cfi_def_cfa_offset 16
	popq	%r13
	.cfi_restore 13
	.cfi_def_cfa_offset 8
	ret
	.p2align 4,,10
	.p2align 3
.L22:
	.cfi_restore_state
	movsbl	6(%rdx), %ebp
	movsbl	7(%rdx), %r13d
	movsbl	8(%rdx), %r12d
	leal	0(%rbp,%r13), %r9d
	addl	%r12d, %r9d
	cmpl	%r9d, %ecx
	jne	.L13
	leal	(%rsi,%r10), %r9d
	addl	%ebp, %r9d
	cmpl	%r9d, %ecx
	jne	.L13
	addl	%r11d, %edi
	addl	%r13d, %edi
	cmpl	%edi, %ecx
	jne	.L13
	leal	(%r8,%rbx), %edi
	addl	%r12d, %edi
	cmpl	%edi, %ecx
	jne	.L13
	addl	%r11d, %esi
	addl	%r12d, %esi
	cmpl	%esi, %ecx
	jne	.L13
	addl	%r11d, %r8d
	addl	%ebp, %r8d
	cmpl	%r8d, %ecx
	jne	.L13
	movq	ideal_char_map(%rip), %rsi
	movq	char_map_one(%rip), %r8
	leaq	9(%rdx), %rdi
	movq	%rdx, %rax
	.p2align 4,,10
	.p2align 3
.L4:
	movsbl	(%rax), %ecx
	movq	%r8, %rdx
	addq	$1, %rax
	salq	%cl, %rdx
	xorq	%rdx, %rsi
	cmpq	%rax, %rdi
	jne	.L4
	testq	%rsi, %rsi
	sete	%al
	jmp	.L13
	.cfi_endproc
	
_Z14check_if_magic ...
.LFB1711:
	.cfi_startproc
	movq	(%rdi), %rdx
	xorl	%eax, %eax
	movsbl	(%rdx), %esi
	movsbl	1(%rdx), %edi
	movsbl	2(%rdx), %r8d
	leal	(%rsi,%rdi), %ecx
	addl	%r8d, %ecx
	cmpl	$159, %ecx
	je	.L17
.L15:
	ret
	.p2align 4,,10
	.p2align 3
.L17:
	movsbl	3(%rdx), %r9d
	movsbl	4(%rdx), %r10d
	movsbl	5(%rdx), %r11d
	leal	(%r9,%r10), %ecx
	addl	%r11d, %ecx
	cmpl	$159, %ecx
	jne	.L15
	pushq	%r12
	.cfi_def_cfa_offset 16
	.cfi_offset 12, -16
	pushq	%rbp
	.cfi_def_cfa_offset 24
	.cfi_offset 6, -24
	pushq	%rbx
	.cfi_def_cfa_offset 32
	.cfi_offset 3, -32
	movsbl	6(%rdx), %ebx
	movsbl	7(%rdx), %r12d
	movsbl	8(%rdx), %ebp
	leal	(%rbx,%r12), %ecx
	addl	%ebp, %ecx
	cmpl	$159, %ecx
	je	.L18
.L2:
	popq	%rbx
	.cfi_remember_state
	.cfi_restore 3
	.cfi_def_cfa_offset 24
	popq	%rbp
	.cfi_restore 6
	.cfi_def_cfa_offset 16
	popq	%r12
	.cfi_restore 12
	.cfi_def_cfa_offset 8
	ret
.L18:
	.cfi_restore_state
	leal	(%rsi,%r9), %ecx
	addl	%ebx, %ecx
	cmpl	$159, %ecx
	jne	.L2
	addl	%r10d, %edi
	addl	%r12d, %edi
	cmpl	$159, %edi
	jne	.L2
	leal	(%r8,%r11), %ecx
	addl	%ebp, %ecx
	cmpl	$159, %ecx
	jne	.L2
	addl	%r10d, %esi
	addl	%ebp, %esi
	cmpl	$159, %esi
	jne	.L2
	addl	%r10d, %r8d
	addl	%ebx, %r8d
	cmpl	$159, %r8d
	jne	.L2
	movq	%rdx, %rax
	leaq	9(%rdx), %rdi
	movl	$1, %esi
	movabsq	$287667426198290432, %rdx
	.p2align 4,,10
	.p2align 3
.L3:
	movsbl	(%rax), %ecx
	movq	%rsi, %rbx
	addq	$1, %rax
	salq	%cl, %rbx
	xorq	%rbx, %rdx
	cmpq	%rax, %rdi
	jne	.L3
	testq	%rdx, %rdx
	sete	%al
	jmp	.L2
	.cfi_endproc
	

By the way, did you notice that mnemonics have little tooltips on them? I'm not sure if they help, but this wouldn't be words and buttons without any interactive stuff whatsoever.

How come they do not show the difference in the runtime? Let's take a look at what Valgrind has to say.

I   refs:      13,099,624,571
I1  misses:             1,846
LLi misses:             1,705
I1  miss rate:           0.00%
LLi miss rate:           0.00%

D   refs:       7,435,891,049
    (4,739,364,171 rd   + 2,696,526,878 wr)
D1  misses:            15,886
    (       13,670 rd   +         2,216 wr)
LLd misses:             9,155
    (        7,758 rd   +         1,397 wr)
D1  miss rate:            0.0%
    (          0.0%     +           0.0%  )
LLd miss rate:            0.0%
    (          0.0%     +           0.0%  )

LL refs:               17,732
    (       15,516 rd   +         2,216 wr)
LL misses:             10,860
    (        9,463 rd   +         1,397 wr)
LL miss rate:             0.0%
    (          0.0%     +           0.0%  )
    
I   refs:      12,856,556,965
I1  misses:             1,843
LLi misses:             1,703
I1  miss rate:           0.00%
LLi miss rate:           0.00%

D   refs:       6,805,402,908
    (4,230,409,813 rd   + 2,574,993,095 wr)
D1  misses:            15,882
    (       13,665 rd   +         2,217 wr)
LLd misses:             9,154
    (        7,758 rd   +         1,396 wr)
D1  miss rate:            0.0%
    (          0.0%     +           0.0%  )
LLd miss rate:            0.0%
    (          0.0%     +           0.0%  )

LL refs:               17,725
    (       15,508 rd   +         2,217 wr)
LL misses:             10,857
    (        9,461 rd   +         1,396 wr)
LL miss rate:             0.0%
    (          0.0%     +           0.0%  )
    

There is a difference, and it shows up in larger numbers, it's just that the benchmark is too small to show it properly. Most of the time we don't even get to do this map thing, we fail earlier.

So yes, it is best to have your constants constant, even if it doesn't show immediately.

Global vs. local

FolwarcznyAdam from Twitter suggests that changing data locality might be a good thing.

const std::array<uint64_t, 8> magic_numbers
{
  3545515123101087289, 3690191062107239479,
  3544956562637535289, 3978984379655991859,
  3689073941180135479, 4123101758198592049,
  3977867258728887859, 4122543197735040049
};
bool check_if_magic(const std::string& sq)
{
  if(sq[4] != '5')
    return false;

  uint64_t magic_number =
    *(reinterpret_cast<const uint32_t*>
      (sq.data()));
  magic_number <<= 32;
  magic_number +=
    *(reinterpret_cast<const uint32_t*>
      (sq.data()+5));

  for(auto i = 0u; i < 8; ++i)
    if(magic_number == magic_numbers[i])
      return true;
  return false;
}
    
bool check_if_magic(const std::string& sq)
{
  const std::array<uint64_t, 8> magic_numbers
  {
    3545515123101087289, 3690191062107239479,
    3544956562637535289, 3978984379655991859,
    3689073941180135479, 4123101758198592049,
    3977867258728887859, 4122543197735040049
  };

  if(sq[4] != '5')
    return false;

  uint64_t magic_number =
    *(reinterpret_cast<const uint32_t*>
      (sq.data()));
  magic_number <<= 32;
  magic_number +=
    *(reinterpret_cast<const uint32_t*>
      (sq.data()+5));

  for(auto i = 0u; i < 8; ++i)
    if(magic_number == magic_numbers[i])
      return true;
  return false;
}
    

It doesn't look rather relevant on the first glance, but GCC does produce quite a different code for global and local variables.

_Z14check_if_magic ...
.LFB1752:
	.cfi_startproc
	movq	(%rdi), %rdx
	xorl	%eax, %eax
	cmpb	$53, 4(%rdx)
	je	.L8
	rep ret
	.p2align 4,,10
	.p2align 3
.L8:
	movl	(%rdx), %ecx
	movl	5(%rdx), %eax
	movabsq	$3545515123101087289, %rdx
	salq	$32, %rcx
	addq	%rax, %rcx
	movl	$_ZL13magic_numbers+8, %eax
	jmp	.L3
	.p2align 4,,10
	.p2align 3
.L9:
	movq	(%rax), %rdx
	addq	$8, %rax
.L3:
	cmpq	%rdx, %rcx
	je	.L5
	cmpq	$_ZL13magic_numbers+64, %rax
	jne	.L9
	xorl	%eax, %eax
	ret
	.p2align 4,,10
	.p2align 3
.L5:
	movl	$1, %eax
	ret
	.cfi_endproc
	
_Z14check_if_magic ...
.LFB1752:
	.cfi_startproc
	subq	$88, %rsp
	.cfi_def_cfa_offset 96
	movq	(%rdi), %rdx
	movq	%fs:40, %rax
	movq	%rax, 72(%rsp)
	xorl	%eax, %eax
	movabsq	$3545515123101087289, %rax
	movq	%rax, (%rsp)
	movabsq	$3690191062107239479, %rax
	movq	%rax, 8(%rsp)
	movabsq	$3544956562637535289, %rax
	movq	%rax, 16(%rsp)
	movabsq	$3978984379655991859, %rax
	movq	%rax, 24(%rsp)
	movabsq	$3689073941180135479, %rax
	movq	%rax, 32(%rsp)
	movabsq	$4123101758198592049, %rax
	movq	%rax, 40(%rsp)
	movabsq	$3977867258728887859, %rax
	movq	%rax, 48(%rsp)
	movabsq	$4122543197735040049, %rax
	movq	%rax, 56(%rsp)
	xorl	%eax, %eax
	cmpb	$53, 4(%rdx)
	je	.L10
.L2:
	movq	72(%rsp), %rsi
	xorq	%fs:40, %rsi
	jne	.L11
	addq	$88, %rsp
	.cfi_remember_state
	.cfi_def_cfa_offset 8
	ret
	.p2align 4,,10
	.p2align 3
.L10:
	.cfi_restore_state
	movl	(%rdx), %ecx
	movl	5(%rdx), %eax
	leaq	64(%rsp), %rsi
	movabsq	$3545515123101087289, %rdx
	salq	$32, %rcx
	addq	%rax, %rcx
	leaq	8(%rsp), %rax
	jmp	.L3
	.p2align 4,,10
	.p2align 3
.L12:
	movq	(%rax), %rdx
	addq	$8, %rax
.L3:
	cmpq	%rdx, %rcx
	je	.L6
	cmpq	%rsi, %rax
	jne	.L12
	xorl	%eax, %eax
	jmp	.L2
	.p2align 4,,10
	.p2align 3
.L6:
	movl	$1, %eax
	jmp	.L2
.L11:
	call	__stack_chk_fail
	.cfi_endproc
	

Well, you might think that the right one is obviously worse. It fills the stack with out values every time the function is called. And it kind of is worse, but that's only because it's not what Adam had in mind. His version actually looks like this: godbolt.org/g/swssJY and it is, of course, much-much better than mine.

The reason for this difference is me using an outdated compiler. And the moral is — update your compiler often.

Loop vs. binary search

Some guys from Reddit, particularly bschindl, Veedrac, and PhilipTrettner came up with the idea of doing binary search. It is a great idea, but once again it doesn't work very well in this case because ideally all 8 64-bit squares fit in a single cache line anyway.

const std::array<uint64_t, 8> magic_numbers
{
  3545515123101087289, 3690191062107239479,
  3544956562637535289, 3978984379655991859,
  3689073941180135479, 4123101758198592049,
  3977867258728887859, 4122543197735040049
};
bool check_if_magic(const std::string& sq)
{
  if(sq[4] != '5')
    return false;

  uint64_t magic_number =
    *(reinterpret_cast<const uint32_t*>
      (sq.data()));
  magic_number <<= 32;
  magic_number +=
    *(reinterpret_cast<const uint32_t*>
      (sq.data()+5));

  for(auto i = 0u; i < 8; ++i)
    if(magic_number == magic_numbers[i])
      return true;
  return false;
}
    
const std::array<uint64_t, 8> magic_numbers
{
3545515123101087289, 3690191062107239479,
3544956562637535289, 3978984379655991859,
3689073941180135479, 4123101758198592049,
3977867258728887859, 4122543197735040049
};
bool check_if_magic(const std::string& sq)
{
  if(square[4] != '5')
    return false;

  uint64_t nr =
    *(reinterpret_cast<const uint32_t*>
      (sq.data()));
  nr <<= 32;
  nr +=
    *(reinterpret_cast<const uint32_t*>
      (sq.data()+5));

  if (nr <= 3690191062107239479)
    if (nr <= 3545515123101087289)
      if (nr < 3545515123101087289)
        return nr == 3544956562637535289;
      else
        return nr == 3545515123101087289;
    else
      if (nr < 3690191062107239479)
        return nr == 3689073941180135479;
      else
        return nr == 3690191062107239479;
  else
    if (nr <= 3978984379655991859)
      if (nr < 3978984379655991859)
        return nr == 3977867258728887859;
      else
        return nr == 3978984379655991859;
    else
      if (nr < 4123101758198592049)
        return nr == 4122543197735040049;
      else
        return nr == 4123101758198592049;
}
    

Valgrind may confirm that sequential reading of 64 bytes is cheap. No only the version on the left has less instruction reads, but the data reads as well, which seems nonsensical.

I   refs:      10,699,297,900
I1  misses:             1,838
LLi misses:             1,701
I1  miss rate:           0.00%
LLi miss rate:           0.00%

D   refs:       5,521,473,844
    (3,342,039,069 rd   + 2,179,434,775 wr)
D1  misses:            15,883
    (       13,668 rd   +         2,215 wr)
LLd misses:             9,154
    (        7,758 rd   +         1,396 wr)
D1  miss rate:            0.0%
    (          0.0%     +           0.0%  )
LLd miss rate:            0.0%
    (          0.0%     +           0.0%  )

LL refs:               17,721
    (       15,506 rd   +         2,215 wr)
LL misses:             10,855
    (        9,459 rd   +         1,396 wr)
LL miss rate:             0.0%
    (          0.0%     +           0.0%  )
    
I   refs:      11,323,056,252
I1  misses:             1,845
LLi misses:             1,705
I1  miss rate:           0.00%
LLi miss rate:           0.00%

D   refs:       5,994,987,820
    (3,428,132,553 rd   + 2,566,855,267 wr)
D1  misses:            15,883
    (       13,666 rd   +         2,217 wr)
LLd misses:             9,154
    (        7,758 rd   +         1,396 wr)
D1  miss rate:            0.0%
    (          0.0%     +           0.0%  )
LLd miss rate:            0.0%
    (          0.0%     +           0.0%  )

LL refs:               17,728
    (       15,511 rd   +         2,217 wr)
LL misses:             10,859
    (        9,463 rd   +         1,396 wr)
LL miss rate:             0.0%
    (          0.0%     +           0.0%  )
    

So a binary search is a bit worse than a simple loop. It is still extremely effective though. It's just with the plain loop being so small it doesn't make sense to make it even better.

But the most amazing thing was found by adamf88. It turn out, Clang can do this by itself: https://godbolt.org/g/cXtdbB! I'm astonished.

Smart lookup

One of the best ideas was proposed by Veedrac. I'm not sure how it works exactly, the indexing thins looks a bit magical to me, but it goes like this:

const char squares[] =
    "   " "   " "   "  // -/1

    "276" "951" "438"  // 1/2
    "294" "753" "618"  // 2/3

    "438" "951" "276"  // 3/4
    "492" "357" "816"  // 4/5

    "618" "753" "294"  // 5/6
    "672" "159" "834"  // 6/7

    "816" "357" "492"  // 7/8
    "834" "159" "672"  // 8/9

    "   " "   " "   "; // 9/-

// UB if sq contains characters [^1-9]
//    or is not at least 9 characters long.
bool check_if_magic(const std::string& square_as_string) {
    const auto* square = square_as_string.data();
    if (square[4] != '5') {
        return false;
    }

    const auto idx = (*square - '1') * 9;
    return
        (!memcmp(square, &squares[idx],   9)) ||
        (!memcmp(square, &squares[idx+9], 9));
}
    

Unfortunately, I had to change the function signature to match the benchmark and it lost quite a lot of speed because of that. It runs for 1.7 seconds that is a little bit slower than 64-bits hack. It does have nicer disassembly though.

_Z14check_if_magic ...
.LFB1779:
	.cfi_startproc
	pushq	%rbp
	.cfi_def_cfa_offset 16
	.cfi_offset 6, -16
	pushq	%rbx
	.cfi_def_cfa_offset 24
	.cfi_offset 3, -24
	xorl	%eax, %eax
	subq	$8, %rsp
	.cfi_def_cfa_offset 32
	movq	(%rdi), %rbx
	cmpb	$53, 4(%rbx)
	je	.L8
.L2:
	addq	$8, %rsp
	.cfi_remember_state
	.cfi_def_cfa_offset 24
	popq	%rbx
	.cfi_def_cfa_offset 16
	popq	%rbp
	.cfi_def_cfa_offset 8
	ret
	.p2align 4,,10
	.p2align 3
.L8:
	.cfi_restore_state
	movsbl	(%rbx), %eax
	movl	$9, %edx
	movq	%rbx, %rdi
	leal	-441(%rax,%rax,8), %ebp
	movslq	%ebp, %rsi
	addq	$_ZL7squares, %rsi
	call	memcmp
	movl	%eax, %edx
	movl	$1, %eax
	testl	%edx, %edx
	je	.L2
	leal	9(%rbp), %esi
	movq	%rbx, %rdi
	movl	$9, %edx
	movslq	%esi, %rsi
	addq	$_ZL7squares, %rsi
	call	memcmp
	testl	%eax, %eax
	sete	%al
	addq	$8, %rsp
	.cfi_def_cfa_offset 24
	popq	%rbx
	.cfi_def_cfa_offset 16
	popq	%rbp
	.cfi_def_cfa_offset 8
	ret
	.cfi_endproc
	

On Valgrind it shows like this:

I   refs:      16,045,605,063
I1  misses:             1,849
LLi misses:             1,713
I1  miss rate:           0.00%
LLi miss rate:           0.00%


D   refs:       8,018,184,251
    (4,590,394,457 rd   + 3,427,789,794 wr)

D1  misses:            15,896
    (       13,678 rd   +         2,218 wr)

LLd misses:             9,159
    (        7,764 rd   +         1,395 wr)

D1  miss rate:            0.0%
    (          0.0%     +           0.0%  )

LLd miss rate:            0.0%
    (          0.0%     +           0.0%  )


LL refs:               17,745
    (       15,527 rd   +         2,218 wr)
LL misses:             10,872
    (        9,477 rd   +         1,395 wr)
LL miss rate:             0.0%
    (          0.0%     +           0.0%  )
    

Data-wise it's absolutely splendid! But it is a bit fat on instruction reads. I would blame `memcmp`. I heard, with better compilers and better standard library implementation it might get properly inlined and optimized contextually, but with my GCC it only gets called twice which is, in such a tight piece of code, simply too expensive.

Smart indexing

But the absolutely magical is the version from kocsis1david.

static uint64_t magic_numbers[8] = {
    0x3336313832393437,
    0x3134333832373639,
    0x3734393238313633,
    0x3936373238333431,
    0x3332393436313837,
    0x3938333436373231,
    0x3738313634393233,
    0x3132373634333839,
};

static uint64_t magic_number(const char sq[9]) {
    uint32_t a, b;
    memcpy(&a, sq, sizeof(uint32_t));
    memcpy(&b, sq + 5, sizeof(uint32_t));
    return ((uint64_t)a << 32) + b;
}

bool check_if_magic(const std::string& square) {
    const auto* sq = square.data();
    if (sq[4] != '5')
        return false;

    int index = sq[0] & 7 | (sq[1] >> 1) & 1;
    return magic_numbers[index] == magic_number(sq);
}
    

It also exploits the same smart indexing idea, but it uses less instructions and no external calls whatsoever.

Its Valgrind report is impressive too!

I   refs:      8,977,429,337
I1  misses:            1,842
LLi misses:            1,704
I1  miss rate:          0.00%
LLi miss rate:          0.00%


D   refs:      5,349,287,013
    (3,169,852,226 rd   + 2,179,434,787 wr)

D1  misses:           15,885
    (       13,668 rd   +         2,217 wr)

LLd misses:            9,155
    (        7,759 rd   +         1,396 wr)

D1  miss rate:           0.0%
    (          0.0%     +           0.0%  )

LLd miss rate:           0.0%
    (          0.0%     +           0.0%  )


LL refs:              17,727
    (       15,510 rd   +         2,217 wr)

LL misses:            10,859
    (        9,463 rd   +         1,396 wr)

LL miss rate:            0.0%
    (          0.0%     +           0.0%  )
    

And it runs for little less than a second which, considering the generator takes some 0.8 of it, is almost 3 times faster than the best take I had to offer.

Excellent!

Conclusion

I don't know about you, but I learned from this idea exchange quite a lot. Way more than from tinkering with a toy problem all on my own.

We should do this again sometime. Probably, with better benchmark though. And a better problem. And Clang. I'm fascinated to try Clang at this point.