/* * Simulated annealing on image demo - auxiliary functions (x86-64) * * Copyright (c) 2017 Project Nayuki * All rights reserved. Contact Nayuki for licensing. * https://www.nayuki.io/page/simulated-annealing-demo */ /* int32_t horizontal_energy_diff_if_swapped(const uint32_t pixels[static 1], uint32_t width, uint32_t height, uint32_t x, uint32_t y) */ .globl horizontal_energy_diff_if_swapped horizontal_energy_diff_if_swapped: /* * Storage usage: * Bytes Location Description * 8 rdi Base address of pixels array argument, later becoming pixel address = &pixels[y * width + x] * 4 esi Width argument (zero-extended to rsi) (read-only) * 4 edx Height argument (zero-extended to rdx) (read-only) * 4 ecx X argument (zero-extended to rcx), temporary for calculations * 4 r8d Y argument (zero-extended to r8), temporary for calculations * 4 r9d Temporary for calculations (zero-extended to r9) * 128 xmm0..xmm7 Neighboring pixel RGBA values (only low 32 bits are used) * 16 xmm8 Temporary for calculations (only low 32 bits are used) * 16 xmm9 Accumulator for return value (only low 32 bits are used) * 4 eax Final return value (zero-extended to rax) */ /* rdi = &pixels[y * width + x] */ movl %r8d, %r9d imul %esi, %r9d addl %ecx, %r9d leaq (%rdi,%r9,4), %rdi /* r9 = -width */ movl %esi, %r9d negq %r9 /* Load neighboring pixels */ movd 0(%rdi) , %xmm0 /* This pixel */ movd 4(%rdi) , %xmm1 /* Right pixel */ movd -4(%rdi) , %xmm2 /* Left pixel */ movd 8(%rdi) , %xmm3 /* Right right pixel */ movd 0(%rdi,%r9 ,4), %xmm4 /* Up pixel */ movd 4(%rdi,%r9 ,4), %xmm5 /* Up right pixel */ movd 0(%rdi,%rsi,4), %xmm6 /* Down pixel */ movd 4(%rdi,%rsi,4), %xmm7 /* Down right pixel */ pxor %xmm9, %xmm9 /* Output accumulator */ /* If x > 0 */ testl %ecx, %ecx jz .horz0 movdqa %xmm2, %xmm8 psadbw %xmm0, %xmm2 psadbw %xmm1, %xmm8 psubd %xmm2, %xmm9 paddd %xmm8, %xmm9 /* If x + 2 < width */ addl $2, %ecx cmpl %esi, %ecx jae .horz1 .horz0: movdqa %xmm3, %xmm8 psadbw %xmm1, %xmm3 psadbw %xmm0, %xmm8 psubd %xmm3, %xmm9 paddd %xmm8, %xmm9 /* If y > 0 */ .horz1: testl %r8d, %r8d jz .horz2 movdqa %xmm4, %xmm8 psadbw %xmm0, %xmm4 psadbw %xmm1, %xmm8 psubd %xmm4, %xmm9 paddd %xmm8, %xmm9 movdqa %xmm5, %xmm8 psadbw %xmm1, %xmm5 psadbw %xmm0, %xmm8 psubd %xmm5, %xmm9 paddd %xmm8, %xmm9 /* If y + 1 < height */ incl %r8d cmpl %edx, %r8d jae .horz3 .horz2: movdqa %xmm6, %xmm8 psadbw %xmm0, %xmm6 psadbw %xmm1, %xmm8 psubd %xmm6, %xmm9 paddd %xmm8, %xmm9 movdqa %xmm7, %xmm8 psadbw %xmm1, %xmm7 psadbw %xmm0, %xmm8 psubd %xmm7, %xmm9 paddd %xmm8, %xmm9 .horz3: movd %xmm9, %eax retq /* int32_t vertical_energy_diff_if_swapped(const uint32_t pixels[static 1], uint32_t width, uint32_t height, uint32_t x, uint32_t y) */ .globl vertical_energy_diff_if_swapped vertical_energy_diff_if_swapped: /* * Storage usage: * Bytes Location Description * 8 rdi Base address of pixels array argument, later becoming pixel address = &pixels[y * width + x] * 4 esi Width argument (zero-extended to rsi) (read-only) * 4 edx Height argument (zero-extended to rdx) (read-only) * 4 ecx X argument (zero-extended to rcx), temporary for calculations * 4 r8d Y argument (zero-extended to r8), temporary for calculations * 4 r9d Temporary for calculations (zero-extended to r9) * 128 xmm0..xmm7 Neighboring pixel RGBA values (only low 32 bits are used) * 16 xmm8 Temporary for calculations (only low 32 bits are used) * 16 xmm9 Accumulator for return value (only low 32 bits are used) * 4 eax Final return value (zero-extended to rax) */ /* rdi = &pixels[y * width + x] */ movl %r8d, %r9d imul %esi, %r9d addl %ecx, %r9d leaq (%rdi,%r9,4), %rdi /* r9 = -width */ movl %esi, %r9d negq %r9 /* Load neighboring pixels */ movd 0(%rdi) , %xmm0 /* This pixel */ movd 0(%rdi,%rsi,4), %xmm1 /* Down pixel */ movd 0(%rdi,%r9 ,4), %xmm2 /* Up pixel */ movd 0(%rdi,%rsi,8), %xmm3 /* Down down pixel */ movd -4(%rdi) , %xmm4 /* Left pixel */ movd -4(%rdi,%rsi,4), %xmm5 /* Left down pixel */ movd 4(%rdi) , %xmm6 /* Right pixel */ movd 4(%rdi,%rsi,4), %xmm7 /* Right down pixel */ pxor %xmm9, %xmm9 /* Output accumulator */ /* If y > 0 */ testl %r8d, %r8d jz .vert0 movdqa %xmm2, %xmm8 psadbw %xmm0, %xmm2 psadbw %xmm1, %xmm8 psubd %xmm2, %xmm9 paddd %xmm8, %xmm9 /* If y + 2 < height */ addl $2, %r8d cmpl %edx, %r8d jae .vert1 .vert0: movdqa %xmm3, %xmm8 psadbw %xmm1, %xmm3 psadbw %xmm0, %xmm8 psubd %xmm3, %xmm9 paddd %xmm8, %xmm9 /* If x > 0 */ .vert1: testl %ecx, %ecx jz .vert2 movdqa %xmm4, %xmm8 psadbw %xmm0, %xmm4 psadbw %xmm1, %xmm8 psubd %xmm4, %xmm9 paddd %xmm8, %xmm9 movdqa %xmm5, %xmm8 psadbw %xmm1, %xmm5 psadbw %xmm0, %xmm8 psubd %xmm5, %xmm9 paddd %xmm8, %xmm9 /* If x + 1 < width */ incl %ecx cmpl %esi, %ecx jae .vert3 .vert2: movdqa %xmm6, %xmm8 psadbw %xmm0, %xmm6 psadbw %xmm1, %xmm8 psubd %xmm6, %xmm9 paddd %xmm8, %xmm9 movdqa %xmm7, %xmm8 psadbw %xmm1, %xmm7 psadbw %xmm0, %xmm8 psubd %xmm7, %xmm9 paddd %xmm8, %xmm9 .vert3: movd %xmm9, %eax retq