alpha_blend_sse :)
;example usage
;mov eax, 0xabcdef
;mov edx, 0xAAAAAADD
;call alph_blend_see
; In\ EAX = background color (ZRBG) 32bit (Z mean zero, always is zero)
; In\ EDX = foreground color (RBGA) 32bit
; Out\ EAX = new color
alph_blend_sse:
xor r14, r14
xor r12, r12
xor r13, r13
movzx r15, dl ; av: alpha number (0x00--->0xFF)
movzx ecx, dl
not ecx ; faster than 255 - dl
mov r14b, cl ; rem
shr edx, 8
and edx, 0x00FFFFFF
mov r12d, edx
mov r13d, eax ; RBGA ---> ZRGB
mov rax, 0x0000FF
movq xmm3, rax
; s: eax
; d: edx
;=============================red = ((s >> 16) * rem + (d >> 16) * av) >> 8;
movq xmm0, r12
psrld xmm0, 0x10
movq xmm1, r14
pmuludq xmm1, xmm0
movq xmm0, r13
psrld xmm0, 0x10
movq xmm2, r15
pmuludq xmm2, xmm0
addps xmm2, xmm1
psrld xmm2, 0x8
movq rax, xmm2
mov r9b, al
shl r9d, 8
;=============================green = (((s >> 8) & 0x0000ff) * rem + ((d >> 8) & 0x0000ff) * av) >> 8;
movq xmm0, r12
psrld xmm0, 0x8
andps xmm0, xmm3
movq xmm1, r14
pmuludq xmm1, xmm0
movq xmm0, r13
psrld xmm0, 0x8
andps xmm0, xmm3
movq xmm2, r15
pmuludq xmm2, xmm0
addps xmm2, xmm1
psrld xmm2, 0x8
movq rax, xmm2
mov r9b, al
shl r9d, 8
;=============================blue = ((s & 0x0000ff) * rem + (d & 0x0000ff) * av) >> 8;
movq xmm0, r12
andps xmm0, xmm3
movq xmm1, r14
andps xmm1, xmm3
pmuludq xmm1, xmm0
movq xmm0, r13
andps xmm0, xmm3
movq xmm2, r15
andps xmm2, xmm3
pmuludq xmm2, xmm0
addps xmm2, xmm1
psrld xmm2, 0x8
movq rax, xmm2
mov r9b, al
mov eax, r9d
ret