After a frustrating battle with a C symbol library, I decided to just clone the routines I need.
memset: ;(address,character,amount)
push ebp
mov ebp,esp
mov ecx,[ebp+10h] ;amount
jecxz memset_done
push edi
mov al,[ebp+0Ch] ;character
mov ah,al ;copy AL to all four bytes of EAX
mov edx,eax
mov edi,[ebp+8] ;target
shl eax,10h
mov ax,dx
mov edx,edi
neg edx
and edx,3
jz short memset_dwords
cmp ecx,edx
jb short memset_remainder
sub ecx,edx
xchg edx,ecx
rep stosb
mov ecx,edx
memset_dwords: ;EDI is now a multiple of 4
mov dl,cl
shr ecx,2
rep stosd
mov cl,dl
and cl,3
memset_remainder:
rep stosb
pop edi
memset_done:
mov eax,[ebp+8]
pop ebp
ret ;leaving 12 bytes of input on the stack
A variation:
memzero: ;(address,amount)
push ebp
mov ebp,esp
mov ecx,[ebp+12] ;amount
jecxz memzero_done
push edi
xor eax,eax
mov edi,[ebp+8] ;target
mov edx,edi
neg edx
and edx,3
jz short memzero_dwords
cmp ecx,edx
jb short memzero_remainder
sub ecx,edx
xchg edx,ecx
rep stosb
mov ecx,edx
memzero_dwords: ;EDI is now a multiple of 4
mov dl,cl
shr ecx,2
rep stosd
mov cl,dl
and cl,3
memzero_remainder:
rep stosb
pop edi
memzero_done:
mov eax,[ebp+8]
pop ebp
ret
check this for some other fast zero algs
http://www.masm32.com/board/index.php?topic=6576.0