News:

MASM32 SDK Description, downloads and other helpful links
MASM32.com New Forum Link
masmforum WebSite

SSE Instruction set

Started by Sundervine, August 31, 2008, 09:55:03 PM

Previous topic - Next topic

Sundervine

I am looking for any documentation on the current sse instructions and their implementation in HLA. Any help or web addresses would be wonderful. Thank you very much for any help you can provide!

Randall Hyde

Quote from: Sundervine on August 31, 2008, 09:55:03 PM
I am looking for any documentation on the current sse instructions and their implementation in HLA. Any help or web addresses would be wonderful. Thank you very much for any help you can provide!

HLA supports all the SSE instructions up to (but not including) the very latest instructions (like blend*).  HLA v1.105 will add support for the brand-new instructions just added.

I believe the HLA documentation is up-to-date on the instructions supported.  Of course, it's only a reference providing the syntax, it does not describe how to actually use the instructions.
hLater,
Randy Hyde


BlackVortex

For the general usage os SSE instructions, I guess the Intel manuals would be helpful (free 6 pdf files available for download)

Sundervine

#3
Thank you very much for the information. I will download the intel information as soon as find them. Again thank you both!
I noticed a few other people seem to be looking for this information so I will at least put the HLA Information here.

SSE Instructions
HLA supports the following SSE and SSE/2 instructions found on the Pentium III, IV, and later processors (note that some instructions are only available on Pentium IV and later processors; see the Intel reference manuals for details):
HLA uses the symbols xmm0, xmm1, ..., xmm7 for the SSE register set.

SSE Instrs:

addsd( sseReg/mem128, sseReg );
addpd( sseReg/mem128, sseReg );
addps( sseReg/mem128, sseReg );
addss( sseReg/mem128, sseReg );
andnpd( sseReg/mem128, sseReg );
andnps( sseReg/mem128, sseReg );
andpd( sseReg/mem128, sseReg );
andps( sseReg/mem128, sseReg );

clflush( mem8 );

cmppd( imm8, sseReg/mem128, sseReg );
cmpps( imm8, sseReg/mem128, sseReg );
cmpsdp( imm8, sseReg/mem64, sseReg );
cmpss( imm8, sseReg/mem32, sseReg );
cmpeqss( sseReg, sseReg );
cmpltss( sseReg, sseReg );
cmpless( sseReg, sseReg );
cmpneqss( sseReg, sseReg );
cmpnlts( sseReg, sseReg );
cmpnles( sseReg, sseReg );
cmpords( sseReg, sseReg );
cmpunordss( sseReg, sseReg );
cmpeqsd( sseReg, sseReg );
cmpltsd( sseReg, sseReg );
cmplesd( sseReg, sseReg );
cmpneqsd( sseReg, sseReg );
cmpnlts( sseReg, sseReg );
cmpnles( sseReg, sseReg );
cmpords( sseReg, sseReg );
cmpunords( sseReg, sseReg );

cmpeqps( sseReg, sseReg );
cmpltps( sseReg, sseReg );
cmpleps( sseReg, sseReg );
cmpneqps( sseReg, sseReg );
cmpnltp( sseReg, sseReg );
cmpnleps( sseReg, sseReg );
cmpordps( sseReg, sseReg );
cmpunordps( sseReg, sseReg );

cmpeqpd( sseReg, sseReg );
cmpltpd( sseReg, sseReg );
cmplepd( sseReg, sseReg );
cmpneqpd( sseReg, sseReg );
cmpnltpd( sseReg, sseReg );
cmpnlepd( sseReg, sseReg );
cmpordpd( sseReg, sseReg );
cmpunordpd( sseReg, sseReg );

comisd( sseReg/mem64, sseReg );
comiss( sseReg/mem32, sseReg );
cvtdq2pd( sseReg/mem64, sseReg );
cvtdq2pq
cvtdq2ps( sseReg/mem128, sseReg );
cvtpd2dq( sseReg/mem128, sseReg );
cvtpd2pi( sseReg/mem128, mmxReg );
cvtpd2ps( sseReg/mem128, sseReg );
cvtpi2pd( sseReg/mem64, sseReg );
cvtpi2ps( sseReg/mem64, sseReg );
cvtpi2ss
cvtps2dq( sseReg/mem128, sseReg );
cvtps2pd( sseReg/mem64, sseReg );
cvtps2pi( sseReg/mem64, sseReg );
cvtsd2si( sseReg/mem64, Reg32 );
cvtsi2sd( Reg32/mem32, sseReg );
cvtsi2ss( sseReg/mem64, sseReg );
cvtss2sd( sseReg/mem32, sseReg );
cvtsd2ss( Reg32/mem32, sseReg );
cvtss2si( sseReg/mem32, Reg32 );
cvttpd2pi( sseReg/mem128, mmxReg );
cvttpd2dq( sseReg/mem128, sseReg );
cvttps2dq( sseReg/mem128, sseReg );
cvttps2pi( sseReg/mem64, mmxReg );
cvttsd2si( sseReg/mem64, Reg32 );
cvttss2si( sseReg/mem32, Reg32 );

divpd( sseReg/mem128, sseReg );
divps( sseReg/mem128, sseReg );
divsd( sseReg/mem64, sseReg );
divss( sseReg/mem32, sseReg );
fxsave( mem512 );
fxrstor( mem512 );
ldmxcsr( mem32 );
lfence

maskmovdqu( sseReg, sseReg );
maskmovq( mmxReg, mmxReg );
maxpd( sseReg/mem128, sseReg );
maxps( sseReg/mem128, sseReg );
maxsd( sseReg/mem64, sseReg );
maxss( sseReg/mem32, sseReg );

mfence

minpd( sseReg/mem128, sseReg );
minps( sseReg/mem128, sseReg );
minsd( sseReg/mem64, sseReg );
minss( sseReg/mem32, sseReg );

movapd( sseReg/mem128, sseReg );
movapd( sseReg, sseReg/mem128 );
movaps( sseReg/mem128, sseReg );
movaps( sseReg, sseReg/mem128 );
movdqa( sseReg/mem128, sseReg );
movdqa( sseReg, sseReg/mem128 );
movdqu( sseReg/mem128, sseReg );
movdqu( sseReg, sseReg/mem128 );
movdq2q( sseReg, mmxReg );
movhlps( sseReg, sseReg );
movhpd( mem64, sseReg );
movhpd( sseReg, mem64 );
movhps( mem64, sseReg );
movhps( sseReg, mem64 );
movlpd( mem64, sseReg );
movlpd( sseReg, mem64 );
movlps( mem64, sseReg );
movlps( sseReg, mem64 );
movlhps( sseReg, sseReg );
movmskpd( sseReg, Reg32 );
movmskps( sseReg, Reg32 );
movnti( Reg32, mem32 );
movntpd( sseReg, mem128 );
movntps( sseReg, mem128 );
movntq( mmxReg, mem64 );
movntdq( sseReg, mem128 );
movq2dq( mmxReg, sseReg );
movsdp( sseReg, sseReg );
movsdp( mem64, sseReg );
movsdp( sseReg, mem64 );
movss( sseReg, sseReg );
movss( mem32, sseReg );
movss( sseReg, mem32 );
movupd( sseReg, sseReg );
movupd( sseReg, mem128 );
movupd( mem128, sseReg );
movups( sseReg, sseReg );
movups( sseReg, mem128 );
movups( mem128, sseReg );

mulpd( sseReg/mem128, sseReg );
mulps( sseReg/mem128, sseReg );
mulss( sseReg/mem32, sseReg );
mulsd( sseReg/mem64, sseReg );

orpd( sseReg/mem128, sseReg );
orps( sseReg/mem128, sseReg );

pause

pmuludq( mmxReg/mem64, mmxReg );
pmuludq( sseReg/mem128, sseReg );

prefetcht0( mem8 );
prefetcht1( mem8 );
prefetcht2( mem8 );
prefetchnta( mem8 );

pshufd( imm8, sseReg/mem128, sseReg );
pslldq( imm8, sseReg );
psrldq( imm8, sseReg );
punpckhqdq( sseReg/mem128, sseReg );
punpcklqdq( sseReg/mem128, sseReg );

rcpps( sseReg/mem128, sseReg );
rcpss( sseReg/mem128, sseReg );
rsqrtps( sseReg/mem128, sseReg );
rsqrtss( sseReg/mem32, sseReg );

sfence;

shufpd( imm8, sseReg/mem128, sseReg );
shufps( imm8, sseReg/mem128, sseReg );
sqrtpd( sseReg/mem128, sseReg );
sqrtps( sseReg/mem128, sseReg );
sqrtsd( sseReg/mem64, sseReg );
sqrtss( sseReg/mem32, sseReg );

stmxcsr( mem32 );

subps( sseReg/mem128, sseReg );
subpd( sseReg/mem128, sseReg );
subsd( sseReg/mem64, sseReg );
subss( sseReg/mem32, sseReg );

ucomisd( sseReg/mem64, sseReg );
ucomiss( sseReg/mem32, sseReg );

unpckhpd( sseReg/mem128, sseReg );
unpckhps( sseReg/mem128, sseReg );
unpcklpd( sseReg/mem128, sseReg );
unpcklps( sseReg/mem128, sseReg );

xorpd( sseReg/mem128, sseReg );
xorps( sseReg/mem128, sseReg );