The MASM Forum Archive 2004 to 2012

Project Support Forums => HLA Forum => Topic started by: Sundervine on August 31, 2008, 09:55:03 PM

Title: SSE Instruction set
Post by: Sundervine on August 31, 2008, 09:55:03 PM
I am looking for any documentation on the current sse instructions and their implementation in HLA. Any help or web addresses would be wonderful. Thank you very much for any help you can provide!
Title: Re: SSE Instruction set
Post by: Randall Hyde on September 01, 2008, 09:07:55 PM
Quote from: Sundervine on August 31, 2008, 09:55:03 PM
I am looking for any documentation on the current sse instructions and their implementation in HLA. Any help or web addresses would be wonderful. Thank you very much for any help you can provide!

HLA supports all the SSE instructions up to (but not including) the very latest instructions (like blend*).  HLA v1.105 will add support for the brand-new instructions just added.

I believe the HLA documentation is up-to-date on the instructions supported.  Of course, it's only a reference providing the syntax, it does not describe how to actually use the instructions.
hLater,
Randy Hyde

Title: Re: SSE Instruction set
Post by: BlackVortex on September 01, 2008, 10:39:51 PM
For the general usage os SSE instructions, I guess the Intel manuals would be helpful (free 6 pdf files available for download)
Title: Re: SSE Instruction set
Post by: Sundervine on September 02, 2008, 02:54:57 AM
Thank you very much for the information. I will download the intel information as soon as find them. Again thank you both!
I noticed a few other people seem to be looking for this information so I will at least put the HLA Information here.

SSE Instructions
HLA supports the following SSE and SSE/2 instructions found on the Pentium III, IV, and later processors (note that some instructions are only available on Pentium IV and later processors; see the Intel reference manuals for details):
HLA uses the symbols xmm0, xmm1, ..., xmm7 for the SSE register set.

SSE Instrs:

addsd( sseReg/mem128, sseReg );
addpd( sseReg/mem128, sseReg );
addps( sseReg/mem128, sseReg );
addss( sseReg/mem128, sseReg );
andnpd( sseReg/mem128, sseReg );
andnps( sseReg/mem128, sseReg );
andpd( sseReg/mem128, sseReg );
andps( sseReg/mem128, sseReg );

clflush( mem8 );

cmppd( imm8, sseReg/mem128, sseReg );
cmpps( imm8, sseReg/mem128, sseReg );
cmpsdp( imm8, sseReg/mem64, sseReg );
cmpss( imm8, sseReg/mem32, sseReg );
cmpeqss( sseReg, sseReg );
cmpltss( sseReg, sseReg );
cmpless( sseReg, sseReg );
cmpneqss( sseReg, sseReg );
cmpnlts( sseReg, sseReg );
cmpnles( sseReg, sseReg );
cmpords( sseReg, sseReg );
cmpunordss( sseReg, sseReg );
cmpeqsd( sseReg, sseReg );
cmpltsd( sseReg, sseReg );
cmplesd( sseReg, sseReg );
cmpneqsd( sseReg, sseReg );
cmpnlts( sseReg, sseReg );
cmpnles( sseReg, sseReg );
cmpords( sseReg, sseReg );
cmpunords( sseReg, sseReg );

cmpeqps( sseReg, sseReg );
cmpltps( sseReg, sseReg );
cmpleps( sseReg, sseReg );
cmpneqps( sseReg, sseReg );
cmpnltp( sseReg, sseReg );
cmpnleps( sseReg, sseReg );
cmpordps( sseReg, sseReg );
cmpunordps( sseReg, sseReg );

cmpeqpd( sseReg, sseReg );
cmpltpd( sseReg, sseReg );
cmplepd( sseReg, sseReg );
cmpneqpd( sseReg, sseReg );
cmpnltpd( sseReg, sseReg );
cmpnlepd( sseReg, sseReg );
cmpordpd( sseReg, sseReg );
cmpunordpd( sseReg, sseReg );

comisd( sseReg/mem64, sseReg );
comiss( sseReg/mem32, sseReg );
cvtdq2pd( sseReg/mem64, sseReg );
cvtdq2pq
cvtdq2ps( sseReg/mem128, sseReg );
cvtpd2dq( sseReg/mem128, sseReg );
cvtpd2pi( sseReg/mem128, mmxReg );
cvtpd2ps( sseReg/mem128, sseReg );
cvtpi2pd( sseReg/mem64, sseReg );
cvtpi2ps( sseReg/mem64, sseReg );
cvtpi2ss
cvtps2dq( sseReg/mem128, sseReg );
cvtps2pd( sseReg/mem64, sseReg );
cvtps2pi( sseReg/mem64, sseReg );
cvtsd2si( sseReg/mem64, Reg32 );
cvtsi2sd( Reg32/mem32, sseReg );
cvtsi2ss( sseReg/mem64, sseReg );
cvtss2sd( sseReg/mem32, sseReg );
cvtsd2ss( Reg32/mem32, sseReg );
cvtss2si( sseReg/mem32, Reg32 );
cvttpd2pi( sseReg/mem128, mmxReg );
cvttpd2dq( sseReg/mem128, sseReg );
cvttps2dq( sseReg/mem128, sseReg );
cvttps2pi( sseReg/mem64, mmxReg );
cvttsd2si( sseReg/mem64, Reg32 );
cvttss2si( sseReg/mem32, Reg32 );

divpd( sseReg/mem128, sseReg );
divps( sseReg/mem128, sseReg );
divsd( sseReg/mem64, sseReg );
divss( sseReg/mem32, sseReg );
fxsave( mem512 );
fxrstor( mem512 );
ldmxcsr( mem32 );
lfence

maskmovdqu( sseReg, sseReg );
maskmovq( mmxReg, mmxReg );
maxpd( sseReg/mem128, sseReg );
maxps( sseReg/mem128, sseReg );
maxsd( sseReg/mem64, sseReg );
maxss( sseReg/mem32, sseReg );

mfence

minpd( sseReg/mem128, sseReg );
minps( sseReg/mem128, sseReg );
minsd( sseReg/mem64, sseReg );
minss( sseReg/mem32, sseReg );

movapd( sseReg/mem128, sseReg );
movapd( sseReg, sseReg/mem128 );
movaps( sseReg/mem128, sseReg );
movaps( sseReg, sseReg/mem128 );
movdqa( sseReg/mem128, sseReg );
movdqa( sseReg, sseReg/mem128 );
movdqu( sseReg/mem128, sseReg );
movdqu( sseReg, sseReg/mem128 );
movdq2q( sseReg, mmxReg );
movhlps( sseReg, sseReg );
movhpd( mem64, sseReg );
movhpd( sseReg, mem64 );
movhps( mem64, sseReg );
movhps( sseReg, mem64 );
movlpd( mem64, sseReg );
movlpd( sseReg, mem64 );
movlps( mem64, sseReg );
movlps( sseReg, mem64 );
movlhps( sseReg, sseReg );
movmskpd( sseReg, Reg32 );
movmskps( sseReg, Reg32 );
movnti( Reg32, mem32 );
movntpd( sseReg, mem128 );
movntps( sseReg, mem128 );
movntq( mmxReg, mem64 );
movntdq( sseReg, mem128 );
movq2dq( mmxReg, sseReg );
movsdp( sseReg, sseReg );
movsdp( mem64, sseReg );
movsdp( sseReg, mem64 );
movss( sseReg, sseReg );
movss( mem32, sseReg );
movss( sseReg, mem32 );
movupd( sseReg, sseReg );
movupd( sseReg, mem128 );
movupd( mem128, sseReg );
movups( sseReg, sseReg );
movups( sseReg, mem128 );
movups( mem128, sseReg );

mulpd( sseReg/mem128, sseReg );
mulps( sseReg/mem128, sseReg );
mulss( sseReg/mem32, sseReg );
mulsd( sseReg/mem64, sseReg );

orpd( sseReg/mem128, sseReg );
orps( sseReg/mem128, sseReg );

pause

pmuludq( mmxReg/mem64, mmxReg );
pmuludq( sseReg/mem128, sseReg );

prefetcht0( mem8 );
prefetcht1( mem8 );
prefetcht2( mem8 );
prefetchnta( mem8 );

pshufd( imm8, sseReg/mem128, sseReg );
pslldq( imm8, sseReg );
psrldq( imm8, sseReg );
punpckhqdq( sseReg/mem128, sseReg );
punpcklqdq( sseReg/mem128, sseReg );

rcpps( sseReg/mem128, sseReg );
rcpss( sseReg/mem128, sseReg );
rsqrtps( sseReg/mem128, sseReg );
rsqrtss( sseReg/mem32, sseReg );

sfence;

shufpd( imm8, sseReg/mem128, sseReg );
shufps( imm8, sseReg/mem128, sseReg );
sqrtpd( sseReg/mem128, sseReg );
sqrtps( sseReg/mem128, sseReg );
sqrtsd( sseReg/mem64, sseReg );
sqrtss( sseReg/mem32, sseReg );

stmxcsr( mem32 );

subps( sseReg/mem128, sseReg );
subpd( sseReg/mem128, sseReg );
subsd( sseReg/mem64, sseReg );
subss( sseReg/mem32, sseReg );

ucomisd( sseReg/mem64, sseReg );
ucomiss( sseReg/mem32, sseReg );

unpckhpd( sseReg/mem128, sseReg );
unpckhps( sseReg/mem128, sseReg );
unpcklpd( sseReg/mem128, sseReg );
unpcklps( sseReg/mem128, sseReg );

xorpd( sseReg/mem128, sseReg );
xorps( sseReg/mem128, sseReg );