News:

MASM32 SDK Description, downloads and other helpful links
MASM32.com New Forum Link
masmforum WebSite

Optimize ASM code produced by MSVC 6

Started by Seb, June 08, 2005, 01:21:17 AM

Previous topic - Next topic

Seb

Hello!

I have started to combine ASM with C, and it works great ATM. However, I do not really gain any speed through my ASM code because it's the from the listing files (browse info) that MSVC 6 produces upon request. Could any kind soul give me a hand in optimizing some parts of my code (and also explaining how and why it's optimized). The ASM code is found below.

Note: This is no "oh plz code that for me" thread. I simply ask for help on optimizing my current ASM code (I know that the code can be optimized).



.486
.model flat

PublicAlias MACRO MangledName ; macro for giving a function alias public names
        MangledName label near
        public MangledName
ENDM

.code

_TEXT SEGMENT PARA USE32 PUBLIC 'CODE'
_TEXT ENDS

_TEXT SEGMENT
_error_buffer$ = 8
_buffer_out$ = 12
_output_size$ = 16
_readsamplesize$ = 20
_predictor_coef_table$ = 24
_predictor_coef_num$ = 28
_predictor_quantitization$ = 32
_i$ = -12
_sum$1587 = 20
predictor_decompress_fir_adapt PROC NEAR
PUBLIC predictor_decompress_fir_adapt
PublicAlias _predictor_decompress_fir_adapt
mov edx, DWORD PTR _error_buffer$[esp-4]
sub esp, 16 ; 00000010H
mov eax, DWORD PTR [edx]
push ebx
mov ebx, DWORD PTR _buffer_out$[esp+16]
push ebp
push esi
push edi
mov edi, DWORD PTR _predictor_coef_num$[esp+28]
mov DWORD PTR [ebx], eax
test edi, edi
jne SHORT $L1567
mov eax, DWORD PTR _output_size$[esp+28]
cmp eax, 1
jle $L1585
lea ecx, DWORD PTR [eax*4-4]
lea esi, DWORD PTR [edx+4]
mov edx, ecx
lea edi, DWORD PTR [ebx+4]
shr ecx, 2
rep movsd
mov ecx, edx
and ecx, 3
rep movsb
pop edi
pop esi
pop ebp
pop ebx
add esp, 16 ; 00000010H
ret
$L1567:
cmp edi, 31 ; 0000001fH
jne SHORT $L1569
mov eax, DWORD PTR _output_size$[esp+28]
cmp eax, 1
jle $L1585
lea esi, DWORD PTR [eax-1]
test esi, esi
jle $L1585
mov ebp, DWORD PTR _readsamplesize$[esp+28]
mov ecx, 32 ; 00000020H
sub ecx, ebp
lea eax, DWORD PTR [ebx+4]
sub edx, ebx
$L1571:
mov edi, DWORD PTR [edx+eax]
mov ebp, DWORD PTR [eax-4]
add edi, ebp
add eax, 4
shl edi, cl
sar edi, cl
dec esi
mov DWORD PTR [eax-4], edi
jne SHORT $L1571
pop edi
pop esi
pop ebp
pop ebx
add esp, 16 ; 00000010H
ret
$L1569:
test edi, edi
jle $L1585
mov eax, DWORD PTR _readsamplesize$[esp+28]
mov ecx, 32 ; 00000020H
sub ecx, eax
mov esi, edx
mov DWORD PTR -4+[esp+32], ecx
lea eax, DWORD PTR [ebx+4]
sub esi, ebx
$L1578:
mov ebp, DWORD PTR [eax+esi]
add ebp, DWORD PTR [eax-4]
add eax, 4
shl ebp, cl
sar ebp, cl
dec edi
mov DWORD PTR [eax-4], ebp
jne SHORT $L1578
mov eax, DWORD PTR _predictor_coef_num$[esp+28]
mov ecx, DWORD PTR _output_size$[esp+28]
inc eax
cmp eax, ecx
mov DWORD PTR _i$[esp+32], eax
jge $L1585
mov ecx, DWORD PTR _predictor_quantitization$[esp+28]
mov esi, 1
dec ecx
shl esi, cl
lea ecx, DWORD PTR [eax*4]
mov DWORD PTR -16+[esp+32], ecx
mov DWORD PTR -8+[esp+32], esi
mov esi, DWORD PTR _predictor_coef_table$[esp+28]
jmp SHORT $L1583
$L2190:
mov edx, DWORD PTR _error_buffer$[esp+28]
mov ebx, DWORD PTR _buffer_out$[esp+28]
$L1583:
mov ecx, DWORD PTR -16+[esp+32]
mov edi, DWORD PTR [edx+eax*4]
mov edx, DWORD PTR [ebx]
mov DWORD PTR _sum$1587[esp+28], 0
xor eax, eax
lea ecx, DWORD PTR [ecx+ebx-4]
$L1590:
movsx ebx, WORD PTR [esi+eax*2]
mov ebp, DWORD PTR [ecx]
sub ecx, 4
sub ebp, edx
imul ebx, ebp
mov ebp, DWORD PTR _sum$1587[esp+28]
add ebp, ebx
mov ebx, DWORD PTR _predictor_coef_num$[esp+28]
inc eax
mov DWORD PTR _sum$1587[esp+28], ebp
cmp eax, ebx
jl SHORT $L1590
mov eax, DWORD PTR -8+[esp+32]
mov ecx, ebp
lea ebp, DWORD PTR [eax+ecx]
mov ecx, DWORD PTR _predictor_quantitization$[esp+28]
sar ebp, cl
mov ecx, DWORD PTR -4+[esp+32]
mov eax, DWORD PTR _buffer_out$[esp+28]
add ebp, edx
mov edx, DWORD PTR -16+[esp+32]
add ebp, edi
shl ebp, cl
sar ebp, cl
test edi, edi
mov DWORD PTR [edx+eax], ebp
jle SHORT $L2193
lea edx, DWORD PTR [ebx-1]
test edx, edx
jl $L1605
sub ebx, edx
mov ebp, ebx
lea ebx, DWORD PTR [eax+ebx*4]
$L1596:
test edi, edi
jle SHORT $L1605
mov eax, DWORD PTR _buffer_out$[esp+28]
mov ecx, DWORD PTR [eax]
mov eax, DWORD PTR [ebx]
sub ecx, eax
jns SHORT $L2174
or eax, -1
jmp SHORT $L2175
$L2174:
xor eax, eax
test ecx, ecx
setg al
$L2175:
sub WORD PTR [esi+edx*2], ax
imul eax, ecx
mov ecx, DWORD PTR _predictor_quantitization$[esp+28]
add ebx, 4
sar eax, cl
imul eax, ebp
sub edi, eax
dec edx
inc ebp
test edx, edx
jge SHORT $L1596
jmp SHORT $L1605
$L2193:
jge SHORT $L1605
lea edx, DWORD PTR [ebx-1]
test edx, edx
jl SHORT $L1605
sub ebx, edx
mov ebp, ebx
lea ebx, DWORD PTR [eax+ebx*4]
$L1604:
test edi, edi
jge SHORT $L1605
mov ecx, DWORD PTR _buffer_out$[esp+28]
mov eax, DWORD PTR [ebx]
mov ecx, DWORD PTR [ecx]
sub ecx, eax
jns SHORT $L2176
or eax, -1
jmp SHORT $L2177
$L2176:
xor eax, eax
test ecx, ecx
setg al
$L2177:
neg eax
sub WORD PTR [esi+edx*2], ax
imul eax, ecx
mov ecx, DWORD PTR _predictor_quantitization$[esp+28]
add ebx, 4
sar eax, cl
imul eax, ebp
sub edi, eax
dec edx
inc ebp
test edx, edx
jge SHORT $L1604
$L1605:
mov edi, DWORD PTR _buffer_out$[esp+28]
mov eax, DWORD PTR _i$[esp+32]
mov ecx, DWORD PTR _output_size$[esp+28]
add edi, 4
inc eax
mov DWORD PTR _buffer_out$[esp+28], edi
cmp eax, ecx
mov DWORD PTR _i$[esp+32], eax
jl $L2190
$L1585:
pop edi
pop esi
pop ebp
pop ebx
add esp, 16
ret
predictor_decompress_fir_adapt ENDP
_TEXT ENDS
END


Regards,
Sebastian Andersson

hutch--

Sebastian,

I moved the topic to the Laboratory as its the right place for an optimisation question.

What I generally do with VC code if I want it in MASM format is to build it wilthe all of the optimisation turned OFF so that you get more registers to work with. Its very crappy code that way but far easier to optimise. You have to remove the redundant loads and stores but with careful work, you end up with better results once you have some practice at this form of optimisation.

The general idea is to set up a test piece so you know if its producing the correct results then progressively GUT the code to get it up to pace.
Download site for MASM32      New MASM Forum
https://masm32.com          https://masm32.com/board/index.php

Vortex

Hi Sebastian,

Which is one is the decompression routine you mentioned in your code? Is it possible to get the original C code?

Seb

Oh, yeah, it's posted below.


static void predictor_decompress_fir_adapt(int32_t *error_buffer,
                                           int32_t *buffer_out,
                                           int output_size,
                                           int readsamplesize,
                                           int16_t *predictor_coef_table,
                                           int predictor_coef_num,
                                           int predictor_quantitization)
{
    int i;

    // first sample always copies
    *buffer_out = *error_buffer;

    if (!predictor_coef_num)
    {
        if (output_size <= 1) return;
        memcpy(buffer_out+1, error_buffer+1, (output_size-1) * 4);
        return;
    }

    if (predictor_coef_num == 0x1f) // 11111 - max value of predictor_coef_num
    { // second-best case scenario for fir decompression,
       // error describes a small difference from the previous sample only
        if (output_size <= 1) return;
        for (i = 0; i < output_size - 1; i++)
        {
            int32_t prev_value;
            int32_t error_value;

            prev_value = buffer_out[i];
            error_value = error_buffer[i+1];
            buffer_out[i+1] = SIGN_EXTENDED32((prev_value+error_value), readsamplesize);
        }
        return;
    }

    // read warm-up samples
    if (predictor_coef_num > 0)
    {
        int i;
        for (i = 0; i < predictor_coef_num; i++)
        {
            int32_t val;

            val = buffer_out[i] + error_buffer[i+1];

            val = SIGN_EXTENDED32(val, readsamplesize);

            buffer_out[i+1] = val;
        }
    }

#if 0
    // 4 and 8 are very common cases (the only ones i've seen). these
    // should be unrolled and optimised
    //
    if (predictor_coef_num == 4)
        // FIXME: optimised general case
        return;

    if (predictor_coef_table == 8)
        // FIXME: optimised general case
        return;
#endif


    // general case
    if (predictor_coef_num > 0)
    {
        for (i = predictor_coef_num + 1;
             i < output_size;
             i++)
        {
            int j;
            int sum = 0;
            int outval;
            int error_val = error_buffer[i];

            for (j = 0; j < predictor_coef_num; j++)
                sum += (buffer_out[predictor_coef_num-j] - buffer_out[0]) *
                       predictor_coef_table[j];

            outval = (1 << (predictor_quantitization-1)) + sum;
            outval = outval >> predictor_quantitization;
            outval = outval + buffer_out[0] + error_val;
            outval = SIGN_EXTENDED32(outval, readsamplesize);

            buffer_out[predictor_coef_num+1] = outval;

            if (error_val > 0)
            {
                int predictor_num = predictor_coef_num - 1;

                while (predictor_num >= 0 && error_val > 0)
                {
                    int val = buffer_out[0] - buffer_out[predictor_coef_num - predictor_num];
                    int sign = SIGN_ONLY(val);

                    predictor_coef_table[predictor_num] -= sign;

                    val *= sign; // absolute value

                    error_val -= ((val >> predictor_quantitization) *
                                  (predictor_coef_num - predictor_num));

                    predictor_num--;
                }
            }
            else if (error_val < 0)
            {
                int predictor_num = predictor_coef_num - 1;

                while (predictor_num >= 0 && error_val < 0)
                {
                    int val = buffer_out[0] - buffer_out[predictor_coef_num - predictor_num];
                    int sign = - SIGN_ONLY(val);

                    predictor_coef_table[predictor_num] -= sign;

                    val *= sign; // neg value

                    error_val -= ((val >> predictor_quantitization) *
                                  (predictor_coef_num - predictor_num));

                    predictor_num--;
                }
            }

            buffer_out++;
        }
    }
}

hutch--

Seb,

I just plugged it into a file and tried to build it as a plain C extension module with vctoolkit CL and it will not build.


test.c
H:\asm\C_test\test.c(3) : error C2143: syntax error : missing ')' before '*'
H:\asm\C_test\test.c(3) : error C2143: syntax error : missing '{' before '*'
H:\asm\C_test\test.c(4) : error C2143: syntax error : missing ';' before '*'
H:\asm\C_test\test.c(5) : error C2059: syntax error : 'type'
H:\asm\C_test\test.c(9) : error C2059: syntax error : ')'
Download site for MASM32      New MASM Forum
https://masm32.com          https://masm32.com/board/index.php

tenkey

Of course not. Without the appropriate include file, you'll need to define int32_t and int16_t.

Unfortunately, SIGN_EXTENDED32 and SIGN_ONLY appear to be macros, so without the include files, you won't get the same code.
A programming language is low level when its programs require attention to the irrelevant.
Alan Perlis, Epigram #8

hutch--

 :bg

I agree but that does not help me with the include that defines the two data types/sizes. Being an assembler barbarian I would assume DWORD and WORD but it would be better to know the data sizes and types in C.
Download site for MASM32      New MASM Forum
https://masm32.com          https://masm32.com/board/index.php

Seb

Oops, my bad. The data types are;


typedef signed int      int32_t;
typedef signed short    int16_t;


And the macros are;


#define SIGN_EXTENDED32(val, bits) ((val << (32 - bits)) >> (32 - bits))

#define SIGN_ONLY(v) \
                     ((v < 0) ? (-1) : \
                                ((v) ? (1) : \
                                           (0)))

AeroASM

Surely it would be easier to write the routine in MASM from scratch? Then you don't have to attempt to understand what the compiler is doing and it is possible to think more clearly about how to go about it.

Mark_Larson

Quote from: AeroASM on June 09, 2005, 12:46:09 PM
Surely it would be easier to write the routine in MASM from scratch? Then you don't have to attempt to understand what the compiler is doing and it is possible to think more clearly about how to go about it.

  When I convert a routine from C to assembler, I never look at the code that the C compiler is generating.  I write code from scratch in assembler.  However since he's new to optimizing, it could be valuable to have him look at the code generated by VC to get an idea of how it optimizes.
BIOS programmers do it fastest, hehe.  ;)

My Optimization webpage
htttp://www.website.masmforum.com/mark/index.htm

hutch--

With the typedefs and equates I ran it through VC but I would suggest tidying the C up a lot more before trying to convert it to MASM as its not very nice looking code as it is.

I would try for simplification and redundancy removeal if it can be done as the asm output from VC will be a lot better if it can be done.
Download site for MASM32      New MASM Forum
https://masm32.com          https://masm32.com/board/index.php

Seb

Quote from: hutch-- on June 09, 2005, 04:12:56 PM
With the typedefs and equates I ran it through VC but I would suggest tidying the C up a lot more before trying to convert it to MASM as its not very nice looking code as it is.

I would try for simplification and redundancy removeal if it can be done as the asm output from VC will be a lot better if it can be done.

Well, I have, but not alot to tidy up is it?