News:

MASM32 SDK Description, downloads and other helpful links
MASM32.com New Forum Link
masmforum WebSite

Don't be overawed by a C compiler.

Started by hutch--, August 29, 2005, 12:54:11 PM

Previous topic - Next topic

hutch--

Here is the replacement version that has two quick sort algos as test pieces, a slow one and a much better design by Robert Sedgewick and John Bentley. The test piece has built in correctness tesing and will create a VERY LARGE log file if an error occurs while optimising the last algo called "optimise_me". It demonstrates two thing about C compilers, one is that they produce average code reasonably well but are off the pace by assembler programming standards. The other is displayed between the timings of the two algos which indicates a weakness on medium level code complexity that does not properly show te speed difference of the much better Sedgewick / Bentley design.

The two optimised assembler versions do give a far more accurate idea of the algo design differences. The last version is the direct unoptimised C assembler output which has been tidied up and set up ready to optimise. It is the Sedgewick / Bentley design and there is no reason why it cannot be made faster than the existing manually optimised assembler version of the same algo.

Here are the timings on the two boxes I have handy.


Intel PIV 2.8gig
Qsort C = 828 MS
Qsort A = 797 MS
SBsort C = 828 MS
SBsort A = 703 MS
Unoptimised = 1422 MS

AMD Sempron 2.4
Qsort C = 1328 MS
Qsort A = 1234 MS
SBsort C = 1203 MS
SBsort A = 953 MS
Unoptimised = 2203 MS


The Sedgewick Bentley version in MASM (SBsort A)  is also the most stable in its timings.

[attachment deleted by admin]
Download site for MASM32      New MASM Forum
https://masm32.com          https://masm32.com/board/index.php

PBrennick

Hutch,

This program employs Robert Sedgewick's sort algorythm.  Maybe you can convert it to your needs?


{ At last! A couple of spline routines that CONNECT the data points.}
{they are set up in a simple, but probably useless, manner for you to
  configure to suit. The reconfigurations are simple, it's the core
  routines that seem to be hard to get.}

uses crt,graph;
const maxnodes=4;
      sigma:real=1.0;  {tension factor}
      total=100;
type data=array[0..maxnodes]of real;
var xmin,xmax,sum,xx,yy:real;
    i,n,eger:integer;
    xc,yc,xp,yp,temp,path:data;
    GraphDriver,GraphMode,ErrorCode:Integer;
    ch:char;
    alternate:boolean;


Procedure Spline1(color:integer);
{ Fits a smooth curve through a given set of points. Small, simple, fast.
  No tension factor.
  From Algorithms, Robert Sedgewick, sort of...
  simple, but limited
  Beat beyond recognition by Ron Nossaman June 1996 }
Var i,j,oldy,oldx,x,y:integer;
    part,t,xx,yy:real;
    d,wy,wx:data;

  Function f(g:real):real;
  begin
     f:=g*g*g-g;
  end;

Begin
   setcolor(color);
   oldx:=999;
   x:=round(xc[1]);
   y:=round(yc[1]);

        {calculate matrix for x & y}
   for i:=1 to maxnodes-1 do d[i]:=4; {fake ascending sequence}
   for i:=1 to maxnodes-1 do
   begin
      wy[i]:=6*((yc[i+1]-yc[i])-(yc[i]-yc[i-1]));
      wx[i]:=6*((xc[i+1]-xc[i])-(xc[i]-xc[i-1]));
   end;
   yp[0]:=0.0; xp[0]:=0.0;
   yp[maxnodes]:=0.0; xp[maxnodes]:=0.0;
   for i:=1 to maxnodes-2 do
   begin
      wy[i+1]:=wy[i+1]-wy[i]*0.25;
      wx[i+1]:=wx[i+1]-wx[i]*0.25;
      d[i+1]:=d[i+1]-0.25;
   end;
   for i:=maxnodes-1 downto 1 do
   begin
      yp[i]:=(wy[i]-yp[i+1])/d[i];
      xp[i]:=(wx[i]-xp[i+1])/d[i];
   end;

        { Draw spline  }
   for i:=0 to maxnodes-1 do
   begin
      for j:=0 to 30 do {arbitrary number of steps between points}
      begin
         t:=j/30;
         part:=t*yc[i+1]+(1-t)*yc[i]+(f(t)*yp[i+1]+f(1-t)*yp[i])/6.0;
         y:=round(part);
         part:=t*xc[i+1]+(1-t)*xc[i]+(f(t)*xp[i+1]+f(1-t)*xp[i])/6.0;
         x:=round(part);
         if oldx<>999 then line(oldx,oldy,x,y);
         oldx:=x;
         oldy:=y;
       end;
    end;
end;


Procedure Spline2(color:integer);
{ Fits a smooth curve through a given set of points. Small, simple, fast.
  No tension factor. Nicer curve than above Spline1 routine.
  From Algorithms, Robert Sedgewick, sort of...
  Beat beyond recognition by Ron Nossaman June 1996 }
Var i,oldy,oldx,x,y,j:integer;
    part,t,xx,yy:real;
    zc,dx,dy,u,wx,wy,px,py:data;

  Function f(g:real):real;
  begin
     f:=g*g*g-g;
  end;

Begin
   setcolor(color);
   oldx:=999;
   x:=round(xc[0]);
   y:=round(yc[0]);
   zc[0]:=0.0;
   for i:=1 to maxnodes do
   begin
      xx:=xc[i-1]-xc[i]; yy:=yc[i-1]-yc[i];
      t:=sqrt(xx*xx+yy*yy);
      zc[i]:=zc[i-1]+t;     {establish a proportional linear progression}
   end;

{calculate x & y matrix stuff}
   for i:=1 to maxnodes-1 do
   begin
      dx[i]:=2*(zc[i+1]-zc[i-1]);
      dy[i]:=2*(zc[i+1]-zc[i-1]);
   end;
   for i:=0 to maxnodes-1 do
   begin
      u[i]:=zc[i+1]-zc[i];
   end;
   for i:=1 to maxnodes-1 do
   begin
      wy[i]:=6*((yc[i+1]-yc[i])/u[i]-(yc[i]-yc[i-1])/u[i-1]);
      wx[i]:=6*((xc[i+1]-xc[i])/u[i]-(xc[i]-xc[i-1])/u[i-1]);
   end;
   py[0]:=0.0; px[0]:=0.0;    px[1]:=0; py[1]:=0;
   py[maxnodes]:=0.0; px[maxnodes]:=0.0;
   for i:=1 to maxnodes-2 do
   begin
      wy[i+1]:=wy[i+1]-wy[i]*u[i]/dy[i];
      dy[i+1]:=dy[i+1]-u[i]*u[i]/dy[i];
      wx[i+1]:=wx[i+1]-wx[i]*u[i]/dx[i];
      dx[i+1]:=dx[i+1]-u[i]*u[i]/dx[i];
   end;
   for i:=maxnodes-1 downto 1 do
   begin
      py[i]:=(wy[i]-u[i]*py[i+1])/dy[i];
      px[i]:=(wx[i]-u[i]*px[i+1])/dx[i];
   end;

{ Draw spline  }
   for i:=0 to maxnodes-1 do
   begin
      for j:=0 to 30 do
      begin
         part:=zc[i]-(((zc[i]-zc[i+1])/30)*j);
         t:=(part-zc[i])/u[i];
         part:=t*yc[i+1]+(1-t)*yc[i]+u[i]*u[i]*(f(t)*py[i+1]+f(1-t)*py[i])/6.0;
         y:=round(part);
         part:=zc[i]-(((zc[i]-zc[i+1])/30)*j);
         t:=(part-zc[i])/u[i];
         part:=t*xc[i+1]+(1-t)*xc[i]+u[i]*u[i]*(f(t)*px[i+1]+f(1-t)*px[i])/6.0;
         x:=round(part);
         if oldx<>999 then line(oldx,oldy,x,y);
         oldx:=x;
         oldy:=y;
       end;
    end;
end;




(* -----------------------------------------------------------------*)
                { Spline under tension}
{  Original Author -- Copyright(c)1985 James R .Van Zandt
   Converted to Turbo Pascal, simplified, altered
    - Ron Nossaman June 1996 nossaman@southwind.net
   Based on algorithms by A. K. Cline      }

procedure curv1(var x:data; var y:data; var p:data; n:integer);
var i:Integer;
    c1,c2,c3,deln,delnm1,delnn,dels,delx1,delx2,delx12,
    diag1,diag2,diagin,dx1,dx2,exps,
    sigmap,sinhs,sinhin,slpp1,slppn,spdiag:real;
begin
   delx1:=x[1]-x[0];
   dx1:=(y[1]-y[0])/delx1;
   if sigma>=0.then
   begin
      slpp1:=0;
      slppn:=0;
   end else
       begin
          if n<>1 then
          begin
             delx2:=x[2]-x[1];
             delx12:=x[2]-x[0];
             c1:=-(delx12+delx1)/delx12/delx1;
             c2:=delx12/delx1/delx2;
             c3:=-delx1/delx12/delx2;
             slpp1:=c1* y[0]+c2* y[1]+c3* y[2];
             deln:=x[n-1]-x[n-2];
             delnm1:=x[n-2]-x[n-3];
             delnn:=x[n-1]-x[n-3];
             c1:=(delnn+deln)/delnn/deln;
             c2:=-delnn/deln/delnm1;
             c3:=deln/delnn/delnm1;
             slppn:=c3* y[n-3]+c2* y[n-2]+c1* y[n-1];
          end else
             begin
                p[0]:=0.;
                p[1]:=0.;
             end;
       end;
   (* denormalize tension factor *)
   sigmap:=abs(sigma)*(n-1)/(x[n-1]-x[0]);
   (* set up right hand side and tridiagonal system for
         yp and perform forward elimination *)
   dels:=sigmap* delx1;
   exps:=exp(dels);
   sinhs:=0.5*(exps-1./exps);
   sinhin:=1./(delx1* sinhs);
   diag1:=sinhin*(dels* 0.5*(exps+1./exps)-sinhs);
   diagin:=1./diag1;
   p[0]:=diagin*(dx1-slpp1);
   spdiag:=sinhin*(sinhs-dels);
   temp[0]:=diagin* spdiag;
   if(n <> 1) then
   begin
      for i:=1 to n-2 do
      begin
         delx2:=x[i+1]-x[i];
         dx2:=(y[i+1]-y[i])/delx2;
         dels:=sigmap*delx2;
         exps:=exp(dels);
         sinhs:=0.5*(exps-1./exps);
         sinhin:=1./(delx2*sinhs);
         diag2:=sinhin*(dels*(0.5*(exps+1./exps))-sinhs);
         diagin:=1./(diag1+diag2-spdiag*temp[i-1]);
         p[i]:=diagin*(dx2-dx1-spdiag*p[i-1]);
         spdiag:=sinhin*(sinhs-dels);
         temp[i]:=diagin*spdiag;
         dx1:=dx2;
         diag1:=diag2;
      end;
   end;
   diagin:=1./(diag1-spdiag*temp[n-2]);
   p[n-1]:=diagin*(slppn-dx2-spdiag*p[n-2]);
              (* perform back substitution *)
   for i:=n-2 downto 0 do p[i]:=p[i]-(temp[i]*p[i+1]);
end;



function curv2(var x:data; var y:data; var p:data; t:real; n:integer):real;
var i,j,i1:integer;
    del1,del2,dels,exps,exps1,s,sigmap,sinhd1,sinhd2,sinhs:real;
begin
   i1:=1;
   s:=x[n-1]-x[0];
   sigmap:=abs(sigma)*(n-1)/s;
   i:=i1;
   while(i<n) and(t>= x[i])do inc(i);
   while(i>1) and(x[i-1]>t)do dec(i);
   i1:=i;
   del1:=t-x[i-1];
   del2:=x[i]-t;
   dels:=x[i]-x[i-1];
   exps1:=exp(sigmap*del1); sinhd1:=0.5*(exps1-1./exps1);
   exps:=exp(sigmap*del2); sinhd2:=0.5*(exps-1./exps);
   exps:=exps1*exps; sinhs:=0.5*(exps-1./exps);
   curv2:=((p[i]*sinhd1+p[i-1]*sinhd2)/sinhs+
        ((y[i]-p[i])*del1+(y[i-1]-p[i-1])*del2)/dels);
end;


procedure curv0(n,requested:integer);
var  i,j,each,stop,seg,regressing:integer;
     s,ds,xx,yy,oldx,oldy:real;
begin
   oldx:=999;
   curv1(path,xc,xp,n);
   curv1(path,yc,yp,n);
   s:=path[0];
   seg:=0;
   for j:=0 to n-2 do
   begin
      stop:=100;
      ds:=(path[j+1]-path[j])/stop;
      for i:=0 to stop-1 do
      begin
         xx:=round(curv2(path,xc,xp,s,n));
         yy:=round(curv2(path,yc,yp,s,n));
         if oldx<>999 then
           line(round(oldx),round(oldy),round(xx),round(yy));
         oldx:=xx; oldy:=yy;
         s:=s+ds;
      end;
   end;
   xx:=xc[n-1];
   yy:=yc[n-1];
   line(round(oldx),round(oldy),round(xx),round(yy));
end;


procedure tspline(color:integer);
{  Fits a smooth curve through a given set of points, using the splines
   under tension introduced by J. Schweikert. They are similar to cubic
   splines,but are less likely to introduce spurious inflection points.

   Original Author -- Copyright(c)1985James R .Van Zandt
   Converted to Turbo Pascal, butchered, simplified, altered --
         Ron Nossaman June 1996 nossaman@southwind.net
   Based on algorithms by A. K. Cline      }
var nn,origin:integer;
begin
   sum:=0;
   path[0]:=0;
   for i:=1 to maxnodes do
   begin
      xx:=xc[i]-xc[i-1];
      yy:=yc[i]-yc[i-1];
      sum:=sum+sqrt((xx*xx)+(yy*yy));
      path[i]:=sum;
      if alternate then path[i]:=i; {my addition, another alternative }
   end;
   setcolor(color);   {display color for spline curve}
   curv0(maxnodes+1,100);
end;


procedure loadarrays;
var i:integer;
begin
   setcolor(red);
   for i:=0 to maxnodes do
   begin
      xc[i]:=random(540)+50;   {try to keep it on the screen for demo}
      yc[i]:=random(380)+50;
{show the path, if you want}
      if i>0 then line(round(xc[i-1]),round(yc[i-1]),round(xc[i]),round(yc[i]));
   end;
   setcolor(white);
   for i:=0 to maxnodes do circle(round(xc[i]),round(yc[i]),3);
end;


begin
   clrscr;
   randomize;
   ch:=#0;
   GraphDriver:=VGA;
   Graphmode:=2;
   InitGraph(GraphDriver,GraphMode,'');
   ErrorCode:=GraphResult;
   If ErrorCode<>grOK then Halt(1);
   repeat
      setfillstyle(solidfill,black);
      bar(0,0,639,479);
      loadarrays;

    {try whatever combinations strike you}
    {there are a lot of different ways to get there!}
      alternate:=false; sigma:=-2; tspline(green);
      alternate:=false; sigma:=0.01; tspline(cyan);
      alternate:=true; sigma:=1; tspline(white);
      spline1(lightred);
      spline2(yellow);
      repeat until keypressed;
      while keypressed do ch:=readkey;
   until ch=#27;
   closegraph;
end.


Robert Sedgewick's quicksort algo has been around for many, many years.  I think the first time I read about them was circa 1975.

hth,
Paul
The GeneSys Project is available from:
The Repository or My crappy website

PBrennick

Hutch,

This example of Sedgewick's quicksort algo is written in JAVA and is probably closer to what you need than the PASCAL version.  It can be difficult to dig this stuff out of my archives but I know I have the original somewhere!


/**
* A Shellsort demonstration algorithm
* ShellAlgorithm.java, 19.04.97
*
* @author   Lars Marius Garshol
* @version  1.00 - 19.04.97
*/

class ShellAlgorithm extends SortAlgorithm {
  void sort(int a[]) throws Exception {
    int h[]={109,41,19,5,1}; //Best Sedgewick sort increment sequence
    int tmp,j;
    int incno; //Increment number

    //Find right start increment
    for (tmp=0; h[tmp]>a.length; tmp++)
      ;

    //Loop through increment sequence
    for (incno=tmp; incno<=h.length-1; incno++) {

      for (int i=h[incno]; i<=a.length-1; i++) {
        // Invariant: a[start..i-h[incno]] h[incno]-sorted
   
        tmp=a[i];   
        for (j=i-h[incno]; j>=0 && a[j]>tmp; j=j-h[incno]) {
          if (stopRequested) {
            return;
          }

          a[j+h[incno]]=a[j];        

          pause(j,i);
        }

        //Now we've found a[i]'s place
        a[j+h[incno]]=tmp;

      } //for i

    } //for incno

  } //end of sort
}


Paul
The GeneSys Project is available from:
The Repository or My crappy website

hutch--

Paul,

Thanks for making the effort, I think that is very close to the original Pascal I ported years ago but an internet search today yielded the original Sedgewick algo in ANSI C which I have built, tested and produced an optimised assembler version of already. The new version has proof of correctness for the sort and a logging capacity if an error occurs. I was just starting on another Sedgewick quick sort of the tri-median variety but its getting too late here and I need some sleep. I will post the new example tomorrow after I finish all of the testing.

This is the Sedgewick / Bentley version which had te characteristic I was after. I rewrote the swaps inline to save stack overhead, converted it to STDCALL and renamed some of the variables so they did not clash with masm reserve words.


void __stdcall swqsort(int arr[], int lft, int rgt)

    /* -----------------------------------
       A quick sort algorithm published by
       Robert Sedgewick and John Bentley
       -------------------------------- */
    {
        int t;
        int i = lft-1;
        int j = rgt;
        int v = arr[rgt];
        if (rgt <= lft) return;
        for (;;)
        {
            while (arr[++i] < v);
            while (v < arr[--j]) if (j == lft) break;
            if (i >= j) break;

            t = arr[i];
            arr[i] = arr[j];
            arr[j] = t;
        }
        t = arr[i];
        arr[i] = arr[rgt];
        arr[rgt] = t;

        swqsort(arr, lft, i-1);
        swqsort(arr, i+1, rgt);
    }
Download site for MASM32      New MASM Forum
https://masm32.com          https://masm32.com/board/index.php

James Ladd

Hutch,

Why is the timing for AMD so different to the PIV ?

Eóin

Hi Hutch--, the following is java code for a Tir-Median check. If you add something similar to your C code just prior to the inner loop it'll will really speed things up.


i = (rgh+lft)/2;
if (arr[lft]>arr[i]) swap(arr,lft,i);     // Tri-Median Methode!
if (arr[lft]>arr[rgt]) swap(arr,lft,rgt);
if (arr[i]>arr[rgt]) swap(arr,i,rgt);

j = rgt-1;
swap(arr,i,j);
i = lft;
v = arr[j];
for(;;)
...


Looking at your code I suspect rgt and lft are swapped compared to the bit I've here.

P.S. I realise this isn't a competition to see who can write the best QuickSort :bg but since I have it lying around I'm attaching (should anyone be interested) a FASM file containing an implementation of a Tri-Median QuickSort which performs an Insertion Sort to speed things up. Included is an OBJ file containing the cassembled procedure. Its exported simply under the name QuickSort and should be called as a stdcall function taking two parameters, the first being a pointer to a DWORD array, the second being the size of the array (in bytes). Also the origion algo in java is include as comments in the source.

[attachment deleted by admin]

hutch--

#6
James,

Most of the time the PIV I have is faster than the Sempron 2.4 but to be fair, the Sempron is a very low cost machine that I use to drive my printer and scanner. It does appear to have similar characteristics to most of the later AMD hardware so its useful for tesing of the same algo in test pieces.

LATER : Note that I have posted the replacement version in the first topic posting.
Download site for MASM32      New MASM Forum
https://masm32.com          https://masm32.com/board/index.php

Farabi

That C++ source must be using FPU. What are they doing with FPU. I saw an qsort function on delphi 3 years ago, and on my p2 400 it is less than a second. How many byte this qsort sorting the words?
Those who had universe knowledges can control the world by a micro processor.
http://www.wix.com/farabio/firstpage

"Etos siperi elegi"

hutch--

Code optimisation is a form of adiction. Here is the optimised SBsort with a couple of tweaks done to it. It averages the hi and lo to set the partition so it slightly isolates te algo from some unusual data sets and is slightly faster for having done so. I have shifted some of the code to the "pre" label to remove one jump at the collectors after conditional jumps and the algo does time slightly faster from both mods.


; «««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««

align 16

sbsort proc arr:DWORD,lft:DWORD,rgt:DWORD

comment * ---------------------------------
        A quick sort algorithm published by
        Robert Sedgewick and John Bentley
        --------------------------------- *

    mov eax, rgt
    cmp eax, lft
    jg @F
    ret

  @@:
    push ebx
    push esi
    push edi

    mov ebx, arr
    mov edi, lft
    sub edi, 1
    mov esi, rgt

    mov eax, esi            ;
    add eax, edi            ; average median calculation
    shr eax, 1              ;
    mov edx, [ebx+eax*4]    ;
    ;; mov edx, [ebx+esi*4]
    jmp entry

  align 4
  pre:
    cmp edi, esi
    jge last
    mov ecx, [ebx+edi*4]
    mov eax, [ebx+esi*4]
    mov [ebx+esi*4], ecx
    mov [ebx+edi*4], eax

  entry:
  REPEAT 7
    add edi, 1
    cmp [ebx+edi*4], edx
    jge first
  ENDM
    add edi, 1
    cmp [ebx+edi*4], edx
    jl entry

  align 4
  first:
    sub esi, 1
    cmp edx, [ebx+esi*4]
    jge pre
    cmp esi, lft
    je pre

    sub esi, 1
    cmp edx, [ebx+esi*4]
    jge pre
    cmp esi, lft
    jne entry
    jmp pre

  align 4
  last:
    mov esi, rgt
    mov ecx, [ebx+edi*4]
    mov eax, [ebx+esi*4]
    mov [ebx+esi*4], ecx
    mov [ebx+edi*4], eax

    sub edi, 1
    invoke sbsort,ebx,lft,edi
    add edi, 2
    invoke sbsort,ebx,edi,esi

    pop edi
    pop esi
    pop ebx

    ret

sbsort endp

; «««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««
Download site for MASM32      New MASM Forum
https://masm32.com          https://masm32.com/board/index.php