Here is the replacement version that has two quick sort algos as test pieces, a slow one and a much better design by Robert Sedgewick and John Bentley. The test piece has built in correctness tesing and will create a VERY LARGE log file if an error occurs while optimising the last algo called "optimise_me". It demonstrates two thing about C compilers, one is that they produce average code reasonably well but are off the pace by assembler programming standards. The other is displayed between the timings of the two algos which indicates a weakness on medium level code complexity that does not properly show te speed difference of the much better Sedgewick / Bentley design.
The two optimised assembler versions do give a far more accurate idea of the algo design differences. The last version is the direct unoptimised C assembler output which has been tidied up and set up ready to optimise. It is the Sedgewick / Bentley design and there is no reason why it cannot be made faster than the existing manually optimised assembler version of the same algo.
Here are the timings on the two boxes I have handy.
Intel PIV 2.8gig
Qsort C = 828 MS
Qsort A = 797 MS
SBsort C = 828 MS
SBsort A = 703 MS
Unoptimised = 1422 MS
AMD Sempron 2.4
Qsort C = 1328 MS
Qsort A = 1234 MS
SBsort C = 1203 MS
SBsort A = 953 MS
Unoptimised = 2203 MS
The Sedgewick Bentley version in MASM (SBsort A) is also the most stable in its timings.
[attachment deleted by admin]
Hutch,
This program employs Robert Sedgewick's sort algorythm. Maybe you can convert it to your needs?
{ At last! A couple of spline routines that CONNECT the data points.}
{they are set up in a simple, but probably useless, manner for you to
configure to suit. The reconfigurations are simple, it's the core
routines that seem to be hard to get.}
uses crt,graph;
const maxnodes=4;
sigma:real=1.0; {tension factor}
total=100;
type data=array[0..maxnodes]of real;
var xmin,xmax,sum,xx,yy:real;
i,n,eger:integer;
xc,yc,xp,yp,temp,path:data;
GraphDriver,GraphMode,ErrorCode:Integer;
ch:char;
alternate:boolean;
Procedure Spline1(color:integer);
{ Fits a smooth curve through a given set of points. Small, simple, fast.
No tension factor.
From Algorithms, Robert Sedgewick, sort of...
simple, but limited
Beat beyond recognition by Ron Nossaman June 1996 }
Var i,j,oldy,oldx,x,y:integer;
part,t,xx,yy:real;
d,wy,wx:data;
Function f(g:real):real;
begin
f:=g*g*g-g;
end;
Begin
setcolor(color);
oldx:=999;
x:=round(xc[1]);
y:=round(yc[1]);
{calculate matrix for x & y}
for i:=1 to maxnodes-1 do d[i]:=4; {fake ascending sequence}
for i:=1 to maxnodes-1 do
begin
wy[i]:=6*((yc[i+1]-yc[i])-(yc[i]-yc[i-1]));
wx[i]:=6*((xc[i+1]-xc[i])-(xc[i]-xc[i-1]));
end;
yp[0]:=0.0; xp[0]:=0.0;
yp[maxnodes]:=0.0; xp[maxnodes]:=0.0;
for i:=1 to maxnodes-2 do
begin
wy[i+1]:=wy[i+1]-wy[i]*0.25;
wx[i+1]:=wx[i+1]-wx[i]*0.25;
d[i+1]:=d[i+1]-0.25;
end;
for i:=maxnodes-1 downto 1 do
begin
yp[i]:=(wy[i]-yp[i+1])/d[i];
xp[i]:=(wx[i]-xp[i+1])/d[i];
end;
{ Draw spline }
for i:=0 to maxnodes-1 do
begin
for j:=0 to 30 do {arbitrary number of steps between points}
begin
t:=j/30;
part:=t*yc[i+1]+(1-t)*yc[i]+(f(t)*yp[i+1]+f(1-t)*yp[i])/6.0;
y:=round(part);
part:=t*xc[i+1]+(1-t)*xc[i]+(f(t)*xp[i+1]+f(1-t)*xp[i])/6.0;
x:=round(part);
if oldx<>999 then line(oldx,oldy,x,y);
oldx:=x;
oldy:=y;
end;
end;
end;
Procedure Spline2(color:integer);
{ Fits a smooth curve through a given set of points. Small, simple, fast.
No tension factor. Nicer curve than above Spline1 routine.
From Algorithms, Robert Sedgewick, sort of...
Beat beyond recognition by Ron Nossaman June 1996 }
Var i,oldy,oldx,x,y,j:integer;
part,t,xx,yy:real;
zc,dx,dy,u,wx,wy,px,py:data;
Function f(g:real):real;
begin
f:=g*g*g-g;
end;
Begin
setcolor(color);
oldx:=999;
x:=round(xc[0]);
y:=round(yc[0]);
zc[0]:=0.0;
for i:=1 to maxnodes do
begin
xx:=xc[i-1]-xc[i]; yy:=yc[i-1]-yc[i];
t:=sqrt(xx*xx+yy*yy);
zc[i]:=zc[i-1]+t; {establish a proportional linear progression}
end;
{calculate x & y matrix stuff}
for i:=1 to maxnodes-1 do
begin
dx[i]:=2*(zc[i+1]-zc[i-1]);
dy[i]:=2*(zc[i+1]-zc[i-1]);
end;
for i:=0 to maxnodes-1 do
begin
u[i]:=zc[i+1]-zc[i];
end;
for i:=1 to maxnodes-1 do
begin
wy[i]:=6*((yc[i+1]-yc[i])/u[i]-(yc[i]-yc[i-1])/u[i-1]);
wx[i]:=6*((xc[i+1]-xc[i])/u[i]-(xc[i]-xc[i-1])/u[i-1]);
end;
py[0]:=0.0; px[0]:=0.0; px[1]:=0; py[1]:=0;
py[maxnodes]:=0.0; px[maxnodes]:=0.0;
for i:=1 to maxnodes-2 do
begin
wy[i+1]:=wy[i+1]-wy[i]*u[i]/dy[i];
dy[i+1]:=dy[i+1]-u[i]*u[i]/dy[i];
wx[i+1]:=wx[i+1]-wx[i]*u[i]/dx[i];
dx[i+1]:=dx[i+1]-u[i]*u[i]/dx[i];
end;
for i:=maxnodes-1 downto 1 do
begin
py[i]:=(wy[i]-u[i]*py[i+1])/dy[i];
px[i]:=(wx[i]-u[i]*px[i+1])/dx[i];
end;
{ Draw spline }
for i:=0 to maxnodes-1 do
begin
for j:=0 to 30 do
begin
part:=zc[i]-(((zc[i]-zc[i+1])/30)*j);
t:=(part-zc[i])/u[i];
part:=t*yc[i+1]+(1-t)*yc[i]+u[i]*u[i]*(f(t)*py[i+1]+f(1-t)*py[i])/6.0;
y:=round(part);
part:=zc[i]-(((zc[i]-zc[i+1])/30)*j);
t:=(part-zc[i])/u[i];
part:=t*xc[i+1]+(1-t)*xc[i]+u[i]*u[i]*(f(t)*px[i+1]+f(1-t)*px[i])/6.0;
x:=round(part);
if oldx<>999 then line(oldx,oldy,x,y);
oldx:=x;
oldy:=y;
end;
end;
end;
(* -----------------------------------------------------------------*)
{ Spline under tension}
{ Original Author -- Copyright(c)1985 James R .Van Zandt
Converted to Turbo Pascal, simplified, altered
- Ron Nossaman June 1996 nossaman@southwind.net
Based on algorithms by A. K. Cline }
procedure curv1(var x:data; var y:data; var p:data; n:integer);
var i:Integer;
c1,c2,c3,deln,delnm1,delnn,dels,delx1,delx2,delx12,
diag1,diag2,diagin,dx1,dx2,exps,
sigmap,sinhs,sinhin,slpp1,slppn,spdiag:real;
begin
delx1:=x[1]-x[0];
dx1:=(y[1]-y[0])/delx1;
if sigma>=0.then
begin
slpp1:=0;
slppn:=0;
end else
begin
if n<>1 then
begin
delx2:=x[2]-x[1];
delx12:=x[2]-x[0];
c1:=-(delx12+delx1)/delx12/delx1;
c2:=delx12/delx1/delx2;
c3:=-delx1/delx12/delx2;
slpp1:=c1* y[0]+c2* y[1]+c3* y[2];
deln:=x[n-1]-x[n-2];
delnm1:=x[n-2]-x[n-3];
delnn:=x[n-1]-x[n-3];
c1:=(delnn+deln)/delnn/deln;
c2:=-delnn/deln/delnm1;
c3:=deln/delnn/delnm1;
slppn:=c3* y[n-3]+c2* y[n-2]+c1* y[n-1];
end else
begin
p[0]:=0.;
p[1]:=0.;
end;
end;
(* denormalize tension factor *)
sigmap:=abs(sigma)*(n-1)/(x[n-1]-x[0]);
(* set up right hand side and tridiagonal system for
yp and perform forward elimination *)
dels:=sigmap* delx1;
exps:=exp(dels);
sinhs:=0.5*(exps-1./exps);
sinhin:=1./(delx1* sinhs);
diag1:=sinhin*(dels* 0.5*(exps+1./exps)-sinhs);
diagin:=1./diag1;
p[0]:=diagin*(dx1-slpp1);
spdiag:=sinhin*(sinhs-dels);
temp[0]:=diagin* spdiag;
if(n <> 1) then
begin
for i:=1 to n-2 do
begin
delx2:=x[i+1]-x[i];
dx2:=(y[i+1]-y[i])/delx2;
dels:=sigmap*delx2;
exps:=exp(dels);
sinhs:=0.5*(exps-1./exps);
sinhin:=1./(delx2*sinhs);
diag2:=sinhin*(dels*(0.5*(exps+1./exps))-sinhs);
diagin:=1./(diag1+diag2-spdiag*temp[i-1]);
p[i]:=diagin*(dx2-dx1-spdiag*p[i-1]);
spdiag:=sinhin*(sinhs-dels);
temp[i]:=diagin*spdiag;
dx1:=dx2;
diag1:=diag2;
end;
end;
diagin:=1./(diag1-spdiag*temp[n-2]);
p[n-1]:=diagin*(slppn-dx2-spdiag*p[n-2]);
(* perform back substitution *)
for i:=n-2 downto 0 do p[i]:=p[i]-(temp[i]*p[i+1]);
end;
function curv2(var x:data; var y:data; var p:data; t:real; n:integer):real;
var i,j,i1:integer;
del1,del2,dels,exps,exps1,s,sigmap,sinhd1,sinhd2,sinhs:real;
begin
i1:=1;
s:=x[n-1]-x[0];
sigmap:=abs(sigma)*(n-1)/s;
i:=i1;
while(i<n) and(t>= x[i])do inc(i);
while(i>1) and(x[i-1]>t)do dec(i);
i1:=i;
del1:=t-x[i-1];
del2:=x[i]-t;
dels:=x[i]-x[i-1];
exps1:=exp(sigmap*del1); sinhd1:=0.5*(exps1-1./exps1);
exps:=exp(sigmap*del2); sinhd2:=0.5*(exps-1./exps);
exps:=exps1*exps; sinhs:=0.5*(exps-1./exps);
curv2:=((p[i]*sinhd1+p[i-1]*sinhd2)/sinhs+
((y[i]-p[i])*del1+(y[i-1]-p[i-1])*del2)/dels);
end;
procedure curv0(n,requested:integer);
var i,j,each,stop,seg,regressing:integer;
s,ds,xx,yy,oldx,oldy:real;
begin
oldx:=999;
curv1(path,xc,xp,n);
curv1(path,yc,yp,n);
s:=path[0];
seg:=0;
for j:=0 to n-2 do
begin
stop:=100;
ds:=(path[j+1]-path[j])/stop;
for i:=0 to stop-1 do
begin
xx:=round(curv2(path,xc,xp,s,n));
yy:=round(curv2(path,yc,yp,s,n));
if oldx<>999 then
line(round(oldx),round(oldy),round(xx),round(yy));
oldx:=xx; oldy:=yy;
s:=s+ds;
end;
end;
xx:=xc[n-1];
yy:=yc[n-1];
line(round(oldx),round(oldy),round(xx),round(yy));
end;
procedure tspline(color:integer);
{ Fits a smooth curve through a given set of points, using the splines
under tension introduced by J. Schweikert. They are similar to cubic
splines,but are less likely to introduce spurious inflection points.
Original Author -- Copyright(c)1985James R .Van Zandt
Converted to Turbo Pascal, butchered, simplified, altered --
Ron Nossaman June 1996 nossaman@southwind.net
Based on algorithms by A. K. Cline }
var nn,origin:integer;
begin
sum:=0;
path[0]:=0;
for i:=1 to maxnodes do
begin
xx:=xc[i]-xc[i-1];
yy:=yc[i]-yc[i-1];
sum:=sum+sqrt((xx*xx)+(yy*yy));
path[i]:=sum;
if alternate then path[i]:=i; {my addition, another alternative }
end;
setcolor(color); {display color for spline curve}
curv0(maxnodes+1,100);
end;
procedure loadarrays;
var i:integer;
begin
setcolor(red);
for i:=0 to maxnodes do
begin
xc[i]:=random(540)+50; {try to keep it on the screen for demo}
yc[i]:=random(380)+50;
{show the path, if you want}
if i>0 then line(round(xc[i-1]),round(yc[i-1]),round(xc[i]),round(yc[i]));
end;
setcolor(white);
for i:=0 to maxnodes do circle(round(xc[i]),round(yc[i]),3);
end;
begin
clrscr;
randomize;
ch:=#0;
GraphDriver:=VGA;
Graphmode:=2;
InitGraph(GraphDriver,GraphMode,'');
ErrorCode:=GraphResult;
If ErrorCode<>grOK then Halt(1);
repeat
setfillstyle(solidfill,black);
bar(0,0,639,479);
loadarrays;
{try whatever combinations strike you}
{there are a lot of different ways to get there!}
alternate:=false; sigma:=-2; tspline(green);
alternate:=false; sigma:=0.01; tspline(cyan);
alternate:=true; sigma:=1; tspline(white);
spline1(lightred);
spline2(yellow);
repeat until keypressed;
while keypressed do ch:=readkey;
until ch=#27;
closegraph;
end.
Robert Sedgewick's quicksort algo has been around for many, many years. I think the first time I read about them was circa 1975.
hth,
Paul
Hutch,
This example of Sedgewick's quicksort algo is written in JAVA and is probably closer to what you need than the PASCAL version. It can be difficult to dig this stuff out of my archives but I know I have the original somewhere!
/**
* A Shellsort demonstration algorithm
* ShellAlgorithm.java, 19.04.97
*
* @author Lars Marius Garshol
* @version 1.00 - 19.04.97
*/
class ShellAlgorithm extends SortAlgorithm {
void sort(int a[]) throws Exception {
int h[]={109,41,19,5,1}; //Best Sedgewick sort increment sequence
int tmp,j;
int incno; //Increment number
//Find right start increment
for (tmp=0; h[tmp]>a.length; tmp++)
;
//Loop through increment sequence
for (incno=tmp; incno<=h.length-1; incno++) {
for (int i=h[incno]; i<=a.length-1; i++) {
// Invariant: a[start..i-h[incno]] h[incno]-sorted
tmp=a[i];
for (j=i-h[incno]; j>=0 && a[j]>tmp; j=j-h[incno]) {
if (stopRequested) {
return;
}
a[j+h[incno]]=a[j];
pause(j,i);
}
//Now we've found a[i]'s place
a[j+h[incno]]=tmp;
} //for i
} //for incno
} //end of sort
}
Paul
Paul,
Thanks for making the effort, I think that is very close to the original Pascal I ported years ago but an internet search today yielded the original Sedgewick algo in ANSI C which I have built, tested and produced an optimised assembler version of already. The new version has proof of correctness for the sort and a logging capacity if an error occurs. I was just starting on another Sedgewick quick sort of the tri-median variety but its getting too late here and I need some sleep. I will post the new example tomorrow after I finish all of the testing.
This is the Sedgewick / Bentley version which had te characteristic I was after. I rewrote the swaps inline to save stack overhead, converted it to STDCALL and renamed some of the variables so they did not clash with masm reserve words.
void __stdcall swqsort(int arr[], int lft, int rgt)
/* -----------------------------------
A quick sort algorithm published by
Robert Sedgewick and John Bentley
-------------------------------- */
{
int t;
int i = lft-1;
int j = rgt;
int v = arr[rgt];
if (rgt <= lft) return;
for (;;)
{
while (arr[++i] < v);
while (v < arr[--j]) if (j == lft) break;
if (i >= j) break;
t = arr[i];
arr[i] = arr[j];
arr[j] = t;
}
t = arr[i];
arr[i] = arr[rgt];
arr[rgt] = t;
swqsort(arr, lft, i-1);
swqsort(arr, i+1, rgt);
}
Hutch,
Why is the timing for AMD so different to the PIV ?
Hi Hutch--, the following is java code for a Tir-Median check. If you add something similar to your C code just prior to the inner loop it'll will really speed things up.
i = (rgh+lft)/2;
if (arr[lft]>arr[i]) swap(arr,lft,i); // Tri-Median Methode!
if (arr[lft]>arr[rgt]) swap(arr,lft,rgt);
if (arr[i]>arr[rgt]) swap(arr,i,rgt);
j = rgt-1;
swap(arr,i,j);
i = lft;
v = arr[j];
for(;;)
...
Looking at your code I suspect rgt and lft are swapped compared to the bit I've here.
P.S. I realise this isn't a competition to see who can write the best QuickSort :bg but since I have it lying around I'm attaching (should anyone be interested) a FASM file containing an implementation of a Tri-Median QuickSort which performs an Insertion Sort to speed things up. Included is an OBJ file containing the cassembled procedure. Its exported simply under the name QuickSort and should be called as a stdcall function taking two parameters, the first being a pointer to a DWORD array, the second being the size of the array (in bytes). Also the origion algo in java is include as comments in the source.
[attachment deleted by admin]
James,
Most of the time the PIV I have is faster than the Sempron 2.4 but to be fair, the Sempron is a very low cost machine that I use to drive my printer and scanner. It does appear to have similar characteristics to most of the later AMD hardware so its useful for tesing of the same algo in test pieces.
LATER : Note that I have posted the replacement version in the first topic posting.
That C++ source must be using FPU. What are they doing with FPU. I saw an qsort function on delphi 3 years ago, and on my p2 400 it is less than a second. How many byte this qsort sorting the words?
Code optimisation is a form of adiction. Here is the optimised SBsort with a couple of tweaks done to it. It averages the hi and lo to set the partition so it slightly isolates te algo from some unusual data sets and is slightly faster for having done so. I have shifted some of the code to the "pre" label to remove one jump at the collectors after conditional jumps and the algo does time slightly faster from both mods.
; «««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««
align 16
sbsort proc arr:DWORD,lft:DWORD,rgt:DWORD
comment * ---------------------------------
A quick sort algorithm published by
Robert Sedgewick and John Bentley
--------------------------------- *
mov eax, rgt
cmp eax, lft
jg @F
ret
@@:
push ebx
push esi
push edi
mov ebx, arr
mov edi, lft
sub edi, 1
mov esi, rgt
mov eax, esi ;
add eax, edi ; average median calculation
shr eax, 1 ;
mov edx, [ebx+eax*4] ;
;; mov edx, [ebx+esi*4]
jmp entry
align 4
pre:
cmp edi, esi
jge last
mov ecx, [ebx+edi*4]
mov eax, [ebx+esi*4]
mov [ebx+esi*4], ecx
mov [ebx+edi*4], eax
entry:
REPEAT 7
add edi, 1
cmp [ebx+edi*4], edx
jge first
ENDM
add edi, 1
cmp [ebx+edi*4], edx
jl entry
align 4
first:
sub esi, 1
cmp edx, [ebx+esi*4]
jge pre
cmp esi, lft
je pre
sub esi, 1
cmp edx, [ebx+esi*4]
jge pre
cmp esi, lft
jne entry
jmp pre
align 4
last:
mov esi, rgt
mov ecx, [ebx+edi*4]
mov eax, [ebx+esi*4]
mov [ebx+esi*4], ecx
mov [ebx+edi*4], eax
sub edi, 1
invoke sbsort,ebx,lft,edi
add edi, 2
invoke sbsort,ebx,edi,esi
pop edi
pop esi
pop ebx
ret
sbsort endp
; «««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««««