View Single Post
  #9  
Old August 1st 11, 11:34 AM posted to alt.comp.lang.borland-delphi,alt.comp.periphs.videocards.nvidia,alt.lang.asm,comp.arch,rec.games.corewar
Skybuck Flying[_7_]
external usenet poster
 
Posts: 460
Default An idea how to speed up computer programs and avoid waiting. ("event driven memory system")

Interesting theory, question is if it can be put to practice.

A problem might be that the other registers might already be in use.

Here is my Delphi/Pascal test code to test CPU random access memory (cpu
cache) performance.

(I already optimized it to use integers only instead of "dynamic indexes"
this already makes it twice as fast as the cuda test program):

Below I shall post the Delphi/Pascal Code and below that I shall post the
generated assembler code to show what Delphi compiler produces/makes of it
and perhaps it might be of some help to you to see what's going on and where
potential problems might be and maybe as inspiration to try and do a better
job at it maybe you can solve it, I shall also try to write an assembler
routine myself to see if this trick of yours can actually work in practice,
you are welcome to try as well

// *** Begin of Delphi/Pascal Code ***

// version 0.02: try multiple pipe trick to see if delphi compiler can use
it.
procedure TCPUMemoryTest.ExecuteCPU;
var
vStart : int64;
vStop : int64;
vFrequency : int64;

vBlockIndex : integer;
vLoopIndex : integer;

vElementIndexA : integer;
vElementIndexB : integer;
vElementIndexC : integer;

vElementCount : integer;
begin
QueryPerformanceCounter( vStart );

vElementCount := mElementCount;
for vBlockIndex := 0 to (mBlockCount div 3) do
begin
vElementIndexA := 0;
vElementIndexB := 0;
vElementIndexC := 0;

for vLoopIndex := 0 to mLoopCount-1 do
begin
vElementIndexA := mMemory[ vElementIndexA + (vBlockIndex*3+0) *
vElementCount ];
vElementIndexB := mMemory[ vElementIndexB + (vBlockIndex*3+1) *
vElementCount ];
vElementIndexC := mMemory[ vElementIndexC + (vBlockIndex*3+2) *
vElementCount ];
end;

mBlockResult[ vBlockIndex*3+0 ] := vElementIndexA;
mBlockResult[ vBlockIndex*3+1 ] := vElementIndexB;
mBlockResult[ vBlockIndex*3+2 ] := vElementIndexC;
end;

QueryPerformanceCounter( vStop );
QueryPerformanceFrequency( vFrequency );

mCPUExecutionTimeInSeconds := (vStop - vStart) / vFrequency;
end;

// *** End of Delphi/Pascal Code ***

So far from what I can tell from the assembler output, Delphi does not seem
to apply the "don't use register until much later trick".

It seems to introduce "register depedencies" which probably makes everything
stall.

This was just an early version/try so perhaps a hand-written assembler
routine would perform better.

// *** Begin of assembler output ***

unit_TCPUMemoryTest_version_001.pas.184: begin
0040FC5C 53 push ebx
0040FC5D 56 push esi
0040FC5E 57 push edi
0040FC5F 55 push ebp
0040FC60 83C4D0 add esp,-$30
0040FC63 8BD8 mov ebx,eax
unit_TCPUMemoryTest_version_001.pas.185: QueryPerformanceCounter( vStart );
0040FC65 54 push esp
0040FC66 E82195FFFF call QueryPerformanceCounter
unit_TCPUMemoryTest_version_001.pas.187: vElementCount := mElementCount;
0040FC6B 8B4B0C mov ecx,[ebx+$0c]
unit_TCPUMemoryTest_version_001.pas.188: for vBlockIndex := 0 to
(mBlockCount div 3) do
0040FC6E 8B4310 mov eax,[ebx+$10]
0040FC71 BE03000000 mov esi,$00000003
0040FC76 99 cdq
0040FC77 F7FE idiv esi
0040FC79 85C0 test eax,eax
0040FC7B 0F8C87000000 jl $0040fd08
0040FC81 40 inc eax
0040FC82 89442420 mov [esp+$20],eax
0040FC86 33C0 xor eax,eax
unit_TCPUMemoryTest_version_001.pas.190: vElementIndexA := 0;
0040FC88 33F6 xor esi,esi
unit_TCPUMemoryTest_version_001.pas.191: vElementIndexB := 0;
0040FC8A 33D2 xor edx,edx
0040FC8C 89542418 mov [esp+$18],edx
unit_TCPUMemoryTest_version_001.pas.192: vElementIndexC := 0;
0040FC90 33D2 xor edx,edx
0040FC92 8954241C mov [esp+$1c],edx
unit_TCPUMemoryTest_version_001.pas.194: for vLoopIndex := 0 to mLoopCount-1
do
0040FC96 8B5314 mov edx,[ebx+$14]
0040FC99 4A dec edx
0040FC9A 85D2 test edx,edx
0040FC9C 7C44 jl $0040fce2
0040FC9E 42 inc edx
0040FC9F 89542424 mov [esp+$24],edx
unit_TCPUMemoryTest_version_001.pas.196: vElementIndexA := mMemory[
vElementIndexA + (vBlockIndex*3+0) * vElementCount ];
0040FCA3 8D1440 lea edx,[eax+eax*2]
0040FCA6 8BFA mov edi,edx
0040FCA8 0FAFF9 imul edi,ecx
0040FCAB 03F7 add esi,edi
0040FCAD 8B7B04 mov edi,[ebx+$04]
0040FCB0 8B34B7 mov esi,[edi+esi*4]
unit_TCPUMemoryTest_version_001.pas.197: vElementIndexB := mMemory[
vElementIndexB + (vBlockIndex*3+1) * vElementCount ];
0040FCB3 8BFA mov edi,edx
0040FCB5 47 inc edi
0040FCB6 0FAFF9 imul edi,ecx
0040FCB9 037C2418 add edi,[esp+$18]
0040FCBD 8B6B04 mov ebp,[ebx+$04]
0040FCC0 8B7CBD00 mov edi,[ebp+edi*4+$00]
0040FCC4 897C2418 mov [esp+$18],edi
unit_TCPUMemoryTest_version_001.pas.198: vElementIndexC := mMemory[
vElementIndexC + (vBlockIndex*3+2) * vElementCount ];
0040FCC8 83C202 add edx,$02
0040FCCB 0FAFD1 imul edx,ecx
0040FCCE 0354241C add edx,[esp+$1c]
0040FCD2 8B7B04 mov edi,[ebx+$04]
0040FCD5 8B1497 mov edx,[edi+edx*4]
0040FCD8 8954241C mov [esp+$1c],edx
unit_TCPUMemoryTest_version_001.pas.194: for vLoopIndex := 0 to mLoopCount-1
do
0040FCDC FF4C2424 dec dword ptr [esp+$24]
0040FCE0 75C1 jnz $0040fca3
unit_TCPUMemoryTest_version_001.pas.201: mBlockResult[ vBlockIndex*3+0 ] :=
vElementIndexA;
0040FCE2 8D1440 lea edx,[eax+eax*2]
0040FCE5 8B7B08 mov edi,[ebx+$08]
0040FCE8 893497 mov [edi+edx*4],esi
unit_TCPUMemoryTest_version_001.pas.202: mBlockResult[ vBlockIndex*3+1 ] :=
vElementIndexB;
0040FCEB 8B7308 mov esi,[ebx+$08]
0040FCEE 8B7C2418 mov edi,[esp+$18]
0040FCF2 897C9604 mov [esi+edx*4+$04],edi
unit_TCPUMemoryTest_version_001.pas.203: mBlockResult[ vBlockIndex*3+2 ] :=
vElementIndexC;
0040FCF6 8B7308 mov esi,[ebx+$08]
0040FCF9 8B7C241C mov edi,[esp+$1c]
0040FCFD 897C9608 mov [esi+edx*4+$08],edi
unit_TCPUMemoryTest_version_001.pas.204: end;
0040FD01 40 inc eax
unit_TCPUMemoryTest_version_001.pas.188: for vBlockIndex := 0 to
(mBlockCount div 3) do
0040FD02 FF4C2420 dec dword ptr [esp+$20]
0040FD06 7580 jnz $0040fc88
unit_TCPUMemoryTest_version_001.pas.206: QueryPerformanceCounter( vStop );
0040FD08 8D442408 lea eax,[esp+$08]
0040FD0C 50 push eax
0040FD0D E87A94FFFF call QueryPerformanceCounter
unit_TCPUMemoryTest_version_001.pas.207: QueryPerformanceFrequency(
vFrequency );
0040FD12 8D442410 lea eax,[esp+$10]
0040FD16 50 push eax
0040FD17 E87894FFFF call QueryPerformanceFrequency
unit_TCPUMemoryTest_version_001.pas.209: mCPUExecutionTimeInSeconds :=
(vStop - vStart) / vFrequency;
0040FD1C 8B442408 mov eax,[esp+$08]
0040FD20 8B54240C mov edx,[esp+$0c]
0040FD24 2B0424 sub eax,[esp]
0040FD27 1B542404 sbb edx,[esp+$04]
0040FD2B 89442428 mov [esp+$28],eax
0040FD2F 8954242C mov [esp+$2c],edx
0040FD33 DF6C2428 fild qword ptr [esp+$28]
0040FD37 DF6C2410 fild qword ptr [esp+$10]
0040FD3B DEF9 fdivp st(1)
0040FD3D DD5B28 fstp qword ptr [ebx+$28]
0040FD40 9B wait
unit_TCPUMemoryTest_version_001.pas.210: end;
0040FD41 83C430 add esp,$30
0040FD44 5D pop ebp
0040FD45 5F pop edi
0040FD46 5E pop esi
0040FD47 5B pop ebx
0040FD48 C3 ret

// *** End of assembler output ***

Bye,
Skybuck.