/* // MULTOPP5 - by Paul Hsieh (c) 1997, 1998, All rights reserved. // // This program is public domain, subject to the conditions that any use made // of it that results in a derivative work or product must credit the author, // Paul Hsieh and that this source never be distributed without these comments // appearing intact at the top of the program. // // MULTOPP5 searches for optimal sequences of adds, subs, shls, leas, to // simulate a constant multiply on a Pentium. It is assumed that no more // than two registers are used (likely a somewhat faulty, but necessary // assumption) and that the intermediate values are bounded by limits of // 0..n, where n is the maximum multiply factor simulated. The results tend // to justify the assumptions. It is easy to show that using more than 2 // registers only affects sequences of at least 5 instructions/3 clocks; that // is to say no sequences given here in 3 clocks or less can be improved for // total clock count. For numbers up to 2048, there is never a need to go // beyond 6 clocks; nearly 90% of them running in 5 clocks or fewer. // // The ordering used is a lexical one: // // (number of clocks, whether or not the V pipe is free, total size) // // The first two optimize clock usage, taking into account that its potential // use inside other code is best served by additional pairing after it has // completed. There is no heuristic for waiting before writing to the ebx // register. The code size optimization is good because otherwise the code // has a tendency to add in extra meaningless instructions that does not // affect the total clock count, but which is not how any reasonable human // would code. // // Terje Mathisen made the good point that assuming eax is not ready for // address generation on the first clock would also be a good thing to add. I // have not looked into this. // // The algorithm is a departure from my original one which used an "OpenList" // and simply grew it incrementally as required. The present algorithm is // simply a depth first search through the code space, pruning code sequences // that are less efficient than code sequences already encountered. The // search is iteratively deepened by maximum (clock,V pipe state) allowed by // minimal granular steps so that nothing is missed along the search, and so // the search path can find suboptimal paths before wasting unpredictable // resources on them. A table of "best encounters so far" is maintained. // The size of this table, so far, appears to be the biggest limiting factor // as I can produce results up to 2048 (up from 512; my 1993 result) easily, // but 4096 is completely undoable with 32 MB of RAM. Terje Mathisen was able // to reach 4096 on a 64MB equipped DEC alpha, and indeed pushed it up to // 5040. However, searches up to non-power of 2s are suspect since the // corresponding left shift over that number is not present in the search // space of partial results. // // Besides leading to optimal p5 multiplies (something of diminishing value // as the x86 series gets faster in subsequent iterations) I am hoping this // example of "finding optimal code sequences" germinates discussion on // optimal code generation in general. // // Thanks to Jeremy McDonald, Robert Prins and Bjorn Stromvall for pointing // out a couple obvious flaws with the first version of this program (some of // the instructions I was using had obvious faster or shorter equivalents.) */ #include typedef unsigned char byte; typedef struct { int CodeLength; int Weight[4]; byte CodeFrag[8]; byte cost; byte p5flags; char * Name; } StepType; /* Pentium pairing bitflags */ #define UV 0xC0 #define PU 0x80 #define PV 0x40 #define NP 0x00 #define AB 0x03 #define Ax 0x02 #define xB 0x01 #define xx 0x00 /* ... including potential AGI stalls */ #define ab (0x0C|AB) #define ax (0x08|Ax) #define xb (0x04|xB) #define MBS 2 #define P5Attr(pairing,readreg,writereg) ((pairing)|((readreg)<clocks=0; pss->freepipe = UFREE; pss->contentionmask = 0; } /* Model Pentium scheduling as best as I know it, to count clocks. Also adds in tiny fractional fudge clocks for total code length. */ PipeStateStructType * AddInstruction(PipeStateStructType * pss, int inst) { static PipeStateStructType new_pss; char imask; ClockCountType iclocks; int RAW,AGI,i,j; imask = Step[inst].p5flags; iclocks = Step[inst].cost; /* Contentions on current instruction, work out first clock */ new_pss = *pss; /* Is there a RAW, WAW or AGI contention with the previous state? */ j = ((((imask&AB)<contentionmask)&(~(UV|AB)); /* ?AW */ i = (imask & pss->contentionmask)&(~(UV|AB)); /* RAW */ /* Is there an AGI stall? */ if( (i & (AB<<(2*(MBS))))!=0 ) { /* RAW on address? => AGI stall */ new_pss.clocks+=1; i &= ~(AB<<(2*(MBS))); /* Doesn't affect issue pairing */ } /* Does this instruction pair? And if so is it a free clock or not? */ if( (0!=j) || ((imask & (pss->freepipe))==0) ) { /* WAW/RAW upariable */ new_pss.clocks+=1; new_pss.freepipe = UFREE; if( imask & PU ) new_pss.freepipe = VFREE; } else { new_pss.freepipe ^= (UFREE ^ VFREE); if( new_pss.freepipe==VFREE ) new_pss.clocks+=1; } /* The potential extra clocks on this instruction */ new_pss.clocks+=(iclocks-1); /* Work out end of clock conditions; Available for next pair? Extra shadowed delay? */ RAW=0; /* No RAW on next inst? */ if( new_pss.clocks!=pss->clocks ) RAW=imask & AB; /* V pipe dependencies on next inst */ AGI = 0; /* No stall after multiple clocks? */ if( new_pss.clocks==pss->clocks ) AGI = ((pss->contentionmask)>>MBS); /* AGI on previous writes */ else if( iclocks<2 ) { AGI = (pss->contentionmask)>>(2*(MBS)); /* AGI from pairing */ } AGI = (AGI|imask) & AB; /* AGI on current writes as well */ /* These are the contention flags */ new_pss.contentionmask = ((AGI<Limit ) a = Invalid; if( b<0 || b>Limit ) b = Invalid; /* No sense in turning a good value to a bad value */ if( (a==Invalid) && ((*A)!=Invalid) ) return 0; if( (b==Invalid) && ((*B)!=Invalid) ) return 0; *A = a; *B = b; return 1; } void GiveResult(int i) { int j; printf("Mul%04d:\t; // %d clocks\n",i,(int)(Best[i])/2); for(j=0;jclocks); cost = cost * 2; if( ps->freepipe==UFREE ) cost = cost+1; // Penalty for occupied V slot. if( d!=0 ) { if( SpaceCost[A][B] < cost ) return; // Slower solution? if( SpaceCost[A][B] > cost ) { SpaceCost[A][B] = cost; if( costcmax ) { PotentialImprovements++; return; } if( d>=MaxPathLength ) return; i=0; do { Path[d]=i; p = *AddInstruction(ps,i); aa=A; bb=B; if( 1==ApplyInst( &aa, &bb, i ) ) SearchInstructionSpace(aa,bb,&p,d+1,cmax); i++; } while(Step[i].cost>0); if( picount==PotentialImprovements ) { if( B==Invalid ) { B = Limit+1; if( A==1 ) return; } SpaceCost[A][B]=USELESS_PATH; // Prune node, nothing more to search. } } /* Reports on searches after each iteration. Just search for filled entries in the Best[] array that have not been already marked in the Done[] array. */ void GetReports(int c) { int i,j; for(i=0;i<=Limit;i++) { if( Done[i]==0 && Best[i]<=c ) { printf("Mul%04d:\t; // %d clocks\n",i,(int)(Best[i])/2); for(j=0;j