Main Page   Data Structures   File List   Data Fields   Globals  

multiplication_scs.c

Go to the documentation of this file.
00001 /** Functions for SCS multiplication operations 
00002 @file multiplication_scs.c
00003 
00004 @author Defour David David.Defour@ens-lyon.fr
00005 @author Florent de Dinechin Florent.de.Dinechin@ens-lyon.fr 
00006 
00007  This file is part of the SCS library.
00008 */
00009 
00010 /*
00011 Copyright (C) 2002  David Defour and Florent de Dinechin
00012 
00013     This library is free software; you can redistribute it and/or
00014     modify it under the terms of the GNU Lesser General Public
00015     License as published by the Free Software Foundation; either
00016     version 2.1 of the License, or (at your option) any later version.
00017 
00018     This library is distributed in the hope that it will be useful,
00019     but WITHOUT ANY WARRANTY; without even the implied warranty of
00020     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00021     Lesser General Public License for more details.
00022 
00023     You should have received a copy of the GNU Lesser General Public
00024     License along with this library; if not, write to the Free Software
00025     Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
00026 
00027 */
00028 
00029 #include "scs.h"
00030 #include "scs_private.h"
00031 
00032 #if 0 /* used to help debugging */
00033 void pr(char* s,double d) {
00034   scs_db_number x;
00035   x.d=d;
00036   printf(s);printf("   ");
00037   printf("%8x%8x . 2^%d   (%8f %8x %8x)   \n",
00038          (x.i[HI_ENDIAN]&0x000FFFFF)+0x00100000,
00039          x.i[LO_ENDIAN],
00040          (x.i[HI_ENDIAN]>>20)-1023,
00041          x.d,
00042          x.i[HI_ENDIAN],
00043          x.i[LO_ENDIAN]);
00044 }
00045 #endif
00046 
00047 
00048 /***************************************************************/
00049 /* First a few #defines selecting between architectures where integer
00050    multiplication is faster than FP multiplication (the normal case)
00051    and those where it is the opposite (some Sparcs). The latter
00052    doesn't work but might at some point, so the code remains */
00053 
00054 #ifdef SCS_USE_FLT_MULT
00055 /* Compute the carry of r1, remove it from r1, and add it to r0. This
00056  doesn't work: how to get a value truncated using only
00057  round-to-nearest FP ? Besides it doesn't work on x86 for at least two
00058  reasons : extended precision FP arithmetic, and a #define below. */
00059 #define SCS_CARRY_PROPAGATE(r1,r0,tmp) \
00060      {tmp=((r1-SCS_FLT_TRUNC_CST)+SCS_FLT_SHIFT_CST)-SCS_FLT_SHIFT_CST; r1-= tmp; r0+=tmp*SCS_RADIX_MONE_DOUBLE;}      
00061 
00062 typedef double SCS_CONVERSION_MUL;
00063  
00064 #else /* SCS_USE_FLT_MULT*/
00065 
00066 /*  Compute the carry of r1, remove it from r1, and add it to r0 */
00067 #define SCS_CARRY_PROPAGATE(r1,r0,tmp) \
00068       {tmp = r1>>SCS_NB_BITS; r0 += tmp; r1 -= (tmp<<SCS_NB_BITS);}     
00069 typedef unsigned long long int SCS_CONVERSION_MUL;
00070 #endif  /* SCS_USE_FLT_MULT */
00071 
00072 
00073 
00074 
00075 
00076 
00077 
00078 /************************************************************/
00079 /* We have unrolled the loops for SCS_NB_WORDS==4 or 8 
00080 
00081    We just wish gcc would do it for us ! There are option switches,
00082    but they don't lead to any performance improvement. When they do,
00083    this part of the source code will be removed.
00084 
00085    In the meantime, feel free to unroll for other values. */
00086 
00087 
00088 /***************************/
00089 #if (SCS_NB_WORDS==4)
00090 /***************************/
00091 void scs_mul(scs_ptr result, scs_ptr x, scs_ptr y){
00092   SCS_CONVERSION_MUL     val, tmp;
00093   SCS_CONVERSION_MUL     r0,r1,r2,r3,r4;
00094   SCS_CONVERSION_MUL     x0,x1,x2,x3;
00095   int                    y0,y1,y2,y3;
00096     
00097   R_EXP = X_EXP * Y_EXP;
00098   R_SGN = X_SGN * Y_SGN;
00099   R_IND = X_IND + Y_IND;
00100 
00101   /* Partial products computation */   
00102   x3=X_HW[3];  y3=Y_HW[3];  x2=X_HW[2];  y2=Y_HW[2];
00103   x1=X_HW[1];  y1=Y_HW[1];  x0=X_HW[0];  y0=Y_HW[0];
00104 
00105   r4 = x3*y1 + x2*y2 + x1*y3;
00106   r3 = x3*y0 + x2*y1 + x1*y2 + x0*y3;
00107   r2 = x2*y0 + x1*y1 + x0*y2;
00108   r1 = x1*y0 + x0*y1 ;
00109   r0 = x0*y0;
00110 
00111   val= 0;
00112   /* Carry Propagate */
00113   SCS_CARRY_PROPAGATE(r4,r3,tmp)
00114   SCS_CARRY_PROPAGATE(r3,r2,tmp)
00115   SCS_CARRY_PROPAGATE(r2,r1,tmp)
00116   SCS_CARRY_PROPAGATE(r1,r0,tmp)      
00117   SCS_CARRY_PROPAGATE(r0,val,tmp)      
00118  
00119   if(val != 0){
00120     /* shift all the digits ! */
00121     R_HW[0] = val; R_HW[1] = r0; R_HW[2] = r1;  R_HW[3] = r2;
00122     R_IND += 1;
00123   }
00124   else {
00125     R_HW[0] = r0; R_HW[1] = r1; R_HW[2] = r2; R_HW[3] = r3;
00126   }
00127 
00128 }
00129 
00130 
00131 void scs_square(scs_ptr result, scs_ptr x){
00132   SCS_CONVERSION_MUL  r0,r1,r2,r3,r4;
00133   SCS_CONVERSION_MUL  x0,x1,x2,x3;
00134   SCS_CONVERSION_MUL  val, tmp;
00135 
00136 
00137   R_EXP = X_EXP * X_EXP;
00138   R_IND = X_IND + X_IND;
00139   R_SGN = 1;
00140     
00141   /*
00142    * Calcul des PP
00143    */   
00144   x3=X_HW[3];  x2=X_HW[2];  x1=X_HW[1];  x0=X_HW[0];
00145 
00146   r0 =  x0*x0;
00147   r1 = (x0*x1)* 2 ;
00148   r2 =  x1*x1 + (x0*x2*2);
00149   r3 = (x1*x2 +  x0*x3)* 2;
00150   r4 =  x2*x2 + (x1*x3)* 2;
00151 
00152   val= 0;
00153   /* Propagation des retenues */
00154   SCS_CARRY_PROPAGATE(r4,r3,tmp)
00155   SCS_CARRY_PROPAGATE(r3,r2,tmp)
00156   SCS_CARRY_PROPAGATE(r2,r1,tmp)
00157   SCS_CARRY_PROPAGATE(r1,r0,tmp)      
00158   SCS_CARRY_PROPAGATE(r0,val,tmp)      
00159  
00160   if(val != 0){
00161     /* shift all the digits ! */
00162     R_HW[0] = val; R_HW[1] = r0; R_HW[2] = r1;  R_HW[3] = r2;
00163     R_IND += 1;
00164   }
00165   else {
00166     R_HW[0] = r0; R_HW[1] = r1; R_HW[2] = r2; R_HW[3] = r3;
00167   }
00168   
00169 }
00170 
00171 
00172 
00173 
00174 
00175 
00176 
00177 /***************************/
00178 #elif (SCS_NB_WORDS==8)
00179 /***************************/
00180 void scs_mul(scs_ptr result, scs_ptr x, scs_ptr y){
00181   SCS_CONVERSION_MUL     val, tmp;
00182   SCS_CONVERSION_MUL     r0,r1,r2,r3,r4,r5,r6,r7,r8;
00183   SCS_CONVERSION_MUL     x0,x1,x2,x3,x4,x5,x6,x7;
00184   int                    y0,y1,y2,y3,y4,y5,y6,y7;
00185     
00186   R_EXP = X_EXP * Y_EXP;
00187   R_SGN = X_SGN * Y_SGN;
00188   R_IND = X_IND + Y_IND;
00189 
00190   /* Partial products computation */   
00191   x7=X_HW[7];  y7=Y_HW[7];  x6=X_HW[6];  y6=Y_HW[6];
00192   x5=X_HW[5];  y5=Y_HW[5];  x4=X_HW[4];  y4=Y_HW[4];
00193   x3=X_HW[3];  y3=Y_HW[3];  x2=X_HW[2];  y2=Y_HW[2];
00194   x1=X_HW[1];  y1=Y_HW[1];  x0=X_HW[0];  y0=Y_HW[0];
00195 
00196   r8 = x7*y1 + x6*y2 + x5*y3 + x4*y4 + x3*y5 + x2*y6 + x1*y7;
00197   r7 = x7*y0 + x6*y1 + x5*y2 + x4*y3 + x3*y4 + x2*y5 + x1*y6 + x0*y7;
00198   r6 = x6*y0 + x5*y1 + x4*y2 + x3*y3 + x2*y4 + x1*y5 + x0*y6;
00199   r5 = x5*y0 + x4*y1 + x3*y2 + x2*y3 + x1*y4 + x0*y5;
00200   r4 = x4*y0 + x3*y1 + x2*y2 + x1*y3 + x0*y4 ;
00201   r3 = x3*y0 + x2*y1 + x1*y2 + x0*y3;
00202   r2 = x2*y0 + x1*y1 + x0*y2;
00203   r1 = x1*y0 + x0*y1 ;
00204   r0 = x0*y0 ;
00205  
00206   val= 0;
00207   /* Carry Propagate */
00208   SCS_CARRY_PROPAGATE(r8,r7,tmp)
00209   SCS_CARRY_PROPAGATE(r7,r6,tmp)
00210   SCS_CARRY_PROPAGATE(r6,r5,tmp)
00211   SCS_CARRY_PROPAGATE(r5,r4,tmp)
00212   SCS_CARRY_PROPAGATE(r4,r3,tmp)
00213   SCS_CARRY_PROPAGATE(r3,r2,tmp)
00214   SCS_CARRY_PROPAGATE(r2,r1,tmp)
00215   SCS_CARRY_PROPAGATE(r1,r0,tmp)      
00216   SCS_CARRY_PROPAGATE(r0,val,tmp)      
00217  
00218   if(val != 0){
00219     /* shift all the digits ! */
00220     R_HW[0] = val; R_HW[1] = r0; R_HW[2] = r1;  R_HW[3] = r2;
00221     R_HW[4] = r3;  R_HW[5] = r4; R_HW[6] = r5;  R_HW[7] = r6;
00222     R_IND += 1;
00223   }
00224   else {
00225     R_HW[0] = r0; R_HW[1] = r1; R_HW[2] = r2; R_HW[3] = r3;
00226     R_HW[4] = r4; R_HW[5] = r5; R_HW[6] = r6; R_HW[7] = r7;
00227   }
00228 
00229 }
00230 
00231 
00232 void scs_square(scs_ptr result, scs_ptr x){
00233   SCS_CONVERSION_MUL  r0,r1,r2,r3,r4,r5,r6,r7,r8;
00234   SCS_CONVERSION_MUL  x0,x1,x2,x3,x4,x5,x6,x7;
00235   SCS_CONVERSION_MUL  val, tmp;
00236 
00237 
00238   R_EXP = X_EXP * X_EXP;
00239   R_IND = X_IND + X_IND;
00240   R_SGN = 1;
00241     
00242   /*
00243    * Partial products
00244    */   
00245   x7=X_HW[7];  x6=X_HW[6];  x5=X_HW[5];  x4=X_HW[4];
00246   x3=X_HW[3];  x2=X_HW[2];  x1=X_HW[1];  x0=X_HW[0];
00247 
00248   r0 =  x0*x0;
00249   r1 = (x0*x1)* 2 ;
00250   r2 =  x1*x1 + (x0*x2*2);
00251   r3 = (x1*x2 +  x0*x3)* 2;
00252   r4 =  x2*x2 + (x1*x3 + x0*x4)* 2;
00253   r5 = (x2*x3 +  x1*x4 + x0*x5)* 2;
00254   r6 =  x3*x3 + (x2*x4 + x1*x5 + x0*x6)* 2;
00255   r7 = (x3*x4 +  x2*x5 + x1*x6 + x0*x7)* 2;
00256   r8 =  x4*x4 + (x3*x5 + x2*x6 + x1*x7)* 2;
00257 
00258   val= 0;
00259   /* Carry propagation */
00260   SCS_CARRY_PROPAGATE(r8,r7,tmp)
00261   SCS_CARRY_PROPAGATE(r7,r6,tmp)
00262   SCS_CARRY_PROPAGATE(r6,r5,tmp)
00263   SCS_CARRY_PROPAGATE(r5,r4,tmp)
00264   SCS_CARRY_PROPAGATE(r4,r3,tmp)
00265   SCS_CARRY_PROPAGATE(r3,r2,tmp)
00266   SCS_CARRY_PROPAGATE(r2,r1,tmp)
00267   SCS_CARRY_PROPAGATE(r1,r0,tmp)      
00268   SCS_CARRY_PROPAGATE(r0,val,tmp)      
00269  
00270   if(val != 0){
00271     /* shift all the digits ! */
00272     R_HW[0] = val; R_HW[1] = r0; R_HW[2] = r1;  R_HW[3] = r2;
00273     R_HW[4] = r3;  R_HW[5] = r4; R_HW[6] = r5;  R_HW[7] = r6;
00274     R_IND += 1;
00275   }
00276   else {
00277     R_HW[0] = r0; R_HW[1] = r1; R_HW[2] = r2; R_HW[3] = r3;
00278     R_HW[4] = r4; R_HW[5] = r5; R_HW[6] = r6; R_HW[7] = r7;
00279   }
00280   
00281 }
00282 
00283 
00284 
00285 /***************************/
00286 #else
00287 /***************************/
00288 /* From there on, the normal, unrolled case */
00289 
00290 
00291 void scs_mul(scs_ptr result, scs_ptr x, scs_ptr y){
00292   SCS_CONVERSION_MUL RES[SCS_NB_WORDS+1];
00293   SCS_CONVERSION_MUL val, tmp;
00294   int i, j;    
00295 
00296   R_EXP = X_EXP * Y_EXP;
00297   R_SGN = X_SGN * Y_SGN;
00298   R_IND = X_IND + Y_IND;
00299 
00300   for(i=0; i<=SCS_NB_WORDS; i++)
00301     RES[i]=0;
00302 
00303   /* Compute only the first half of the partial product. See the
00304      unrolled code for an example of what we compute */
00305 
00306 #ifdef SCS_TYPECPU_X86
00307   /* This is the only place where there is assembly code to force 64-bit
00308      arithmetic. Someday gcc will catch up here, too.
00309   */
00310   {
00311     scs_db_number t;
00312     /* i=0 */
00313     for(j=0; j<(SCS_NB_WORDS); j++) {
00314       __asm__ volatile("mull %3" 
00315                        : "=a" (t.i[LO_ENDIAN]), "=d" (t.i[HI_ENDIAN])
00316                        : "a" (X_HW[0]) , "g" (Y_HW[j]));
00317       RES[j] += t.l;
00318     }
00319     /* i = 1..SCS_NB_WORDS-1 */
00320     for(i=1 ; i<SCS_NB_WORDS; i++){
00321       for(j=0; j<(SCS_NB_WORDS-i); j++){
00322         __asm__ volatile("mull %3" 
00323                          : "=a" (t.i[LO_ENDIAN]), "=d" (t.i[HI_ENDIAN])
00324                          : "a" (X_HW[i]) , "g" (Y_HW[j]));
00325         RES[i+j] += t.l;
00326       }
00327       __asm__ volatile("mull %3" 
00328                        : "=a" (t.i[LO_ENDIAN]), "=d" (t.i[HI_ENDIAN])
00329                        : "a" (X_HW[i]) , "g" (Y_HW[j])); 
00330       /* here j==SCS_NB_WORDS-i */
00331       RES[SCS_NB_WORDS] += t.l;
00332     }
00333  }
00334 
00335 #else /* other architectures */
00336 
00337  /* i=0 */
00338  tmp = X_HW[0];
00339  for(j=0; j<(SCS_NB_WORDS); j++)
00340    RES[j] += tmp * Y_HW[j];
00341  /* i = 1..SCS_NB_WORDS-1 */
00342  for(i=1 ; i<SCS_NB_WORDS; i++){
00343       tmp = X_HW[i];
00344       for(j=0; j<(SCS_NB_WORDS-i); j++)
00345         RES[i+j] += tmp * Y_HW[j];
00346       RES[SCS_NB_WORDS] += tmp * Y_HW[j]; /* here j==SCS_NB_WORDS-i */
00347   }
00348 #endif/* SCS_TYPECPU_X86 */
00349 
00350   val = 0;
00351 
00352   /* Carry propagate */
00353   for(i=SCS_NB_WORDS; i>0; i--)
00354     SCS_CARRY_PROPAGATE(RES[i],RES[i-1],tmp)
00355   SCS_CARRY_PROPAGATE(RES[0],val,tmp)
00356 
00357 
00358   /* Store the result */
00359   if(val != 0){
00360     /* shift all the digits ! */     
00361     R_HW[0] = val;
00362     for(i=1; i<SCS_NB_WORDS; i++)
00363       R_HW[i] = RES[i-1];
00364   
00365     R_IND += 1;
00366   }else {
00367     for(i=0; i<SCS_NB_WORDS; i++)
00368       R_HW[i] = RES[i];
00369    }
00370 }
00371 
00372 
00373 
00374 
00375 
00376 
00377 
00378 void scs_square(scs_ptr result, scs_ptr x){
00379   SCS_CONVERSION_MUL RES[SCS_NB_WORDS+1];
00380   SCS_CONVERSION_MUL val, tmp;
00381   int i, j;
00382   
00383 
00384   R_EXP = X_EXP * X_EXP;
00385   R_SGN = 1;
00386   R_IND = X_IND + X_IND;
00387 
00388   /* Set to 0 intermediate register     */
00389   for(i=0; i<=SCS_NB_WORDS; i++)
00390     RES[i] = 0;
00391 
00392   /* Compute all the double partial products: 2 x_i * x_j, i!=j */
00393   tmp = (SCS_CONVERSION_MUL)X_HW[0];
00394   for(j=1; j<SCS_NB_WORDS; j++)
00395     RES[j] += tmp * X_HW[j];
00396   for(i=1 ; i<(SCS_NB_WORDS+1)/2; i++){
00397     tmp = (SCS_CONVERSION_MUL)X_HW[i];
00398     for(j=i+1; j<(SCS_NB_WORDS-i); j++)
00399       RES[i+j] += tmp * X_HW[j];
00400     RES[SCS_NB_WORDS] += tmp * X_HW[SCS_NB_WORDS-i];
00401   }
00402 
00403   /* All these partial products are double */
00404   for(i=0; i<=SCS_NB_WORDS; i++)
00405     RES[i] *=2;
00406 
00407   /* Add partial product of the form x_i^2 */
00408   for(i=0, j=0; i<=SCS_NB_WORDS; i+=2, j++){
00409     RES[i]  += (SCS_CONVERSION_MUL)X_HW[j] * X_HW[j];
00410   }  
00411 
00412   val = 0;
00413   /* Carry propagate */
00414   for(i=SCS_NB_WORDS; i>0; i--)
00415       SCS_CARRY_PROPAGATE(RES[i],RES[i-1],tmp)
00416  
00417   SCS_CARRY_PROPAGATE(RES[0],val,tmp)
00418 
00419 
00420   /* Store the result */
00421   if(val != 0){
00422     /* shift all the digits ! */     
00423     R_HW[0] = val;
00424     for(i=1; i<SCS_NB_WORDS; i++)
00425       R_HW[i] = RES[i-1];
00426   
00427     R_IND += 1;
00428   }else {
00429     for(i=0; i<SCS_NB_WORDS; i++)
00430       R_HW[i] = RES[i];
00431    }
00432 
00433 }
00434 
00435 
00436 /* 
00437  * #endif corresponding to the test #if (SCS_NB_WORDS==8)
00438  */
00439 #endif
00440 
00441 
00442 
00443 
00444 
00445 /*
00446  Multiply x by an integer val; result is returned in x.
00447  */
00448 void scs_mul_ui(scs_ptr x, unsigned int val_int){
00449   SCS_CONVERSION_MUL val, tmp, vald, rr;
00450   int i;
00451 
00452   if (val_int == 0)
00453     X_EXP = 0;
00454   
00455   vald = val_int;
00456 
00457   val = 0; 
00458   rr = 0;
00459   for(i=(SCS_NB_WORDS-1); i>=0; i--){
00460     val    += vald * X_HW[i];
00461     SCS_CARRY_PROPAGATE(val, rr, tmp)
00462     X_HW[i] = val;
00463     val     = rr;
00464     rr      = 0; 
00465   }
00466 
00467   if(val != 0){
00468     /* shift all the digits ! */ 
00469     for(i=(SCS_NB_WORDS-1); i>0; i--)
00470       X_HW[i] = X_HW[i-1];
00471 
00472     X_HW[0] = val;
00473     X_IND  += 1;
00474   }
00475   
00476   return;
00477 }

Generated on Tue Jun 17 10:15:51 2003 for SCSLib by doxygen1.2.15