Object-orientation sometimes sacrifies efficiency for readibility and ease of code maintenance:
Array a, b, c, d, e; a = b + c + d + e;
This will generate the following pseudo-code:
// pseudocode for a = b + c + d + e double* _t1 = new double[N]; for ( int i=0; i<N; ++i) _t1[i] = b[i] + c[i]; double* _t2 = new double[N*M]; for ( int i=0; i<N; ++i) _t2[i] = _t1[i] + d[i]; double* _t3 = new double[N*M]; for ( int i=0; i<N; ++i) _t3[i] = _t2[i] + e[i]; for ( int i=0; i<N; ++i) a[i] = _t3[i]; delete [] _t3; delete [] _t2; delete [] _t1;
Performace problems
- For small arrays new and delete result poor performance: 1/10 of C.
- For medium arrays, overhead of extra loops and memory access add +50%
- For large arrays, the cost of the temporaries are the limitations: by Veldhuisen this could be 1/7 to 1/27 of the C or Fortran versions.
template <class Left, class Right> class X { }; X<A, X<B, X<C, X<D, END> > > > a; // list X< X<A,B>, X<C,D> > a; // tree
Using recursive templates we are able to build a parser-tree:
Array A, B, C, D; D = A + B +C ; A + B + C X< Array, plus, X<Array, plus, Array> > struct plus { }; // represent addition class Array { }; // represent a node in parse tree template <typename Left, typename Op, typename Right> class X { }; template <class T> X<T, plus, Array> operator+( T, Array) { return X< T, plus, Array>(); } Array A, B, C, D; D = A + B + C; = X<Array,plus,Array>() + C; = X< X<Array,plus,Array>, plus, Array>();
#include <iostream> using namespace std; // this class encapsulates the "+" operation. struct plus { static double apply( double a, double b) { return a+b; } }; // the node in the parse tree. template <typename Left, typename Op, typename Right> struct X { Left left; Right right; X( Left t1, Right t2) : left(t1), right(t2) { } double operator[](int i) { return Op::apply( left[i], right[i] ); } }; struct Array { // constructor Array( double *data, int N) : data_(data), N_(N) { } // assign an expression to the array template <typename Left, typename Op, typename Right> void operator=( X<Left,Op,Right> expr) { for ( int i = 0; i < N_; ++i) data_[i] = expr[i]; } double operator[](int i) { return data_[i]; } double *data_; int N_; }; template <typename Left> X<Left, plus, Array> operator+( Left a, Array b) { return X<Left, plus, Array>(a,b); } int main() { double a_data[] = { 2, 3, 5, 9 }; double b_data[] = { 1, 0, 0, 1 }; double c_data[] = { 3, 0, 2, 5 }; double d_data[4]; Array A(a_data,4); Array B(b_data,4); Array C(c_data,4); Array D(d_data,4); D = A + B + C; for ( int i = 0; i < 4; ++i ) cout << D[i] << " "; cout << endl; }
What happens in compilation-time?
D = A + B + C; = X<Array,plus,Array>(A,B) + C; = X< X<Array,plus,Array>, plus, Array>( X<Array,plus,Array>(A,B), C);
then it matches template Array::operator=
D.operator=(X<X<Array,plus,Array>,plus,Array>(X<Array,plus,Array>(A,B),C) expr) { for ( int i = 0; i < N_; ++i) data_[i] = expr[i]; }
The expr[i] is expanded by inlining operator[] from each node from parse tree:
data_[i] = plus::apply( X<Array,plus,Array>(A,B)[i], C[i]); = plus::apply( A[i] + B[i] + C[i]); = A[i] + B[i] + C[i];
So the final result of D = A + B + C is:
for ( int i = 0; i < N_; ++i) D.data_[i] = A.data_[i] + B.data_[i] + C.data_[i];
No temporaries, and a single loop!
Do not do an expression templates implementation yourself, except for fun. There are several good implementations like:
- Blitz++ http://www.oonumerics.org/blitz
- PETE http://www.acl.lanl.gov/pete
Just when you thought your little language was safe: Expression Templates in Java - Todd Veldhuizen, Erfurt 2000
W = X + Y * Z public class DoArrayStuff { public static apply( float[] w, float[] x, float[] y, float[] z) { for ( int i = 0; i < w.length; ++i) w[i] = x[i] + y[i] * z[i]; } } // or: public static apply( Array w, Array x, Array y, Array z) { w = x + y * z; }
JavaTran == Java+Fortan
int n = 1000; Array w = new Array(n); Array x = new Array(n); Array y = new Array(n); Array z = new Array(n); w = x + y * z; // since we do not have operator overloading: w.assign(x.plus(y.times(z))); /* Expr BinaryOperator / \ / \ / \ / \ Array BinaryOpExpr Plus Times */ // y * z Expr e = new BinaryOpExpr( y, z, new Times()); public abstract class Expr { public abstract float eval(int i); public Expr plus(Expr b) { BinaryOperator plus = new Plus(); return new BinaryOpExpr(this,b,plus); } public Expr times(Expr b) { BinaryOperator times = new Times(); return new BinaryOpExpr(this,b,times); } } public class BinaryOpExpr extends Expr { Expr a, b; BinaryOperator op; public BinaryOpExpr( Expr a_, Expr b_, BinaryOperator op_) { a = a_; b = b_; op = op_; } public float eval(int i) { return op.apply(a.eval(i),b.eval(i)); } } public abstract class BinaryOperator { public abstract float apply(float a, float b);} public class Plus extends BinaryOperator { public float apply(float a, float b) { return a+b; } } public class Times extends BinaryOperator { public float apply(float a, float b) { return a*b; } } public class Array extends Expr { float data[]; int length; public Array(int n) { data = new float[n]; length = n; } public float eval(int i) { return data[i]; } public void set(int i, float val) { data[i] = val; } public void assign(Expr e) { for ( int i = 0; i < length; ++i) data[i] = e.eval(i); } } public class Test { public static void main(java.lang.String[] args) { int n = 10000; Array w = new Array(n); Array x = new Array(n); Array y = new Array(n); Array z = new Array(n); // initialize x, y, z // w = x + y + z w.assign(x.plus(y.times(z))); }
The problem is: there is no partial evaluation in Java compilers
w = x + y * z
To evaluate each e.eval(i)
- 6 virtual function calls
- 3 bound checks
- numerous pointer indirections
dead slow
Lunar compiler: Java -> intermediate form -> partial evaluation -> C code JVM JavaTran ExpressionTemplates ================== ======== =================== Lunar 33.1 2.4 Sun Hotspot 1.3 (JIT) 1.4 0.4 Transvirtual Kaffe 4.3 0.7 Mflops/s n=1000
Here we present an other expression template: bubble sort. Bubble sort is very inefficient for large N, but quite reasonable for small N.
inline void swap( int& a, int& b) { int temp = a; a = b; b = temp; } // // bubble sort is very inefficient for large N, // but quite reasonable for small N // void bubbleSort( int *data int N) { for ( int i = N-1; i > 0; --i) for ( int j = 0; j < i; ++j) if ( data[j] > data[j+1] ) swap( data[j], data[j+1]); } // // the inline version for N=3 // inline void bubbleSort3( int *data) { int temp; if ( data[0] > data[1] ) { temp = data[0]; data[0] = data[1]; data[1] = temp; } if ( data[1] > data[2] ) { temp = data[1]; data[1] = data[2]; data[2] = temp; } if ( data[0] > data[1] ) { temp = data[0]; data[0] = data[1]; data[1] = temp; } } // // we had two loops. To reduce the loops define bubble sort recursively // void bubbleSort( int *data, int N) { for ( int j = 0; j < N-1; ++j) if ( data[j] > data[j+1] ) swap( data[j], data[j+1]); if ( N > 2 ) bubbleSort( data, N-1); } // // Now the sort consists of a loop and a recursive call to itself // this is simple to implement with recursive templates // template <int N> struct IntBubbleSort { static inline void sort(int *data) { IntBubbleSortLoop<N-1,0>::loop(data); IntBubbleSort<N-1>::sort(data); } } template <> struct IntBubbleSort<1> { static inline void sort(int *data) { } } // // IntBubbleSortLoop<N-1,0>::loop(data) will replace the for loop in j // and then makes a recursive call to itself // // For N=4 this will be the effect: // static inline void IntBubbleSort<4>::sort(int *data) { IntBubbleSortLoop<3,0>::loop(data); IntBubbleSortLoop<2,0>::loop(data); IntBubbleSortLoop<1,0>::loop(data); } // // the first template argument 3, 2, 1 plays the role of i in the original version // the second is equivalent j // template <int I, int J> class IntBubbleSortLoop { private: enum { go = ( J <= I-2 ) }; public: static inline void loop(int *data) { IntSwap<J, J+1>::compareAndSwap(data); IntBubbleSortLoop< go ? I : 0, go ? (J+1) : 0 >::loop(data); } }; template <> class IntBubbleSortLoop<0,0> { public: static inline void loop(int *) { } }; // // writing the terminal case of the recursion is a bit more difficault, // because we have two template parameters. The solution: both should // revert to 0, when the base case is reached. // We use a loop flag (go), and when go is false, we set parameters to 0 // // For N=4 here is the expansion: // static inline void IntBubbleSort<4>::sort(int *data) { IntSwap<0,1>::compareAndSwap(data); IntSwap<1,2>::compareAndSwap(data); IntSwap<2,3>::compareAndSwap(data); IntSwap<0,1>::compareAndSwap(data); IntSwap<1,2>::compareAndSwap(data); IntSwap<0,1>::compareAndSwap(data); } // // the last remaining definition is IntSwap<I,J>::compareAndSwap(..) // template <int I, int J> class IntSwap { public: static inline void compareAndSwap(int *data) { if ( data[I] > data[J] ) swap( data[I], data[J]); } } // // the swap() is the original // // // For N=4 here is the expansion: // static inline void IntBubbleSort<4>::sort(int *data) { if ( data[0] > data[1]) swap( data[0], data[1]); if ( data[1] > data[2]) swap( data[1], data[2]); if ( data[2] > data[3]) swap( data[2], data[3]); if ( data[0] > data[1]) swap( data[0], data[1]); if ( data[1] > data[2]) swap( data[1], data[2]); if ( data[0] > data[1]) swap( data[0], data[1]); }