I. Introduction into GPU programming.
 II. Exception safe dynamic memory handling in Cuda project.
 III. Calculation of partial sums in parallel.
 IV. Manipulation of piecewise polynomial functions in parallel.
 1 History of changes (PiecewisePoly).
 2 Calculus behind the PiecewisePoly project.
 3 Code structure (PiecewisePoly project).
 4 Python scripting for PiecewisePoly project.
 V. Manipulation of localized piecewise polynomial functions in parallel.

## Code structure (PiecewisePoly project).

he following is a prototype for the main class.

class Poly {

public:

typedef CoeffType::type real;

private:

class Impl;

boost::shared_ptr<Impl> theImpl;

public:

Poly();

Poly( const Poly& x );

Poly& operator=( const Poly& x );

Poly( const boost::python::object& params, const boost::python::object& coeffs );

Poly( const Piece& spt, int degree );

Poly( int d, int a, int b, int degree );

Poly( const Piece& spt, int degree, ots::cuda::memory::Host2D<real> coeffs );

Poly( int d, int a, int b, int degree, ots::cuda::memory::Host2D<real> coeffs );

real operator() ( real x ) const;

boost::python::object batch( const boost::python::object& x ) const;

std::string toString() const;

Piece support() const;

int degree() const;

Poly upscale( unsigned deltaD ) const;

Poly segment( const Piece& p ) const;

std::size_t len() const;

Poly mult( real x ) const;

Poly mult( const Poly& x ) const;

Poly add( const Poly& x ) const;

Poly sub( const Poly& x ) const;

real integral() const;

real integral( const Piece& p ) const;

Poly pow( unsigned n ) const;

Poly T( int k ) const;

Poly S( unsigned d ) const;

Poly Op( unsigned d, int k ) const;

Poly convolution( const Poly& x ) const;

Poly D() const;

boost::python::object description() const;

Poly Cr() const;

};

This class is functional in the Regular compiler scope and was exported in Python shell. The function "batch" performs parallel calculation of several values given by a Python list. The function "upscale" returns the same function but on a finer mesh. The function "segment" restricts to a given interval. The T,S are operations from the section ( Wavelet analysis ) and the Op is the combination of the two. The D is the differentiation. The Cr is the crudification operator, see the definition ( Crudification operator ). The rest is self-explanatory.

Implementation of every function is similar. We consider the function "add" as an example. The following is the part of the poly.cpp (Regular compiler scope).

class Poly::Impl : boost::noncopyable

{

ots::cuda::memory::Device<int> theParams;

ots::cuda::memory::Host<int> theParams2; //kept synchronized with theParams

ots::cuda::memory::Device2D<real> theCoeffs;

static const int theParamSize=5;

int getD() const { return theParams2.get(0); }

int getA() const { return theParams2.get(1); }

int getB() const { return theParams2.get(2); }

int getDegree() const { return theParams2.get(3); }

int getScale() const { return theParams2.get(4); }

...

Poly add( const Poly& x ) const

{

Piece spt=support();

Piece spt2=x.support();

Poly xx;

if( spt2.d()<spt.d() ) xx=x.upscale(spt.d()-spt2.d()); else xx=x;

Poly r(support().Union(xx.support()),std::max(getDegree(),x.degree()));

theParams.h(),

theCoeffs.h(),

xx.theImpl->theParams.h(),

xx.theImpl->theCoeffs.h(),

r.theImpl->theParams.h(),

r.theImpl->theCoeffs.h()

);

return r; }

...

};

The function impl::add::act is implemented in the Host compiler scope as follows.

void act(

indexes params1, reals2 coeffs1,

indexes params2, reals2 coeffs2,

indexes paramsR, reals2 coeffsR

)

{

if( coeffsR.rows()>0 )

{

checkPolyData::act(params1,coeffs1);

checkPolyData::act(params2,coeffs2);

checkPolyData::act(paramsR,coeffsR);

check( de::isOk(), "prior device error" );

sy::deviceWait();

act_<<<

kernel::first(coeffsR.rows()),

kernel::second(),

kernel::third()

>>>

(params1,coeffs1,params2,coeffs2,paramsR,coeffsR);

sy::deviceWait();

check( de::isOk(), "device error generated" );

}

}

Finally, the function impl::add::act_ does the actual work (Device compiler scope).

__global__ void act_(

indexes params1_, reals2 coeffs1_,

indexes params2_, reals2 coeffs2_,

indexes paramsR_, reals2 coeffsR_

)

{

Indexes params1(params1_);

Reals2 coeffs1(coeffs1_);

Indexes params2(params2_);

Reals2 coeffs2(coeffs2_);

Indexes paramsR(paramsR_);

Reals2 coeffsR(coeffsR_);

if( !coeffsR.isValidRowIndex(th) ) return;

Reals _R=coeffsR.getRow(th);

_R.assign(0);

index a1=polyParams::a(params1);

index a2=polyParams::a(params2);

index aR=polyParams::a(paramsR);

index loc=aR+th;

index th1=loc-a1;

index th2=loc-a2;

if( coeffs1.isValidRowIndex(th1) )

{

Reals src=coeffs1.getRow(th1);

index sz=min(_R.size(),src.size());

for( index p=0; p<sz; ++p )

_R.put(p,_R.get(p)+src.get(p));

}

if( coeffs2.isValidRowIndex(th2) )

{

Reals src=coeffs2.getRow(th2);

index sz=min(_R.size(),src.size());

for( index p=0; p<sz; ++p )

_R.put(p,_R.get(p)+src.get(p));

}

}