# Traitement parallèle de donnée
## Cuda
### Pattern d'entrelacement:
**device**
`addArray_device.cu`
```cpp=
/**
* output : void required, car kernel is asynchrone
* pattern entrelacement
*/
__global__ void addArray(
float* ptrDevV1,
float* ptrDevV2,
float* ptrDevW,
int n)
{
const int NB_THREAD = Thread2D::nbThread();
const int TID = Thread2D::tid();
// Debug, facultatif
if (TID == 0)
{
printf("Coucou from device tid = %d", TID);
// required Kernel::synchronize();
// after the call of kernel
}
int s = TID;
while(s<n)
{
ptrDevW[s] = ptrDevV1[s] + ptrDevV2[s];
s += NB_THREAD;
}
}
```
**host**
`addArray.cu`
constructeur:
```cpp=
AddArray::AddArray(
const Grid& grid ,
float* ptrV1,
float* ptrV2,
float* ptrW,
int n) :
ptrV1(ptrV1), //
ptrV2(ptrV2), //
ptrW(ptrW), //
n(n), //
dg(grid.dg), //
db(grid.db)
{
this->sizeVector = n * sizeof(float);
// MM (malloc Device)
{
GM::malloc(&ptrDevV1, sizeVector);
GM::malloc(&ptrDevV2, sizeVector);
GM::malloc(&ptrDevW, sizeVector);
}
}
```
destructeur:
```cpp=
AddArray::~AddArray(void)
{
//MM (device free)
{
GM::free(ptrDevV1);
GM::free(ptrDevV2);
GM::free(ptrDevW);
}
}
```
```cpp=
void AddArray::run()
{
// MM (copy Host->Device)
{
GM::memcpyHToD(ptrDevV1, ptrV1, sizeVector);
GM::memcpyHToD(ptrDevV2, ptrV2, sizeVector);
}
addArray<<<dg,db>>>(ptrDevV1, ptrDevV2, ptrDevW, n);
Kernel::synchronize();
// facultatif, but required for print in device side
// MM (Device -> Host)
{
// MM barier de synchronisation implicite
GM::memcpyDToH(ptrW, ptrDevW, sizeVector);
// attention de ne pas oublier que c'est une size
}
}
```
### Pattern 1 à 1:
`addArray11_device.cu`
```cpp=
__global__ void addArray11(float* ptrDevV1 , float* ptrDevV2 , float* ptrDevW , int n)
{
const int TID = Thread2D::tid();
ptrDevW[TID] = ptrDevV1[TID] + ptrDevV2[TID];
}
```
`addArray11.cu`
```cpp=
extern __global__ void addArray11(
float* ptrDevV1,
float* ptrDevV2,
float* ptrDevW,
int n);
```