I want to copy the int m_CellParticleNumber and m_aCellParticleID[CELLMAXPARTICLENUM] value from host to device.For this i use constant memory. But i am not being able to copy the value to the constant memory. can you please help me in this code for copying the value.
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <memory>
#include<iostream>
#define m_CellNum 100
#define CELLMAXPARTICLENUM 100
int numPartilces = 10;
extern void Initialize();
struct CCell
{
int m_CellParticleNumber ;
int m_aCellParticleID[CELLMAXPARTICLENUM];
} ;
CCell* hvalue;
CCell* dvalue;
int* dCellParticleID;
__constant__ CCell* c_value;
__global__ void Sum_constant(CCell* value, int N)
{
int index;
index = blockIdx.x * blockDim.x + threadIdx.x;
if (index<N)
for(int idx=0;idx<N ;++idx)
value[index].m_aCellParticleID[idx]= value[index].m_aCellParticleID[idx]+ c_value[index].m_aCellParticleID[idx] ;
//return;
}
int main()
{
hvalue = new CCell[m_CellNum];
cudaMalloc((void**)&dvalue,m_CellNum * sizeof(CCell));
//calling function to initialize the value
Initialize();
//initializing the device momory
cudaMemcpy(dvalue, hvalue, sizeof(CCell)*m_CellNum,cudaMemcpyHostToDevice);
//copying value to constant memory
cudaMemcpyToSymbol(c_value->m_aCellParticleID, &dvalue->m_aCellParticleID, sizeof(int)*m_CellNum);
//dividing bolcks and grid
int block_size = 4;
int n_blocks = numPartilces/block_size + (numPartilces%block_size == 0 ? 0:1);
//invocking kernel function
Sum_constant <<< n_blocks, block_size >>> (c_value,numPartilces);
//copying value from host to device
cudaMemcpy(hvalue, dvalue,numPartilces * sizeof(int),cudaMemcpyDeviceToHost);
//showing result
for(int i = 0; i < 2; ++i)
{
for(int j = 0; j < numPartilces; ++j)
{
std::cout<<hvalue[i].m_aCellParticleID[j]<<"\n";
}
}
free(hvalue);
cudaFree(dvalue);
return 0;
}
void Initialize()
{
cudaMalloc((void**)&dCellParticleID,m_CellNum * sizeof(int));
for(int i = 0; i < numPartilces; ++i)
{
hvalue[i].m_CellParticleNumber = 0;
for(int j = 0; j < numPartilces; ++j)
{
hvalue[i].m_aCellParticleID[j] = j+2;
}
hvalue[i].m_CellParticleNumber++;
}
}
This is what i tried according to the suggestion given but still it dont works. can you please help me. #include "cuda_runtime.h" #include "device_launch_parameters.h"
#include <stdio.h>
#include <memory>
#include<iostream>
#define m_CellNum 100
#define CELLMAXPARTICLENUM 100
int numPartilces = 10;
extern void Initialize();
struct CCell
{
int m_CellParticleNumber ;
int m_aCellParticleID[CELLMAXPARTICLENUM];
} ;
CCell* hvalue;
CCell* dvalue;
int* dCellParticleID;
__constant__ CCell c_value[m_CellNum];
__global__ void Sum_constant(CCell* value, int N)
{
int index;
index = blockIdx.x * blockDim.x + threadIdx.x;
if (index<N)
for(int idx=0;idx<N ;++idx)
value[index].m_aCellParticleID[idx]= value[index].m_aCellParticleID[idx]+ c_value[index].m_aCellParticleID[idx] ;
//return;
}
int main()
{
int numPartilces = 10;
hvalue = new CCell[m_CellNum];
cudaMalloc((void**)&dvalue,m_CellNum * sizeof(CCell));
//calling function to initialize the value
Initialize();
//initializing the device momory
cudaMemcpy(dvalue, hvalue, sizeof(CCell)*m_CellNum,cudaMemcpyHostToDevice);
//copying value to constant memory
cudaMemcpyToSymbol(c_value, &dvalue, sizeof(dvalue));
//dividing bolcks and grid
int block_size = 4;
int n_blocks = numPartilces/block_size + (numPartilces%block_size == 0 ? 0:1);
//invocking kernel function
Sum_constant <<< n_blocks, block_size >>> (dvalue,numPartilces);
//copying value from host to device
cudaMemcpy(hvalue, dvalue,2*m_CellNum * sizeof(int),cudaMemcpyDeviceToHost);
//showing result
for(int i = 0; i < numPartilces; ++i)
{
for(int j = 0; j < numPartilces; ++j)
{
std::cout<<hvalue[i].m_aCellParticleID[j]<<"\n";
}
}
free(hvalue);
cudaFree(dvalue);
return 0;
}
void Initialize()
{
cudaMalloc((void**)&dCellParticleID,m_CellNum * sizeof(int));
for(int i = 0; i < numPartilces; ++i)
{
hvalue[i].m_CellParticleNumber = 0;
for(int j = 0; j < numPartilces; ++j)
{
hvalue[i].m_aCellParticleID[j] = j+2;
}
hvalue[i].m_CellParticleNumber++;
}
}
__constant__ CCell* c_value;to something like__constant__ CCell c_value[N];in whichNis your anticipated supremum of occupied space by symbol. Look for documents on the internet about using constant memory in CUDA. I can tell if you use constant memory for the problem above, it will hurt the performance.cudaMemcpyToSymbol(c_value->m_aCellParticleID, &dvalue->m_aCellParticleID, sizeof(int)*m_CellNum);can't work! First problem is thatcudaMemcpyToSymbolcopy by default from host to device, if no other copy direction is set - see cudaMemcpyToSymbol. Next misstake is that on the host you want to copy fromdvalue->m_aCellParticleID, but host isn't able to resolve this pointer, because it's a device pointer.Sum_constant <<< n_blocks, block_size >>> (c_value,numPartilces);with c_value as input. That's not right. You have to pass dvalue as pointer. When copy back the results you only copynumPartilces * sizeof(int)bytes from dvalue to hvalue. But when output the results you want to print overall 2 *numPartilces * sizeof(int). What exactly you want to do? It seems that there several fundamental misstakes in the way you allocate your memory and how you want to use that arrays.