Doing exactly so, with the known dimension in the first place (highest order - first square bracket entry) is not possible, as the compiler may not implement addressing properly.
However, it is possible to do so setting the second parameter only at compile time. Here is an example code:
extern __shared__ int shared2Darray[][17] ;
__global__ void kernel(int* output)
{
shared2Darray[threadIdx.y][threadIdx.x] = threadIdx.x + 2*threadIdx.y ;
__syncthreads();
output [threadIdx.y * blockDim.x + threadIdx.x] = shared2Darray[threadIdx.y][threadIdx.x] ;
__syncthreads();
}
int main()
{
int* h_output, *d_output ;
cudaMalloc(&d_output, 16*16*sizeof(int));
kernel<<<1, dim3(16,16,1), 16*17*sizeof(int)>>> (d_output) ;
h_output = new int[16*16] ;
cudaMemcpy (h_output, d_output, 16*16*sizeof(int), cudaMemcpyDeviceToHost) ;
cudaDeviceReset();
for (int x = 0 ; x < 16 ; ++x)
{
for (int y = 0 ; y < 16 ; ++y)
{
if (h_output[y*16+x] != x+2*y)
printf ("ERROR\n");
}
}
printf ("DONE\n");
delete[] h_output ;
return 0 ;
}
The size of the array is defined by the shared memory parameter in the triple angled bracket notation. Hence, the size of the second dimension is deduced by dividing the shared memory size in bytes by the size in bytes of a single entry.