#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
// Define matrix dimensions (N x N)
#define N 1024
#define BLOCK_SIZE 16
// CUDA Kernel for Element-wise Matrix Addition
__global__ void matrixAddKernel(const float* A, const float* B, float* C, int n) {
// Calculate global row and column index for the thread
int col = blockIdx.x * blockDim.x + threadIdx.x;
int row = blockIdx.y * blockDim.y + threadIdx.y;
// Boundary check to prevent accessing out-of-bounds memory
if (row < n && col < n) {
// Map 2D coordinate to a flattened 1D index
int index = row * n + col;
C[index] = A[index] + B[index];
}
}
int main() {
int numElements = N * N;
size_t size = numElements * sizeof(float);
// 1. Allocate memory on the Host (CPU)
float *h_A = (float*)malloc(size);
float *h_B = (float*)malloc(size);
float *h_C = (float*)malloc(size);
// 2. Initialize host matrices with arbitrary data
for (int i = 0; i < numElements; i++) {
h_A[i] = 1.0f; // Fill A with 1.0
h_B[i] = 2.0f; // Fill B with 2.0
}
// 3. Allocate memory on the Device (GPU)
float *d_A = NULL;
float *d_B = NULL;
float *d_C = NULL;
cudaMalloc((void**)&d_A, size);
cudaMalloc((void**)&d_B, size);
cudaMalloc((void**)&d_C, size);
// 4. Copy data from Host to Device memory
cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);
// 5. Configure Thread Blocks and Grid Dimensions
// dim3 elements define 2D shapes for blocks and grids
dim3 threadsPerBlock(BLOCK_SIZE, BLOCK_SIZE);
dim3 numBlocks((N + BLOCK_SIZE - 1) / BLOCK_SIZE, (N + BLOCK_SIZE - 1) / BLOCK_SIZE);
// 6. Launch the CUDA Kernel on the GPU
matrixAddKernel<<<numBlocks, threadsPerBlock>>>(d_A, d_B, d_C, N);
// Wait for the GPU to finish before accessing results on CPU
cudaDeviceSynchronize();
// 7. Copy the final result from Device back to Host memory
cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);
// 8. Verify the result (Sample check of a few elements)
int success = 1;
for (int i = 0; i < numElements; i++) {
if (h_C[i] != 3.0f) {
success = 0;
break;
}
}
if (success) {
printf("Success! Matrix addition completed correctly on the GPU.\n");
printf("Sample Element C[0]: %f (Expected: 3.000000)\n", h_C[0]);
} else {
printf("Error! Matrix addition validation failed.\n");
}
// 9. Free Device and Host memory
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
free(h_A);
free(h_B);
free(h_C);
return 0;
}
console ...........
Running in FUNCTIONAL mode...
Compiling...
Executing...
Success! Matrix addition completed correctly on the GPU.
Sample Element C[0]: 3.000000 (Expected: 3.000000)
Exit status: 0.......................................
కామెంట్లు లేవు:
కామెంట్ను పోస్ట్ చేయండి