AssocProf RAM.A.DAYINABOYINA, C.S.E, JUnivEth, MTUnivEth, RAISONY UNIV,KL UNIV AP.......: Mr Dr.............CUDA (Compute Unified Device Architecture) is a proprietary parallel computing platform and programming model created by Nvidia. It allows software developers to use a compatible graphics processing unit (GPU) for general-purpose processing, vastly accelerating compute-intensive tasks like artificial intelligence, scientific simulations, and video rendering............................

3, జులై 2026, శుక్రవారం

Mr Dr.............CUDA (Compute Unified Device Architecture) is a proprietary parallel computing platform and programming model created by Nvidia. It allows software developers to use a compatible graphics processing unit (GPU) for general-purpose processing, vastly accelerating compute-intensive tasks like artificial intelligence, scientific simulations, and video rendering............................

#include <stdio.h>

#include <stdlib.h>

#include <cuda_runtime.h>

// Define matrix dimensions (N x N)

#define N 1024

#define BLOCK_SIZE 16

// CUDA Kernel for Element-wise Matrix Addition

__global__ void matrixAddKernel(const float* A, const float* B, float* C, int n) {

// Calculate global row and column index for the thread

int col = blockIdx.x * blockDim.x + threadIdx.x;

int row = blockIdx.y * blockDim.y + threadIdx.y;

// Boundary check to prevent accessing out-of-bounds memory

if (row < n && col < n) {

// Map 2D coordinate to a flattened 1D index

int index = row * n + col;

C[index] = A[index] + B[index];

}

int main() {

int numElements = N * N;

size_t size = numElements * sizeof(float);

// 1. Allocate memory on the Host (CPU)

float *h_A = (float*)malloc(size);

float *h_B = (float*)malloc(size);

float *h_C = (float*)malloc(size);

// 2. Initialize host matrices with arbitrary data

for (int i = 0; i < numElements; i++) {

h_A[i] = 1.0f; // Fill A with 1.0

h_B[i] = 2.0f; // Fill B with 2.0

}

// 3. Allocate memory on the Device (GPU)

float *d_A = NULL;

float *d_B = NULL;

float *d_C = NULL;

cudaMalloc((void**)&d_A, size);

cudaMalloc((void**)&d_B, size);

cudaMalloc((void**)&d_C, size);

// 4. Copy data from Host to Device memory

cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);

cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);

// 5. Configure Thread Blocks and Grid Dimensions

// dim3 elements define 2D shapes for blocks and grids

dim3 threadsPerBlock(BLOCK_SIZE, BLOCK_SIZE);

dim3 numBlocks((N + BLOCK_SIZE - 1) / BLOCK_SIZE, (N + BLOCK_SIZE - 1) / BLOCK_SIZE);

// 6. Launch the CUDA Kernel on the GPU

matrixAddKernel<<<numBlocks, threadsPerBlock>>>(d_A, d_B, d_C, N);

// Wait for the GPU to finish before accessing results on CPU

cudaDeviceSynchronize();

// 7. Copy the final result from Device back to Host memory

cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);

// 8. Verify the result (Sample check of a few elements)

int success = 1;

for (int i = 0; i < numElements; i++) {

if (h_C[i] != 3.0f) {

success = 0;

break;

}

if (success) {

printf("Success! Matrix addition completed correctly on the GPU.\n");

printf("Sample Element C[0]: %f (Expected: 3.000000)\n", h_C[0]);

} else {

printf("Error! Matrix addition validation failed.\n");

}

// 9. Free Device and Host memory

cudaFree(d_A);

cudaFree(d_B);

cudaFree(d_C);

free(h_A);

free(h_B);

free(h_C);

return 0;

}

console ...........

Running in FUNCTIONAL mode...

Compiling...

Executing...

Success! Matrix addition completed correctly on the GPU.

Sample Element C[0]: 3.000000 (Expected: 3.000000)

Exit status: 0.......................................

AssocProf RAM.A.DAYINABOYINA, C.S.E, JUnivEth, MTUnivEth, RAISONY UNIV,KL UNIV AP.......

3, జులై 2026, శుక్రవారం

కామెంట్‌లు లేవు:

కామెంట్‌ను పోస్ట్ చేయండి