3, జులై 2026, శుక్రవారం

Mr Dr.............CUDA (Compute Unified Device Architecture) is a proprietary parallel computing platform and programming model created by Nvidia. It allows software developers to use a compatible graphics processing unit (GPU) for general-purpose processing, vastly accelerating compute-intensive tasks like artificial intelligence, scientific simulations, and video rendering............................

 #include <stdio.h>

#include <stdlib.h>

#include <cuda_runtime.h>


// Define matrix dimensions (N x N)

#define N 1024

#define BLOCK_SIZE 16


// CUDA Kernel for Element-wise Matrix Addition

__global__ void matrixAddKernel(const float* A, const float* B, float* C, int n) {

    // Calculate global row and column index for the thread

    int col = blockIdx.x * blockDim.x + threadIdx.x;

    int row = blockIdx.y * blockDim.y + threadIdx.y;


    // Boundary check to prevent accessing out-of-bounds memory

    if (row < n && col < n) {

        // Map 2D coordinate to a flattened 1D index

        int index = row * n + col;

        C[index] = A[index] + B[index];

    }

}


int main() {

    int numElements = N * N;

    size_t size = numElements * sizeof(float);


    // 1. Allocate memory on the Host (CPU)

    float *h_A = (float*)malloc(size);

    float *h_B = (float*)malloc(size);

    float *h_C = (float*)malloc(size);


    // 2. Initialize host matrices with arbitrary data

    for (int i = 0; i < numElements; i++) {

        h_A[i] = 1.0f; // Fill A with 1.0

        h_B[i] = 2.0f; // Fill B with 2.0

    }


    // 3. Allocate memory on the Device (GPU)

    float *d_A = NULL;

    float *d_B = NULL;

    float *d_C = NULL;

    cudaMalloc((void**)&d_A, size);

    cudaMalloc((void**)&d_B, size);

    cudaMalloc((void**)&d_C, size);


    // 4. Copy data from Host to Device memory

    cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);

    cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);


    // 5. Configure Thread Blocks and Grid Dimensions

    // dim3 elements define 2D shapes for blocks and grids

    dim3 threadsPerBlock(BLOCK_SIZE, BLOCK_SIZE); 

    dim3 numBlocks((N + BLOCK_SIZE - 1) / BLOCK_SIZE, (N + BLOCK_SIZE - 1) / BLOCK_SIZE);


    // 6. Launch the CUDA Kernel on the GPU

    matrixAddKernel<<<numBlocks, threadsPerBlock>>>(d_A, d_B, d_C, N);


    // Wait for the GPU to finish before accessing results on CPU

    cudaDeviceSynchronize();


    // 7. Copy the final result from Device back to Host memory

    cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);


    // 8. Verify the result (Sample check of a few elements)

    int success = 1;

    for (int i = 0; i < numElements; i++) {

        if (h_C[i] != 3.0f) {

            success = 0;

            break;

        }

    }


    if (success) {

        printf("Success! Matrix addition completed correctly on the GPU.\n");

        printf("Sample Element C[0]: %f (Expected: 3.000000)\n", h_C[0]);

    } else {

        printf("Error! Matrix addition validation failed.\n");

    }


    // 9. Free Device and Host memory

    cudaFree(d_A);

    cudaFree(d_B);

    cudaFree(d_C);

    free(h_A);

    free(h_B);

    free(h_C);


    return 0;

}


console ...........

Running in FUNCTIONAL mode...

Compiling...

Executing...

Success! Matrix addition completed correctly on the GPU.

Sample Element C[0]: 3.000000 (Expected: 3.000000)

Exit status: 0.......................................

కామెంట్‌లు లేవు:

కామెంట్‌ను పోస్ట్ చేయండి