2, జులై 2026, గురువారం
sample CUDA Program GUP HPC NVIDIA
#include <stdio.h>
__global__ void vecAdd(float *a, float *b, float *c, int n) {
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < n) {
c[i] = a[i] + b[i];
}
}
int main() {
int n = 1024;
size_t bytes = n * sizeof(float);
float *h_a = (float*)malloc(bytes);
float *h_b = (float*)malloc(bytes);
float *h_c = (float*)malloc(bytes);
for (int i = 0; i < n; i++) {
h_a[i] = i * 1.0f;
h_b[i] = i * 2.0f;
}
float *d_a, *d_b, *d_c;
cudaMalloc(&d_a, bytes);
cudaMalloc(&d_b, bytes);
cudaMalloc(&d_c, bytes);
cudaMemcpy(d_a, h_a, bytes, cudaMemcpyHostToDevice);
cudaMemcpy(d_b, h_b, bytes, cudaMemcpyHostToDevice);
int threadsPerBlock = 256;
int blocksPerGrid = (n + threadsPerBlock - 1) / threadsPerBlock;
vecAdd<<<blocksPerGrid, threadsPerBlock>>>(d_a, d_b, d_c, n);
cudaMemcpy(h_c, d_c, bytes, cudaMemcpyDeviceToHost);
printf("c[0] = %f\n", h_c[0]);
printf("c[1023] = %f\n", h_c[1023]);
cudaFree(d_a); cudaFree(d_b); cudaFree(d_c);
free(h_a); free(h_b); free(h_c);
return 0;
}
OUTPUT...Compute Unified Device Architecture