#include #include __global__ void add(int *a, int *b, int *c, int n) { int i = threadIdx.x + blockIdx.x * blockDim.x; if (i < n) c[i] = a[i] + b[i]; } int main() { const int N = 1000; int *h_a = new int[N], *h_b = new int[N], *h_c = new int[N]; for (int i = 0; i < N; ++i) { h_a[i] = i; h_b[i] = 2 * i; } int *d_a, *d_b, *d_c; cudaMalloc(&d_a, N * sizeof(int)); cudaMalloc(&d_b, N * sizeof(int)); cudaMalloc(&d_c, N * sizeof(int)); cudaMemcpy(d_a, h_a, N * sizeof(int), cudaMemcpyHostToDevice); cudaMemcpy(d_b, h_b, N * sizeof(int), cudaMemcpyHostToDevice); add<<<(N + 255) / 256, 256>>>(d_a, d_b, d_c, N); cudaMemcpy(h_c, d_c, N * sizeof(int), cudaMemcpyDeviceToHost); std::cout << "Result: " << h_c[0] << ", " << h_c[N-1] << std::endl; cudaFree(d_a); cudaFree(d_b); cudaFree(d_c); delete[] h_a; delete[] h_b; delete[] h_c; return 0; }