// Launch kernel int threadsPerBlock = 256; int blocksPerGrid = (n + threadsPerBlock - 1) / threadsPerBlock; vectorAdd<<<blocksPerGrid, threadsPerBlock>>>(d_a, d_b, d_c, n);
all: $(TARGET)
run: $(TARGET) ./$(TARGET)
// Allocate host memory float *h_a = new float[n]; float *h_b = new float[n]; float *h_c = new float[n]; cuda toolkit
// Launch kernel int threadsPerBlock = 256; int blocksPerGrid = (n + threadsPerBlock - 1) / threadsPerBlock; vectorAdd<<<blocksPerGrid, threadsPerBlock>>>(d_a, d_b, d_c, n);
all: $(TARGET)
run: $(TARGET) ./$(TARGET)
// Allocate host memory float *h_a = new float[n]; float *h_b = new float[n]; float *h_c = new float[n];