PROGRAM fortest
! simple program which creates 2 vectors and adds them in a
! cuda function
IMPLICIT NONE
integer*4 :: i
integer*4, parameter :: N=8
real*4, Dimension(N) :: a, b
DO i=1,N
a(i)=i*1.0
b(i)=2.0
END DO
print *, 'a = ', (a(i), i=1,N)
CALL kernel_wrapper(a, b, N)
print *, 'a + 2 = ', (a(i), i=1,N)
END PROGRAM
cudatest.cu
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <cuda.h>
#include <cuda_runtime.h>
// simple kernel function that adds two vectors
__global__ void vect_add(float *a, float *b, int N)
{
int idx = threadIdx.x;
if (idx<N) a[idx] = a[idx] + b[idx];
}
// function called from main fortran program
extern "C" void kernel_wrapper_(float *a, float *b, int *Np)
{
float *a_d, *b_d; // declare GPU vector copies
int blocks = 1; // uses 1 block of
int N = *Np; // N threads on GPU
// Allocate memory on GPU
cudaMalloc( (void **)&a_d, sizeof(float) * N );
cudaMalloc( (void **)&b_d, sizeof(float) * N );
// copy vectors from CPU to GPU
cudaMemcpy( a_d, a, sizeof(float) * N, cudaMemcpyHostToDevice );
cudaMemcpy( b_d, b, sizeof(float) * N, cudaMemcpyHostToDevice );
// call function on GPU
vect_add<<< blocks, N >>>( a_d, b_d, N);
// copy vectors back from GPU to CPU
cudaMemcpy( a, a_d, sizeof(float) * N, cudaMemcpyDeviceToHost );
cudaMemcpy( b, b_d, sizeof(float) * N, cudaMemcpyDeviceToHost );
// free GPU memory
cudaFree(a_d);
cudaFree(b_d);
return;
}