I just renamed the program saxpy.f so that it can be modified to introduce some new instructions
Now I want to know after compilation how saxpy can
be assessed using nvprof
! nvfortran gks2.cuf
module mathOps
contains
attributes(global) subroutine saxpy(x, y, a)
implicit none
real :: x(:), y(:)
real, value :: a
integer :: i, n
n = size(x)
i = blockDim%x * (blockIdx%x - 1) + threadIdx%x
if (i <= n) y(i) = y(i) + a*x(i)
end subroutine saxpy
end module mathOps
program testSaxpy
use mathOps
use cudafor
implicit none
integer, parameter :: N = 40000
real :: x(N), y(N), a
real, device :: x_d(N), y_d(N)
type(dim3) :: grid, tBlock
tBlock = dim3(256,1,1)
grid = dim3(ceiling(real(N)/tBlock%x),1,1)
x = 1.0; y = 2.0; a = 2.0
x_d = x
y_d = y
call saxpy<<<grid, tBlock>>>(x_d, y_d, a)
y = y_d
write(*,*) 'Max error: ', maxval(abs(y-4.0))
end program testSaxpy
sudo /opt/nvidia/hpc_sdk/Linux_x86_64/25.1/compilers/bin/nvfortran
-cuda -cudalibs -o g2 gks2.cuf
-L/opt/nvidia/hpc_sdk/Linux_x86_64/25.1/math_libs/11.8/targets/x86_64-linux/lib
./g2
Max error: 0.000000
nvprof --unified-memory-profiling off ./g2
==11436== NVPROF is profiling process 11436, command: ./g2
Max error: 0.000000
==11436== Profiling application: ./g2
==11436== Profiling result:
Type Time(%) Time Calls Avg Min Max Name
GPU
activities:
62.38% 31.520us 4 7.8800us 992ns 14.848us [CUDA memcpy HtoD]
26.28% 13.280us 1 13.280us 13.280us 13.280us [CUDA memcpy DtoH]
11.34% 5.7280us 1 5.7280us 5.7280us 5.7280us mathops_saxpy_
API
calls:
79.63% 146.81ms 1 146.81ms 146.81ms 146.81ms uDevicePrimaryCtxRetain
19.50% 35.959ms 5 7.1918ms 3.6340us 35.877ms cudaMemcpy
0.25% 458.66us 1 458.66us 458.66us 458.66us cuMemAllocHost
0.22% 404.19us 1 404.19us 404.19us 404.19us cuDeviceTotalMem
0.15% 281.90us 106 2.6590us 262ns 118.81us cuDeviceGetAttribute
0.10% 180.63us 384 470ns 243ns 5.0580us P
0.06% 107.70us 5 21.540us 1.6150us 95.645us cuMemAlloc
0.05% 99.009us 1 99.009us 99.009us 99.009us cuDeviceGetName
0.01% 19.257us 1 19.257us 19.257us 19.257us cudaLaunchKernel
0.01% 17.423us 1 17.423us 17.423us 17.423us cuDeviceGetPCIBu
0.01% 16.629us 1 16.629us 16.629us 16.629us cuInit
0.01% 12.104us 1 12.104us 12.104us 12.104us cudaGetDevice
0.00% 3.8130us 3 1.2710us 317ns 3.1710us cuDeviceGetCount
0.00% 1.4840us 3 494ns 267ns 926ns cuDeviceGet
0.00% 1.3230us 2 661ns 518ns 805ns cuCtxGetCurrent
0.00% 1.1320us 4 283ns 86ns 516ns cuCtxSetCurrent