79466303

Date: 2025-02-25 11:04:57
Score: 1.5
Natty:
Report link

I just renamed the program saxpy.f so that it can be modified to introduce some new instructions

Now I want to know after compilation  how saxpy can 
be assessed using nvprof

! nvfortran gks2.cuf

module mathOps

 contains
    attributes(global) subroutine saxpy(x, y, a)
    
    implicit none
    real :: x(:), y(:)
    real, value :: a
    integer :: i, n
    n = size(x)
    i = blockDim%x * (blockIdx%x - 1) + threadIdx%x
    if (i <= n) y(i) = y(i) + a*x(i)
    
  end subroutine saxpy 
end module mathOps

program testSaxpy
  use mathOps
  use cudafor
  implicit none
  integer, parameter :: N = 40000
  real :: x(N), y(N), a
  real, device :: x_d(N), y_d(N)
  type(dim3) :: grid, tBlock

  tBlock = dim3(256,1,1)
  grid = dim3(ceiling(real(N)/tBlock%x),1,1)

  x = 1.0; y = 2.0; a = 2.0
  x_d = x
  y_d = y
  call saxpy<<<grid, tBlock>>>(x_d, y_d, a)
  y = y_d
  write(*,*) 'Max error: ', maxval(abs(y-4.0))
end program testSaxpy



 sudo  /opt/nvidia/hpc_sdk/Linux_x86_64/25.1/compilers/bin/nvfortran 
       -cuda -cudalibs -o g2 gks2.cuf 
       -L/opt/nvidia/hpc_sdk/Linux_x86_64/25.1/math_libs/11.8/targets/x86_64-linux/lib
       
        ./g2
        Max error:     0.000000    
        



    nvprof --unified-memory-profiling off  ./g2
            ==11436== NVPROF is profiling process 11436, command: ./g2
 Max error:     0.000000    
==11436== Profiling application: ./g2
==11436== Profiling result:
   Type      Time(%)  Time        Calls       Avg       Min       Max  Name
 GPU 
 activities: 
         62.38%  31.520us 4   7.8800us 992ns     14.848us  [CUDA memcpy HtoD]
         26.28%  13.280us 1  13.280us  13.280us  13.280us  [CUDA memcpy DtoH]
         11.34%  5.7280us 1   5.7280us  5.7280us  5.7280us  mathops_saxpy_
 API 
calls: 
      79.63%  146.81ms 1 146.81ms  146.81ms  146.81ms uDevicePrimaryCtxRetain
      19.50%  35.959ms 5  7.1918ms  3.6340us  35.877ms  cudaMemcpy
       0.25%  458.66us 1  458.66us  458.66us  458.66us  cuMemAllocHost
       0.22%  404.19us 1  404.19us  404.19us  404.19us  cuDeviceTotalMem
       0.15%  281.90us 106  2.6590us 262ns  118.81us  cuDeviceGetAttribute
       0.10%  180.63us 384    470ns 243ns  5.0580us      P
       0.06%  107.70us 5   21.540us  1.6150us  95.645us  cuMemAlloc
       0.05%  99.009us 1   99.009us  99.009us  99.009us  cuDeviceGetName
       0.01%  19.257us 1  19.257us  19.257us  19.257us  cudaLaunchKernel
       0.01%  17.423us 1  17.423us  17.423us  17.423us  cuDeviceGetPCIBu
       0.01%  16.629us 1  16.629us  16.629us  16.629us  cuInit
       0.01%  12.104us 1  12.104us  12.104us  12.104us  cudaGetDevice
       0.00%  3.8130us 3  1.2710us     317ns  3.1710us  cuDeviceGetCount
       0.00%  1.4840us 3     494ns     267ns     926ns  cuDeviceGet
       0.00%  1.3230us 2     661ns     518ns     805ns  cuCtxGetCurrent
       0.00%  1.1320us 4     283ns      86ns     516ns  cuCtxSetCurrent
    
Reasons:
  • Blacklisted phrase (1): I want to know
  • RegEx Blacklisted phrase (1): I want
  • Long answer (-1):
  • Has code block (-0.5):
  • Low reputation (1):
Posted by: Goutam Sen