-
Notifications
You must be signed in to change notification settings - Fork 30
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Support for mixed precision builtins #1786
Comments
In more detail the current (psykal_lite) implementation of these is ! Perform innerproduct of a r_solver precision field in r_double precision
subroutine invoke_rdouble_X_innerproduct_X(field_norm, field)
use scalar_mod, only: scalar_type
use omp_lib, only: omp_get_thread_num
use omp_lib, only: omp_get_max_threads
use mesh_mod, only: mesh_type
use r_solver_field_mod, only: r_solver_field_type, r_solver_field_proxy_type
implicit none
real(kind=r_def), intent(out) :: field_norm
type(r_solver_field_type), intent(in) :: field
type(scalar_type) :: global_sum
integer(kind=i_def) :: df
real(kind=r_double), allocatable, dimension(:) :: l_field_norm
integer(kind=i_def) :: th_idx
integer(kind=i_def) :: loop0_start, loop0_stop
integer(kind=i_def) :: nthreads
type(r_solver_field_proxy_type) :: field_proxy
integer(kind=i_def) :: max_halo_depth_mesh
type(mesh_type), pointer :: mesh => null()
!
! Determine the number of OpenMP threads
!
nthreads = omp_get_max_threads()
!
! Initialise field and/or operator proxies
!
field_proxy = field%get_proxy()
!
! Create a mesh object
!
mesh => field_proxy%vspace%get_mesh()
max_halo_depth_mesh = mesh%get_halo_depth()
!
! Set-up all of the loop bounds
!
loop0_start = 1
loop0_stop = field_proxy%vspace%get_last_dof_owned()
!
! Call kernels and communication routines
!
!
! Zero summation variables
!
field_norm = 0.0_r_def
ALLOCATE (l_field_norm(nthreads))
l_field_norm = 0.0_r_double
!
!$omp parallel default(shared), private(df,th_idx)
th_idx = omp_get_thread_num()+1
!$omp do schedule(static)
DO df=loop0_start,loop0_stop
l_field_norm(th_idx) = l_field_norm(th_idx) + real(field_proxy%data(df),r_double)**2
END DO
!$omp end do
!$omp end parallel
!
! sum the partial results sequentially
!
DO th_idx=1,nthreads
field_norm = field_norm+real(l_field_norm(th_idx),r_def)
END DO
DEALLOCATE (l_field_norm)
global_sum%value = field_norm
field_norm = global_sum%get_sum()
!
end subroutine invoke_rdouble_X_innerproduct_X
! Perform innerproduct of a r_solver precision field in r_def precision
subroutine invoke_rdouble_X_innerproduct_Y(field_norm, field1, field2)
use scalar_mod, only: scalar_type
use omp_lib, only: omp_get_thread_num
use omp_lib, only: omp_get_max_threads
use mesh_mod, only: mesh_type
use r_solver_field_mod, only: r_solver_field_type, r_solver_field_proxy_type
implicit none
real(kind=r_def), intent(out) :: field_norm
type(r_solver_field_type), intent(in) :: field1, field2
type(scalar_type) :: global_sum
integer(kind=i_def) :: df
real(kind=r_double), allocatable, dimension(:) :: l_field_norm
integer(kind=i_def) :: th_idx
integer(kind=i_def) :: loop0_start, loop0_stop
integer(kind=i_def) :: nthreads
type(r_solver_field_proxy_type) :: field1_proxy, field2_proxy
integer(kind=i_def) :: max_halo_depth_mesh
type(mesh_type), pointer :: mesh => null()
!
! Determine the number of OpenMP threads
!
nthreads = omp_get_max_threads()
!
! Initialise field and/or operator proxies
!
field1_proxy = field1%get_proxy()
field2_proxy = field2%get_proxy()
!
! Create a mesh object
!
mesh => field1_proxy%vspace%get_mesh()
max_halo_depth_mesh = mesh%get_halo_depth()
!
! Set-up all of the loop bounds
!
loop0_start = 1
loop0_stop = field1_proxy%vspace%get_last_dof_owned()
!
! Call kernels and communication routines
!
!
! Zero summation variables
!
field_norm = 0.0_r_def
ALLOCATE (l_field_norm(nthreads))
l_field_norm = 0.0_r_double
!
!$omp parallel default(shared), private(df,th_idx)
th_idx = omp_get_thread_num()+1
!$omp do schedule(static)
DO df=loop0_start,loop0_stop
l_field_norm(th_idx) = l_field_norm(th_idx) + real(field1_proxy%data(df),r_double)*real(field2_proxy%data(df),r_double)
END DO
!$omp end do
!$omp end parallel
!
! sum the partial results sequentially
!
DO th_idx=1,nthreads
field_norm = field_norm+real(l_field_norm(th_idx),r_def)
END DO
DEALLOCATE (l_field_norm)
global_sum%value = field_norm
field_norm = global_sum%get_sum()
!
end subroutine invoke_rdouble_X_innerproduct_Y
! Copy a field_type to a r_solver_field_type
subroutine invoke_copy_to_rsolver(rsolver_field, field)
use omp_lib, only: omp_get_thread_num
use omp_lib, only: omp_get_max_threads
use mesh_mod, only: mesh_type
use r_solver_field_mod, only: r_solver_field_type, r_solver_field_proxy_type
use field_mod, only: field_type, field_proxy_type
implicit none
type(r_solver_field_type), intent(inout) :: rsolver_field
type(field_type), intent(in) :: field
integer(kind=i_def) :: df
integer(kind=i_def) :: loop0_start, loop0_stop
type(r_solver_field_proxy_type) :: rsolver_field_proxy
type(field_proxy_type) :: field_proxy
integer(kind=i_def) :: max_halo_depth_mesh
type(mesh_type), pointer :: mesh => null()
!
! Initialise field and/or operator proxies
!
rsolver_field_proxy = rsolver_field%get_proxy()
field_proxy = field%get_proxy()
!
! Create a mesh object
!
mesh => rsolver_field_proxy%vspace%get_mesh()
max_halo_depth_mesh = mesh%get_halo_depth()
!
! Set-up all of the loop bounds
!
loop0_start = 1
loop0_stop = rsolver_field_proxy%vspace%get_last_dof_halo(1)
!
! Call kernels and communication routines
!
IF (field_proxy%is_dirty(depth=1)) THEN
CALL field_proxy%halo_exchange(depth=1)
END IF
!
!$omp parallel default(shared), private(df)
!$omp do schedule(static)
DO df=loop0_start,loop0_stop
rsolver_field_proxy%data(df) = real(field_proxy%data(df), r_solver)
END DO
!$omp end do
!
! Set halos dirty/clean for fields modified in the above loop
!
!$omp master
CALL rsolver_field_proxy%set_dirty()
CALL rsolver_field_proxy%set_clean(1)
!$omp end master
!
!$omp end parallel
!
end subroutine invoke_copy_to_rsolver
! Copy a r_solver_field_type to a field_type
subroutine invoke_copy_to_rdef(rdef_field, field)
use omp_lib, only: omp_get_thread_num
use omp_lib, only: omp_get_max_threads
use mesh_mod, only: mesh_type
use r_solver_field_mod, only: r_solver_field_type, r_solver_field_proxy_type
use field_mod, only: field_type, field_proxy_type
implicit none
type(field_type), intent(inout) :: rdef_field
type(r_solver_field_type), intent(in) :: field
integer(kind=i_def) :: df
integer(kind=i_def) :: loop0_start, loop0_stop
type(r_solver_field_proxy_type) :: field_proxy
type(field_proxy_type) :: rdef_field_proxy
integer(kind=i_def) :: max_halo_depth_mesh
type(mesh_type), pointer :: mesh => null()
!
! Initialise field and/or operator proxies
!
rdef_field_proxy = rdef_field%get_proxy()
field_proxy = field%get_proxy()
!
! Create a mesh object
!
mesh => rdef_field_proxy%vspace%get_mesh()
max_halo_depth_mesh = mesh%get_halo_depth()
!
! Set-up all of the loop bounds
!
loop0_start = 1
loop0_stop = rdef_field_proxy%vspace%get_last_dof_halo(1)
!
! Call kernels and communication routines
!
IF (field_proxy%is_dirty(depth=1)) THEN
CALL field_proxy%halo_exchange(depth=1)
END IF
!
!$omp parallel default(shared), private(df)
!$omp do schedule(static)
DO df=loop0_start,loop0_stop
rdef_field_proxy%data(df) = real(field_proxy%data(df), r_def)
END DO
!$omp end do
!
! Set halos dirty/clean for fields modified in the above loop
!
!$omp master
CALL rdef_field_proxy%set_dirty()
CALL rdef_field_proxy%set_clean(1)
!$omp end master
!
!$omp end parallel
!
end subroutine invoke_copy_to_rdef |
I was just notified by LFRicians that this it would be good to have it in a release (early next year) so that we can remove some LFRic infrastructure code the relies on PSyKAl-lite. |
Yes. Specifically, |
Regarding
When creating a builtin for this, is it better to combine these into one line rather than initiating two loops over the array? for example:
Does nesting like this even work? |
Hi @oakleybrunt, the rule for proper Kernels (as opposed to their PSyKAlite implementation) is that they must make no reference to parallelism (which, in this case, means no references to thread index). In this case, it will be PSyclone's job to do the reduction. (PSyclone has support for generating the type of code that is used here which guarantees matching run-for-run results on the same number of OMP threads.) I think all you need to do is copy the implementation of the other innerpoduct builtin in PSyclone but include the casting. |
Hey @arporter, I included those lines for context, really. Though I guess it isn't hard for anyone to work out what Okay, so will I need to cast twice like in the example I gave? |
In the Builtin itself you will need to ensure that the LHS of the Assignment has the correct precision and that the RHS has a cast. The code you want to look at is: PSyclone/src/psyclone/domain/lfric/lfric_builtins.py Lines 2374 to 2387 in d4091c3
However, you are also going to have to update the bit of PSyclone that generates the reduction result and that's not going to be so easy because it doesn't yet use the PSyIR properly: PSyclone/src/psyclone/psyGen.py Lines 1145 to 1298 in d4091c3
In fact, it may be that someone at STFC (myself, @sergisiso or @rupertford) needs to do that bit. Take a look and see what you think. |
The code generated by PSyclone (and that which is present in the psykalite you are looking at) is specifically designed to ensure that if you run the same code on the same number of OpenMP threads then you get the same answer. See https://psyclone.readthedocs.io/en/stable/transformations.html#openmp-reductions for a bit more info. This is optional functionality - by default PSyclone will just use OpenMP's default support for reductions but that doesn't give reproducible results. |
See also |
Okay, I understand more... but also have more questions! Questions: Why can't the |
So, in PSyclone there's an old way of doing the code generation and a new way. We are gradually migrating to the new way and both the GOcean and NEMO APIs use it. LFRic does not yet use it. The new way is to construct everything in terms of Nodes in a PSyIR tree and then give that tree to a 'backend' that generates the (Fortran) code. The old way uses functionality from the I can't quite remember why REPRODUCIBLE_REDUCTIONS is set to False. Probably it harms performance and also it's good to be able to keep things as simple as possible sometimes.
You're going to have to help me here - is that the one you're working on? If you're creating a new builtin then it needs to support the option for reproducible reductions - we can't have exceptions. I'm guessing that the answer to your question is: "because the code that generates the reduction (using f2pygen) doesn't know about precision". |
Ahhh this is what I was missing, thank you!!
Okay, I see. It's all linking together now. I had read this in the User or Developer guide today but didn't realise that the old way does not recognize precisions. Does the new way know about precision? |
Well, it's not so much that it doesn't know about precision (I think it can be supplied), it's more that the code that currently uses it will be assuming/hardwired to use |
Where possible, the current version of PSyclone extracts the precision of variables from the algorithm layer in LFRic. This information is required for declaring variables appropriately in the PSy-layer. Whether that information ends up in PSyIR or still uses f2pygen I don't know. |
Note, after a discussion with Chris we have decided to revisit approach to global reductions in #1570 as there is ongoing work in using 32- and 64-bit definitions of global sums in LFRic instead of @oakleybrunt added support for real-to-real conversion built-ins (PR #2420) and that will make it into the upcoming PSyclone release 2.5.0 (issue #2447). |
LFRic ticket #3055 introduces for new builtins to support mixed precision work. These will need implementing in psyclone. Briefly, they are:
invoke_rdouble_X_innerproduct_X
: Compute x.x where the individual sums are promoted to double precision and the output to r_def precisioninvoke_rdouble_X_innerproduct_Y
Compute x.y where the individual sums are promoted to double precision and the output to r_def precisioninvoke_copy_to_rsolver
Effectively setval_X(x, y) where x is r_solver and y is r_definvoke_copy_to_rdef
Effectively setval_X(x, y) where x is r_def and y is r_solverThe final two could be achieved by a generalization of the setval_X builtin using generic interfaces (as used elsewhere in the lfric code)
The text was updated successfully, but these errors were encountered: