#
#     Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
#
# NVIDIA CORPORATION and its licensors retain all intellectual property
# and proprietary rights in and to this software, related documentation
# and any modifications thereto.  Any use, reproduction, disclosure or
# distribution of this software and related documentation without an express
# license agreement from NVIDIA CORPORATION is strictly prohibited.
#

# C++ parallel algorithms support

# -stdpar={gpu|gpu,multicore|multicore}
#         gpu                     C++/Fortran parallelism compiled for GPU execution only
#         gpu,multicore           (default) C++/Fortran parallelism compiled for GPU (default) or multicore CPU execution
#         multicore                         C++/Fortran parallelism compiled for multicore CPU execution only

# If Fortran + -stdpar
variable TGTSPAFTN is default($land($equal($DRIVERLANG,Fortran),$LNGSPA));

# Remove this line and all reference to TGTSPACXX, once `-stdpar` supports multiple compute capabilities.
variable TGTSPACXX is default($and($notequal($DRIVERLANG,Fortran),$LNGSPA));

# Select only CC compatible with OpenMP GPU Offload
variable SPACOMPUTECAPS is default($if($lor($land($equal($DRIVERLANG,CPP),$TGTCUDA),$LNGSPA),$foreach(cc,$COMPUTECAP,$if($expr($cc >= 60),$cc ))));

switch -stdpar is
    help($if($ISFTN,Enable (ISO Fortran 2018) do-concurrent,Enable (ISO C++17) Parallel Algorithms behavior))
    helpgroup(target)
    set(LNGSPA=1)

    # Common
    append(COMPINCDIR=$if($expr($TGTSPA & $TGTALL),include-stdpar))
    # Common

    # GPU
    set(STDPARGPU=$if($expr($TGTSPA & $TGTGPU),1,0))
    set(STDPARMC=$if($expr($TGTSPA & $TGTCPU),1,0))
    set(STDPARMCXX=$if($land($expr($TGTSPA & $TGTCPU),$equal($DRIVERLANG,CPP)),1,0))
    set(NOSTDPAR=0)
    set(CUDARPATHNEEDED=$if($lor($ANYCUF,$land($ISCUDARTNEEDED,$not($ISFTN)),$land($ISFTN,$TGTCUDA)),1,0))
    set(CUDARTNEEDED=$if($lor($ANYCUF,$land($ISCUDARTNEEDED,$not($ISFTN)),$land($ISFTN,$TGTCUDA)),1,0))
    set(PSTL=$if($expr($TGTSPA & $TGTGPU),1))
    set(CNEEDCUDA=$ifn($ISFTN,$if($lor($expr($TGTSPA & $TGTGPU),$ANYCU),1,0),0))
    set(NEEDLOCSCRIPT=$if($expr($TGTSPA & $TGTGPU),1))
    append(CPP1ARGS=$if($expr($TGTSPA & $TGTGPU),$if($STDPARGPU,--cuda --gpustdpar -D__NV_NO_HOST_COMPILER_CHECK $foreach(c,$COMPUTECAPS,--cudacap=$c ))))
    append(CPPPREINC=$if($expr($TGTSPA & $TGTGPU),$if($STDPARGPU,--preinclude _cuda_preinclude.h)))
    append(CPP2ARGS=$if($expr($TGTSPA & $TGTGPU),$if($STDPARGPU,-x 137 1 -x 137 0x200000)))
    append(F901ARGS=$if($expr($TGTSPA & $TGTGPU),$if($STDPARGPU,-x 240 1 )))
    append(F902ARGS=$if($expr($TGTSPA & $TGTGPU),$if($STDPARGPU,-x 240 1 )))
    # Define __NVCOMIPLER_CUDA_ARCH__.  TODO - This needs to be improved to
    # select the highest of the default compute capabilities, or the smallest
    # of the user-specified compute capabilities.  Or maybe report an error if
    # the user specifies multiple compute capabilities.
    # Daniel: The first cc is the default device. If users doesn't specify
    # device id in their application, the application runs on the default
    # device.
    append(USRDDEF=$if($expr($TGTSPA & $TGTGPU),$if($STDPARGPU,-D__PGI_CUDA_ARCH__=$MOSTCAPABLECOMPILEDCC -D__NVCOMPILER_CUDA_ARCH__=$MOSTCAPABLECOMPILEDCC)))
    # TODO - Everything past here was simply copied from the given section.
    # The RC files need to be refactored so that all this stuff is shared
    # rather than copied.
    # GPU compilation
    set(ACCTESLAONLY=$DEFTESLAONLY)
    set(DEFACC=$if($expr($TGTSPA & $TGTGPU),))
    set(DEF901ACC=$if($expr($TGTSPA & $TGTGPU),))
    append(ACCCGFLAGS=$if($expr($TGTSPA & $TGTGPU),$if($STDPARGPU,-x 180 0x4000400 -x 121 0xc00)))
    append(ACCCGFLAGS=$if($expr($TGTSPA & $TGTGPU),$if($STDPARGPU,$PADDFLAG)))
    append(ACCFEFLAGS=$if($expr($TGTSPA & $TGTGPU),$if($STDPARGPU,$PADDFLAG)))
    append(ACCCGFLAGS=$if($expr($TGTSPA & $TGTGPU),$if($STDPARGPU,$DEFAULTCAPFLAG)))
    set(DEFDEFDEFACCRELOC=$if($expr($TGTSPA & $TGTGPU),1,$DEFDEFDEF4ACCRELOC))
    add(nkey=1)
    max(OPTLEVELINITDEF1=2)
    append(OPTLEVELMINLIMIT=$if($expr($TGTSPA & $TGTGPU),$if($STDPARGPU," -ta=tesla")))
    append(ACCFEFLAGS=$if($expr($TGTSPA & $TGTGPU),$if($STDPARGPU,-x 163 1 -x 186 0x80000 -x 180 0x400 $TOOLKITFLAG)))
    append(ACCCGFLAGS=$if($expr($TGTSPA & $TGTGPU),$if($STDPARGPU,-x 163 0x1 -x 186 0x80000 $TOOLKITFLAG $ACCCACHE -x 194 0x40000)))
    append(ACCELS=$if($expr($TGTSPA & $TGTGPU),$if($STDPARGPU,tesla)))
    set(NEEDACCLIB=$if($ISACCLIBNEEDED,1,0))
    set(ACCDEF=$if($expr($TGTSPA & $TGTGPU),$foreach(f,$ACCDEFINES, -D$f)))
    set(ACCDEFDEF=$if($expr($TGTSPA & $TGTGPU),$foreach(f,$ACCDEFINES, -def $f)))
    append(ACCCGFLAGS=$if($expr($TGTSPA & $TGTGPU),$if($STDPARGPU,$if($notequal($USECUDAROOT,),-cudaroot $USECUDAROOT))))

    # C++ Multicore
    set(CONCUR=YES)
    set(CONCURSW=YES)
    append(MPSETDEFCPP=$if($STDPARMCXX, --mpstdpar))
    set(CONCURCOMPILE=$if($or($STDPARMCXX,$LNGOMP),YES))
    set(MPSETDEF=$if($or($STDPARMCXX,$LNGOMP),-mp -x 69 0x200)) # MPSET gets this value, unless -Mpfi
    set(DEFNOSGIMP=)
    set(DEFNOOPENMP=)
    set(NOSGIMP=$if($or($STDPARMCXX,$LNGOMP),$if($index($CG,llvm),-x 69 2)))
    set(COMPLIBMP=$if($or($STDPARMCXX,$LNGOMP),$foreach(dd,$COMPLIBSUBDIR,$if($isdir($COMPBASE/$COMPSYS/$COMPVER/$dd/mp),$COMPBASE/$COMPSYS/$COMPVER/$dd/mp))))
    set(STDRPATHMP=$if($or($STDPARMCXX,$LNGOMP),$foreach(dd,$COMPLIBSUBDIR,$if($isdir($COMPBASE/$COMPSYS/$COMPVER/$dd/mp),-rpath $COMPBASE/$COMPSYS/$COMPVER/$dd/mp))))
    set(MPFAIR=$if($or($STDPARMCXX,$LNGOMP),-x 69 0x400))
    append(CPP2ARGS=$if($or($STDPARMCXX,$LNGOMP),-x 180 0x4000000))
    set(OMPLIB=$if($index($CG,llvm),$NVOMPLIBS))
    # C++ Multicore

    # Fortran Multicore
    set(nkey=0)
    set(DEFACC=)
    set(DEF901ACC=)
    append(ACCCGFLAGS=-x 180 0x4000400 -x 121 0xc00)
    append(ACCELS=$if($land($TGTSPAFTN,$expr($TGTSPA & $TGTCPU)),multicore))
    set(ACCMULTIONLY=$if($land($TGTSPAFTN,$expr($TGTSPA & $TGTCPU)),$DEFMULTIONLY))

    # ACCDEPRECATE PGI Accelerator Directives
    append(ACCCGFLAGS=$if($land($TGTSPAFTN,$expr($TGTSPA & $TGTCPU)),$PADDFLAG))
    append(ACCFEFLAGS=$if($land($TGTSPAFTN,$expr($TGTSPA & $TGTCPU)),$PADDFLAG))

    set(ACCDEF=$foreach(f,$ACCDEFINES, -D$f))
    set(ACCDEFDEF=$foreach(f,$ACCDEFINES, -def $f))
    append(CPP1ARGS=$if($land($TGTSPAFTN,$expr($TGTSPA & $TGTCPU)),--accel --preinclude openacc_predef.h))

    append(OPTLEVELMINLIMIT=$if($land($TGTSPAFTN,$expr($TGTSPA & $TGTCPU))," -ta=multicore"))
    append(ACCFEFLAGS=$if($land($TGTSPAFTN,$expr($TGTSPA & $TGTCPU)),-x 163 1 -x 186 0x80000 -x 180 0x400))
    append(ACCCGFLAGS=$if($land($TGTSPAFTN,$expr($TGTSPA & $TGTCPU)),-x 163 1 -x 186 0x80000 -x 180 0x400 -x 121 0xc00))
    # Stdpar specific
    append(ACCFEFLAGS=$if($land($TGTSPAFTN,$expr($TGTSPA & $TGTCPU)),-x 240 0x1))
    append(ACCCGFLAGS=$if($land($TGTSPAFTN,$expr($TGTSPA & $TGTCPU)),-x 240 0x1))
    # Fortran Multicore

    keyword(
        gpu(
            help($if($ISFTN,Enable Fortran do-concurrent acceleration on the GPU (default); please refer to -gpu for target specific options,Execute C++ parallel algorithms on the GPU (default); please refer to -gpu for target specific options))
            set(TGLSPA=1)
            set(TGLSPAGPU=$TGTGPU)
            set(TGTSPA=$expr($TGLSPACPU | $TGLSPAGPU))
            keyword(
                acc(
                    if($not($ISFTN))
                    help(Execute C++ parallel algorithms on the GPU via OpenACC; please refer to -gpu for target specific options)
                    append(USRDDEF=-D__NVCOMPILER_STDPAR_OPENACC_GPU)
                )
            )
        )
        multicore(
            help($if($ISFTN,Enable Fortran do-concurrent acceleration on multicore; please refer to -gpu for target specific options,Execute C++ parallel algorithms in parallel on the CPU))
            set(TGLSPA=1)
            set(TGLSPACPU=$TGTCPU)
            set(TGTSPA=$expr($TGLSPACPU | $TGLSPAGPU))
        )
    )
    # Fortran Multicore
    append(ACCCGFLAGS=$if($land($TGTSPAFTN,$expr($TGTSPA & $TGTCPU),$MULTICORETRACE), -x 210 8, -y 210 8))
    set(ACCMULTI=$if($land($TGTSPAFTN,$expr($TGTSPA & $TGTCPU)),1))
    set(OMPLIB=$if($equal($PGLLVMTARGET,yes), $if($USEOTHEROMPLIB,$LIBOMP,$NVOMPLIBS)))
    # Fortran Multicore

    # C++ Multicore
    append(CGARGS=$if($STDPARMCXX,$MPSET $MPFAIR $NOSGIMP $NOOPENMP))
    append(CPP1ARGS=$if($STDPARMCXX,$MPSETCPP $MPDEFCPP))
    append(CARGS=$if($STDPARMCXX,$if($index($CG,llvm),$OMPATMFLAGS)))
    # C++ Multicore

    set(LRTLIB=$LRTLIBNAME)

    nokeyword();

switch -nostdpar is
    help(Execute C++ parallel algorithms sequentially)
    helpgroup(target)
    set(NOSTDPAR=1)
    set(STDPARGPU=0)
    set(STDPARMC=0)

    # Disable both host and device
    set(TGLSPA=1)
    set(TGTSPAGPU=0)
    set(TGTSPACPU=0);

# Keep old `-Mpstl` flag for internal use (need to update first tests, runtime builds, etc.)
switch -Mpstl is hide shorthand(-stdpar);
