sfantao created this revision. sfantao added reviewers: ABataev, jlebar, tra, echristo, hfinkel. sfantao added subscribers: caomhin, carlo.bertolli, arpith-jacob, cfe-commits.
This patch proposes a new class to generate and record action dependences related with offloading. The builder provides three main functionalities: - Add device dependences to host actions. - Add host dependence to device actions. - Register device top-level actions. The constructor of the builder detect the programming models that should be supported, and generates a specialized builder for each. If a new programming model is to be added in the future, only a new specialized builder has to be implemented. When the specialized builder is generated, it produces programming-model-specific diagnostics. A CUDA specialized builder is proposed in the patch that mostly consists of the partition of the current `buildCudaAction` by the three different functionalities. http://reviews.llvm.org/D18172 Files: lib/Driver/Driver.cpp
Index: lib/Driver/Driver.cpp =================================================================== --- lib/Driver/Driver.cpp +++ lib/Driver/Driver.cpp @@ -1342,128 +1342,425 @@ } } -// For each unique --cuda-gpu-arch= argument creates a TY_CUDA_DEVICE -// input action and then wraps each in CudaDeviceAction paired with -// appropriate GPU arch name. In case of partial (i.e preprocessing -// only) or device-only compilation, each device action is added to /p -// Actions and /p Current is released. Otherwise the function creates -// and returns a new CudaHostAction which wraps /p Current and device -// side actions. -static Action *buildCudaActions(Compilation &C, DerivedArgList &Args, - const Arg *InputArg, Action *HostAction, - ActionList &Actions) { - Arg *PartialCompilationArg = Args.getLastArg(options::OPT_cuda_host_only, - options::OPT_cuda_device_only); - // Host-only compilation case. - if (PartialCompilationArg && - PartialCompilationArg->getOption().matches(options::OPT_cuda_host_only)) { - OffloadAction::HostDependence HDep( - HostAction, C.getOffloadingHostToolChain(), /*BoundArch=*/nullptr, - Action::OFFLOAD_CUDA); - return C.MakeAction<OffloadAction>(HDep); - } - - // Collect all cuda_gpu_arch parameters, removing duplicates. - SmallVector<const char *, 4> GpuArchList; - llvm::StringSet<> GpuArchNames; - for (Arg *A : Args) { - if (!A->getOption().matches(options::OPT_cuda_gpu_arch_EQ)) - continue; - A->claim(); +namespace { +/// \brief Provides a convenient interface for different programming models to +/// generate the required device actions. +class OffloadingActionBuilder { + /// \brief Flag used to trace errors in the builder. + bool IsValid; + + /// \brief The compilation that is using this builder. + Compilation &C; + + /// \brief The derived arguments associated with this builder. + DerivedArgList &Args; + + /// \brief Builder interface. It doesn't build anything or keep any state. + class DeviceActionBuilder { + public: + typedef llvm::SmallVector<phases::ID, phases::MaxNumberOfPhases> PhasesTy; + + protected: + /// \brief Compilation associated with this builder. + Compilation &C; + + /// \brief Tool chains associated with this builder. The same programming + /// model may have associated one or more tool chains. + SmallVector<const ToolChain *, 2> ToolChains; + + /// \brief The derived arguments associated with this builder. + DerivedArgList &Args; + + /// \brief The inputs associated with this builder. + const Driver::InputList &Inputs; + + public: + DeviceActionBuilder(Compilation &C, DerivedArgList &Args, + const Driver::InputList &Inputs) + : C(C), Args(Args), Inputs(Inputs) {} + virtual ~DeviceActionBuilder() {} + + // \brief Fill up the array \a DA with all the device dependences that + // should be added to the provided host action \a HostAction. If the host + // action is to be ignored or there is any error, return true. + virtual bool getDeviceDepences(OffloadAction::DeviceDependences &DA, + Action *HostAction, types::ID InputType, + const Arg *InputArg, phases::ID CurPhase, + phases::ID FinalPhase, PhasesTy &Phases) { + return false; + } - const auto& Arch = A->getValue(); - if (!toolchains::CudaToolChain::GpuArchToComputeName(Arch)) - C.getDriver().Diag(clang::diag::err_drv_cuda_bad_gpu_arch) << Arch; - else if (GpuArchNames.insert(Arch).second) - GpuArchList.push_back(Arch); - } - - // Default to sm_20 which is the lowest common denominator for supported GPUs. - // sm_20 code should work correctly, if suboptimally, on all newer GPUs. - if (GpuArchList.empty()) - GpuArchList.push_back("sm_20"); - - // Replicate inputs for each GPU architecture. - Driver::InputList CudaDeviceInputs; - for (unsigned I = 0, E = GpuArchList.size(); I != E; ++I) - CudaDeviceInputs.push_back(std::make_pair(types::TY_CUDA_DEVICE, InputArg)); - - const ToolChain *CudaTC = - C.getSingleOffloadDeviceToolChain<Action::OFFLOAD_CUDA>(); - - // Build actions for all device inputs. - assert(CudaTC && "Missing toolchain for device-side compilation."); - ActionList CudaDeviceActions; - C.getDriver().BuildActions(C, Args, CudaDeviceInputs, CudaDeviceActions); - assert(GpuArchList.size() == CudaDeviceActions.size() && - "Failed to create actions for all devices"); - - // Check whether any of device actions stopped before they could generate PTX. - bool PartialCompilation = - llvm::any_of(CudaDeviceActions, [](const Action *a) { - return a->getKind() != Action::AssembleJobClass; - }); - - // Figure out what to do with device actions -- pass them as inputs to the - // host action or run each of them independently. - bool DeviceOnlyCompilation = PartialCompilationArg != nullptr; - if (PartialCompilation || DeviceOnlyCompilation) { - // In case of partial or device-only compilation results of device actions - // are not consumed by the host action device actions have to be added to - // top-level actions list with AtTopLevel=true and run independently. - - // -o is ambiguous if we have more than one top-level action. - if (Args.hasArg(options::OPT_o) && - (!DeviceOnlyCompilation || GpuArchList.size() > 1)) { - C.getDriver().Diag( - clang::diag::err_drv_output_argument_with_multiple_files); - return nullptr; + // \brief Update the state to include the provided host action \a HostAction + // as a dependency of the current device action. Return true, if an error + // was found. + virtual bool addDeviceDepences(Action *HostAction, types::ID InputType, + const Arg *InputArg) { + return false; } - for (unsigned I = 0, E = GpuArchList.size(); I != E; ++I) { - OffloadAction::DeviceDependences DDep; - DDep.add(CudaDeviceActions[I], CudaTC, GpuArchList[I], + // \brief Append top level actions generated by the builder. Return true if + // errors were found. + virtual bool appendTopLevelActions(ActionList &AL) { return false; } + + // \brief Initialize the builder. Return true if any initialization errors + // are found. + virtual bool initialize() { return false; } + + // \brief Return true if this builder is valid. We have a valid builder if + // we have associated device tool chains. + bool isValid() { return !ToolChains.empty(); } + }; + + /// \brief CUDA action builder. It injects device code in the host backend + /// action. + class CudaActionBuilder : public DeviceActionBuilder { + /// \brief Flags to signal if the user requested host-only or device-only + /// compilation. + bool IsHostOnlyCompilation = false; + bool IsDeviceOnlyCompilation = false; + + /// \brief List of GPU architectures to use in this compilation. + SmallVector<const char *, 4> GpuArchList; + + /// \brief The CUDA actions for the current input. + ActionList CudaDeviceActions; + + /// \brief The CUDA fat binary if it was generated for the current input. + Action *CudaFatBinary = nullptr; + + public: + CudaActionBuilder(Compilation &C, DerivedArgList &Args, + const Driver::InputList &Inputs) + : DeviceActionBuilder(C, Args, Inputs) {} + + bool getDeviceDepences(OffloadAction::DeviceDependences &DA, + Action *HostAction, types::ID InputType, + const Arg *InputArg, phases::ID CurPhase, + phases::ID FinalPhase, PhasesTy &Phases) override { + // If we are in host-only mode and dealing with a CUDA input we generate + // a empty dependency to mark the compile host action with the CUDA kind. + if (IsHostOnlyCompilation && InputType == types::TY_CUDA && + CurPhase == phases::Backend) { + DA.add(/*A=*/nullptr, ToolChains.front(), /*BoundArch=*/nullptr, Action::OFFLOAD_CUDA); - Actions.push_back( - C.MakeAction<OffloadAction>(DDep, CudaDeviceActions[I]->getType())); + return false; + } + + // If we don't have any CUDA actions, we don't have any dependences to + // create for the host. + if (CudaDeviceActions.empty()) + return false; + + assert(CudaDeviceActions.size() == GpuArchList.size() && + "Expecting one action per GPU architecture."); + assert(!IsHostOnlyCompilation && + "Not expecting CUDA actions in host-only compilation."); + + // If we are generating code for the device or we are in a backend phase, + // we attempt to generate the fat binary. We compile each arch to ptx and + // assemble to cubin, then feed the cubin *and* the ptx into a device + // "link" action, which uses fatbinary to combine these cubins into one + // fatbin. The fatbin is then an input to the host action if not in + // device-only mode. + if (IsDeviceOnlyCompilation || CurPhase == phases::Backend) { + ActionList DeviceActions; + for (unsigned I = 0, E = GpuArchList.size(); I != E; ++I) { + // Produce the device action from the current phase up to the assemble + // phase. + for (auto Ph : Phases) { + // Skip the phases that were already dealt with. + if (Ph < CurPhase) + continue; + // We have to be consistent with the host final phase. + if (Ph > FinalPhase) + break; + + CudaDeviceActions[I] = C.getDriver().ConstructPhaseAction( + C, Args, Ph, CudaDeviceActions[I]); + + if (Ph == phases::Assemble) + break; + } + + // If we didn't reach the assemble phase, we can't generate the fat + // binary. + if (!isa<AssembleJobAction>(CudaDeviceActions[I])) + continue; + + Action *AssembleAction = CudaDeviceActions[I]; + assert(AssembleAction->getType() == types::TY_Object); + assert(AssembleAction->getInputs().size() == 1); + + Action *BackendAction = AssembleAction->getInputs()[0]; + assert(BackendAction->getType() == types::TY_PP_Asm); + + for (auto &A : {AssembleAction, BackendAction}) { + OffloadAction::DeviceDependences DDep; + DDep.add(A, ToolChains.front(), GpuArchList[I], + Action::OFFLOAD_CUDA); + DeviceActions.push_back( + C.MakeAction<OffloadAction>(DDep, A->getType())); + } + } + + // We generate the fat binary if we have input device actions. + if (!DeviceActions.empty()) { + CudaFatBinary = + C.MakeAction<LinkJobAction>(DeviceActions, types::TY_CUDA_FATBIN); + + if (!IsDeviceOnlyCompilation) { + DA.add(CudaFatBinary, ToolChains.front(), /*BoundArch=*/nullptr, + Action::OFFLOAD_CUDA); + // Clear the fat binary, it is already a dependence to an host + // action. + CudaFatBinary = nullptr; + } + + // Remove the CUDA actions as they are already connected to an host + // action or fat binary. + CudaDeviceActions.clear(); + } + + // We return IsDeviceOnlyCompilation to avoid creating host action in + // device-only mode. + return IsDeviceOnlyCompilation; + } + + assert(CurPhase < phases::Backend && "Generating single CUDA " + "instructions should only occur " + "before the backend phase!"); + + // Produce an action for each device arch. + for (Action *&A : CudaDeviceActions) + A = C.getDriver().ConstructPhaseAction(C, Args, CurPhase, A); + return false; + } + + bool addDeviceDepences(Action *HostAction, types::ID InputType, + const Arg *InputArg) override { + // While generating code for CUDA, we only depend on the host input action + // to trigger the creation of all the CUDA device actions. + + // If we are dealing with an input action, replicate it for each GPU + // architecture, unless we are in host-only mode. + if (isa<InputAction>(HostAction) && !IsHostOnlyCompilation) { + assert(!GpuArchList.empty() && + "We should have at least one GPU architecture."); + + // If the host input is not CUDA, we don't need to bother about this + // input. + if (InputType != types::TY_CUDA) + return false; + + // Replicate inputs for each GPU architecture. + for (unsigned I = 0, E = GpuArchList.size(); I != E; ++I) + CudaDeviceActions.push_back( + C.MakeAction<InputAction>(*InputArg, types::TY_CUDA_DEVICE)); + } + return false; + } + + bool appendTopLevelActions(ActionList &AL) override { + // Utility to append actions to the top level list. + auto AddTopLevel = [&](Action *A, const char *BoundArch) { + OffloadAction::DeviceDependences Dep; + Dep.add(A, ToolChains.front(), BoundArch, Action::OFFLOAD_CUDA); + AL.push_back(C.MakeAction<OffloadAction>(Dep, A->getType())); + }; + + // If we have a fat binary, add it to the list. + if (CudaFatBinary) { + AddTopLevel(CudaFatBinary, /*BoundArch=*/nullptr); + CudaDeviceActions.clear(); + CudaFatBinary = nullptr; + return false; + } + + if (CudaDeviceActions.empty()) + return false; + + // If we have CUDA actions at this point, that's because we have a have + // partial compilation, so we should have an action for each GPU + // architecture. + assert(CudaDeviceActions.size() == GpuArchList.size() && + "Expecting one action per GPU architecture."); + assert(ToolChains.size() == 1 && + "Expecting to have a sing CUDA toolchain."); + for (unsigned I = 0, E = GpuArchList.size(); I != E; ++I) + AddTopLevel(CudaDeviceActions[I], GpuArchList[I]); + + CudaDeviceActions.clear(); + return false; + } + + bool initialize() override { + bool Error = false; + + const auto *CudaTC = + C.getSingleOffloadDeviceToolChain<Action::OFFLOAD_CUDA>(); + + // We don't need to support CUDA. + if (!CudaTC) + return Error; + + ToolChains.push_back(CudaTC); + + Arg *PartialCompilationArg = Args.getLastArg( + options::OPT_cuda_host_only, options::OPT_cuda_device_only); + + IsHostOnlyCompilation = PartialCompilationArg && + PartialCompilationArg->getOption().matches( + options::OPT_cuda_host_only); + + IsDeviceOnlyCompilation = PartialCompilationArg && !IsHostOnlyCompilation; + + // Collect all cuda_gpu_arch parameters, removing duplicates. + llvm::StringSet<> GpuArchNames; + for (Arg *A : Args) { + if (!A->getOption().matches(options::OPT_cuda_gpu_arch_EQ)) + continue; + A->claim(); + + const auto &Arch = A->getValue(); + if (!toolchains::CudaToolChain::GpuArchToComputeName(Arch)) { + C.getDriver().Diag(clang::diag::err_drv_cuda_bad_gpu_arch) << Arch; + Error = true; + } else if (GpuArchNames.insert(Arch).second) + GpuArchList.push_back(Arch); + } + + // Default to sm_20 which is the lowest common denominator for supported + // GPUs. + // sm_20 code should work correctly, if suboptimally, on all newer GPUs. + if (GpuArchList.empty()) + GpuArchList.push_back("sm_20"); + + return Error; } - // Kill host action in case of device-only compilation. - if (DeviceOnlyCompilation) + }; + + /// Add the implementation for other specialized builders here. + + /// \brief Specialized builders being used by this offloading action builder. + SmallVector<DeviceActionBuilder *, 4> SpecializedBuilders; + +public: + OffloadingActionBuilder(Compilation &C, DerivedArgList &Args, + const Driver::InputList &Inputs) + : C(C), Args(Args) { + // Create a specialized builder for each device toolchain. + + IsValid = true; + + // Create a specialized builder for CUDA. + SpecializedBuilders.push_back(new CudaActionBuilder(C, Args, Inputs)); + + // + // Build other specialized builders here. + // + + // Initialize all the builders, keeping track of errors. + for (auto *SB : SpecializedBuilders) + IsValid = IsValid && !SB->initialize(); + } + + ~OffloadingActionBuilder() { + for (auto *SB : SpecializedBuilders) + delete SB; + } + + /// \brief Generate an action that adds device dependences (if any) to an host + /// action. If no dependence actions exist, just return the host action \a + /// HostAction. If an error is found or if no builder requires the host + /// toolchain, return nullptr. + Action * + addDeviceDependencesToHostAction(Action *HostAction, types::ID InputType, + const Arg *InputArg, phases::ID CurPhase, + phases::ID FinalPhase, + DeviceActionBuilder::PhasesTy &Phases) { + if (!IsValid) return nullptr; - return HostAction; - } - - // If we're not a partial or device-only compilation, we compile each arch to - // ptx and assemble to cubin, then feed the cubin *and* the ptx into a device - // "link" action, which uses fatbinary to combine these cubins into one - // fatbin. The fatbin is then an input to the host compilation. - ActionList DeviceActions; - for (unsigned I = 0, E = GpuArchList.size(); I != E; ++I) { - Action* AssembleAction = CudaDeviceActions[I]; - assert(AssembleAction->getType() == types::TY_Object); - assert(AssembleAction->getInputs().size() == 1); - - Action* BackendAction = AssembleAction->getInputs()[0]; - assert(BackendAction->getType() == types::TY_PP_Asm); - - for (auto &A : {AssembleAction, BackendAction}) { - OffloadAction::DeviceDependences DDep; - DDep.add(A, CudaTC, GpuArchList[I], Action::OFFLOAD_CUDA); - DeviceActions.push_back(C.MakeAction<OffloadAction>(DDep, A->getType())); - } - } - auto FatbinAction = - C.MakeAction<LinkJobAction>(DeviceActions, types::TY_CUDA_FATBIN); - - // Return a new host action that incorporates original host action and all - // device actions. - OffloadAction::HostDependence HDep(HostAction, C.getOffloadingHostToolChain(), - /*BoundArch=*/nullptr, - Action::OFFLOAD_CUDA); - OffloadAction::DeviceDependences DDep; - DDep.add(FatbinAction, CudaTC, /*BoundArch=*/nullptr, Action::OFFLOAD_CUDA); - return C.MakeAction<OffloadAction>(HDep, DDep); -} + + if (SpecializedBuilders.empty()) + return HostAction; + + assert(HostAction && "Invalid host action!"); + + OffloadAction::DeviceDependences DDeps; + // Check if all the programming models agree we should not emit the host + // action. + bool DoesNotRequireHostAction = true; + for (auto *SB : SpecializedBuilders) { + if (!SB->isValid()) { + DoesNotRequireHostAction = false; + continue; + } + DoesNotRequireHostAction = + DoesNotRequireHostAction && + SB->getDeviceDepences(DDeps, HostAction, InputType, InputArg, + CurPhase, FinalPhase, Phases); + } + + if (DoesNotRequireHostAction) + return nullptr; + + if (DDeps.getActions().empty()) + return HostAction; + + // We have dependences we need to bundle together. We use an offload action + // for that. + OffloadAction::HostDependence HDep(HostAction, + C.getOffloadingHostToolChain(), + /*BoundArch=*/nullptr, DDeps); + return C.MakeAction<OffloadAction>(HDep, DDeps); + } + + /// \brief Generate an action that adds a host dependence to a device action. + /// The results will be kept in this action builder. Return true if an error + /// was found. + bool addHostDependenceToDeviceActions(Action *HostAction, types::ID InputType, + const Arg *InputArg) { + if (!IsValid) + return true; + + assert(HostAction && "Invalid host action!"); + + bool Error = false; + for (auto *SB : SpecializedBuilders) { + if (!SB->isValid()) + continue; + Error = Error || SB->addDeviceDepences(HostAction, InputType, InputArg); + } + return Error; + } + + /// \brief Add the offloading top level actions to the provided action list. + bool appendTopLevelActions(ActionList &AL) { + bool Error = false; + + auto NumActions = AL.size(); + + for (auto *SB : SpecializedBuilders) { + if (!SB->isValid()) + continue; + Error = Error || SB->appendTopLevelActions(AL); + } + + assert(NumActions <= AL.size() && "Expecting more actions, not less!"); + + // If any action is added by the builders, -o is ambiguous if we have more + // than one top-level action. + if (NumActions < AL.size() && Args.hasArg(options::OPT_o) && + AL.size() > 1) { + C.getDriver().Diag( + clang::diag::err_drv_output_argument_with_multiple_files); + return true; + } + + return Error; + } +}; +} // anonymous namespace. void Driver::BuildActions(Compilation &C, DerivedArgList &Args, const InputList &Inputs, ActionList &Actions) const { @@ -1571,6 +1868,9 @@ YcArg = YuArg = nullptr; } + // Builder to be used to build offloading actions. + OffloadingActionBuilder OffloadBuilder(C, Args, Inputs); + // Construct the actions to perform. ActionList LinkerInputs; @@ -1633,52 +1933,63 @@ continue; } - phases::ID CudaInjectionPhase = - (phases::Compile < FinalPhase && - llvm::find(PL, phases::Compile) != PL.end()) - ? phases::Compile - : FinalPhase; // Build the pipeline for this file. Action *Current = C.MakeAction<InputAction>(*InputArg, InputType); + + // Use the current host action in any of the offloading actions, if + // required. + if (OffloadBuilder.addHostDependenceToDeviceActions(Current, InputType, + InputArg)) + break; + for (SmallVectorImpl<phases::ID>::iterator i = PL.begin(), e = PL.end(); i != e; ++i) { phases::ID Phase = *i; // We are done if this step is past what the user requested. if (Phase > FinalPhase) break; + // Add any offload action the host action depends on. + Current = OffloadBuilder.addDeviceDependencesToHostAction( + Current, InputType, InputArg, Phase, FinalPhase, PL); + if (!Current) + break; + // Queue linker inputs. if (Phase == phases::Link) { assert((i + 1) == e && "linking must be final compilation step."); LinkerInputs.push_back(Current); Current = nullptr; break; } - // Some types skip the assembler phase (e.g., llvm-bc), but we can't - // encode this in the steps because the intermediate type depends on - // arguments. Just special case here. - if (Phase == phases::Assemble && Current->getType() != types::TY_PP_Asm) + // Otherwise construct the appropriate action. + auto *NewCurrent = ConstructPhaseAction(C, Args, Phase, Current); + + // We didn't create a new action, so we will just move to the next phase. + if (NewCurrent == Current) continue; - // Otherwise construct the appropriate action. - Current = ConstructPhaseAction(C, Args, Phase, Current); + Current = NewCurrent; - if (InputType == types::TY_CUDA && Phase == CudaInjectionPhase) { - Current = buildCudaActions(C, Args, InputArg, Current, Actions); - if (!Current) - break; - } + // Use the current host action in any of the offloading actions, if + // required. + if (OffloadBuilder.addHostDependenceToDeviceActions(Current, InputType, + InputArg)) + break; if (Current->getType() == types::TY_Nothing) break; } // If we ended with something, add to the output list. if (Current) Actions.push_back(Current); + + // Add any top level actions generated for offloading. + OffloadBuilder.appendTopLevelActions(Actions); } // Add a link action if necessary. @@ -1704,6 +2015,13 @@ Action *Driver::ConstructPhaseAction(Compilation &C, const ArgList &Args, phases::ID Phase, Action *Input) const { llvm::PrettyStackTraceString CrashInfo("Constructing phase actions"); + + // Some types skip the assembler phase (e.g., llvm-bc), but we can't + // encode this in the steps because the intermediate type depends on + // arguments. Just special case here. + if (Phase == phases::Assemble && Input->getType() != types::TY_PP_Asm) + return Input; + // Build the appropriate action. switch (Phase) { case phases::Link:
_______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits