jothidam.tv - c99shell

!C99Shell v. 2.0 [PHP 7 Update] [25.02.2019]!
Software: Apache. PHP/7.3.33 uname -a: Linux web25.us.cloudlogin.co 5.10.237-xeon-hst #1 SMP Mon May 5 15:10:04 UTC 2025 x86_64 uid=233359(alpastrology) gid=888(tty) groups=888(tty),33(tape) Safe-mode: OFF (not secure) /usr/include/llvm/Frontend/OpenMP/ drwxr-xr-x Free 6181.94 GB of 6263.14 GB (98.7%) Encoder Tools Proc. FTP brute Sec. SQL PHP-code Update Feedback Self remove Logout

//===- IR/OpenMPIRBuilder.h - OpenMP encoding builder for LLVM IR - C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file defines the OpenMPIRBuilder class and helpers used as a convenient
// way to create LLVM instructions for OpenMP directives.
//
//===----------------------------------------------------------------------===//

#ifndef LLVM_FRONTEND_OPENMP_OMPIRBUILDER_H
#define LLVM_FRONTEND_OPENMP_OMPIRBUILDER_H

#include "llvm/Analysis/MemorySSAUpdater.h"
#include "llvm/Frontend/OpenMP/OMPConstants.h"
#include "llvm/Frontend/OpenMP/OMPGridValues.h"
#include "llvm/IR/DebugLoc.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/Module.h"
#include "llvm/Support/Allocator.h"
#include "llvm/TargetParser/Triple.h"
#include <forward_list>
#include <map>
#include <optional>

namespace llvm {
class CanonicalLoopInfo;
struct TargetRegionEntryInfo;
class OffloadEntriesInfoManager;
class OpenMPIRBuilder;

/// Move the instruction after an InsertPoint to the beginning of another
/// BasicBlock.
///
/// The instructions after \p IP are moved to the beginning of \p New which must
/// not have any PHINodes. If \p CreateBranch is true, a branch instruction to
/// \p New will be added such that there is no semantic change. Otherwise, the
/// \p IP insert block remains degenerate and it is up to the caller to insert a
/// terminator.
void spliceBB(IRBuilderBase::InsertPoint IP, BasicBlock *New,
              bool CreateBranch);

/// Splice a BasicBlock at an IRBuilder's current insertion point. Its new
/// insert location will stick to after the instruction before the insertion
/// point (instead of moving with the instruction the InsertPoint stores
/// internally).
void spliceBB(IRBuilder<> &Builder, BasicBlock *New, bool CreateBranch);

/// Split a BasicBlock at an InsertPoint, even if the block is degenerate
/// (missing the terminator).
///
/// llvm::SplitBasicBlock and BasicBlock::splitBasicBlock require a well-formed
/// BasicBlock. \p Name is used for the new successor block. If \p CreateBranch
/// is true, a branch to the new successor will new created such that
/// semantically there is no change; otherwise the block of the insertion point
/// remains degenerate and it is the caller's responsibility to insert a
/// terminator. Returns the new successor block.
BasicBlock *splitBB(IRBuilderBase::InsertPoint IP, bool CreateBranch,
                    llvm::Twine Name = {});

/// Split a BasicBlock at \p Builder's insertion point, even if the block is
/// degenerate (missing the terminator).  Its new insert location will stick to
/// after the instruction before the insertion point (instead of moving with the
/// instruction the InsertPoint stores internally).
BasicBlock *splitBB(IRBuilderBase &Builder, bool CreateBranch,
                    llvm::Twine Name = {});

/// Split a BasicBlock at \p Builder's insertion point, even if the block is
/// degenerate (missing the terminator).  Its new insert location will stick to
/// after the instruction before the insertion point (instead of moving with the
/// instruction the InsertPoint stores internally).
BasicBlock *splitBB(IRBuilder<> &Builder, bool CreateBranch, llvm::Twine Name);

/// Like splitBB, but reuses the current block's name for the new name.
BasicBlock *splitBBWithSuffix(IRBuilderBase &Builder, bool CreateBranch,
                              llvm::Twine Suffix = ".split");

/// Captures attributes that affect generating LLVM-IR using the
/// OpenMPIRBuilder and related classes. Note that not all attributes are
/// required for all classes or functions. In some use cases the configuration
/// is not necessary at all, because because the only functions that are called
/// are ones that are not dependent on the configuration.
class OpenMPIRBuilderConfig {
public:
  /// Flag to define whether to generate code for the role of the OpenMP host
  /// (if set to false) or device (if set to true) in an offloading context. It
  /// is set when the -fopenmp-is-target-device compiler frontend option is
  /// specified.
  std::optional<bool> IsTargetDevice;

  /// Flag for specifying if the compilation is done for an accelerator. It is
  /// set according to the architecture of the target triple and currently only
  /// true when targeting AMDGPU or NVPTX. Today, these targets can only perform
  /// the role of an OpenMP target device, so `IsTargetDevice` must also be true
  /// if `IsGPU` is true. This restriction might be lifted if an accelerator-
  /// like target with the ability to work as the OpenMP host is added, or if
  /// the capabilities of the currently supported GPU architectures are
  /// expanded.
  std::optional<bool> IsGPU;

  /// Flag for specifying if LLVMUsed information should be emitted.
  std::optional<bool> EmitLLVMUsedMetaInfo;

  /// Flag for specifying if offloading is mandatory.
  std::optional<bool> OpenMPOffloadMandatory;

  /// First separator used between the initial two parts of a name.
  std::optional<StringRef> FirstSeparator;
  /// Separator used between all of the rest consecutive parts of s name
  std::optional<StringRef> Separator;

  // Grid Value for the GPU target
  std::optional<omp::GV> GridValue;

  OpenMPIRBuilderConfig();
  OpenMPIRBuilderConfig(bool IsTargetDevice, bool IsGPU,
                        bool OpenMPOffloadMandatory,
                        bool HasRequiresReverseOffload,
                        bool HasRequiresUnifiedAddress,
                        bool HasRequiresUnifiedSharedMemory,
                        bool HasRequiresDynamicAllocators);

  // Getters functions that assert if the required values are not present.
  bool isTargetDevice() const {
    assert(IsTargetDevice.has_value() && "IsTargetDevice is not set");
    return *IsTargetDevice;
  }

  bool isGPU() const {
    assert(IsGPU.has_value() && "IsGPU is not set");
    return *IsGPU;
  }

  bool openMPOffloadMandatory() const {
    assert(OpenMPOffloadMandatory.has_value() &&
           "OpenMPOffloadMandatory is not set");
    return *OpenMPOffloadMandatory;
  }

  omp::GV getGridValue() const {
    assert(GridValue.has_value() && "GridValue is not set");
    return *GridValue;
  }

  bool hasRequiresFlags() const { return RequiresFlags; }
  bool hasRequiresReverseOffload() const;
  bool hasRequiresUnifiedAddress() const;
  bool hasRequiresUnifiedSharedMemory() const;
  bool hasRequiresDynamicAllocators() const;

  /// Returns requires directive clauses as flags compatible with those expected
  /// by libomptarget.
  int64_t getRequiresFlags() const;

  // Returns the FirstSeparator if set, otherwise use the default separator
  // depending on isGPU
  StringRef firstSeparator() const {
    if (FirstSeparator.has_value())
      return *FirstSeparator;
    if (isGPU())
      return "_";
    return ".";
  }

  // Returns the Separator if set, otherwise use the default separator depending
  // on isGPU
  StringRef separator() const {
    if (Separator.has_value())
      return *Separator;
    if (isGPU())
      return "$";
    return ".";
  }

  void setIsTargetDevice(bool Value) { IsTargetDevice = Value; }
  void setIsGPU(bool Value) { IsGPU = Value; }
  void setEmitLLVMUsed(bool Value = true) { EmitLLVMUsedMetaInfo = Value; }
  void setOpenMPOffloadMandatory(bool Value) { OpenMPOffloadMandatory = Value; }
  void setFirstSeparator(StringRef FS) { FirstSeparator = FS; }
  void setSeparator(StringRef S) { Separator = S; }
  void setGridValue(omp::GV G) { GridValue = G; }

  void setHasRequiresReverseOffload(bool Value);
  void setHasRequiresUnifiedAddress(bool Value);
  void setHasRequiresUnifiedSharedMemory(bool Value);
  void setHasRequiresDynamicAllocators(bool Value);

private:
  /// Flags for specifying which requires directive clauses are present.
  int64_t RequiresFlags;
};

/// Data structure to contain the information needed to uniquely identify
/// a target entry.
struct TargetRegionEntryInfo {
  std::string ParentName;
  unsigned DeviceID;
  unsigned FileID;
  unsigned Line;
  unsigned Count;

  TargetRegionEntryInfo() : DeviceID(0), FileID(0), Line(0), Count(0) {}
  TargetRegionEntryInfo(StringRef ParentName, unsigned DeviceID,
                        unsigned FileID, unsigned Line, unsigned Count = 0)
      : ParentName(ParentName), DeviceID(DeviceID), FileID(FileID), Line(Line),
        Count(Count) {}

  static void getTargetRegionEntryFnName(SmallVectorImpl<char> &Name,
                                         StringRef ParentName,
                                         unsigned DeviceID, unsigned FileID,
                                         unsigned Line, unsigned Count);

  bool operator<(const TargetRegionEntryInfo &RHS) const {
    return std::make_tuple(ParentName, DeviceID, FileID, Line, Count) <
           std::make_tuple(RHS.ParentName, RHS.DeviceID, RHS.FileID, RHS.Line,
                           RHS.Count);
  }
};

/// Class that manages information about offload code regions and data
class OffloadEntriesInfoManager {
  /// Number of entries registered so far.
  OpenMPIRBuilder *OMPBuilder;
  unsigned OffloadingEntriesNum = 0;

public:
  /// Base class of the entries info.
  class OffloadEntryInfo {
  public:
    /// Kind of a given entry.
    enum OffloadingEntryInfoKinds : unsigned {
      /// Entry is a target region.
      OffloadingEntryInfoTargetRegion = 0,
      /// Entry is a declare target variable.
      OffloadingEntryInfoDeviceGlobalVar = 1,
      /// Invalid entry info.
      OffloadingEntryInfoInvalid = ~0u
    };

  protected:
    OffloadEntryInfo() = delete;
    explicit OffloadEntryInfo(OffloadingEntryInfoKinds Kind) : Kind(Kind) {}
    explicit OffloadEntryInfo(OffloadingEntryInfoKinds Kind, unsigned Order,
                              uint32_t Flags)
        : Flags(Flags), Order(Order), Kind(Kind) {}
    ~OffloadEntryInfo() = default;

  public:
    bool isValid() const { return Order != ~0u; }
    unsigned getOrder() const { return Order; }
    OffloadingEntryInfoKinds getKind() const { return Kind; }
    uint32_t getFlags() const { return Flags; }
    void setFlags(uint32_t NewFlags) { Flags = NewFlags; }
    Constant *getAddress() const { return cast_or_null<Constant>(Addr); }
    void setAddress(Constant *V) {
      assert(!Addr.pointsToAliveValue() && "Address has been set before!");
      Addr = V;
    }
    static bool classof(const OffloadEntryInfo *Info) { return true; }

  private:
    /// Address of the entity that has to be mapped for offloading.
    WeakTrackingVH Addr;

    /// Flags associated with the device global.
    uint32_t Flags = 0u;

    /// Order this entry was emitted.
    unsigned Order = ~0u;

    OffloadingEntryInfoKinds Kind = OffloadingEntryInfoInvalid;
  };

  /// Return true if a there are no entries defined.
  bool empty() const;
  /// Return number of entries defined so far.
  unsigned size() const { return OffloadingEntriesNum; }

  OffloadEntriesInfoManager(OpenMPIRBuilder *builder) : OMPBuilder(builder) {}

  //
  // Target region entries related.
  //

  /// Kind of the target registry entry.
  enum OMPTargetRegionEntryKind : uint32_t {
    /// Mark the entry as target region.
    OMPTargetRegionEntryTargetRegion = 0x0,
  };

  /// Target region entries info.
  class OffloadEntryInfoTargetRegion final : public OffloadEntryInfo {
    /// Address that can be used as the ID of the entry.
    Constant *ID = nullptr;

  public:
    OffloadEntryInfoTargetRegion()
        : OffloadEntryInfo(OffloadingEntryInfoTargetRegion) {}
    explicit OffloadEntryInfoTargetRegion(unsigned Order, Constant *Addr,
                                          Constant *ID,
                                          OMPTargetRegionEntryKind Flags)
        : OffloadEntryInfo(OffloadingEntryInfoTargetRegion, Order, Flags),
          ID(ID) {
      setAddress(Addr);
    }

    Constant *getID() const { return ID; }
    void setID(Constant *V) {
      assert(!ID && "ID has been set before!");
      ID = V;
    }
    static bool classof(const OffloadEntryInfo *Info) {
      return Info->getKind() == OffloadingEntryInfoTargetRegion;
    }
  };

  /// Initialize target region entry.
  /// This is ONLY needed for DEVICE compilation.
  void initializeTargetRegionEntryInfo(const TargetRegionEntryInfo &EntryInfo,
                                       unsigned Order);
  /// Register target region entry.
  void registerTargetRegionEntryInfo(TargetRegionEntryInfo EntryInfo,
                                     Constant *Addr, Constant *ID,
                                     OMPTargetRegionEntryKind Flags);
  /// Return true if a target region entry with the provided information
  /// exists.
  bool hasTargetRegionEntryInfo(TargetRegionEntryInfo EntryInfo,
                                bool IgnoreAddressId = false) const;

  // Return the Name based on \a EntryInfo using the next available Count.
  void getTargetRegionEntryFnName(SmallVectorImpl<char> &Name,
                                  const TargetRegionEntryInfo &EntryInfo);

  /// brief Applies action \a Action on all registered entries.
  typedef function_ref<void(const TargetRegionEntryInfo &EntryInfo,
                            const OffloadEntryInfoTargetRegion &)>
      OffloadTargetRegionEntryInfoActTy;
  void
  actOnTargetRegionEntriesInfo(const OffloadTargetRegionEntryInfoActTy &Action);

  //
  // Device global variable entries related.
  //

  /// Kind of the global variable entry..
  enum OMPTargetGlobalVarEntryKind : uint32_t {
    /// Mark the entry as a to declare target.
    OMPTargetGlobalVarEntryTo = 0x0,
    /// Mark the entry as a to declare target link.
    OMPTargetGlobalVarEntryLink = 0x1,
    /// Mark the entry as a declare target enter.
    OMPTargetGlobalVarEntryEnter = 0x2,
    /// Mark the entry as having no declare target entry kind.
    OMPTargetGlobalVarEntryNone = 0x3,
    /// Mark the entry as a declare target indirect global.
    OMPTargetGlobalVarEntryIndirect = 0x8,
    /// Mark the entry as a register requires global.
    OMPTargetGlobalRegisterRequires = 0x10,
  };

  /// Kind of device clause for declare target variables
  /// and functions
  /// NOTE: Currently not used as a part of a variable entry
  /// used for Flang and Clang to interface with the variable
  /// related registration functions
  enum OMPTargetDeviceClauseKind : uint32_t {
    /// The target is marked for all devices
    OMPTargetDeviceClauseAny = 0x0,
    /// The target is marked for non-host devices
    OMPTargetDeviceClauseNoHost = 0x1,
    /// The target is marked for host devices
    OMPTargetDeviceClauseHost = 0x2,
    /// The target is marked as having no clause
    OMPTargetDeviceClauseNone = 0x3
  };

  /// Device global variable entries info.
  class OffloadEntryInfoDeviceGlobalVar final : public OffloadEntryInfo {
    /// Type of the global variable.
    int64_t VarSize;
    GlobalValue::LinkageTypes Linkage;
    const std::string VarName;

  public:
    OffloadEntryInfoDeviceGlobalVar()
        : OffloadEntryInfo(OffloadingEntryInfoDeviceGlobalVar) {}
    explicit OffloadEntryInfoDeviceGlobalVar(unsigned Order,
                                             OMPTargetGlobalVarEntryKind Flags)
        : OffloadEntryInfo(OffloadingEntryInfoDeviceGlobalVar, Order, Flags) {}
    explicit OffloadEntryInfoDeviceGlobalVar(unsigned Order, Constant *Addr,
                                             int64_t VarSize,
                                             OMPTargetGlobalVarEntryKind Flags,
                                             GlobalValue::LinkageTypes Linkage,
                                             const std::string &VarName)
        : OffloadEntryInfo(OffloadingEntryInfoDeviceGlobalVar, Order, Flags),
          VarSize(VarSize), Linkage(Linkage), VarName(VarName) {
      setAddress(Addr);
    }

    int64_t getVarSize() const { return VarSize; }
    StringRef getVarName() const { return VarName; }
    void setVarSize(int64_t Size) { VarSize = Size; }
    GlobalValue::LinkageTypes getLinkage() const { return Linkage; }
    void setLinkage(GlobalValue::LinkageTypes LT) { Linkage = LT; }
    static bool classof(const OffloadEntryInfo *Info) {
      return Info->getKind() == OffloadingEntryInfoDeviceGlobalVar;
    }
  };

  /// Initialize device global variable entry.
  /// This is ONLY used for DEVICE compilation.
  void initializeDeviceGlobalVarEntryInfo(StringRef Name,
                                          OMPTargetGlobalVarEntryKind Flags,
                                          unsigned Order);

  /// Register device global variable entry.
  void registerDeviceGlobalVarEntryInfo(StringRef VarName, Constant *Addr,
                                        int64_t VarSize,
                                        OMPTargetGlobalVarEntryKind Flags,
                                        GlobalValue::LinkageTypes Linkage);
  /// Checks if the variable with the given name has been registered already.
  bool hasDeviceGlobalVarEntryInfo(StringRef VarName) const {
    return OffloadEntriesDeviceGlobalVar.count(VarName) > 0;
  }
  /// Applies action \a Action on all registered entries.
  typedef function_ref<void(StringRef, const OffloadEntryInfoDeviceGlobalVar &)>
      OffloadDeviceGlobalVarEntryInfoActTy;
  void actOnDeviceGlobalVarEntriesInfo(
      const OffloadDeviceGlobalVarEntryInfoActTy &Action);

private:
  /// Return the count of entries at a particular source location.
  unsigned
  getTargetRegionEntryInfoCount(const TargetRegionEntryInfo &EntryInfo) const;

  /// Update the count of entries at a particular source location.
  void
  incrementTargetRegionEntryInfoCount(const TargetRegionEntryInfo &EntryInfo);

  static TargetRegionEntryInfo
  getTargetRegionEntryCountKey(const TargetRegionEntryInfo &EntryInfo) {
    return TargetRegionEntryInfo(EntryInfo.ParentName, EntryInfo.DeviceID,
                                 EntryInfo.FileID, EntryInfo.Line, 0);
  }

  // Count of entries at a location.
  std::map<TargetRegionEntryInfo, unsigned> OffloadEntriesTargetRegionCount;

  // Storage for target region entries kind.
  typedef std::map<TargetRegionEntryInfo, OffloadEntryInfoTargetRegion>
      OffloadEntriesTargetRegionTy;
  OffloadEntriesTargetRegionTy OffloadEntriesTargetRegion;
  /// Storage for device global variable entries kind. The storage is to be
  /// indexed by mangled name.
  typedef StringMap<OffloadEntryInfoDeviceGlobalVar>
      OffloadEntriesDeviceGlobalVarTy;
  OffloadEntriesDeviceGlobalVarTy OffloadEntriesDeviceGlobalVar;
};

/// An interface to create LLVM-IR for OpenMP directives.
///
/// Each OpenMP directive has a corresponding public generator method.
class OpenMPIRBuilder {
public:
  /// Create a new OpenMPIRBuilder operating on the given module \p M. This will
  /// not have an effect on \p M (see initialize)
  OpenMPIRBuilder(Module &M)
      : M(M), Builder(M.getContext()), OffloadInfoManager(this),
        T(Triple(M.getTargetTriple())) {}
  ~OpenMPIRBuilder();

  /// Initialize the internal state, this will put structures types and
  /// potentially other helpers into the underlying module. Must be called
  /// before any other method and only once! This internal state includes types
  /// used in the OpenMPIRBuilder generated from OMPKinds.def.
  void initialize();

  void setConfig(OpenMPIRBuilderConfig C) { Config = C; }

  /// Finalize the underlying module, e.g., by outlining regions.
  /// \param Fn                    The function to be finalized. If not used,
  ///                              all functions are finalized.
  void finalize(Function *Fn = nullptr);

  /// Add attributes known for \p FnID to \p Fn.
  void addAttributes(omp::RuntimeFunction FnID, Function &Fn);

  /// Type used throughout for insertion points.
  using InsertPointTy = IRBuilder<>::InsertPoint;

  /// Get the create a name using the platform specific separators.
  /// \param Parts parts of the final name that needs separation
  /// The created name has a first separator between the first and second part
  /// and a second separator between all other parts.
  /// E.g. with FirstSeparator "$" and Separator "." and
  /// parts: "p1", "p2", "p3", "p4"
  /// The resulting name is "p1$p2.p3.p4"
  /// The separators are retrieved from the OpenMPIRBuilderConfig.
  std::string createPlatformSpecificName(ArrayRef<StringRef> Parts) const;

  /// Callback type for variable finalization (think destructors).
  ///
  /// \param CodeGenIP is the insertion point at which the finalization code
  ///                  should be placed.
  ///
  /// A finalize callback knows about all objects that need finalization, e.g.
  /// destruction, when the scope of the currently generated construct is left
  /// at the time, and location, the callback is invoked.
  using FinalizeCallbackTy = std::function<void(InsertPointTy CodeGenIP)>;

  struct FinalizationInfo {
    /// The finalization callback provided by the last in-flight invocation of
    /// createXXXX for the directive of kind DK.
    FinalizeCallbackTy FiniCB;

    /// The directive kind of the innermost directive that has an associated
    /// region which might require finalization when it is left.
    omp::Directive DK;

    /// Flag to indicate if the directive is cancellable.
    bool IsCancellable;
  };

  /// Push a finalization callback on the finalization stack.
  ///
  /// NOTE: Temporary solution until Clang CG is gone.
  void pushFinalizationCB(const FinalizationInfo &FI) {
    FinalizationStack.push_back(FI);
  }

  /// Pop the last finalization callback from the finalization stack.
  ///
  /// NOTE: Temporary solution until Clang CG is gone.
  void popFinalizationCB() { FinalizationStack.pop_back(); }

  /// Callback type for body (=inner region) code generation
  ///
  /// The callback takes code locations as arguments, each describing a
  /// location where additional instructions can be inserted.
  ///
  /// The CodeGenIP may be in the middle of a basic block or point to the end of
  /// it. The basic block may have a terminator or be degenerate. The callback
  /// function may just insert instructions at that position, but also split the
  /// block (without the Before argument of BasicBlock::splitBasicBlock such
  /// that the identify of the split predecessor block is preserved) and insert
  /// additional control flow, including branches that do not lead back to what
  /// follows the CodeGenIP. Note that since the callback is allowed to split
  /// the block, callers must assume that InsertPoints to positions in the
  /// BasicBlock after CodeGenIP including CodeGenIP itself are invalidated. If
  /// such InsertPoints need to be preserved, it can split the block itself
  /// before calling the callback.
  ///
  /// AllocaIP and CodeGenIP must not point to the same position.
  ///
  /// \param AllocaIP is the insertion point at which new alloca instructions
  ///                 should be placed. The BasicBlock it is pointing to must
  ///                 not be split.
  /// \param CodeGenIP is the insertion point at which the body code should be
  ///                  placed.
  using BodyGenCallbackTy =
      function_ref<void(InsertPointTy AllocaIP, InsertPointTy CodeGenIP)>;

  // This is created primarily for sections construct as llvm::function_ref
  // (BodyGenCallbackTy) is not storable (as described in the comments of
  // function_ref class - function_ref contains non-ownable reference
  // to the callable.
  using StorableBodyGenCallbackTy =
      std::function<void(InsertPointTy AllocaIP, InsertPointTy CodeGenIP)>;

  /// Callback type for loop body code generation.
  ///
  /// \param CodeGenIP is the insertion point where the loop's body code must be
  ///                  placed. This will be a dedicated BasicBlock with a
  ///                  conditional branch from the loop condition check and
  ///                  terminated with an unconditional branch to the loop
  ///                  latch.
  /// \param IndVar    is the induction variable usable at the insertion point.
  using LoopBodyGenCallbackTy =
      function_ref<void(InsertPointTy CodeGenIP, Value *IndVar)>;

  /// Callback type for variable privatization (think copy & default
  /// constructor).
  ///
  /// \param AllocaIP is the insertion point at which new alloca instructions
  ///                 should be placed.
  /// \param CodeGenIP is the insertion point at which the privatization code
  ///                  should be placed.
  /// \param Original The value being copied/created, should not be used in the
  ///                 generated IR.
  /// \param Inner The equivalent of \p Original that should be used in the
  ///              generated IR; this is equal to \p Original if the value is
  ///              a pointer and can thus be passed directly, otherwise it is
  ///              an equivalent but different value.
  /// \param ReplVal The replacement value, thus a copy or new created version
  ///                of \p Inner.
  ///
  /// \returns The new insertion point where code generation continues and
  ///          \p ReplVal the replacement value.
  using PrivatizeCallbackTy = function_ref<InsertPointTy(
      InsertPointTy AllocaIP, InsertPointTy CodeGenIP, Value &Original,
      Value &Inner, Value *&ReplVal)>;

  /// Description of a LLVM-IR insertion point (IP) and a debug/source location
  /// (filename, line, column, ...).
  struct LocationDescription {
    LocationDescription(const IRBuilderBase &IRB)
        : IP(IRB.saveIP()), DL(IRB.getCurrentDebugLocation()) {}
    LocationDescription(const InsertPointTy &IP) : IP(IP) {}
    LocationDescription(const InsertPointTy &IP, const DebugLoc &DL)
        : IP(IP), DL(DL) {}
    InsertPointTy IP;
    DebugLoc DL;
  };

  /// Emitter methods for OpenMP directives.
  ///
  ///{

  /// Generator for '#omp barrier'
  ///
  /// \param Loc The location where the barrier directive was encountered.
  /// \param Kind The kind of directive that caused the barrier.
  /// \param ForceSimpleCall Flag to force a simple (=non-cancellation) barrier.
  /// \param CheckCancelFlag Flag to indicate a cancel barrier return value
  ///                        should be checked and acted upon.
  /// \param ThreadID Optional parameter to pass in any existing ThreadID value.
  ///
  /// \returns The insertion point after the barrier.
  InsertPointTy createBarrier(const LocationDescription &Loc,
                              omp::Directive Kind, bool ForceSimpleCall = false,
                              bool CheckCancelFlag = true);

  /// Generator for '#omp cancel'
  ///
  /// \param Loc The location where the directive was encountered.
  /// \param IfCondition The evaluated 'if' clause expression, if any.
  /// \param CanceledDirective The kind of directive that is cancled.
  ///
  /// \returns The insertion point after the barrier.
  InsertPointTy createCancel(const LocationDescription &Loc, Value *IfCondition,
                             omp::Directive CanceledDirective);

  /// Generator for '#omp parallel'
  ///
  /// \param Loc The insert and source location description.
  /// \param AllocaIP The insertion points to be used for alloca instructions.
  /// \param BodyGenCB Callback that will generate the region code.
  /// \param PrivCB Callback to copy a given variable (think copy constructor).
  /// \param FiniCB Callback to finalize variable copies.
  /// \param IfCondition The evaluated 'if' clause expression, if any.
  /// \param NumThreads The evaluated 'num_threads' clause expression, if any.
  /// \param ProcBind The value of the 'proc_bind' clause (see ProcBindKind).
  /// \param IsCancellable Flag to indicate a cancellable parallel region.
  ///
  /// \returns The insertion position *after* the parallel.
  IRBuilder<>::InsertPoint
  createParallel(const LocationDescription &Loc, InsertPointTy AllocaIP,
                 BodyGenCallbackTy BodyGenCB, PrivatizeCallbackTy PrivCB,
                 FinalizeCallbackTy FiniCB, Value *IfCondition,
                 Value *NumThreads, omp::ProcBindKind ProcBind,
                 bool IsCancellable);

  /// Generator for the control flow structure of an OpenMP canonical loop.
  ///
  /// This generator operates on the logical iteration space of the loop, i.e.
  /// the caller only has to provide a loop trip count of the loop as defined by
  /// base language semantics. The trip count is interpreted as an unsigned
  /// integer. The induction variable passed to \p BodyGenCB will be of the same
  /// type and run from 0 to \p TripCount - 1. It is up to the callback to
  /// convert the logical iteration variable to the loop counter variable in the
  /// loop body.
  ///
  /// \param Loc       The insert and source location description. The insert
  ///                  location can be between two instructions or the end of a
  ///                  degenerate block (e.g. a BB under construction).
  /// \param BodyGenCB Callback that will generate the loop body code.
  /// \param TripCount Number of iterations the loop body is executed.
  /// \param Name      Base name used to derive BB and instruction names.
  ///
  /// \returns An object representing the created control flow structure which
  ///          can be used for loop-associated directives.
  CanonicalLoopInfo *createCanonicalLoop(const LocationDescription &Loc,
                                         LoopBodyGenCallbackTy BodyGenCB,
                                         Value *TripCount,
                                         const Twine &Name = "loop");

  /// Generator for the control flow structure of an OpenMP canonical loop.
  ///
  /// Instead of a logical iteration space, this allows specifying user-defined
  /// loop counter values using increment, upper- and lower bounds. To
  /// disambiguate the terminology when counting downwards, instead of lower
  /// bounds we use \p Start for the loop counter value in the first body
  /// iteration.
  ///
  /// Consider the following limitations:
  ///
  ///  * A loop counter space over all integer values of its bit-width cannot be
  ///    represented. E.g using uint8_t, its loop trip count of 256 cannot be
  ///    stored into an 8 bit integer):
  ///
  ///      DO I = 0, 255, 1
  ///
  ///  * Unsigned wrapping is only supported when wrapping only "once"; E.g.
  ///    effectively counting downwards:
  ///
  ///      for (uint8_t i = 100u; i > 0; i += 127u)
  ///
  ///
  /// TODO: May need to add additional parameters to represent:
  ///
  ///  * Allow representing downcounting with unsigned integers.
  ///
  ///  * Sign of the step and the comparison operator might disagree:
  ///
  ///      for (int i = 0; i < 42; i -= 1u)
  ///
  //
  /// \param Loc       The insert and source location description.
  /// \param BodyGenCB Callback that will generate the loop body code.
  /// \param Start     Value of the loop counter for the first iterations.
  /// \param Stop      Loop counter values past this will stop the loop.
  /// \param Step      Loop counter increment after each iteration; negative
  ///                  means counting down.
  /// \param IsSigned  Whether Start, Stop and Step are signed integers.
  /// \param InclusiveStop Whether \p Stop itself is a valid value for the loop
  ///                      counter.
  /// \param ComputeIP Insertion point for instructions computing the trip
  ///                  count. Can be used to ensure the trip count is available
  ///                  at the outermost loop of a loop nest. If not set,
  ///                  defaults to the preheader of the generated loop.
  /// \param Name      Base name used to derive BB and instruction names.
  ///
  /// \returns An object representing the created control flow structure which
  ///          can be used for loop-associated directives.
  CanonicalLoopInfo *createCanonicalLoop(const LocationDescription &Loc,
                                         LoopBodyGenCallbackTy BodyGenCB,
                                         Value *Start, Value *Stop, Value *Step,
                                         bool IsSigned, bool InclusiveStop,
                                         InsertPointTy ComputeIP = {},
                                         const Twine &Name = "loop");

  /// Collapse a loop nest into a single loop.
  ///
  /// Merges loops of a loop nest into a single CanonicalLoopNest representation
  /// that has the same number of innermost loop iterations as the origin loop
  /// nest. The induction variables of the input loops are derived from the
  /// collapsed loop's induction variable. This is intended to be used to
  /// implement OpenMP's collapse clause. Before applying a directive,
  /// collapseLoops normalizes a loop nest to contain only a single loop and the
  /// directive's implementation does not need to handle multiple loops itself.
  /// This does not remove the need to handle all loop nest handling by
  /// directives, such as the ordered(<n>) clause or the simd schedule-clause
  /// modifier of the worksharing-loop directive.
  ///
  /// Example:
  /// \code
  ///   for (int i = 0; i < 7; ++i) // Canonical loop "i"
  ///     for (int j = 0; j < 9; ++j) // Canonical loop "j"
  ///       body(i, j);
  /// \endcode
  ///
  /// After collapsing with Loops={i,j}, the loop is changed to
  /// \code
  ///   for (int ij = 0; ij < 63; ++ij) {
  ///     int i = ij / 9;
  ///     int j = ij % 9;
  ///     body(i, j);
  ///   }
  /// \endcode
  ///
  /// In the current implementation, the following limitations apply:
  ///
  ///  * All input loops have an induction variable of the same type.
  ///
  ///  * The collapsed loop will have the same trip count integer type as the
  ///    input loops. Therefore it is possible that the collapsed loop cannot
  ///    represent all iterations of the input loops. For instance, assuming a
  ///    32 bit integer type, and two input loops both iterating 2^16 times, the
  ///    theoretical trip count of the collapsed loop would be 2^32 iteration,
  ///    which cannot be represented in an 32-bit integer. Behavior is undefined
  ///    in this case.
  ///
  ///  * The trip counts of every input loop must be available at \p ComputeIP.
  ///    Non-rectangular loops are not yet supported.
  ///
  ///  * At each nest level, code between a surrounding loop and its nested loop
  ///    is hoisted into the loop body, and such code will be executed more
  ///    often than before collapsing (or not at all if any inner loop iteration
  ///    has a trip count of 0). This is permitted by the OpenMP specification.
  ///
  /// \param DL        Debug location for instructions added for collapsing,
  ///                  such as instructions to compute/derive the input loop's
  ///                  induction variables.
  /// \param Loops     Loops in the loop nest to collapse. Loops are specified
  ///                  from outermost-to-innermost and every control flow of a
  ///                  loop's body must pass through its directly nested loop.
  /// \param ComputeIP Where additional instruction that compute the collapsed
  ///                  trip count. If not set, defaults to before the generated
  ///                  loop.
  ///
  /// \returns The CanonicalLoopInfo object representing the collapsed loop.
  CanonicalLoopInfo *collapseLoops(DebugLoc DL,
                                   ArrayRef<CanonicalLoopInfo *> Loops,
                                   InsertPointTy ComputeIP);

  /// Get the default alignment value for given target
  ///
  /// \param TargetTriple   Target triple
  /// \param Features       StringMap which describes extra CPU features
  static unsigned getOpenMPDefaultSimdAlign(const Triple &TargetTriple,
                                            const StringMap<bool> &Features);

  /// Retrieve (or create if non-existent) the address of a declare
  /// target variable, used in conjunction with registerTargetGlobalVariable
  /// to create declare target global variables.
  ///
  /// \param CaptureClause - enumerator corresponding to the OpenMP capture
  /// clause used in conjunction with the variable being registered (link,
  /// to, enter).
  /// \param DeviceClause - enumerator corresponding to the OpenMP capture
  /// clause used in conjunction with the variable being registered (nohost,
  /// host, any)
  /// \param IsDeclaration - boolean stating if the variable being registered
  /// is a declaration-only and not a definition
  /// \param IsExternallyVisible - boolean stating if the variable is externally
  /// visible
  /// \param EntryInfo - Unique entry information for the value generated
  /// using getTargetEntryUniqueInfo, used to name generated pointer references
  /// to the declare target variable
  /// \param MangledName - the mangled name of the variable being registered
  /// \param GeneratedRefs - references generated by invocations of
  /// registerTargetGlobalVariable invoked from getAddrOfDeclareTargetVar,
  /// these are required by Clang for book keeping.
  /// \param OpenMPSIMD - if OpenMP SIMD mode is currently enabled
  /// \param TargetTriple - The OpenMP device target triple we are compiling
  /// for
  /// \param LlvmPtrTy - The type of the variable we are generating or
  /// retrieving an address for
  /// \param GlobalInitializer - a lambda function which creates a constant
  /// used for initializing a pointer reference to the variable in certain
  /// cases. If a nullptr is passed, it will default to utilising the original
  /// variable to initialize the pointer reference.
  /// \param VariableLinkage - a lambda function which returns the variables
  /// linkage type, if unspecified and a nullptr is given, it will instead
  /// utilise the linkage stored on the existing global variable in the
  /// LLVMModule.
  Constant *getAddrOfDeclareTargetVar(
      OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind CaptureClause,
      OffloadEntriesInfoManager::OMPTargetDeviceClauseKind DeviceClause,
      bool IsDeclaration, bool IsExternallyVisible,
      TargetRegionEntryInfo EntryInfo, StringRef MangledName,
      std::vector<GlobalVariable *> &GeneratedRefs, bool OpenMPSIMD,
      std::vector<Triple> TargetTriple, Type *LlvmPtrTy,
      std::function<Constant *()> GlobalInitializer,
      std::function<GlobalValue::LinkageTypes()> VariableLinkage);

  /// Registers a target variable for device or host.
  ///
  /// \param CaptureClause - enumerator corresponding to the OpenMP capture
  /// clause used in conjunction with the variable being registered (link,
  /// to, enter).
  /// \param DeviceClause - enumerator corresponding to the OpenMP capture
  /// clause used in conjunction with the variable being registered (nohost,
  /// host, any)
  /// \param IsDeclaration - boolean stating if the variable being registered
  /// is a declaration-only and not a definition
  /// \param IsExternallyVisible - boolean stating if the variable is externally
  /// visible
  /// \param EntryInfo - Unique entry information for the value generated
  /// using getTargetEntryUniqueInfo, used to name generated pointer references
  /// to the declare target variable
  /// \param MangledName - the mangled name of the variable being registered
  /// \param GeneratedRefs - references generated by invocations of
  /// registerTargetGlobalVariable these are required by Clang for book
  /// keeping.
  /// \param OpenMPSIMD - if OpenMP SIMD mode is currently enabled
  /// \param TargetTriple - The OpenMP device target triple we are compiling
  /// for
  /// \param GlobalInitializer - a lambda function which creates a constant
  /// used for initializing a pointer reference to the variable in certain
  /// cases. If a nullptr is passed, it will default to utilising the original
  /// variable to initialize the pointer reference.
  /// \param VariableLinkage - a lambda function which returns the variables
  /// linkage type, if unspecified and a nullptr is given, it will instead
  /// utilise the linkage stored on the existing global variable in the
  /// LLVMModule.
  /// \param LlvmPtrTy - The type of the variable we are generating or
  /// retrieving an address for
  /// \param Addr - the original llvm value (addr) of the variable to be
  /// registered
  void registerTargetGlobalVariable(
      OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind CaptureClause,
      OffloadEntriesInfoManager::OMPTargetDeviceClauseKind DeviceClause,
      bool IsDeclaration, bool IsExternallyVisible,
      TargetRegionEntryInfo EntryInfo, StringRef MangledName,
      std::vector<GlobalVariable *> &GeneratedRefs, bool OpenMPSIMD,
      std::vector<Triple> TargetTriple,
      std::function<Constant *()> GlobalInitializer,
      std::function<GlobalValue::LinkageTypes()> VariableLinkage,
      Type *LlvmPtrTy, Constant *Addr);

  /// Get the offset of the OMP_MAP_MEMBER_OF field.
  unsigned getFlagMemberOffset();

  /// Get OMP_MAP_MEMBER_OF flag with extra bits reserved based on
  /// the position given.
  /// \param Position - A value indicating the position of the parent
  /// of the member in the kernel argument structure, often retrieved
  /// by the parents position in the combined information vectors used
  /// to generate the structure itself. Multiple children (member's of)
  /// with the same parent will use the same returned member flag.
  omp::OpenMPOffloadMappingFlags getMemberOfFlag(unsigned Position);

  /// Given an initial flag set, this function modifies it to contain
  /// the passed in MemberOfFlag generated from the getMemberOfFlag
  /// function. The results are dependent on the existing flag bits
  /// set in the original flag set.
  /// \param Flags - The original set of flags to be modified with the
  /// passed in MemberOfFlag.
  /// \param MemberOfFlag - A modified OMP_MAP_MEMBER_OF flag, adjusted
  /// slightly based on the getMemberOfFlag which adjusts the flag bits
  /// based on the members position in its parent.
  void setCorrectMemberOfFlag(omp::OpenMPOffloadMappingFlags &Flags,
                              omp::OpenMPOffloadMappingFlags MemberOfFlag);

private:
  /// Modifies the canonical loop to be a statically-scheduled workshare loop
  /// which is executed on the device
  ///
  /// This takes a \p CLI representing a canonical loop, such as the one
  /// created by \see createCanonicalLoop and emits additional instructions to
  /// turn it into a workshare loop. In particular, it calls to an OpenMP
  /// runtime function in the preheader to call OpenMP device rtl function
  /// which handles worksharing of loop body interations.
  ///
  /// \param DL       Debug location for instructions added for the
  ///                 workshare-loop construct itself.
  /// \param CLI      A descriptor of the canonical loop to workshare.
  /// \param AllocaIP An insertion point for Alloca instructions usable in the
  ///                 preheader of the loop.
  /// \param LoopType Information about type of loop worksharing.
  ///                 It corresponds to type of loop workshare OpenMP pragma.
  ///
  /// \returns Point where to insert code after the workshare construct.
  InsertPointTy applyWorkshareLoopTarget(DebugLoc DL, CanonicalLoopInfo *CLI,
                                         InsertPointTy AllocaIP,
                                         omp::WorksharingLoopType LoopType);

  /// Modifies the canonical loop to be a statically-scheduled workshare loop.
  ///
  /// This takes a \p LoopInfo representing a canonical loop, such as the one
  /// created by \p createCanonicalLoop and emits additional instructions to
  /// turn it into a workshare loop. In particular, it calls to an OpenMP
  /// runtime function in the preheader to obtain the loop bounds to be used in
  /// the current thread, updates the relevant instructions in the canonical
  /// loop and calls to an OpenMP runtime finalization function after the loop.
  ///
  /// \param DL       Debug location for instructions added for the
  ///                 workshare-loop construct itself.
  /// \param CLI      A descriptor of the canonical loop to workshare.
  /// \param AllocaIP An insertion point for Alloca instructions usable in the
  ///                 preheader of the loop.
  /// \param NeedsBarrier Indicates whether a barrier must be inserted after
  ///                     the loop.
  ///
  /// \returns Point where to insert code after the workshare construct.
  InsertPointTy applyStaticWorkshareLoop(DebugLoc DL, CanonicalLoopInfo *CLI,
                                         InsertPointTy AllocaIP,
                                         bool NeedsBarrier);

  /// Modifies the canonical loop a statically-scheduled workshare loop with a
  /// user-specified chunk size.
  ///
  /// \param DL           Debug location for instructions added for the
  ///                     workshare-loop construct itself.
  /// \param CLI          A descriptor of the canonical loop to workshare.
  /// \param AllocaIP     An insertion point for Alloca instructions usable in
  ///                     the preheader of the loop.
  /// \param NeedsBarrier Indicates whether a barrier must be inserted after the
  ///                     loop.
  /// \param ChunkSize    The user-specified chunk size.
  ///
  /// \returns Point where to insert code after the workshare construct.
  InsertPointTy applyStaticChunkedWorkshareLoop(DebugLoc DL,
                                                CanonicalLoopInfo *CLI,
                                                InsertPointTy AllocaIP,
                                                bool NeedsBarrier,
                                                Value *ChunkSize);

  /// Modifies the canonical loop to be a dynamically-scheduled workshare loop.
  ///
  /// This takes a \p LoopInfo representing a canonical loop, such as the one
  /// created by \p createCanonicalLoop and emits additional instructions to
  /// turn it into a workshare loop. In particular, it calls to an OpenMP
  /// runtime function in the preheader to obtain, and then in each iteration
  /// to update the loop counter.
  ///
  /// \param DL       Debug location for instructions added for the
  ///                 workshare-loop construct itself.
  /// \param CLI      A descriptor of the canonical loop to workshare.
  /// \param AllocaIP An insertion point for Alloca instructions usable in the
  ///                 preheader of the loop.
  /// \param SchedType Type of scheduling to be passed to the init function.
  /// \param NeedsBarrier Indicates whether a barrier must be insterted after
  ///                     the loop.
  /// \param Chunk    The size of loop chunk considered as a unit when
  ///                 scheduling. If \p nullptr, defaults to 1.
  ///
  /// \returns Point where to insert code after the workshare construct.
  InsertPointTy applyDynamicWorkshareLoop(DebugLoc DL, CanonicalLoopInfo *CLI,
                                          InsertPointTy AllocaIP,
                                          omp::OMPScheduleType SchedType,
                                          bool NeedsBarrier,
                                          Value *Chunk = nullptr);

  /// Create alternative version of the loop to support if clause
  ///
  /// OpenMP if clause can require to generate second loop. This loop
  /// will be executed when if clause condition is not met. createIfVersion
  /// adds branch instruction to the copied loop if \p  ifCond is not met.
  ///
  /// \param Loop       Original loop which should be versioned.
  /// \param IfCond     Value which corresponds to if clause condition
  /// \param VMap       Value to value map to define relation between
  ///                   original and copied loop values and loop blocks.
  /// \param NamePrefix Optional name prefix for if.then if.else blocks.
  void createIfVersion(CanonicalLoopInfo *Loop, Value *IfCond,
                       ValueToValueMapTy &VMap, const Twine &NamePrefix = "");

public:
  /// Modifies the canonical loop to be a workshare loop.
  ///
  /// This takes a \p LoopInfo representing a canonical loop, such as the one
  /// created by \p createCanonicalLoop and emits additional instructions to
  /// turn it into a workshare loop. In particular, it calls to an OpenMP
  /// runtime function in the preheader to obtain the loop bounds to be used in
  /// the current thread, updates the relevant instructions in the canonical
  /// loop and calls to an OpenMP runtime finalization function after the loop.
  ///
  /// The concrete transformation is done by applyStaticWorkshareLoop,
  /// applyStaticChunkedWorkshareLoop, or applyDynamicWorkshareLoop, depending
  /// on the value of \p SchedKind and \p ChunkSize.
  ///
  /// \param DL       Debug location for instructions added for the
  ///                 workshare-loop construct itself.
  /// \param CLI      A descriptor of the canonical loop to workshare.
  /// \param AllocaIP An insertion point for Alloca instructions usable in the
  ///                 preheader of the loop.
  /// \param NeedsBarrier Indicates whether a barrier must be insterted after
  ///                     the loop.
  /// \param SchedKind Scheduling algorithm to use.
  /// \param ChunkSize The chunk size for the inner loop.
  /// \param HasSimdModifier Whether the simd modifier is present in the
  ///                        schedule clause.
  /// \param HasMonotonicModifier Whether the monotonic modifier is present in
  ///                             the schedule clause.
  /// \param HasNonmonotonicModifier Whether the nonmonotonic modifier is
  ///                                present in the schedule clause.
  /// \param HasOrderedClause Whether the (parameterless) ordered clause is
  ///                         present.
  /// \param LoopType Information about type of loop worksharing.
  ///                 It corresponds to type of loop workshare OpenMP pragma.
  ///
  /// \returns Point where to insert code after the workshare construct.
  InsertPointTy applyWorkshareLoop(
      DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP,
      bool NeedsBarrier,
      llvm::omp::ScheduleKind SchedKind = llvm::omp::OMP_SCHEDULE_Default,
      Value *ChunkSize = nullptr, bool HasSimdModifier = false,
      bool HasMonotonicModifier = false, bool HasNonmonotonicModifier = false,
      bool HasOrderedClause = false,
      omp::WorksharingLoopType LoopType =
          omp::WorksharingLoopType::ForStaticLoop);

  /// Tile a loop nest.
  ///
  /// Tiles the loops of \p Loops by the tile sizes in \p TileSizes. Loops in
  /// \p/ Loops must be perfectly nested, from outermost to innermost loop
  /// (i.e. Loops.front() is the outermost loop). The trip count llvm::Value
  /// of every loop and every tile sizes must be usable in the outermost
  /// loop's preheader. This implies that the loop nest is rectangular.
  ///
  /// Example:
  /// \code
  ///   for (int i = 0; i < 15; ++i) // Canonical loop "i"
  ///     for (int j = 0; j < 14; ++j) // Canonical loop "j"
  ///         body(i, j);
  /// \endcode
  ///
  /// After tiling with Loops={i,j} and TileSizes={5,7}, the loop is changed to
  /// \code
  ///   for (int i1 = 0; i1 < 3; ++i1)
  ///     for (int j1 = 0; j1 < 2; ++j1)
  ///       for (int i2 = 0; i2 < 5; ++i2)
  ///         for (int j2 = 0; j2 < 7; ++j2)
  ///           body(i1*3+i2, j1*3+j2);
  /// \endcode
  ///
  /// The returned vector are the loops {i1,j1,i2,j2}. The loops i1 and j1 are
  /// referred to the floor, and the loops i2 and j2 are the tiles. Tiling also
  /// handles non-constant trip counts, non-constant tile sizes and trip counts
  /// that are not multiples of the tile size. In the latter case the tile loop
  /// of the last floor-loop iteration will have fewer iterations than specified
  /// as its tile size.
  ///
  ///
  /// @param DL        Debug location for instructions added by tiling, for
  ///                  instance the floor- and tile trip count computation.
  /// @param Loops     Loops to tile. The CanonicalLoopInfo objects are
  ///                  invalidated by this method, i.e. should not used after
  ///                  tiling.
  /// @param TileSizes For each loop in \p Loops, the tile size for that
  ///                  dimensions.
  ///
  /// \returns A list of generated loops. Contains twice as many loops as the
  ///          input loop nest; the first half are the floor loops and the
  ///          second half are the tile loops.
  std::vector<CanonicalLoopInfo *>
  tileLoops(DebugLoc DL, ArrayRef<CanonicalLoopInfo *> Loops,
            ArrayRef<Value *> TileSizes);

  /// Fully unroll a loop.
  ///
  /// Instead of unrolling the loop immediately (and duplicating its body
  /// instructions), it is deferred to LLVM's LoopUnrollPass by adding loop
  /// metadata.
  ///
  /// \param DL   Debug location for instructions added by unrolling.
  /// \param Loop The loop to unroll. The loop will be invalidated.
  void unrollLoopFull(DebugLoc DL, CanonicalLoopInfo *Loop);

  /// Fully or partially unroll a loop. How the loop is unrolled is determined
  /// using LLVM's LoopUnrollPass.
  ///
  /// \param DL   Debug location for instructions added by unrolling.
  /// \param Loop The loop to unroll. The loop will be invalidated.
  void unrollLoopHeuristic(DebugLoc DL, CanonicalLoopInfo *Loop);

  /// Partially unroll a loop.
  ///
  /// The CanonicalLoopInfo of the unrolled loop for use with chained
  /// loop-associated directive can be requested using \p UnrolledCLI. Not
  /// needing the CanonicalLoopInfo allows more efficient code generation by
  /// deferring the actual unrolling to the LoopUnrollPass using loop metadata.
  /// A loop-associated directive applied to the unrolled loop needs to know the
  /// new trip count which means that if using a heuristically determined unroll
  /// factor (\p Factor == 0), that factor must be computed immediately. We are
  /// using the same logic as the LoopUnrollPass to derived the unroll factor,
  /// but which assumes that some canonicalization has taken place (e.g.
  /// Mem2Reg, LICM, GVN, Inlining, etc.). That is, the heuristic will perform
  /// better when the unrolled loop's CanonicalLoopInfo is not needed.
  ///
  /// \param DL          Debug location for instructions added by unrolling.
  /// \param Loop        The loop to unroll. The loop will be invalidated.
  /// \param Factor      The factor to unroll the loop by. A factor of 0
  ///                    indicates that a heuristic should be used to determine
  ///                    the unroll-factor.
  /// \param UnrolledCLI If non-null, receives the CanonicalLoopInfo of the
  ///                    partially unrolled loop. Otherwise, uses loop metadata
  ///                    to defer unrolling to the LoopUnrollPass.
  void unrollLoopPartial(DebugLoc DL, CanonicalLoopInfo *Loop, int32_t Factor,
                         CanonicalLoopInfo **UnrolledCLI);

  /// Add metadata to simd-ize a loop. If IfCond is not nullptr, the loop
  /// is cloned. The metadata which prevents vectorization is added to
  /// to the cloned loop. The cloned loop is executed when ifCond is evaluated
  /// to false.
  ///
  /// \param Loop        The loop to simd-ize.
  /// \param AlignedVars The map which containts pairs of the pointer
  ///                    and its corresponding alignment.
  /// \param IfCond      The value which corresponds to the if clause
  ///                    condition.
  /// \param Order       The enum to map order clause.
  /// \param Simdlen     The Simdlen length to apply to the simd loop.
  /// \param Safelen     The Safelen length to apply to the simd loop.
  void applySimd(CanonicalLoopInfo *Loop,
                 MapVector<Value *, Value *> AlignedVars, Value *IfCond,
                 omp::OrderKind Order, ConstantInt *Simdlen,
                 ConstantInt *Safelen);

  /// Generator for '#omp flush'
  ///
  /// \param Loc The location where the flush directive was encountered
  void createFlush(const LocationDescription &Loc);

  /// Generator for '#omp taskwait'
  ///
  /// \param Loc The location where the taskwait directive was encountered.
  void createTaskwait(const LocationDescription &Loc);

  /// Generator for '#omp taskyield'
  ///
  /// \param Loc The location where the taskyield directive was encountered.
  void createTaskyield(const LocationDescription &Loc);

  /// A struct to pack the relevant information for an OpenMP depend clause.
  struct DependData {
    omp::RTLDependenceKindTy DepKind = omp::RTLDependenceKindTy::DepUnknown;
    Type *DepValueType;
    Value *DepVal;
    explicit DependData() = default;
    DependData(omp::RTLDependenceKindTy DepKind, Type *DepValueType,
               Value *DepVal)
        : DepKind(DepKind), DepValueType(DepValueType), DepVal(DepVal) {}
  };

  /// Generator for `#omp task`
  ///
  /// \param Loc The location where the task construct was encountered.
  /// \param AllocaIP The insertion point to be used for alloca instructions.
  /// \param BodyGenCB Callback that will generate the region code.
  /// \param Tied True if the task is tied, false if the task is untied.
  /// \param Final i1 value which is `true` if the task is final, `false` if the
  ///              task is not final.
  /// \param IfCondition i1 value. If it evaluates to `false`, an undeferred
  ///                    task is generated, and the encountering thread must
  ///                    suspend the current task region, for which execution
  ///                    cannot be resumed until execution of the structured
  ///                    block that is associated with the generated task is
  ///                    completed.
  InsertPointTy createTask(const LocationDescription &Loc,
                           InsertPointTy AllocaIP, BodyGenCallbackTy BodyGenCB,
                           bool Tied = true, Value *Final = nullptr,
                           Value *IfCondition = nullptr,
                           SmallVector<DependData> Dependencies = {});

  /// Generator for the taskgroup construct
  ///
  /// \param Loc The location where the taskgroup construct was encountered.
  /// \param AllocaIP The insertion point to be used for alloca instructions.
  /// \param BodyGenCB Callback that will generate the region code.
  InsertPointTy createTaskgroup(const LocationDescription &Loc,
                                InsertPointTy AllocaIP,
                                BodyGenCallbackTy BodyGenCB);

  using FileIdentifierInfoCallbackTy =
      std::function<std::tuple<std::string, uint64_t>()>;

  /// Creates a unique info for a target entry when provided a filename and
  /// line number from.
  ///
  /// \param CallBack A callback function which should return filename the entry
  /// resides in as well as the line number for the target entry
  /// \param ParentName The name of the parent the target entry resides in, if
  /// any.
  static TargetRegionEntryInfo
  getTargetEntryUniqueInfo(FileIdentifierInfoCallbackTy CallBack,
                           StringRef ParentName = "");

  /// Enum class for the RedctionGen CallBack type to be used.
  enum class ReductionGenCBKind { Clang, MLIR };

  /// ReductionGen CallBack for Clang
  ///
  /// \param CodeGenIP InsertPoint for CodeGen.
  /// \param Index Index of the ReductionInfo to generate code for.
  /// \param LHSPtr Optionally used by Clang to return the LHSPtr it used for
  /// codegen, used for fixup later.
  /// \param RHSPtr Optionally used by Clang to
  /// return the RHSPtr it used for codegen, used for fixup later.
  /// \param CurFn Optionally used by Clang to pass in the Current Function as
  /// Clang context may be old.
  using ReductionGenClangCBTy =
      std::function<InsertPointTy(InsertPointTy CodeGenIP, unsigned Index,
                                  Value **LHS, Value **RHS, Function *CurFn)>;

  /// ReductionGen CallBack for MLIR
  ///
  /// \param CodeGenIP InsertPoint for CodeGen.
  /// \param LHS Pass in the LHS Value to be used for CodeGen.
  /// \param RHS Pass in the RHS Value to be used for CodeGen.
  using ReductionGenCBTy = std::function<InsertPointTy(
      InsertPointTy CodeGenIP, Value *LHS, Value *RHS, Value *&Res)>;

  /// Functions used to generate atomic reductions. Such functions take two
  /// Values representing pointers to LHS and RHS of the reduction, as well as
  /// the element type of these pointers. They are expected to atomically
  /// update the LHS to the reduced value.
  using ReductionGenAtomicCBTy =
      std::function<InsertPointTy(InsertPointTy, Type *, Value *, Value *)>;

  /// Enum class for reduction evaluation types scalar, complex and aggregate.
  enum class EvalKind { Scalar, Complex, Aggregate };

  /// Information about an OpenMP reduction.
  struct ReductionInfo {
    ReductionInfo(Type *ElementType, Value *Variable, Value *PrivateVariable,
                  EvalKind EvaluationKind, ReductionGenCBTy ReductionGen,
                  ReductionGenClangCBTy ReductionGenClang,
                  ReductionGenAtomicCBTy AtomicReductionGen)
        : ElementType(ElementType), Variable(Variable),
          PrivateVariable(PrivateVariable), EvaluationKind(EvaluationKind),
          ReductionGen(ReductionGen), ReductionGenClang(ReductionGenClang),
          AtomicReductionGen(AtomicReductionGen) {}
    ReductionInfo(Value *PrivateVariable)
        : ElementType(nullptr), Variable(nullptr),
          PrivateVariable(PrivateVariable), EvaluationKind(EvalKind::Scalar),
          ReductionGen(), ReductionGenClang(), AtomicReductionGen() {}

    /// Reduction element type, must match pointee type of variable.
    Type *ElementType;

    /// Reduction variable of pointer type.
    Value *Variable;

    /// Thread-private partial reduction variable.
    Value *PrivateVariable;

    /// Reduction evaluation kind - scalar, complex or aggregate.
    EvalKind EvaluationKind;

    /// Callback for generating the reduction body. The IR produced by this will
    /// be used to combine two values in a thread-safe context, e.g., under
    /// lock or within the same thread, and therefore need not be atomic.
    ReductionGenCBTy ReductionGen;

    /// Clang callback for generating the reduction body. The IR produced by
    /// this will be used to combine two values in a thread-safe context, e.g.,
    /// under lock or within the same thread, and therefore need not be atomic.
    ReductionGenClangCBTy ReductionGenClang;

    /// Callback for generating the atomic reduction body, may be null. The IR
    /// produced by this will be used to atomically combine two values during
    /// reduction. If null, the implementation will use the non-atomic version
    /// along with the appropriate synchronization mechanisms.
    ReductionGenAtomicCBTy AtomicReductionGen;
  };

  enum class CopyAction : unsigned {
    // RemoteLaneToThread: Copy over a Reduce list from a remote lane in
    // the warp using shuffle instructions.
    RemoteLaneToThread,
    // ThreadCopy: Make a copy of a Reduce list on the thread's stack.
    ThreadCopy,
  };

  struct CopyOptionsTy {
    Value *RemoteLaneOffset = nullptr;
    Value *ScratchpadIndex = nullptr;
    Value *ScratchpadWidth = nullptr;
  };

  /// Supporting functions for Reductions CodeGen.
private:
  /// Emit the llvm.used metadata.
  void emitUsed(StringRef Name, std::vector<llvm::WeakTrackingVH> &List);

  /// Get the id of the current thread on the GPU.
  Value *getGPUThreadID();

  /// Get the GPU warp size.
  Value *getGPUWarpSize();

  /// Get the id of the warp in the block.
  /// We assume that the warp size is 32, which is always the case
  /// on the NVPTX device, to generate more efficient code.
  Value *getNVPTXWarpID();

  /// Get the id of the current lane in the Warp.
  /// We assume that the warp size is 32, which is always the case
  /// on the NVPTX device, to generate more efficient code.
  Value *getNVPTXLaneID();

  /// Cast value to the specified type.
  Value *castValueToType(InsertPointTy AllocaIP, Value *From, Type *ToType);

  /// This function creates calls to one of two shuffle functions to copy
  /// variables between lanes in a warp.
  Value *createRuntimeShuffleFunction(InsertPointTy AllocaIP, Value *Element,
                                      Type *ElementType, Value *Offset);

  /// Function to shuffle over the value from the remote lane.
  void shuffleAndStore(InsertPointTy AllocaIP, Value *SrcAddr, Value *DstAddr,
                       Type *ElementType, Value *Offset,
                       Type *ReductionArrayTy);

  /// Emit instructions to copy a Reduce list, which contains partially
  /// aggregated values, in the specified direction.
  void emitReductionListCopy(
      InsertPointTy AllocaIP, CopyAction Action, Type *ReductionArrayTy,
      ArrayRef<ReductionInfo> ReductionInfos, Value *SrcBase, Value *DestBase,
      CopyOptionsTy CopyOptions = {nullptr, nullptr, nullptr});

  /// Emit a helper that reduces data across two OpenMP threads (lanes)
  /// in the same warp.  It uses shuffle instructions to copy over data from
  /// a remote lane's stack.  The reduction algorithm performed is specified
  /// by the fourth parameter.
  ///
  /// Algorithm Versions.
  /// Full Warp Reduce (argument value 0):
  ///   This algorithm assumes that all 32 lanes are active and gathers
  ///   data from these 32 lanes, producing a single resultant value.
  /// Contiguous Partial Warp Reduce (argument value 1):
  ///   This algorithm assumes that only a *contiguous* subset of lanes
  ///   are active.  This happens for the last warp in a parallel region
  ///   when the user specified num_threads is not an integer multiple of
  ///   32.  This contiguous subset always starts with the zeroth lane.
  /// Partial Warp Reduce (argument value 2):
  ///   This algorithm gathers data from any number of lanes at any position.
  /// All reduced values are stored in the lowest possible lane.  The set
  /// of problems every algorithm addresses is a super set of those
  /// addressable by algorithms with a lower version number.  Overhead
  /// increases as algorithm version increases.
  ///
  /// Terminology
  /// Reduce element:
  ///   Reduce element refers to the individual data field with primitive
  ///   data types to be combined and reduced across threads.
  /// Reduce list:
  ///   Reduce list refers to a collection of local, thread-private
  ///   reduce elements.
  /// Remote Reduce list:
  ///   Remote Reduce list refers to a collection of remote (relative to
  ///   the current thread) reduce elements.
  ///
  /// We distinguish between three states of threads that are important to
  /// the implementation of this function.
  /// Alive threads:
  ///   Threads in a warp executing the SIMT instruction, as distinguished from
  ///   threads that are inactive due to divergent control flow.
  /// Active threads:
  ///   The minimal set of threads that has to be alive upon entry to this
  ///   function.  The computation is correct iff active threads are alive.
  ///   Some threads are alive but they are not active because they do not
  ///   contribute to the computation in any useful manner.  Turning them off
  ///   may introduce control flow overheads without any tangible benefits.
  /// Effective threads:
  ///   In order to comply with the argument requirements of the shuffle
  ///   function, we must keep all lanes holding data alive.  But at most
  ///   half of them perform value aggregation; we refer to this half of
  ///   threads as effective. The other half is simply handing off their
  ///   data.
  ///
  /// Procedure
  /// Value shuffle:
  ///   In this step active threads transfer data from higher lane positions
  ///   in the warp to lower lane positions, creating Remote Reduce list.
  /// Value aggregation:
  ///   In this step, effective threads combine their thread local Reduce list
  ///   with Remote Reduce list and store the result in the thread local
  ///   Reduce list.
  /// Value copy:
  ///   In this step, we deal with the assumption made by algorithm 2
  ///   (i.e. contiguity assumption).  When we have an odd number of lanes
  ///   active, say 2k+1, only k threads will be effective and therefore k
  ///   new values will be produced.  However, the Reduce list owned by the
  ///   (2k+1)th thread is ignored in the value aggregation.  Therefore
  ///   we copy the Reduce list from the (2k+1)th lane to (k+1)th lane so
  ///   that the contiguity assumption still holds.
  ///
  /// \param ReductionInfos Array type containing the ReductionOps.
  /// \param ReduceFn The reduction function.
  /// \param FuncAttrs Optional param to specify any function attributes that
  ///                  need to be copied to the new function.
  ///
  /// \return The ShuffleAndReduce function.
  Function *emitShuffleAndReduceFunction(
      ArrayRef<OpenMPIRBuilder::ReductionInfo> ReductionInfos,
      Function *ReduceFn, AttributeList FuncAttrs);

  /// This function emits a helper that gathers Reduce lists from the first
  /// lane of every active warp to lanes in the first warp.
  ///
  /// void inter_warp_copy_func(void* reduce_data, num_warps)
  ///   shared smem[warp_size];
  ///   For all data entries D in reduce_data:
  ///     sync
  ///     If (I am the first lane in each warp)
  ///       Copy my local D to smem[warp_id]
  ///     sync
  ///     if (I am the first warp)
  ///       Copy smem[thread_id] to my local D
  ///
  /// \param Loc The insert and source location description.
  /// \param ReductionInfos Array type containing the ReductionOps.
  /// \param FuncAttrs Optional param to specify any function attributes that
  ///                  need to be copied to the new function.
  ///
  /// \return The InterWarpCopy function.
  Function *emitInterWarpCopyFunction(const LocationDescription &Loc,
                                      ArrayRef<ReductionInfo> ReductionInfos,
                                      AttributeList FuncAttrs);

  /// This function emits a helper that copies all the reduction variables from
  /// the team into the provided global buffer for the reduction variables.
  ///
  /// void list_to_global_copy_func(void *buffer, int Idx, void *reduce_data)
  ///   For all data entries D in reduce_data:
  ///     Copy local D to buffer.D[Idx]
  ///
  /// \param ReductionInfos Array type containing the ReductionOps.
  /// \param ReductionsBufferTy The StructTy for the reductions buffer.
  /// \param FuncAttrs Optional param to specify any function attributes that
  ///                  need to be copied to the new function.
  ///
  /// \return The ListToGlobalCopy function.
  Function *emitListToGlobalCopyFunction(ArrayRef<ReductionInfo> ReductionInfos,
                                         Type *ReductionsBufferTy,
                                         AttributeList FuncAttrs);

  /// This function emits a helper that copies all the reduction variables from
  /// the team into the provided global buffer for the reduction variables.
  ///
  /// void list_to_global_copy_func(void *buffer, int Idx, void *reduce_data)
  ///   For all data entries D in reduce_data:
  ///     Copy buffer.D[Idx] to local D;
  ///
  /// \param ReductionInfos Array type containing the ReductionOps.
  /// \param ReductionsBufferTy The StructTy for the reductions buffer.
  /// \param FuncAttrs Optional param to specify any function attributes that
  ///                  need to be copied to the new function.
  ///
  /// \return The GlobalToList function.
  Function *emitGlobalToListCopyFunction(ArrayRef<ReductionInfo> ReductionInfos,
                                         Type *ReductionsBufferTy,
                                         AttributeList FuncAttrs);

  /// This function emits a helper that reduces all the reduction variables from
  /// the team into the provided global buffer for the reduction variables.
  ///
  /// void list_to_global_reduce_func(void *buffer, int Idx, void *reduce_data)
  ///  void *GlobPtrs[];
  ///  GlobPtrs[0] = (void*)&buffer.D0[Idx];
  ///  ...
  ///  GlobPtrs[N] = (void*)&buffer.DN[Idx];
  ///  reduce_function(GlobPtrs, reduce_data);
  ///
  /// \param ReductionInfos Array type containing the ReductionOps.
  /// \param ReduceFn The reduction function.
  /// \param ReductionsBufferTy The StructTy for the reductions buffer.
  /// \param FuncAttrs Optional param to specify any function attributes that
  ///                  need to be copied to the new function.
  ///
  /// \return The ListToGlobalReduce function.
  Function *
  emitListToGlobalReduceFunction(ArrayRef<ReductionInfo> ReductionInfos,
                                 Function *ReduceFn, Type *ReductionsBufferTy,
                                 AttributeList FuncAttrs);

  /// This function emits a helper that reduces all the reduction variables from
  /// the team into the provided global buffer for the reduction variables.
  ///
  /// void global_to_list_reduce_func(void *buffer, int Idx, void *reduce_data)
  ///  void *GlobPtrs[];
  ///  GlobPtrs[0] = (void*)&buffer.D0[Idx];
  ///  ...
  ///  GlobPtrs[N] = (void*)&buffer.DN[Idx];
  ///  reduce_function(reduce_data, GlobPtrs);
  ///
  /// \param ReductionInfos Array type containing the ReductionOps.
  /// \param ReduceFn The reduction function.
  /// \param ReductionsBufferTy The StructTy for the reductions buffer.
  /// \param FuncAttrs Optional param to specify any function attributes that
  ///                  need to be copied to the new function.
  ///
  /// \return The GlobalToListReduce function.
  Function *
  emitGlobalToListReduceFunction(ArrayRef<ReductionInfo> ReductionInfos,
                                 Function *ReduceFn, Type *ReductionsBufferTy,
                                 AttributeList FuncAttrs);

  /// Get the function name of a reduction function.
  std::string getReductionFuncName(StringRef Name) const;

  /// Emits reduction function.
  /// \param ReducerName Name of the function calling the reduction.
  /// \param ReductionInfos Array type containing the ReductionOps.
  /// \param ReductionGenCBKind Optional param to specify Clang or MLIR
  ///                           CodeGenCB kind.
  /// \param FuncAttrs Optional param to specify any function attributes that
  ///                  need to be copied to the new function.
  ///
  /// \return The reduction function.
  Function *createReductionFunction(
      StringRef ReducerName, ArrayRef<ReductionInfo> ReductionInfos,
      ReductionGenCBKind ReductionGenCBKind = ReductionGenCBKind::MLIR,
      AttributeList FuncAttrs = {});

public:
  ///
  /// Design of OpenMP reductions on the GPU
  ///
  /// Consider a typical OpenMP program with one or more reduction
  /// clauses:
  ///
  /// float foo;
  /// double bar;
  /// #pragma omp target teams distribute parallel for \
  ///             reduction(+:foo) reduction(*:bar)
  /// for (int i = 0; i < N; i++) {
  ///   foo += A[i]; bar *= B[i];
  /// }
  ///
  /// where 'foo' and 'bar' are reduced across all OpenMP threads in
  /// all teams.  In our OpenMP implementation on the NVPTX device an
  /// OpenMP team is mapped to a CUDA threadblock and OpenMP threads
  /// within a team are mapped to CUDA threads within a threadblock.
  /// Our goal is to efficiently aggregate values across all OpenMP
  /// threads such that:
  ///
  ///   - the compiler and runtime are logically concise, and
  ///   - the reduction is performed efficiently in a hierarchical
  ///     manner as follows: within OpenMP threads in the same warp,
  ///     across warps in a threadblock, and finally across teams on
  ///     the NVPTX device.
  ///
  /// Introduction to Decoupling
  ///
  /// We would like to decouple the compiler and the runtime so that the
  /// latter is ignorant of the reduction variables (number, data types)
  /// and the reduction operators.  This allows a simpler interface
  /// and implementation while still attaining good performance.
  ///
  /// Pseudocode for the aforementioned OpenMP program generated by the
  /// compiler is as follows:
  ///
  /// 1. Create private copies of reduction variables on each OpenMP
  ///    thread: 'foo_private', 'bar_private'
  /// 2. Each OpenMP thread reduces the chunk of 'A' and 'B' assigned
  ///    to it and writes the result in 'foo_private' and 'bar_private'
  ///    respectively.
  /// 3. Call the OpenMP runtime on the GPU to reduce within a team
  ///    and store the result on the team master:
  ///
  ///     __kmpc_nvptx_parallel_reduce_nowait_v2(...,
  ///        reduceData, shuffleReduceFn, interWarpCpyFn)
  ///
  ///     where:
  ///       struct ReduceData {
  ///         double *foo;
  ///         double *bar;
  ///       } reduceData
  ///       reduceData.foo = &foo_private
  ///       reduceData.bar = &bar_private
  ///
  ///     'shuffleReduceFn' and 'interWarpCpyFn' are pointers to two
  ///     auxiliary functions generated by the compiler that operate on
  ///     variables of type 'ReduceData'.  They aid the runtime perform
  ///     algorithmic steps in a data agnostic manner.
  ///
  ///     'shuffleReduceFn' is a pointer to a function that reduces data
  ///     of type 'ReduceData' across two OpenMP threads (lanes) in the
  ///     same warp.  It takes the following arguments as input:
  ///
  ///     a. variable of type 'ReduceData' on the calling lane,
  ///     b. its lane_id,
  ///     c. an offset relative to the current lane_id to generate a
  ///        remote_lane_id.  The remote lane contains the second
  ///        variable of type 'ReduceData' that is to be reduced.
  ///     d. an algorithm version parameter determining which reduction
  ///        algorithm to use.
  ///
  ///     'shuffleReduceFn' retrieves data from the remote lane using
  ///     efficient GPU shuffle intrinsics and reduces, using the
  ///     algorithm specified by the 4th parameter, the two operands
  ///     element-wise.  The result is written to the first operand.
  ///
  ///     Different reduction algorithms are implemented in different
  ///     runtime functions, all calling 'shuffleReduceFn' to perform
  ///     the essential reduction step.  Therefore, based on the 4th
  ///     parameter, this function behaves slightly differently to
  ///     cooperate with the runtime to ensure correctness under
  ///     different circumstances.
  ///
  ///     'InterWarpCpyFn' is a pointer to a function that transfers
  ///     reduced variables across warps.  It tunnels, through CUDA
  ///     shared memory, the thread-private data of type 'ReduceData'
  ///     from lane 0 of each warp to a lane in the first warp.
  /// 4. Call the OpenMP runtime on the GPU to reduce across teams.
  ///    The last team writes the global reduced value to memory.
  ///
  ///     ret = __kmpc_nvptx_teams_reduce_nowait(...,
  ///             reduceData, shuffleReduceFn, interWarpCpyFn,
  ///             scratchpadCopyFn, loadAndReduceFn)
  ///
  ///     'scratchpadCopyFn' is a helper that stores reduced
  ///     data from the team master to a scratchpad array in
  ///     global memory.
  ///
  ///     'loadAndReduceFn' is a helper that loads data from
  ///     the scratchpad array and reduces it with the input
  ///     operand.
  ///
  ///     These compiler generated functions hide address
  ///     calculation and alignment information from the runtime.
  /// 5. if ret == 1:
  ///     The team master of the last team stores the reduced
  ///     result to the globals in memory.
  ///     foo += reduceData.foo; bar *= reduceData.bar
  ///
  ///
  /// Warp Reduction Algorithms
  ///
  /// On the warp level, we have three algorithms implemented in the
  /// OpenMP runtime depending on the number of active lanes:
  ///
  /// Full Warp Reduction
  ///
  /// The reduce algorithm within a warp where all lanes are active
  /// is implemented in the runtime as follows:
  ///
  /// full_warp_reduce(void *reduce_data,
  ///                  kmp_ShuffleReductFctPtr ShuffleReduceFn) {
  ///   for (int offset = WARPSIZE/2; offset > 0; offset /= 2)
  ///     ShuffleReduceFn(reduce_data, 0, offset, 0);
  /// }
  ///
  /// The algorithm completes in log(2, WARPSIZE) steps.
  ///
  /// 'ShuffleReduceFn' is used here with lane_id set to 0 because it is
  /// not used therefore we save instructions by not retrieving lane_id
  /// from the corresponding special registers.  The 4th parameter, which
  /// represents the version of the algorithm being used, is set to 0 to
  /// signify full warp reduction.
  ///
  /// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
  ///
  /// #reduce_elem refers to an element in the local lane's data structure
  /// #remote_elem is retrieved from a remote lane
  /// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
  /// reduce_elem = reduce_elem REDUCE_OP remote_elem;
  ///
  /// Contiguous Partial Warp Reduction
  ///
  /// This reduce algorithm is used within a warp where only the first
  /// 'n' (n <= WARPSIZE) lanes are active.  It is typically used when the
  /// number of OpenMP threads in a parallel region is not a multiple of
  /// WARPSIZE.  The algorithm is implemented in the runtime as follows:
  ///
  /// void
  /// contiguous_partial_reduce(void *reduce_data,
  ///                           kmp_ShuffleReductFctPtr ShuffleReduceFn,
  ///                           int size, int lane_id) {
  ///   int curr_size;
  ///   int offset;
  ///   curr_size = size;
  ///   mask = curr_size/2;
  ///   while (offset>0) {
  ///     ShuffleReduceFn(reduce_data, lane_id, offset, 1);
  ///     curr_size = (curr_size+1)/2;
  ///     offset = curr_size/2;
  ///   }
  /// }
  ///
  /// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
  ///
  /// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
  /// if (lane_id < offset)
  ///     reduce_elem = reduce_elem REDUCE_OP remote_elem
  /// else
  ///     reduce_elem = remote_elem
  ///
  /// This algorithm assumes that the data to be reduced are located in a
  /// contiguous subset of lanes starting from the first.  When there is
  /// an odd number of active lanes, the data in the last lane is not
  /// aggregated with any other lane's dat but is instead copied over.
  ///
  /// Dispersed Partial Warp Reduction
  ///
  /// This algorithm is used within a warp when any discontiguous subset of
  /// lanes are active.  It is used to implement the reduction operation
  /// across lanes in an OpenMP simd region or in a nested parallel region.
  ///
  /// void
  /// dispersed_partial_reduce(void *reduce_data,
  ///                          kmp_ShuffleReductFctPtr ShuffleReduceFn) {
  ///   int size, remote_id;
  ///   int logical_lane_id = number_of_active_lanes_before_me() * 2;
  ///   do {
  ///       remote_id = next_active_lane_id_right_after_me();
  ///       # the above function returns 0 of no active lane
  ///       # is present right after the current lane.
  ///       size = number_of_active_lanes_in_this_warp();
  ///       logical_lane_id /= 2;
  ///       ShuffleReduceFn(reduce_data, logical_lane_id,
  ///                       remote_id-1-threadIdx.x, 2);
  ///   } while (logical_lane_id % 2 == 0 && size > 1);
  /// }
  ///
  /// There is no assumption made about the initial state of the reduction.
  /// Any number of lanes (>=1) could be active at any position.  The reduction
  /// result is returned in the first active lane.
  ///
  /// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
  ///
  /// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
  /// if (lane_id % 2 == 0 && offset > 0)
  ///     reduce_elem = reduce_elem REDUCE_OP remote_elem
  /// else
  ///     reduce_elem = remote_elem
  ///
  ///
  /// Intra-Team Reduction
  ///
  /// This function, as implemented in the runtime call
  /// '__kmpc_nvptx_parallel_reduce_nowait_v2', aggregates data across OpenMP
  /// threads in a team.  It first reduces within a warp using the
  /// aforementioned algorithms.  We then proceed to gather all such
  /// reduced values at the first warp.
  ///
  /// The runtime makes use of the function 'InterWarpCpyFn', which copies
  /// data from each of the "warp master" (zeroth lane of each warp, where
  /// warp-reduced data is held) to the zeroth warp.  This step reduces (in
  /// a mathematical sense) the problem of reduction across warp masters in
  /// a block to the problem of warp reduction.
  ///
  ///
  /// Inter-Team Reduction
  ///
  /// Once a team has reduced its data to a single value, it is stored in
  /// a global scratchpad array.  Since each team has a distinct slot, this
  /// can be done without locking.
  ///
  /// The last team to write to the scratchpad array proceeds to reduce the
  /// scratchpad array.  One or more workers in the last team use the helper
  /// 'loadAndReduceDataFn' to load and reduce values from the array, i.e.,
  /// the k'th worker reduces every k'th element.
  ///
  /// Finally, a call is made to '__kmpc_nvptx_parallel_reduce_nowait_v2' to
  /// reduce across workers and compute a globally reduced value.
  ///
  /// \param Loc                The location where the reduction was
  ///                           encountered. Must be within the associate
  ///                           directive and after the last local access to the
  ///                           reduction variables.
  /// \param AllocaIP           An insertion point suitable for allocas usable
  ///                           in reductions.
  /// \param CodeGenIP           An insertion point suitable for code
  /// generation. \param ReductionInfos     A list of info on each reduction
  /// variable. \param IsNoWait           Optional flag set if the reduction is
  /// marked as
  ///                           nowait.
  /// \param IsTeamsReduction   Optional flag set if it is a teams
  ///                           reduction.
  /// \param HasDistribute      Optional flag set if it is a
  ///                           distribute reduction.
  /// \param GridValue          Optional GPU grid value.
  /// \param ReductionBufNum    Optional OpenMPCUDAReductionBufNumValue to be
  /// used for teams reduction.
  /// \param SrcLocInfo         Source location information global.
  InsertPointTy createReductionsGPU(
      const LocationDescription &Loc, InsertPointTy AllocaIP,
      InsertPointTy CodeGenIP, ArrayRef<ReductionInfo> ReductionInfos,
      bool IsNoWait = false, bool IsTeamsReduction = false,
      bool HasDistribute = false,
      ReductionGenCBKind ReductionGenCBKind = ReductionGenCBKind::MLIR,
      std::optional<omp::GV> GridValue = {}, unsigned ReductionBufNum = 1024,
      Value *SrcLocInfo = nullptr);

  // TODO: provide atomic and non-atomic reduction generators for reduction
  // operators defined by the OpenMP specification.

  /// Generator for '#omp reduction'.
  ///
  /// Emits the IR instructing the runtime to perform the specific kind of
  /// reductions. Expects reduction variables to have been privatized and
  /// initialized to reduction-neutral values separately. Emits the calls to
  /// runtime functions as well as the reduction function and the basic blocks
  /// performing the reduction atomically and non-atomically.
  ///
  /// The code emitted for the following:
  ///
  /// \code
  ///   type var_1;
  ///   type var_2;
  ///   #pragma omp <directive> reduction(reduction-op:var_1,var_2)
  ///   /* body */;
  /// \endcode
  ///
  /// corresponds to the following sketch.
  ///
  /// \code
  /// void _outlined_par() {
  ///   // N is the number of different reductions.
  ///   void *red_array[] = {privatized_var_1, privatized_var_2, ...};
  ///   switch(__kmpc_reduce(..., N, /*size of data in red array*/, red_array,
  ///                        _omp_reduction_func,
  ///                        _gomp_critical_user.reduction.var)) {
  ///   case 1: {
  ///     var_1 = var_1 <reduction-op> privatized_var_1;
  ///     var_2 = var_2 <reduction-op> privatized_var_2;
  ///     // ...
  ///    __kmpc_end_reduce(...);
  ///     break;
  ///   }
  ///   case 2: {
  ///     _Atomic<ReductionOp>(var_1, privatized_var_1);
  ///     _Atomic<ReductionOp>(var_2, privatized_var_2);
  ///     // ...
  ///     break;
  ///   }
  ///   default: break;
  ///   }
  /// }
  ///
  /// void _omp_reduction_func(void **lhs, void **rhs) {
  ///   *(type *)lhs[0] = *(type *)lhs[0] <reduction-op> *(type *)rhs[0];
  ///   *(type *)lhs[1] = *(type *)lhs[1] <reduction-op> *(type *)rhs[1];
  ///   // ...
  /// }
  /// \endcode
  ///
  /// \param Loc                The location where the reduction was
  ///                           encountered. Must be within the associate
  ///                           directive and after the last local access to the
  ///                           reduction variables.
  /// \param AllocaIP           An insertion point suitable for allocas usable
  ///                           in reductions.
  /// \param ReductionInfos     A list of info on each reduction variable.
  /// \param IsNoWait           A flag set if the reduction is marked as nowait.
  /// \param IsByRef            A flag set if the reduction is using reference
  /// or direct value.
  InsertPointTy createReductions(const LocationDescription &Loc,
                                 InsertPointTy AllocaIP,
                                 ArrayRef<ReductionInfo> ReductionInfos,
                                 ArrayRef<bool> IsByRef, bool IsNoWait = false);

  ///}

  /// Return the insertion point used by the underlying IRBuilder.
  InsertPointTy getInsertionPoint() { return Builder.saveIP(); }

  /// Update the internal location to \p Loc.
  bool updateToLocation(const LocationDescription &Loc) {
    Builder.restoreIP(Loc.IP);
    Builder.SetCurrentDebugLocation(Loc.DL);
    return Loc.IP.getBlock() != nullptr;
  }

  /// Return the function declaration for the runtime function with \p FnID.
  FunctionCallee getOrCreateRuntimeFunction(Module &M,
                                            omp::RuntimeFunction FnID);

  Function *getOrCreateRuntimeFunctionPtr(omp::RuntimeFunction FnID);

  /// Return the (LLVM-IR) string describing the source location \p LocStr.
  Constant *getOrCreateSrcLocStr(StringRef LocStr, uint32_t &SrcLocStrSize);

  /// Return the (LLVM-IR) string describing the default source location.
  Constant *getOrCreateDefaultSrcLocStr(uint32_t &SrcLocStrSize);

  /// Return the (LLVM-IR) string describing the source location identified by
  /// the arguments.
  Constant *getOrCreateSrcLocStr(StringRef FunctionName, StringRef FileName,
                                 unsigned Line, unsigned Column,
                                 uint32_t &SrcLocStrSize);

  /// Return the (LLVM-IR) string describing the DebugLoc \p DL. Use \p F as
  /// fallback if \p DL does not specify the function name.
  Constant *getOrCreateSrcLocStr(DebugLoc DL, uint32_t &SrcLocStrSize,
                                 Function *F = nullptr);

  /// Return the (LLVM-IR) string describing the source location \p Loc.
  Constant *getOrCreateSrcLocStr(const LocationDescription &Loc,
                                 uint32_t &SrcLocStrSize);

  /// Return an ident_t* encoding the source location \p SrcLocStr and \p Flags.
  /// TODO: Create a enum class for the Reserve2Flags
  Constant *getOrCreateIdent(Constant *SrcLocStr, uint32_t SrcLocStrSize,
                             omp::IdentFlag Flags = omp::IdentFlag(0),
                             unsigned Reserve2Flags = 0);

  /// Create a hidden global flag \p Name in the module with initial value \p
  /// Value.
  GlobalValue *createGlobalFlag(unsigned Value, StringRef Name);

  /// Generate control flow and cleanup for cancellation.
  ///
  /// \param CancelFlag Flag indicating if the cancellation is performed.
  /// \param CanceledDirective The kind of directive that is cancled.
  /// \param ExitCB Extra code to be generated in the exit block.
  void emitCancelationCheckImpl(Value *CancelFlag,
                                omp::Directive CanceledDirective,
                                FinalizeCallbackTy ExitCB = {});

  /// Generate a target region entry call.
  ///
  /// \param Loc The location at which the request originated and is fulfilled.
  /// \param AllocaIP The insertion point to be used for alloca instructions.
  /// \param Return Return value of the created function returned by reference.
  /// \param DeviceID Identifier for the device via the 'device' clause.
  /// \param NumTeams Numer of teams for the region via the 'num_teams' clause
  ///                 or 0 if unspecified and -1 if there is no 'teams' clause.
  /// \param NumThreads Number of threads via the 'thread_limit' clause.
  /// \param HostPtr Pointer to the host-side pointer of the target kernel.
  /// \param KernelArgs Array of arguments to the kernel.
  InsertPointTy emitTargetKernel(const LocationDescription &Loc,
                                 InsertPointTy AllocaIP, Value *&Return,
                                 Value *Ident, Value *DeviceID, Value *NumTeams,
                                 Value *NumThreads, Value *HostPtr,
                                 ArrayRef<Value *> KernelArgs);

  /// Generate a flush runtime call.
  ///
  /// \param Loc The location at which the request originated and is fulfilled.
  void emitFlush(const LocationDescription &Loc);

  /// The finalization stack made up of finalize callbacks currently in-flight,
  /// wrapped into FinalizationInfo objects that reference also the finalization
  /// target block and the kind of cancellable directive.
  SmallVector<FinalizationInfo, 8> FinalizationStack;

  /// Return true if the last entry in the finalization stack is of kind \p DK
  /// and cancellable.
  bool isLastFinalizationInfoCancellable(omp::Directive DK) {
    return !FinalizationStack.empty() &&
           FinalizationStack.back().IsCancellable &&
           FinalizationStack.back().DK == DK;
  }

  /// Generate a taskwait runtime call.
  ///
  /// \param Loc The location at which the request originated and is fulfilled.
  void emitTaskwaitImpl(const LocationDescription &Loc);

  /// Generate a taskyield runtime call.
  ///
  /// \param Loc The location at which the request originated and is fulfilled.
  void emitTaskyieldImpl(const LocationDescription &Loc);

  /// Return the current thread ID.
  ///
  /// \param Ident The ident (ident_t*) describing the query origin.
  Value *getOrCreateThreadID(Value *Ident);

  /// The OpenMPIRBuilder Configuration
  OpenMPIRBuilderConfig Config;

  /// The underlying LLVM-IR module
  Module &M;

  /// The LLVM-IR Builder used to create IR.
  IRBuilder<> Builder;

  /// Map to remember source location strings
  StringMap<Constant *> SrcLocStrMap;

  /// Map to remember existing ident_t*.
  DenseMap<std::pair<Constant *, uint64_t>, Constant *> IdentMap;

  /// Info manager to keep track of target regions.
  OffloadEntriesInfoManager OffloadInfoManager;

  /// The target triple of the underlying module.
  const Triple T;

  /// Helper that contains information about regions we need to outline
  /// during finalization.
  struct OutlineInfo {
    using PostOutlineCBTy = std::function<void(Function &)>;
    PostOutlineCBTy PostOutlineCB;
    BasicBlock *EntryBB, *ExitBB, *OuterAllocaBB;
    SmallVector<Value *, 2> ExcludeArgsFromAggregate;

    /// Collect all blocks in between EntryBB and ExitBB in both the given
    /// vector and set.
    void collectBlocks(SmallPtrSetImpl<BasicBlock *> &BlockSet,
                       SmallVectorImpl<BasicBlock *> &BlockVector);

    /// Return the function that contains the region to be outlined.
    Function *getFunction() const { return EntryBB->getParent(); }
  };

  /// Collection of regions that need to be outlined during finalization.
  SmallVector<OutlineInfo, 16> OutlineInfos;

  /// A collection of candidate target functions that's constant allocas will
  /// attempt to be raised on a call of finalize after all currently enqueued
  /// outline info's have been processed.
  SmallVector<llvm::Function *, 16> ConstantAllocaRaiseCandidates;

  /// Collection of owned canonical loop objects that eventually need to be
  /// free'd.
  std::forward_list<CanonicalLoopInfo> LoopInfos;

  /// Add a new region that will be outlined later.
  void addOutlineInfo(OutlineInfo &&OI) { OutlineInfos.emplace_back(OI); }

  /// An ordered map of auto-generated variables to their unique names.
  /// It stores variables with the following names: 1) ".gomp_critical_user_" +
  /// <critical_section_name> + ".var" for "omp critical" directives; 2)
  /// <mangled_name_for_global_var> + ".cache." for cache for threadprivate
  /// variables.
  StringMap<GlobalVariable *, BumpPtrAllocator> InternalVars;

  /// Computes the size of type in bytes.
  Value *getSizeInBytes(Value *BasePtr);

  // Emit a branch from the current block to the Target block only if
  // the current block has a terminator.
  void emitBranch(BasicBlock *Target);

  // If BB has no use then delete it and return. Else place BB after the current
  // block, if possible, or else at the end of the function. Also add a branch
  // from current block to BB if current block does not have a terminator.
  void emitBlock(BasicBlock *BB, Function *CurFn, bool IsFinished = false);

  /// Emits code for OpenMP 'if' clause using specified \a BodyGenCallbackTy
  /// Here is the logic:
  /// if (Cond) {
  ///   ThenGen();
  /// } else {
  ///   ElseGen();
  /// }
  void emitIfClause(Value *Cond, BodyGenCallbackTy ThenGen,
                    BodyGenCallbackTy ElseGen, InsertPointTy AllocaIP = {});

  /// Create the global variable holding the offload mappings information.
  GlobalVariable *createOffloadMaptypes(SmallVectorImpl<uint64_t> &Mappings,
                                        std::string VarName);

  /// Create the global variable holding the offload names information.
  GlobalVariable *
  createOffloadMapnames(SmallVectorImpl<llvm::Constant *> &Names,
                        std::string VarName);

  struct MapperAllocas {
    AllocaInst *ArgsBase = nullptr;
    AllocaInst *Args = nullptr;
    AllocaInst *ArgSizes = nullptr;
  };

  /// Create the allocas instruction used in call to mapper functions.
  void createMapperAllocas(const LocationDescription &Loc,
                           InsertPointTy AllocaIP, unsigned NumOperands,
                           struct MapperAllocas &MapperAllocas);

  /// Create the call for the target mapper function.
  /// \param Loc The source location description.
  /// \param MapperFunc Function to be called.
  /// \param SrcLocInfo Source location information global.
  /// \param MaptypesArg The argument types.
  /// \param MapnamesArg The argument names.
  /// \param MapperAllocas The AllocaInst used for the call.
  /// \param DeviceID Device ID for the call.
  /// \param NumOperands Number of operands in the call.
  void emitMapperCall(const LocationDescription &Loc, Function *MapperFunc,
                      Value *SrcLocInfo, Value *MaptypesArg, Value *MapnamesArg,
                      struct MapperAllocas &MapperAllocas, int64_t DeviceID,
                      unsigned NumOperands);

  /// Container for the arguments used to pass data to the runtime library.
  struct TargetDataRTArgs {
    /// The array of base pointer passed to the runtime library.
    Value *BasePointersArray = nullptr;
    /// The array of section pointers passed to the runtime library.
    Value *PointersArray = nullptr;
    /// The array of sizes passed to the runtime library.
    Value *SizesArray = nullptr;
    /// The array of map types passed to the runtime library for the beginning
    /// of the region or for the entire region if there are no separate map
    /// types for the region end.
    Value *MapTypesArray = nullptr;
    /// The array of map types passed to the runtime library for the end of the
    /// region, or nullptr if there are no separate map types for the region
    /// end.
    Value *MapTypesArrayEnd = nullptr;
    /// The array of user-defined mappers passed to the runtime library.
    Value *MappersArray = nullptr;
    /// The array of original declaration names of mapped pointers sent to the
    /// runtime library for debugging
    Value *MapNamesArray = nullptr;

    explicit TargetDataRTArgs() {}
    explicit TargetDataRTArgs(Value *BasePointersArray, Value *PointersArray,
                              Value *SizesArray, Value *MapTypesArray,
                              Value *MapTypesArrayEnd, Value *MappersArray,
                              Value *MapNamesArray)
        : BasePointersArray(BasePointersArray), PointersArray(PointersArray),
          SizesArray(SizesArray), MapTypesArray(MapTypesArray),
          MapTypesArrayEnd(MapTypesArrayEnd), MappersArray(MappersArray),
          MapNamesArray(MapNamesArray) {}
  };

  /// Data structure that contains the needed information to construct the
  /// kernel args vector.
  struct TargetKernelArgs {
    /// Number of arguments passed to the runtime library.
    unsigned NumTargetItems;
    /// Arguments passed to the runtime library
    TargetDataRTArgs RTArgs;
    /// The number of iterations
    Value *NumIterations;
    /// The number of teams.
    Value *NumTeams;
    /// The number of threads.
    Value *NumThreads;
    /// The size of the dynamic shared memory.
    Value *DynCGGroupMem;
    /// True if the kernel has 'no wait' clause.
    bool HasNoWait;

    /// Constructor for TargetKernelArgs
    TargetKernelArgs(unsigned NumTargetItems, TargetDataRTArgs RTArgs,
                     Value *NumIterations, Value *NumTeams, Value *NumThreads,
                     Value *DynCGGroupMem, bool HasNoWait)
        : NumTargetItems(NumTargetItems), RTArgs(RTArgs),
          NumIterations(NumIterations), NumTeams(NumTeams),
          NumThreads(NumThreads), DynCGGroupMem(DynCGGroupMem),
          HasNoWait(HasNoWait) {}
  };

  /// Create the kernel args vector used by emitTargetKernel. This function
  /// creates various constant values that are used in the resulting args
  /// vector.
  static void getKernelArgsVector(TargetKernelArgs &KernelArgs,
                                  IRBuilderBase &Builder,
                                  SmallVector<Value *> &ArgsVector);

  /// Struct that keeps the information that should be kept throughout
  /// a 'target data' region.
  class TargetDataInfo {
    /// Set to true if device pointer information have to be obtained.
    bool RequiresDevicePointerInfo = false;
    /// Set to true if Clang emits separate runtime calls for the beginning and
    /// end of the region.  These calls might have separate map type arrays.
    bool SeparateBeginEndCalls = false;

  public:
    TargetDataRTArgs RTArgs;

    SmallMapVector<const Value *, std::pair<Value *, Value *>, 4>
        DevicePtrInfoMap;

    /// Indicate whether any user-defined mapper exists.
    bool HasMapper = false;
    /// The total number of pointers passed to the runtime library.
    unsigned NumberOfPtrs = 0u;

    explicit TargetDataInfo() {}
    explicit TargetDataInfo(bool RequiresDevicePointerInfo,
                            bool SeparateBeginEndCalls)
        : RequiresDevicePointerInfo(RequiresDevicePointerInfo),
          SeparateBeginEndCalls(SeparateBeginEndCalls) {}
    /// Clear information about the data arrays.
    void clearArrayInfo() {
      RTArgs = TargetDataRTArgs();
      HasMapper = false;
      NumberOfPtrs = 0u;
    }
    /// Return true if the current target data information has valid arrays.
    bool isValid() {
      return RTArgs.BasePointersArray && RTArgs.PointersArray &&
             RTArgs.SizesArray && RTArgs.MapTypesArray &&
             (!HasMapper || RTArgs.MappersArray) && NumberOfPtrs;
    }
    bool requiresDevicePointerInfo() { return RequiresDevicePointerInfo; }
    bool separateBeginEndCalls() { return SeparateBeginEndCalls; }
  };

  enum class DeviceInfoTy { None, Pointer, Address };
  using MapValuesArrayTy = SmallVector<Value *, 4>;
  using MapDeviceInfoArrayTy = SmallVector<DeviceInfoTy, 4>;
  using MapFlagsArrayTy = SmallVector<omp::OpenMPOffloadMappingFlags, 4>;
  using MapNamesArrayTy = SmallVector<Constant *, 4>;
  using MapDimArrayTy = SmallVector<uint64_t, 4>;
  using MapNonContiguousArrayTy = SmallVector<MapValuesArrayTy, 4>;

  /// This structure contains combined information generated for mappable
  /// clauses, including base pointers, pointers, sizes, map types, user-defined
  /// mappers, and non-contiguous information.
  struct MapInfosTy {
    struct StructNonContiguousInfo {
      bool IsNonContiguous = false;
      MapDimArrayTy Dims;
      MapNonContiguousArrayTy Offsets;
      MapNonContiguousArrayTy Counts;
      MapNonContiguousArrayTy Strides;
    };
    MapValuesArrayTy BasePointers;
    MapValuesArrayTy Pointers;
    MapDeviceInfoArrayTy DevicePointers;
    MapValuesArrayTy Sizes;
    MapFlagsArrayTy Types;
    MapNamesArrayTy Names;
    StructNonContiguousInfo NonContigInfo;

    /// Append arrays in \a CurInfo.
    void append(MapInfosTy &CurInfo) {
      BasePointers.append(CurInfo.BasePointers.begin(),
                          CurInfo.BasePointers.end());
      Pointers.append(CurInfo.Pointers.begin(), CurInfo.Pointers.end());
      DevicePointers.append(CurInfo.DevicePointers.begin(),
                            CurInfo.DevicePointers.end());
      Sizes.append(CurInfo.Sizes.begin(), CurInfo.Sizes.end());
      Types.append(CurInfo.Types.begin(), CurInfo.Types.end());
      Names.append(CurInfo.Names.begin(), CurInfo.Names.end());
      NonContigInfo.Dims.append(CurInfo.NonContigInfo.Dims.begin(),
                                CurInfo.NonContigInfo.Dims.end());
      NonContigInfo.Offsets.append(CurInfo.NonContigInfo.Offsets.begin(),
                                   CurInfo.NonContigInfo.Offsets.end());
      NonContigInfo.Counts.append(CurInfo.NonContigInfo.Counts.begin(),
                                  CurInfo.NonContigInfo.Counts.end());
      NonContigInfo.Strides.append(CurInfo.NonContigInfo.Strides.begin(),
                                   CurInfo.NonContigInfo.Strides.end());
    }
  };

  /// Callback function type for functions emitting the host fallback code that
  /// is executed when the kernel launch fails. It takes an insertion point as
  /// parameter where the code should be emitted. It returns an insertion point
  /// that points right after after the emitted code.
  using EmitFallbackCallbackTy = function_ref<InsertPointTy(InsertPointTy)>;

  /// Generate a target region entry call and host fallback call.
  ///
  /// \param Loc The location at which the request originated and is fulfilled.
  /// \param OutlinedFn The outlined kernel function.
  /// \param OutlinedFnID The ooulined function ID.
  /// \param EmitTargetCallFallbackCB Call back function to generate host
  ///        fallback code.
  /// \param Args Data structure holding information about the kernel arguments.
  /// \param DeviceID Identifier for the device via the 'device' clause.
  /// \param RTLoc Source location identifier
  /// \param AllocaIP The insertion point to be used for alloca instructions.
  InsertPointTy emitKernelLaunch(
      const LocationDescription &Loc, Function *OutlinedFn, Value *OutlinedFnID,
      EmitFallbackCallbackTy EmitTargetCallFallbackCB, TargetKernelArgs &Args,
      Value *DeviceID, Value *RTLoc, InsertPointTy AllocaIP);

  /// Generate a target-task for the target construct
  ///
  /// \param OutlinedFn The outlined device/target kernel function.
  /// \param OutlinedFnID The ooulined function ID.
  /// \param EmitTargetCallFallbackCB Call back function to generate host
  ///        fallback code.
  /// \param Args Data structure holding information about the kernel arguments.
  /// \param DeviceID Identifier for the device via the 'device' clause.
  /// \param RTLoc Source location identifier
  /// \param AllocaIP The insertion point to be used for alloca instructions.
  /// \param Dependencies Vector of DependData objects holding information of
  ///        dependencies as specified by the 'depend' clause.
  /// \param HasNoWait True if the target construct had 'nowait' on it, false
  ///        otherwise
  InsertPointTy emitTargetTask(
      Function *OutlinedFn, Value *OutlinedFnID,
      EmitFallbackCallbackTy EmitTargetCallFallbackCB, TargetKernelArgs &Args,
      Value *DeviceID, Value *RTLoc, InsertPointTy AllocaIP,
      SmallVector<OpenMPIRBuilder::DependData> &Dependencies, bool HasNoWait);

  /// Emit the arguments to be passed to the runtime library based on the
  /// arrays of base pointers, pointers, sizes, map types, and mappers.  If
  /// ForEndCall, emit map types to be passed for the end of the region instead
  /// of the beginning.
  void emitOffloadingArraysArgument(IRBuilderBase &Builder,
                                    OpenMPIRBuilder::TargetDataRTArgs &RTArgs,
                                    OpenMPIRBuilder::TargetDataInfo &Info,
                                    bool EmitDebug = false,
                                    bool ForEndCall = false);

  /// Emit an array of struct descriptors to be assigned to the offload args.
  void emitNonContiguousDescriptor(InsertPointTy AllocaIP,
                                   InsertPointTy CodeGenIP,
                                   MapInfosTy &CombinedInfo,
                                   TargetDataInfo &Info);

  /// Emit the arrays used to pass the captures and map information to the
  /// offloading runtime library. If there is no map or capture information,
  /// return nullptr by reference.
  void emitOffloadingArrays(
      InsertPointTy AllocaIP, InsertPointTy CodeGenIP, MapInfosTy &CombinedInfo,
      TargetDataInfo &Info, bool IsNonContiguous = false,
      function_ref<void(unsigned int, Value *)> DeviceAddrCB = nullptr,
      function_ref<Value *(unsigned int)> CustomMapperCB = nullptr);

  /// Creates offloading entry for the provided entry ID \a ID, address \a
  /// Addr, size \a Size, and flags \a Flags.
  void createOffloadEntry(Constant *ID, Constant *Addr, uint64_t Size,
                          int32_t Flags, GlobalValue::LinkageTypes,
                          StringRef Name = "");

  /// The kind of errors that can occur when emitting the offload entries and
  /// metadata.
  enum EmitMetadataErrorKind {
    EMIT_MD_TARGET_REGION_ERROR,
    EMIT_MD_DECLARE_TARGET_ERROR,
    EMIT_MD_GLOBAL_VAR_LINK_ERROR
  };

  /// Callback function type
  using EmitMetadataErrorReportFunctionTy =
      std::function<void(EmitMetadataErrorKind, TargetRegionEntryInfo)>;

  // Emit the offloading entries and metadata so that the device codegen side
  // can easily figure out what to emit. The produced metadata looks like
  // this:
  //
  // !omp_offload.info = !{!1, ...}
  //
  // We only generate metadata for function that contain target regions.
  void createOffloadEntriesAndInfoMetadata(
      EmitMetadataErrorReportFunctionTy &ErrorReportFunction);

public:
  /// Generator for __kmpc_copyprivate
  ///
  /// \param Loc The source location description.
  /// \param BufSize Number of elements in the buffer.
  /// \param CpyBuf List of pointers to data to be copied.
  /// \param CpyFn function to call for copying data.
  /// \param DidIt flag variable; 1 for 'single' thread, 0 otherwise.
  ///
  /// \return The insertion position *after* the CopyPrivate call.

  InsertPointTy createCopyPrivate(const LocationDescription &Loc,
                                  llvm::Value *BufSize, llvm::Value *CpyBuf,
                                  llvm::Value *CpyFn, llvm::Value *DidIt);

  /// Generator for '#omp single'
  ///
  /// \param Loc The source location description.
  /// \param BodyGenCB Callback that will generate the region code.
  /// \param FiniCB Callback to finalize variable copies.
  /// \param IsNowait If false, a barrier is emitted.
  /// \param CPVars copyprivate variables.
  /// \param CPFuncs copy functions to use for each copyprivate variable.
  ///
  /// \returns The insertion position *after* the single call.
  InsertPointTy createSingle(const LocationDescription &Loc,
                             BodyGenCallbackTy BodyGenCB,
                             FinalizeCallbackTy FiniCB, bool IsNowait,
                             ArrayRef<llvm::Value *> CPVars = {},
                             ArrayRef<llvm::Function *> CPFuncs = {});

  /// Generator for '#omp master'
  ///
  /// \param Loc The insert and source location description.
  /// \param BodyGenCB Callback that will generate the region code.
  /// \param FiniCB Callback to finalize variable copies.
  ///
  /// \returns The insertion position *after* the master.
  InsertPointTy createMaster(const LocationDescription &Loc,
                             BodyGenCallbackTy BodyGenCB,
                             FinalizeCallbackTy FiniCB);

  /// Generator for '#omp masked'
  ///
  /// \param Loc The insert and source location description.
  /// \param BodyGenCB Callback that will generate the region code.
  /// \param FiniCB Callback to finialize variable copies.
  ///
  /// \returns The insertion position *after* the masked.
  InsertPointTy createMasked(const LocationDescription &Loc,
                             BodyGenCallbackTy BodyGenCB,
                             FinalizeCallbackTy FiniCB, Value *Filter);

  /// Generator for '#omp critical'
  ///
  /// \param Loc The insert and source location description.
  /// \param BodyGenCB Callback that will generate the region body code.
  /// \param FiniCB Callback to finalize variable copies.
  /// \param CriticalName name of the lock used by the critical directive
  /// \param HintInst Hint Instruction for hint clause associated with critical
  ///
  /// \returns The insertion position *after* the critical.
  InsertPointTy createCritical(const LocationDescription &Loc,
                               BodyGenCallbackTy BodyGenCB,
                               FinalizeCallbackTy FiniCB,
                               StringRef CriticalName, Value *HintInst);

  /// Generator for '#omp ordered depend (source | sink)'
  ///
  /// \param Loc The insert and source location description.
  /// \param AllocaIP The insertion point to be used for alloca instructions.
  /// \param NumLoops The number of loops in depend clause.
  /// \param StoreValues The value will be stored in vector address.
  /// \param Name The name of alloca instruction.
  /// \param IsDependSource If true, depend source; otherwise, depend sink.
  ///
  /// \return The insertion position *after* the ordered.
  InsertPointTy createOrderedDepend(const LocationDescription &Loc,
                                    InsertPointTy AllocaIP, unsigned NumLoops,
                                    ArrayRef<llvm::Value *> StoreValues,
                                    const Twine &Name, bool IsDependSource);

  /// Generator for '#omp ordered [threads | simd]'
  ///
  /// \param Loc The insert and source location description.
  /// \param BodyGenCB Callback that will generate the region code.
  /// \param FiniCB Callback to finalize variable copies.
  /// \param IsThreads If true, with threads clause or without clause;
  /// otherwise, with simd clause;
  ///
  /// \returns The insertion position *after* the ordered.
  InsertPointTy createOrderedThreadsSimd(const LocationDescription &Loc,
                                         BodyGenCallbackTy BodyGenCB,
                                         FinalizeCallbackTy FiniCB,
                                         bool IsThreads);

  /// Generator for '#omp sections'
  ///
  /// \param Loc The insert and source location description.
  /// \param AllocaIP The insertion points to be used for alloca instructions.
  /// \param SectionCBs Callbacks that will generate body of each section.
  /// \param PrivCB Callback to copy a given variable (think copy constructor).
  /// \param FiniCB Callback to finalize variable copies.
  /// \param IsCancellable Flag to indicate a cancellable parallel region.
  /// \param IsNowait If true, barrier - to ensure all sections are executed
  /// before moving forward will not be generated.
  /// \returns The insertion position *after* the sections.
  InsertPointTy createSections(const LocationDescription &Loc,
                               InsertPointTy AllocaIP,
                               ArrayRef<StorableBodyGenCallbackTy> SectionCBs,
                               PrivatizeCallbackTy PrivCB,
                               FinalizeCallbackTy FiniCB, bool IsCancellable,
                               bool IsNowait);

  /// Generator for '#omp section'
  ///
  /// \param Loc The insert and source location description.
  /// \param BodyGenCB Callback that will generate the region body code.
  /// \param FiniCB Callback to finalize variable copies.
  /// \returns The insertion position *after* the section.
  InsertPointTy createSection(const LocationDescription &Loc,
                              BodyGenCallbackTy BodyGenCB,
                              FinalizeCallbackTy FiniCB);

  /// Generator for `#omp teams`
  ///
  /// \param Loc The location where the teams construct was encountered.
  /// \param BodyGenCB Callback that will generate the region code.
  /// \param NumTeamsLower Lower bound on number of teams. If this is nullptr,
  ///        it is as if lower bound is specified as equal to upperbound. If
  ///        this is non-null, then upperbound must also be non-null.
  /// \param NumTeamsUpper Upper bound on the number of teams.
  /// \param ThreadLimit on the number of threads that may participate in a
  ///        contention group created by each team.
  /// \param IfExpr is the integer argument value of the if condition on the
  ///        teams clause.
  InsertPointTy
  createTeams(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB,
              Value *NumTeamsLower = nullptr, Value *NumTeamsUpper = nullptr,
              Value *ThreadLimit = nullptr, Value *IfExpr = nullptr);

  /// Generate conditional branch and relevant BasicBlocks through which private
  /// threads copy the 'copyin' variables from Master copy to threadprivate
  /// copies.
  ///
  /// \param IP insertion block for copyin conditional
  /// \param MasterVarPtr a pointer to the master variable
  /// \param PrivateVarPtr a pointer to the threadprivate variable
  /// \param IntPtrTy Pointer size type
  /// \param BranchtoEnd Create a branch between the copyin.not.master blocks
  //                 and copy.in.end block
  ///
  /// \returns The insertion point where copying operation to be emitted.
  InsertPointTy createCopyinClauseBlocks(InsertPointTy IP, Value *MasterAddr,
                                         Value *PrivateAddr,
                                         llvm::IntegerType *IntPtrTy,
                                         bool BranchtoEnd = true);

  /// Create a runtime call for kmpc_Alloc
  ///
  /// \param Loc The insert and source location description.
  /// \param Size Size of allocated memory space
  /// \param Allocator Allocator information instruction
  /// \param Name Name of call Instruction for OMP_alloc
  ///
  /// \returns CallInst to the OMP_Alloc call
  CallInst *createOMPAlloc(const LocationDescription &Loc, Value *Size,
                           Value *Allocator, std::string Name = "");

  /// Create a runtime call for kmpc_free
  ///
  /// \param Loc The insert and source location description.
  /// \param Addr Address of memory space to be freed
  /// \param Allocator Allocator information instruction
  /// \param Name Name of call Instruction for OMP_Free
  ///
  /// \returns CallInst to the OMP_Free call
  CallInst *createOMPFree(const LocationDescription &Loc, Value *Addr,
                          Value *Allocator, std::string Name = "");

  /// Create a runtime call for kmpc_threadprivate_cached
  ///
  /// \param Loc The insert and source location description.
  /// \param Pointer pointer to data to be cached
  /// \param Size size of data to be cached
  /// \param Name Name of call Instruction for callinst
  ///
  /// \returns CallInst to the thread private cache call.
  CallInst *createCachedThreadPrivate(const LocationDescription &Loc,
                                      llvm::Value *Pointer,
                                      llvm::ConstantInt *Size,
                                      const llvm::Twine &Name = Twine(""));

  /// Create a runtime call for __tgt_interop_init
  ///
  /// \param Loc The insert and source location description.
  /// \param InteropVar variable to be allocated
  /// \param InteropType type of interop operation
  /// \param Device devide to which offloading will occur
  /// \param NumDependences  number of dependence variables
  /// \param DependenceAddress pointer to dependence variables
  /// \param HaveNowaitClause does nowait clause exist
  ///
  /// \returns CallInst to the __tgt_interop_init call
  CallInst *createOMPInteropInit(const LocationDescription &Loc,
                                 Value *InteropVar,
                                 omp::OMPInteropType InteropType, Value *Device,
                                 Value *NumDependences,
                                 Value *DependenceAddress,
                                 bool HaveNowaitClause);

  /// Create a runtime call for __tgt_interop_destroy
  ///
  /// \param Loc The insert and source location description.
  /// \param InteropVar variable to be allocated
  /// \param Device devide to which offloading will occur
  /// \param NumDependences  number of dependence variables
  /// \param DependenceAddress pointer to dependence variables
  /// \param HaveNowaitClause does nowait clause exist
  ///
  /// \returns CallInst to the __tgt_interop_destroy call
  CallInst *createOMPInteropDestroy(const LocationDescription &Loc,
                                    Value *InteropVar, Value *Device,
                                    Value *NumDependences,
                                    Value *DependenceAddress,
                                    bool HaveNowaitClause);

  /// Create a runtime call for __tgt_interop_use
  ///
  /// \param Loc The insert and source location description.
  /// \param InteropVar variable to be allocated
  /// \param Device devide to which offloading will occur
  /// \param NumDependences  number of dependence variables
  /// \param DependenceAddress pointer to dependence variables
  /// \param HaveNowaitClause does nowait clause exist
  ///
  /// \returns CallInst to the __tgt_interop_use call
  CallInst *createOMPInteropUse(const LocationDescription &Loc,
                                Value *InteropVar, Value *Device,
                                Value *NumDependences, Value *DependenceAddress,
                                bool HaveNowaitClause);

  /// The `omp target` interface
  ///
  /// For more information about the usage of this interface,
  /// \see openmp/libomptarget/deviceRTLs/common/include/target.h
  ///
  ///{

  /// Create a runtime call for kmpc_target_init
  ///
  /// \param Loc The insert and source location description.
  /// \param IsSPMD Flag to indicate if the kernel is an SPMD kernel or not.
  /// \param MinThreads Minimal number of threads, or 0.
  /// \param MaxThreads Maximal number of threads, or 0.
  /// \param MinTeams Minimal number of teams, or 0.
  /// \param MaxTeams Maximal number of teams, or 0.
  InsertPointTy createTargetInit(const LocationDescription &Loc, bool IsSPMD,
                                 int32_t MinThreadsVal = 0,
                                 int32_t MaxThreadsVal = 0,
                                 int32_t MinTeamsVal = 0,
                                 int32_t MaxTeamsVal = 0);

  /// Create a runtime call for kmpc_target_deinit
  ///
  /// \param Loc The insert and source location description.
  /// \param TeamsReductionDataSize The maximal size of all the reduction data
  ///        for teams reduction.
  /// \param TeamsReductionBufferLength The number of elements (each of up to
  ///        \p TeamsReductionDataSize size), in the teams reduction buffer.
  void createTargetDeinit(const LocationDescription &Loc,
                          int32_t TeamsReductionDataSize = 0,
                          int32_t TeamsReductionBufferLength = 1024);

  ///}

  /// Helpers to read/write kernel annotations from the IR.
  ///
  ///{

  /// Read/write a bounds on threads for \p Kernel. Read will return 0 if none
  /// is set.
  static std::pair<int32_t, int32_t>
  readThreadBoundsForKernel(const Triple &T, Function &Kernel);
  static void writeThreadBoundsForKernel(const Triple &T, Function &Kernel,
                                         int32_t LB, int32_t UB);

  /// Read/write a bounds on teams for \p Kernel. Read will return 0 if none
  /// is set.
  static std::pair<int32_t, int32_t> readTeamBoundsForKernel(const Triple &T,
                                                             Function &Kernel);
  static void writeTeamsForKernel(const Triple &T, Function &Kernel, int32_t LB,
                                  int32_t UB);
  ///}

private:
  // Sets the function attributes expected for the outlined function
  void setOutlinedTargetRegionFunctionAttributes(Function *OutlinedFn);

  // Creates the function ID/Address for the given outlined function.
  // In the case of an embedded device function the address of the function is
  // used, in the case of a non-offload function a constant is created.
  Constant *createOutlinedFunctionID(Function *OutlinedFn,
                                     StringRef EntryFnIDName);

  // Creates the region entry address for the outlined function
  Constant *createTargetRegionEntryAddr(Function *OutlinedFunction,
                                        StringRef EntryFnName);

public:
  /// Functions used to generate a function with the given name.
  using FunctionGenCallback = std::function<Function *(StringRef FunctionName)>;

  /// Create a unique name for the entry function using the source location
  /// information of the current target region. The name will be something like:
  ///
  /// __omp_offloading_DD_FFFF_PP_lBB[_CC]
  ///
  /// where DD_FFFF is an ID unique to the file (device and file IDs), PP is the
  /// mangled name of the function that encloses the target region and BB is the
  /// line number of the target region. CC is a count added when more than one
  /// region is located at the same location.
  ///
  /// If this target outline function is not an offload entry, we don't need to
  /// register it. This may happen if it is guarded by an if clause that is
  /// false at compile time, or no target archs have been specified.
  ///
  /// The created target region ID is used by the runtime library to identify
  /// the current target region, so it only has to be unique and not
  /// necessarily point to anything. It could be the pointer to the outlined
  /// function that implements the target region, but we aren't using that so
  /// that the compiler doesn't need to keep that, and could therefore inline
  /// the host function if proven worthwhile during optimization. In the other
  /// hand, if emitting code for the device, the ID has to be the function
  /// address so that it can retrieved from the offloading entry and launched
  /// by the runtime library. We also mark the outlined function to have
  /// external linkage in case we are emitting code for the device, because
  /// these functions will be entry points to the device.
  ///
  /// \param InfoManager The info manager keeping track of the offload entries
  /// \param EntryInfo The entry information about the function
  /// \param GenerateFunctionCallback The callback function to generate the code
  /// \param OutlinedFunction Pointer to the outlined function
  /// \param EntryFnIDName Name of the ID o be created
  void emitTargetRegionFunction(TargetRegionEntryInfo &EntryInfo,
                                FunctionGenCallback &GenerateFunctionCallback,
                                bool IsOffloadEntry, Function *&OutlinedFn,
                                Constant *&OutlinedFnID);

  /// Registers the given function and sets up the attribtues of the function
  /// Returns the FunctionID.
  ///
  /// \param InfoManager The info manager keeping track of the offload entries
  /// \param EntryInfo The entry information about the function
  /// \param OutlinedFunction Pointer to the outlined function
  /// \param EntryFnName Name of the outlined function
  /// \param EntryFnIDName Name of the ID o be created
  Constant *registerTargetRegionFunction(TargetRegionEntryInfo &EntryInfo,
                                         Function *OutlinedFunction,
                                         StringRef EntryFnName,
                                         StringRef EntryFnIDName);

  /// Type of BodyGen to use for region codegen
  ///
  /// Priv: If device pointer privatization is required, emit the body of the
  /// region here. It will have to be duplicated: with and without
  /// privatization.
  /// DupNoPriv: If we need device pointer privatization, we need
  /// to emit the body of the region with no privatization in the 'else' branch
  /// of the conditional.
  /// NoPriv: If we don't require privatization of device
  /// pointers, we emit the body in between the runtime calls. This avoids
  /// duplicating the body code.
  enum BodyGenTy { Priv, DupNoPriv, NoPriv };

  /// Callback type for creating the map infos for the kernel parameters.
  /// \param CodeGenIP is the insertion point where code should be generated,
  ///        if any.
  using GenMapInfoCallbackTy =
      function_ref<MapInfosTy &(InsertPointTy CodeGenIP)>;

  /// Generator for '#omp target data'
  ///
  /// \param Loc The location where the target data construct was encountered.
  /// \param AllocaIP The insertion points to be used for alloca instructions.
  /// \param CodeGenIP The insertion point at which the target directive code
  /// should be placed.
  /// \param IsBegin If true then emits begin mapper call otherwise emits
  /// end mapper call.
  /// \param DeviceID Stores the DeviceID from the device clause.
  /// \param IfCond Value which corresponds to the if clause condition.
  /// \param Info Stores all information realted to the Target Data directive.
  /// \param GenMapInfoCB Callback that populates the MapInfos and returns.
  /// \param BodyGenCB Optional Callback to generate the region code.
  /// \param DeviceAddrCB Optional callback to generate code related to
  /// use_device_ptr and use_device_addr.
  /// \param CustomMapperCB Optional callback to generate code related to
  /// custom mappers.
  OpenMPIRBuilder::InsertPointTy createTargetData(
      const LocationDescription &Loc, InsertPointTy AllocaIP,
      InsertPointTy CodeGenIP, Value *DeviceID, Value *IfCond,
      TargetDataInfo &Info, GenMapInfoCallbackTy GenMapInfoCB,
      omp::RuntimeFunction *MapperFunc = nullptr,
      function_ref<InsertPointTy(InsertPointTy CodeGenIP,
                                 BodyGenTy BodyGenType)>
          BodyGenCB = nullptr,
      function_ref<void(unsigned int, Value *)> DeviceAddrCB = nullptr,
      function_ref<Value *(unsigned int)> CustomMapperCB = nullptr,
      Value *SrcLocInfo = nullptr);

  using TargetBodyGenCallbackTy = function_ref<InsertPointTy(
      InsertPointTy AllocaIP, InsertPointTy CodeGenIP)>;

  using TargetGenArgAccessorsCallbackTy = function_ref<InsertPointTy(
      Argument &Arg, Value *Input, Value *&RetVal, InsertPointTy AllocaIP,
      InsertPointTy CodeGenIP)>;

  /// Generator for '#omp target'
  ///
  /// \param Loc where the target data construct was encountered.
  /// \param CodeGenIP The insertion point where the call to the outlined
  /// function should be emitted.
  /// \param EntryInfo The entry information about the function.
  /// \param NumTeams Number of teams specified in the num_teams clause.
  /// \param NumThreads Number of teams specified in the thread_limit clause.
  /// \param Inputs The input values to the region that will be passed.
  /// as arguments to the outlined function.
  /// \param BodyGenCB Callback that will generate the region code.
  /// \param ArgAccessorFuncCB Callback that will generate accessors
  /// instructions for passed in target arguments where neccessary
  /// \param Dependencies A vector of DependData objects that carry
  // dependency information as passed in the depend clause
  InsertPointTy createTarget(const LocationDescription &Loc,
                             OpenMPIRBuilder::InsertPointTy AllocaIP,
                             OpenMPIRBuilder::InsertPointTy CodeGenIP,
                             TargetRegionEntryInfo &EntryInfo, int32_t NumTeams,
                             int32_t NumThreads,
                             SmallVectorImpl<Value *> &Inputs,
                             GenMapInfoCallbackTy GenMapInfoCB,
                             TargetBodyGenCallbackTy BodyGenCB,
                             TargetGenArgAccessorsCallbackTy ArgAccessorFuncCB,
                             SmallVector<DependData> Dependencies = {});

  /// Returns __kmpc_for_static_init_* runtime function for the specified
  /// size \a IVSize and sign \a IVSigned. Will create a distribute call
  /// __kmpc_distribute_static_init* if \a IsGPUDistribute is set.
  FunctionCallee createForStaticInitFunction(unsigned IVSize, bool IVSigned,
                                             bool IsGPUDistribute);

  /// Returns __kmpc_dispatch_init_* runtime function for the specified
  /// size \a IVSize and sign \a IVSigned.
  FunctionCallee createDispatchInitFunction(unsigned IVSize, bool IVSigned);

  /// Returns __kmpc_dispatch_next_* runtime function for the specified
  /// size \a IVSize and sign \a IVSigned.
  FunctionCallee createDispatchNextFunction(unsigned IVSize, bool IVSigned);

  /// Returns __kmpc_dispatch_fini_* runtime function for the specified
  /// size \a IVSize and sign \a IVSigned.
  FunctionCallee createDispatchFiniFunction(unsigned IVSize, bool IVSigned);

  /// Returns __kmpc_dispatch_deinit runtime function.
  FunctionCallee createDispatchDeinitFunction();

  /// Declarations for LLVM-IR types (simple, array, function and structure) are
  /// generated below. Their names are defined and used in OpenMPKinds.def. Here
  /// we provide the declarations, the initializeTypes function will provide the
  /// values.
  ///
  ///{
#define OMP_TYPE(VarName, InitValue) Type *VarName = nullptr;
#define OMP_ARRAY_TYPE(VarName, ElemTy, ArraySize)                             \
  ArrayType *VarName##Ty = nullptr;                                            \
  PointerType *VarName##PtrTy = nullptr;
#define OMP_FUNCTION_TYPE(VarName, IsVarArg, ReturnType, ...)                  \
  FunctionType *VarName = nullptr;                                             \
  PointerType *VarName##Ptr = nullptr;
#define OMP_STRUCT_TYPE(VarName, StrName, ...)                                 \
  StructType *VarName = nullptr;                                               \
  PointerType *VarName##Ptr = nullptr;
#include "llvm/Frontend/OpenMP/OMPKinds.def"

  ///}

private:
  /// Create all simple and struct types exposed by the runtime and remember
  /// the llvm::PointerTypes of them for easy access later.
  void initializeTypes(Module &M);

  /// Common interface for generating entry calls for OMP Directives.
  /// if the directive has a region/body, It will set the insertion
  /// point to the body
  ///
  /// \param OMPD Directive to generate entry blocks for
  /// \param EntryCall Call to the entry OMP Runtime Function
  /// \param ExitBB block where the region ends.
  /// \param Conditional indicate if the entry call result will be used
  ///        to evaluate a conditional of whether a thread will execute
  ///        body code or not.
  ///
  /// \return The insertion position in exit block
  InsertPointTy emitCommonDirectiveEntry(omp::Directive OMPD, Value *EntryCall,
                                         BasicBlock *ExitBB,
                                         bool Conditional = false);

  /// Common interface to finalize the region
  ///
  /// \param OMPD Directive to generate exiting code for
  /// \param FinIP Insertion point for emitting Finalization code and exit call
  /// \param ExitCall Call to the ending OMP Runtime Function
  /// \param HasFinalize indicate if the directive will require finalization
  ///         and has a finalization callback in the stack that
  ///        should be called.
  ///
  /// \return The insertion position in exit block
  InsertPointTy emitCommonDirectiveExit(omp::Directive OMPD,
                                        InsertPointTy FinIP,
                                        Instruction *ExitCall,
                                        bool HasFinalize = true);

  /// Common Interface to generate OMP inlined regions
  ///
  /// \param OMPD Directive to generate inlined region for
  /// \param EntryCall Call to the entry OMP Runtime Function
  /// \param ExitCall Call to the ending OMP Runtime Function
  /// \param BodyGenCB Body code generation callback.
  /// \param FiniCB Finalization Callback. Will be called when finalizing region
  /// \param Conditional indicate if the entry call result will be used
  ///        to evaluate a conditional of whether a thread will execute
  ///        body code or not.
  /// \param HasFinalize indicate if the directive will require finalization
  ///        and has a finalization callback in the stack that
  ///        should be called.
  /// \param IsCancellable if HasFinalize is set to true, indicate if the
  ///        the directive should be cancellable.
  /// \return The insertion point after the region

  InsertPointTy
  EmitOMPInlinedRegion(omp::Directive OMPD, Instruction *EntryCall,
                       Instruction *ExitCall, BodyGenCallbackTy BodyGenCB,
                       FinalizeCallbackTy FiniCB, bool Conditional = false,
                       bool HasFinalize = true, bool IsCancellable = false);

  /// Get the platform-specific name separator.
  /// \param Parts different parts of the final name that needs separation
  /// \param FirstSeparator First separator used between the initial two
  ///        parts of the name.
  /// \param Separator separator used between all of the rest consecutive
  ///        parts of the name
  static std::string getNameWithSeparators(ArrayRef<StringRef> Parts,
                                           StringRef FirstSeparator,
                                           StringRef Separator);

  /// Returns corresponding lock object for the specified critical region
  /// name. If the lock object does not exist it is created, otherwise the
  /// reference to the existing copy is returned.
  /// \param CriticalName Name of the critical region.
  ///
  Value *getOMPCriticalRegionLock(StringRef CriticalName);

  /// Callback type for Atomic Expression update
  /// ex:
  /// \code{.cpp}
  /// unsigned x = 0;
  /// #pragma omp atomic update
  /// x = Expr(x_old);  //Expr() is any legal operation
  /// \endcode
  ///
  /// \param XOld the value of the atomic memory address to use for update
  /// \param IRB reference to the IRBuilder to use
  ///
  /// \returns Value to update X to.
  using AtomicUpdateCallbackTy =
      const function_ref<Value *(Value *XOld, IRBuilder<> &IRB)>;

private:
  enum AtomicKind { Read, Write, Update, Capture, Compare };

  /// Determine whether to emit flush or not
  ///
  /// \param Loc    The insert and source location description.
  /// \param AO     The required atomic ordering
  /// \param AK     The OpenMP atomic operation kind used.
  ///
  /// \returns        wether a flush was emitted or not
  bool checkAndEmitFlushAfterAtomic(const LocationDescription &Loc,
                                    AtomicOrdering AO, AtomicKind AK);

  /// Emit atomic update for constructs: X = X BinOp Expr ,or X = Expr BinOp X
  /// For complex Operations: X = UpdateOp(X) => CmpExch X, old_X, UpdateOp(X)
  /// Only Scalar data types.
  ///
  /// \param AllocaIP      The insertion point to be used for alloca
  ///                   instructions.
  /// \param X                The target atomic pointer to be updated
  /// \param XElemTy    The element type of the atomic pointer.
  /// \param Expr            The value to update X with.
  /// \param AO                Atomic ordering of the generated atomic
  ///                   instructions.
  /// \param RMWOp          The binary operation used for update. If
  ///                   operation is not supported by atomicRMW,
  ///                   or belong to {FADD, FSUB, BAD_BINOP}.
  ///                   Then a `cmpExch` based    atomic will be generated.
  /// \param UpdateOp     Code generator for complex expressions that cannot be
  ///                   expressed through atomicrmw instruction.
  /// \param VolatileX         true if \a X volatile?
  /// \param IsXBinopExpr true if \a X is Left H.S. in Right H.S. part of the
  ///                     update expression, false otherwise.
  ///                     (e.g. true for X = X BinOp Expr)
  ///
  /// \returns A pair of the old value of X before the update, and the value
  ///          used for the update.
  std::pair<Value *, Value *>
  emitAtomicUpdate(InsertPointTy AllocaIP, Value *X, Type *XElemTy, Value *Expr,
                   AtomicOrdering AO, AtomicRMWInst::BinOp RMWOp,
                   AtomicUpdateCallbackTy &UpdateOp, bool VolatileX,
                   bool IsXBinopExpr);

  /// Emit the binary op. described by \p RMWOp, using \p Src1 and \p Src2 .
  ///
  /// \Return The instruction
  Value *emitRMWOpAsInstruction(Value *Src1, Value *Src2,
                                AtomicRMWInst::BinOp RMWOp);

public:
  /// a struct to pack relevant information while generating atomic Ops
  struct AtomicOpValue {
    Value *Var = nullptr;
    Type *ElemTy = nullptr;
    bool IsSigned = false;
    bool IsVolatile = false;
  };

  /// Emit atomic Read for : V = X --- Only Scalar data types.
  ///
  /// \param Loc    The insert and source location description.
  /// \param X            The target pointer to be atomically read
  /// \param V            Memory address where to store atomically read
  ///                         value
  /// \param AO            Atomic ordering of the generated atomic
  ///                         instructions.
  ///
  /// \return Insertion point after generated atomic read IR.
  InsertPointTy createAtomicRead(const LocationDescription &Loc,
                                 AtomicOpValue &X, AtomicOpValue &V,
                                 AtomicOrdering AO);

  /// Emit atomic write for : X = Expr --- Only Scalar data types.
  ///
  /// \param Loc    The insert and source location description.
  /// \param X            The target pointer to be atomically written to
  /// \param Expr        The value to store.
  /// \param AO            Atomic ordering of the generated atomic
  ///               instructions.
  ///
  /// \return Insertion point after generated atomic Write IR.
  InsertPointTy createAtomicWrite(const LocationDescription &Loc,
                                  AtomicOpValue &X, Value *Expr,
                                  AtomicOrdering AO);

  /// Emit atomic update for constructs: X = X BinOp Expr ,or X = Expr BinOp X
  /// For complex Operations: X = UpdateOp(X) => CmpExch X, old_X, UpdateOp(X)
  /// Only Scalar data types.
  ///
  /// \param Loc      The insert and source location description.
  /// \param AllocaIP The insertion point to be used for alloca instructions.
  /// \param X        The target atomic pointer to be updated
  /// \param Expr     The value to update X with.
  /// \param AO       Atomic ordering of the generated atomic instructions.
  /// \param RMWOp    The binary operation used for update. If operation
  ///                 is    not supported by atomicRMW, or belong to
  ///                    {FADD, FSUB, BAD_BINOP}. Then a `cmpExch` based
  ///                 atomic will be generated.
  /// \param UpdateOp     Code generator for complex expressions that cannot be
  ///                   expressed through atomicrmw instruction.
  /// \param IsXBinopExpr true if \a X is Left H.S. in Right H.S. part of the
  ///                     update expression, false otherwise.
  ///                        (e.g. true for X = X BinOp Expr)
  ///
  /// \return Insertion point after generated atomic update IR.
  InsertPointTy createAtomicUpdate(const LocationDescription &Loc,
                                   InsertPointTy AllocaIP, AtomicOpValue &X,
                                   Value *Expr, AtomicOrdering AO,
                                   AtomicRMWInst::BinOp RMWOp,
                                   AtomicUpdateCallbackTy &UpdateOp,
                                   bool IsXBinopExpr);

  /// Emit atomic update for constructs: --- Only Scalar data types
  /// V = X; X = X BinOp Expr ,
  /// X = X BinOp Expr; V = X,
  /// V = X; X = Expr BinOp X,
  /// X = Expr BinOp X; V = X,
  /// V = X; X = UpdateOp(X),
  /// X = UpdateOp(X); V = X,
  ///
  /// \param Loc        The insert and source location description.
  /// \param AllocaIP   The insertion point to be used for alloca instructions.
  /// \param X          The target atomic pointer to be updated
  /// \param V          Memory address where to store captured value
  /// \param Expr       The value to update X with.
  /// \param AO         Atomic ordering of the generated atomic instructions
  /// \param RMWOp      The binary operation used for update. If
  ///                   operation is not supported by atomicRMW, or belong to
  ///                      {FADD, FSUB, BAD_BINOP}. Then a cmpExch based
  ///                   atomic will be generated.
  /// \param UpdateOp   Code generator for complex expressions that cannot be
  ///                   expressed through atomicrmw instruction.
  /// \param UpdateExpr true if X is an in place update of the form
  ///                   X = X BinOp Expr or X = Expr BinOp X
  /// \param IsXBinopExpr true if X is Left H.S. in Right H.S. part of the
  ///                     update expression, false otherwise.
  ///                     (e.g. true for X = X BinOp Expr)
  /// \param IsPostfixUpdate true if original value of 'x' must be stored in
  ///                        'v', not an updated one.
  ///
  /// \return Insertion point after generated atomic capture IR.
  InsertPointTy
  createAtomicCapture(const LocationDescription &Loc, InsertPointTy AllocaIP,
                      AtomicOpValue &X, AtomicOpValue &V, Value *Expr,
                      AtomicOrdering AO, AtomicRMWInst::BinOp RMWOp,
                      AtomicUpdateCallbackTy &UpdateOp, bool UpdateExpr,
                      bool IsPostfixUpdate, bool IsXBinopExpr);

  /// Emit atomic compare for constructs: --- Only scalar data types
  /// cond-expr-stmt:
  /// x = x ordop expr ? expr : x;
  /// x = expr ordop x ? expr : x;
  /// x = x == e ? d : x;
  /// x = e == x ? d : x; (this one is not in the spec)
  /// cond-update-stmt:
  /// if (x ordop expr) { x = expr; }
  /// if (expr ordop x) { x = expr; }
  /// if (x == e) { x = d; }
  /// if (e == x) { x = d; } (this one is not in the spec)
  /// conditional-update-capture-atomic:
  /// v = x; cond-update-stmt; (IsPostfixUpdate=true, IsFailOnly=false)
  /// cond-update-stmt; v = x; (IsPostfixUpdate=false, IsFailOnly=false)
  /// if (x == e) { x = d; } else { v = x; } (IsPostfixUpdate=false,
  ///                                         IsFailOnly=true)
  /// r = x == e; if (r) { x = d; } (IsPostfixUpdate=false, IsFailOnly=false)
  /// r = x == e; if (r) { x = d; } else { v = x; } (IsPostfixUpdate=false,
  ///                                                IsFailOnly=true)
  ///
  /// \param Loc          The insert and source location description.
  /// \param X            The target atomic pointer to be updated.
  /// \param V            Memory address where to store captured value (for
  ///                     compare capture only).
  /// \param R            Memory address where to store comparison result
  ///                     (for compare capture with '==' only).
  /// \param E            The expected value ('e') for forms that use an
  ///                     equality comparison or an expression ('expr') for
  ///                     forms that use 'ordop' (logically an atomic maximum or
  ///                     minimum).
  /// \param D            The desired value for forms that use an equality
  ///                     comparison. If forms that use 'ordop', it should be
  ///                     \p nullptr.
  /// \param AO           Atomic ordering of the generated atomic instructions.
  /// \param Op           Atomic compare operation. It can only be ==, <, or >.
  /// \param IsXBinopExpr True if the conditional statement is in the form where
  ///                     x is on LHS. It only matters for < or >.
  /// \param IsPostfixUpdate  True if original value of 'x' must be stored in
  ///                         'v', not an updated one (for compare capture
  ///                         only).
  /// \param IsFailOnly   True if the original value of 'x' is stored to 'v'
  ///                     only when the comparison fails. This is only valid for
  ///                     the case the comparison is '=='.
  ///
  /// \return Insertion point after generated atomic capture IR.
  InsertPointTy
  createAtomicCompare(const LocationDescription &Loc, AtomicOpValue &X,
                      AtomicOpValue &V, AtomicOpValue &R, Value *E, Value *D,
                      AtomicOrdering AO, omp::OMPAtomicCompareOp Op,
                      bool IsXBinopExpr, bool IsPostfixUpdate, bool IsFailOnly);
  InsertPointTy createAtomicCompare(const LocationDescription &Loc,
                                    AtomicOpValue &X, AtomicOpValue &V,
                                    AtomicOpValue &R, Value *E, Value *D,
                                    AtomicOrdering AO,
                                    omp::OMPAtomicCompareOp Op,
                                    bool IsXBinopExpr, bool IsPostfixUpdate,
                                    bool IsFailOnly, AtomicOrdering Failure);

  /// Create the control flow structure of a canonical OpenMP loop.
  ///
  /// The emitted loop will be disconnected, i.e. no edge to the loop's
  /// preheader and no terminator in the AfterBB. The OpenMPIRBuilder's
  /// IRBuilder location is not preserved.
  ///
  /// \param DL        DebugLoc used for the instructions in the skeleton.
  /// \param TripCount Value to be used for the trip count.
  /// \param F         Function in which to insert the BasicBlocks.
  /// \param PreInsertBefore  Where to insert BBs that execute before the body,
  ///                         typically the body itself.
  /// \param PostInsertBefore Where to insert BBs that execute after the body.
  /// \param Name      Base name used to derive BB
  ///                  and instruction names.
  ///
  /// \returns The CanonicalLoopInfo that represents the emitted loop.
  CanonicalLoopInfo *createLoopSkeleton(DebugLoc DL, Value *TripCount,
                                        Function *F,
                                        BasicBlock *PreInsertBefore,
                                        BasicBlock *PostInsertBefore,
                                        const Twine &Name = {});
  /// OMP Offload Info Metadata name string
  const std::string ompOffloadInfoName = "omp_offload.info";

  /// Loads all the offload entries information from the host IR
  /// metadata. This function is only meant to be used with device code
  /// generation.
  ///
  /// \param M         Module to load Metadata info from. Module passed maybe
  /// loaded from bitcode file, i.e, different from OpenMPIRBuilder::M module.
  void loadOffloadInfoMetadata(Module &M);

  /// Loads all the offload entries information from the host IR
  /// metadata read from the file passed in as the HostFilePath argument. This
  /// function is only meant to be used with device code generation.
  ///
  /// \param HostFilePath The path to the host IR file,
  /// used to load in offload metadata for the device, allowing host and device
  /// to maintain the same metadata mapping.
  void loadOffloadInfoMetadata(StringRef HostFilePath);

  /// Gets (if variable with the given name already exist) or creates
  /// internal global variable with the specified Name. The created variable has
  /// linkage CommonLinkage by default and is initialized by null value.
  /// \param Ty Type of the global variable. If it is exist already the type
  /// must be the same.
  /// \param Name Name of the variable.
  GlobalVariable *getOrCreateInternalVariable(Type *Ty, const StringRef &Name,
                                              unsigned AddressSpace = 0);
};

/// Class to represented the control flow structure of an OpenMP canonical loop.
///
/// The control-flow structure is standardized for easy consumption by
/// directives associated with loops. For instance, the worksharing-loop
/// construct may change this control flow such that each loop iteration is
/// executed on only one thread. The constraints of a canonical loop in brief
/// are:
///
///  * The number of loop iterations must have been computed before entering the
///    loop.
///
///  * Has an (unsigned) logical induction variable that starts at zero and
///    increments by one.
///
///  * The loop's CFG itself has no side-effects. The OpenMP specification
///    itself allows side-effects, but the order in which they happen, including
///    how often or whether at all, is unspecified. We expect that the frontend
///    will emit those side-effect instructions somewhere (e.g. before the loop)
///    such that the CanonicalLoopInfo itself can be side-effect free.
///
/// Keep in mind that CanonicalLoopInfo is meant to only describe a repeated
/// execution of a loop body that satifies these constraints. It does NOT
/// represent arbitrary SESE regions that happen to contain a loop. Do not use
/// CanonicalLoopInfo for such purposes.
///
/// The control flow can be described as follows:
///
///     Preheader
///        |
///  /-> Header
///  |     |
///  |    Cond---\
///  |     |     |
///  |    Body   |
///  |    | |    |
///  |   <...>   |
///  |    | |    |
///   \--Latch   |
///              |
///             Exit
///              |
///            After
///
/// The loop is thought to start at PreheaderIP (at the Preheader's terminator,
/// including) and end at AfterIP (at the After's first instruction, excluding).
/// That is, instructions in the Preheader and After blocks (except the
/// Preheader's terminator) are out of CanonicalLoopInfo's control and may have
/// side-effects. Typically, the Preheader is used to compute the loop's trip
/// count. The instructions from BodyIP (at the Body block's first instruction,
/// excluding) until the Latch are also considered outside CanonicalLoopInfo's
/// control and thus can have side-effects. The body block is the single entry
/// point into the loop body, which may contain arbitrary control flow as long
/// as all control paths eventually branch to the Latch block.
///
/// TODO: Consider adding another standardized BasicBlock between Body CFG and
/// Latch to guarantee that there is only a single edge to the latch. It would
/// make loop transformations easier to not needing to consider multiple
/// predecessors of the latch (See redirectAllPredecessorsTo) and would give us
/// an equivalant to PreheaderIP, AfterIP and BodyIP for inserting code that
/// executes after each body iteration.
///
/// There must be no loop-carried dependencies through llvm::Values. This is
/// equivalant to that the Latch has no PHINode and the Header's only PHINode is
/// for the induction variable.
///
/// All code in Header, Cond, Latch and Exit (plus the terminator of the
/// Preheader) are CanonicalLoopInfo's responsibility and their build-up checked
/// by assertOK(). They are expected to not be modified unless explicitly
/// modifying the CanonicalLoopInfo through a methods that applies a OpenMP
/// loop-associated construct such as applyWorkshareLoop, tileLoops, unrollLoop,
/// etc. These methods usually invalidate the CanonicalLoopInfo and re-use its
/// basic blocks. After invalidation, the CanonicalLoopInfo must not be used
/// anymore as its underlying control flow may not exist anymore.
/// Loop-transformation methods such as tileLoops, collapseLoops and unrollLoop
/// may also return a new CanonicalLoopInfo that can be passed to other
/// loop-associated construct implementing methods. These loop-transforming
/// methods may either create a new CanonicalLoopInfo usually using
/// createLoopSkeleton and invalidate the input CanonicalLoopInfo, or reuse and
/// modify one of the input CanonicalLoopInfo and return it as representing the
/// modified loop. What is done is an implementation detail of
/// transformation-implementing method and callers should always assume that the
/// CanonicalLoopInfo passed to it is invalidated and a new object is returned.
/// Returned CanonicalLoopInfo have the same structure and guarantees as the one
/// created by createCanonicalLoop, such that transforming methods do not have
/// to special case where the CanonicalLoopInfo originated from.
///
/// Generally, methods consuming CanonicalLoopInfo do not need an
/// OpenMPIRBuilder::InsertPointTy as argument, but use the locations of the
/// CanonicalLoopInfo to insert new or modify existing instructions. Unless
/// documented otherwise, methods consuming CanonicalLoopInfo do not invalidate
/// any InsertPoint that is outside CanonicalLoopInfo's control. Specifically,
/// any InsertPoint in the Preheader, After or Block can still be used after
/// calling such a method.
///
/// TODO: Provide mechanisms for exception handling and cancellation points.
///
/// Defined outside OpenMPIRBuilder because nested classes cannot be
/// forward-declared, e.g. to avoid having to include the entire OMPIRBuilder.h.
class CanonicalLoopInfo {
  friend class OpenMPIRBuilder;

private:
  BasicBlock *Header = nullptr;
  BasicBlock *Cond = nullptr;
  BasicBlock *Latch = nullptr;
  BasicBlock *Exit = nullptr;

  /// Add the control blocks of this loop to \p BBs.
  ///
  /// This does not include any block from the body, including the one returned
  /// by getBody().
  ///
  /// FIXME: This currently includes the Preheader and After blocks even though
  /// their content is (mostly) not under CanonicalLoopInfo's control.
  /// Re-evaluated whether this makes sense.
  void collectControlBlocks(SmallVectorImpl<BasicBlock *> &BBs);

  /// Sets the number of loop iterations to the given value. This value must be
  /// valid in the condition block (i.e., defined in the preheader) and is
  /// interpreted as an unsigned integer.
  void setTripCount(Value *TripCount);

  /// Replace all uses of the canonical induction variable in the loop body with
  /// a new one.
  ///
  /// The intended use case is to update the induction variable for an updated
  /// iteration space such that it can stay normalized in the 0...tripcount-1
  /// range.
  ///
  /// The \p Updater is called with the (presumable updated) current normalized
  /// induction variable and is expected to return the value that uses of the
  /// pre-updated induction values should use instead, typically dependent on
  /// the new induction variable. This is a lambda (instead of e.g. just passing
  /// the new value) to be able to distinguish the uses of the pre-updated
  /// induction variable and uses of the induction varible to compute the
  /// updated induction variable value.
  void mapIndVar(llvm::function_ref<Value *(Instruction *)> Updater);

public:
  /// Returns whether this object currently represents the IR of a loop. If
  /// returning false, it may have been consumed by a loop transformation or not
  /// been intialized. Do not use in this case;
  bool isValid() const { return Header; }

  /// The preheader ensures that there is only a single edge entering the loop.
  /// Code that must be execute before any loop iteration can be emitted here,
  /// such as computing the loop trip count and begin lifetime markers. Code in
  /// the preheader is not considered part of the canonical loop.
  BasicBlock *getPreheader() const;

  /// The header is the entry for each iteration. In the canonical control flow,
  /// it only contains the PHINode for the induction variable.
  BasicBlock *getHeader() const {
    assert(isValid() && "Requires a valid canonical loop");
    return Header;
  }

  /// The condition block computes whether there is another loop iteration. If
  /// yes, branches to the body; otherwise to the exit block.
  BasicBlock *getCond() const {
    assert(isValid() && "Requires a valid canonical loop");
    return Cond;
  }

  /// The body block is the single entry for a loop iteration and not controlled
  /// by CanonicalLoopInfo. It can contain arbitrary control flow but must
  /// eventually branch to the \p Latch block.
  BasicBlock *getBody() const {
    assert(isValid() && "Requires a valid canonical loop");
    return cast<BranchInst>(Cond->getTerminator())->getSuccessor(0);
  }

  /// Reaching the latch indicates the end of the loop body code. In the
  /// canonical control flow, it only contains the increment of the induction
  /// variable.
  BasicBlock *getLatch() const {
    assert(isValid() && "Requires a valid canonical loop");
    return Latch;
  }

  /// Reaching the exit indicates no more iterations are being executed.
  BasicBlock *getExit() const {
    assert(isValid() && "Requires a valid canonical loop");
    return Exit;
  }

  /// The after block is intended for clean-up code such as lifetime end
  /// markers. It is separate from the exit block to ensure, analogous to the
  /// preheader, it having just a single entry edge and being free from PHI
  /// nodes should there be multiple loop exits (such as from break
  /// statements/cancellations).
  BasicBlock *getAfter() const {
    assert(isValid() && "Requires a valid canonical loop");
    return Exit->getSingleSuccessor();
  }

  /// Returns the llvm::Value containing the number of loop iterations. It must
  /// be valid in the preheader and always interpreted as an unsigned integer of
  /// any bit-width.
  Value *getTripCount() const {
    assert(isValid() && "Requires a valid canonical loop");
    Instruction *CmpI = &Cond->front();
    assert(isa<CmpInst>(CmpI) && "First inst must compare IV with TripCount");
    return CmpI->getOperand(1);
  }

  /// Returns the instruction representing the current logical induction
  /// variable. Always unsigned, always starting at 0 with an increment of one.
  Instruction *getIndVar() const {
    assert(isValid() && "Requires a valid canonical loop");
    Instruction *IndVarPHI = &Header->front();
    assert(isa<PHINode>(IndVarPHI) && "First inst must be the IV PHI");
    return IndVarPHI;
  }

  /// Return the type of the induction variable (and the trip count).
  Type *getIndVarType() const {
    assert(isValid() && "Requires a valid canonical loop");
    return getIndVar()->getType();
  }

  /// Return the insertion point for user code before the loop.
  OpenMPIRBuilder::InsertPointTy getPreheaderIP() const {
    assert(isValid() && "Requires a valid canonical loop");
    BasicBlock *Preheader = getPreheader();
    return {Preheader, std::prev(Preheader->end())};
  };

  /// Return the insertion point for user code in the body.
  OpenMPIRBuilder::InsertPointTy getBodyIP() const {
    assert(isValid() && "Requires a valid canonical loop");
    BasicBlock *Body = getBody();
    return {Body, Body->begin()};
  };

  /// Return the insertion point for user code after the loop.
  OpenMPIRBuilder::InsertPointTy getAfterIP() const {
    assert(isValid() && "Requires a valid canonical loop");
    BasicBlock *After = getAfter();
    return {After, After->begin()};
  };

  Function *getFunction() const {
    assert(isValid() && "Requires a valid canonical loop");
    return Header->getParent();
  }

  /// Consistency self-check.
  void assertOK() const;

  /// Invalidate this loop. That is, the underlying IR does not fulfill the
  /// requirements of an OpenMP canonical loop anymore.
  void invalidate();
};

} // end namespace llvm

#endif // LLVM_FRONTEND_OPENMP_OMPIRBUILDER_H
:: Command execute ::
Enter:	Select: