#include "llvm/Transforms/Vectorize.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/Hashing.h"
#include "llvm/ADT/MapVector.h"
#include "llvm/ADT/SetVector.h"
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/SmallSet.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/ADT/StringExtras.h"
#include "llvm/Analysis/AliasAnalysis.h"
#include "llvm/Analysis/BasicAliasAnalysis.h"
#include "llvm/Analysis/AliasSetTracker.h"
#include "llvm/Analysis/AssumptionCache.h"
#include "llvm/Analysis/BlockFrequencyInfo.h"
#include "llvm/Analysis/CodeMetrics.h"
#include "llvm/Analysis/DemandedBits.h"
#include "llvm/Analysis/GlobalsModRef.h"
#include "llvm/Analysis/LoopAccessAnalysis.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/LoopIterator.h"
#include "llvm/Analysis/LoopPass.h"
#include "llvm/Analysis/ScalarEvolution.h"
#include "llvm/Analysis/ScalarEvolutionExpander.h"
#include "llvm/Analysis/ScalarEvolutionExpressions.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/DataLayout.h"
#include "llvm/IR/DebugInfo.h"
#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/DiagnosticInfo.h"
#include "llvm/IR/Dominators.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/LLVMContext.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/PatternMatch.h"
#include "llvm/IR/Type.h"
#include "llvm/IR/Value.h"
#include "llvm/IR/ValueHandle.h"
#include "llvm/IR/Verifier.h"
#include "llvm/Pass.h"
#include "llvm/Support/BranchProbability.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Analysis/VectorUtils.h"
#include "llvm/Transforms/Utils/LoopUtils.h"
#include <algorithm>
#include <functional>
#include <map>
#include <tuple>
using namespace llvm;
using namespace llvm::PatternMatch;
#define LV_NAME "loop-vectorize"
#define DEBUG_TYPE LV_NAME
STATISTIC(LoopsVectorized, "Number of loops vectorized");
STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
static cl::opt<bool>
EnableIfConversion("enable-if-conversion", cl::init(true), cl::Hidden,
cl::desc("Enable if-conversion during vectorization."));
static cl::opt<unsigned>
TinyTripCountVectorThreshold("vectorizer-min-trip-count", cl::init(16),
cl::Hidden,
cl::desc("Don't vectorize loops with a constant "
"trip count that is smaller than this "
"value."));
static cl::opt<bool> MaximizeBandwidth(
"vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
cl::desc("Maximize bandwidth when selecting vectorization factor which "
"will be determined by the smallest type in loop."));
static cl::opt<bool> EnableMemAccessVersioning(
"enable-mem-access-versioning", cl::init(true), cl::Hidden,
cl::desc("Enable symblic stride memory access versioning"));
static cl::opt<bool> EnableInterleavedMemAccesses(
"enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
static cl::opt<unsigned> MaxInterleaveGroupFactor(
"max-interleave-group-factor", cl::Hidden,
cl::desc("Maximum factor for an interleaved access group (default = 8)"),
cl::init(8));
static const unsigned TinyTripCountInterleaveThreshold = 128;
static cl::opt<unsigned> ForceTargetNumScalarRegs(
"force-target-num-scalar-regs", cl::init(0), cl::Hidden,
cl::desc("A flag that overrides the target's number of scalar registers."));
static cl::opt<unsigned> ForceTargetNumVectorRegs(
"force-target-num-vector-regs", cl::init(0), cl::Hidden,
cl::desc("A flag that overrides the target's number of vector registers."));
static const unsigned MaxInterleaveFactor = 16;
static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor(
"force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
cl::desc("A flag that overrides the target's max interleave factor for "
"scalar loops."));
static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(
"force-target-max-vector-interleave", cl::init(0), cl::Hidden,
cl::desc("A flag that overrides the target's max interleave factor for "
"vectorized loops."));
static cl::opt<unsigned> ForceTargetInstructionCost(
"force-target-instruction-cost", cl::init(0), cl::Hidden,
cl::desc("A flag that overrides the target's expected cost for "
"an instruction to a single constant value. Mostly "
"useful for getting consistent testing."));
static cl::opt<unsigned> SmallLoopCost(
"small-loop-cost", cl::init(20), cl::Hidden,
cl::desc(
"The cost of a loop that is considered 'small' by the interleaver."));
static cl::opt<bool> LoopVectorizeWithBlockFrequency(
"loop-vectorize-with-block-frequency", cl::init(false), cl::Hidden,
cl::desc("Enable the use of the block frequency analysis to access PGO "
"heuristics minimizing code growth in cold regions and being more "
"aggressive in hot regions."));
static cl::opt<bool> EnableLoadStoreRuntimeInterleave(
"enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
cl::desc(
"Enable runtime interleaving until load/store ports are saturated"));
static cl::opt<unsigned> NumberOfStoresToPredicate(
"vectorize-num-stores-pred", cl::init(1), cl::Hidden,
cl::desc("Max number of stores to be predicated behind an if."));
static cl::opt<bool> EnableIndVarRegisterHeur(
"enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
cl::desc("Count the induction variable only once when interleaving"));
static cl::opt<bool> EnableCondStoresVectorization(
"enable-cond-stores-vec", cl::init(false), cl::Hidden,
cl::desc("Enable if predication of stores during vectorization."));
static cl::opt<unsigned> MaxNestedScalarReductionIC(
"max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
cl::desc("The maximum interleave count to use when interleaving a scalar "
"reduction in a nested loop."));
static cl::opt<unsigned> PragmaVectorizeMemoryCheckThreshold(
"pragma-vectorize-memory-check-threshold", cl::init(128), cl::Hidden,
cl::desc("The maximum allowed number of runtime memory checks with a "
"vectorize(enable) pragma."));
static cl::opt<unsigned> VectorizeSCEVCheckThreshold(
"vectorize-scev-check-threshold", cl::init(16), cl::Hidden,
cl::desc("The maximum number of SCEV checks allowed."));
static cl::opt<unsigned> PragmaVectorizeSCEVCheckThreshold(
"pragma-vectorize-scev-check-threshold", cl::init(128), cl::Hidden,
cl::desc("The maximum number of SCEV checks allowed with a "
"vectorize(enable) pragma"));
namespace {
class LoopVectorizeHints;
class LoopVectorizationLegality;
class LoopVectorizationCostModel;
class LoopVectorizationRequirements;
class VectorizationReport : public LoopAccessReport {
public:
VectorizationReport(Instruction *I = nullptr)
: LoopAccessReport("loop not vectorized: ", I) {}
explicit VectorizationReport(const LoopAccessReport &R)
: LoopAccessReport(Twine("loop not vectorized: ") + R.str(),
R.getInstr()) {}
};
static Type* ToVectorTy(Type *Scalar, unsigned VF) {
if (Scalar->isVoidTy() || VF == 1)
return Scalar;
return VectorType::get(Scalar, VF);
}
static GetElementPtrInst *getGEPInstruction(Value *Ptr) {
if (isa<GetElementPtrInst>(Ptr))
return cast<GetElementPtrInst>(Ptr);
if (isa<BitCastInst>(Ptr) &&
isa<GetElementPtrInst>(cast<BitCastInst>(Ptr)->getOperand(0))) {
Type *BitcastTy = Ptr->getType();
Type *GEPTy = cast<BitCastInst>(Ptr)->getSrcTy();
if (!isa<PointerType>(BitcastTy) || !isa<PointerType>(GEPTy))
return nullptr;
Type *Pointee1Ty = cast<PointerType>(BitcastTy)->getPointerElementType();
Type *Pointee2Ty = cast<PointerType>(GEPTy)->getPointerElementType();
const DataLayout &DL = cast<BitCastInst>(Ptr)->getModule()->getDataLayout();
if (DL.getTypeSizeInBits(Pointee1Ty) == DL.getTypeSizeInBits(Pointee2Ty))
return cast<GetElementPtrInst>(cast<BitCastInst>(Ptr)->getOperand(0));
}
return nullptr;
}
class InnerLoopVectorizer {
public:
InnerLoopVectorizer(Loop *OrigLoop, ScalarEvolution *SE, LoopInfo *LI,
DominatorTree *DT, const TargetLibraryInfo *TLI,
const TargetTransformInfo *TTI, unsigned VecWidth,
unsigned UnrollFactor, SCEVUnionPredicate &Preds)
: OrigLoop(OrigLoop), SE(SE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
VF(VecWidth), UF(UnrollFactor), Builder(SE->getContext()),
Induction(nullptr), OldInduction(nullptr), WidenMap(UnrollFactor),
TripCount(nullptr), VectorTripCount(nullptr), Legal(nullptr),
AddedSafetyChecks(false), Preds(Preds) {}
void vectorize(LoopVectorizationLegality *L,
DenseMap<Instruction*,uint64_t> MinimumBitWidths) {
MinBWs = MinimumBitWidths;
Legal = L;
createEmptyLoop();
vectorizeLoop();
}
bool IsSafetyChecksAdded() {
return AddedSafetyChecks;
}
virtual ~InnerLoopVectorizer() {}
protected:
typedef SmallVector<PHINode*, 4> PhiVector;
typedef SmallVector<Value*, 2> VectorParts;
typedef DenseMap<std::pair<BasicBlock*, BasicBlock*>,
VectorParts> EdgeMaskCache;
void createEmptyLoop();
PHINode *createInductionVariable(Loop *L, Value *Start, Value *End,
Value *Step, Instruction *DL);
virtual void vectorizeLoop();
void fixLCSSAPHIs();
void truncateToMinimalBitwidths();
VectorParts createBlockInMask(BasicBlock *BB);
VectorParts createEdgeMask(BasicBlock *Src, BasicBlock *Dst);
void vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV);
void widenPHIInstruction(Instruction *PN, VectorParts &Entry,
unsigned UF, unsigned VF, PhiVector *PV);
void updateAnalysis();
virtual void scalarizeInstruction(Instruction *Instr,
bool IfPredicateStore=false);
virtual void vectorizeMemoryInstruction(Instruction *Instr);
virtual Value *getBroadcastInstrs(Value *V);
virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step);
VectorParts &getVectorValue(Value *V);
void vectorizeInterleaveGroup(Instruction *Instr);
virtual Value *reverseVector(Value *Vec);
Value *getOrCreateTripCount(Loop *NewLoop);
Value *getOrCreateVectorTripCount(Loop *NewLoop);
void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass);
void emitVectorLoopEnteredCheck(Loop *L, BasicBlock *Bypass);
void emitSCEVChecks(Loop *L, BasicBlock *Bypass);
void emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass);
struct ValueMap {
ValueMap(unsigned UnrollFactor) : UF(UnrollFactor) {}
bool has(Value *Key) const { return MapStorage.count(Key); }
VectorParts &splat(Value *Key, Value *Val) {
VectorParts &Entry = MapStorage[Key];
Entry.assign(UF, Val);
return Entry;
}
VectorParts &get(Value *Key) {
VectorParts &Entry = MapStorage[Key];
if (Entry.empty())
Entry.resize(UF);
assert(Entry.size() == UF);
return Entry;
}
private:
unsigned UF;
std::map<Value *, VectorParts> MapStorage;
};
Loop *OrigLoop;
ScalarEvolution *SE;
LoopInfo *LI;
DominatorTree *DT;
AliasAnalysis *AA;
const TargetLibraryInfo *TLI;
const TargetTransformInfo *TTI;
unsigned VF;
protected:
unsigned UF;
IRBuilder<> Builder;
BasicBlock *LoopVectorPreHeader;
BasicBlock *LoopScalarPreHeader;
BasicBlock *LoopMiddleBlock;
BasicBlock *LoopExitBlock;
SmallVector<BasicBlock *, 4> LoopVectorBody;
BasicBlock *LoopScalarBody;
SmallVector<BasicBlock *, 4> LoopBypassBlocks;
PHINode *Induction;
PHINode *OldInduction;
ValueMap WidenMap;
SmallVector<std::pair<StoreInst*,Value*>, 4> PredicatedStores;
EdgeMaskCache MaskCache;
Value *TripCount;
Value *VectorTripCount;
DenseMap<Instruction*,uint64_t> MinBWs;
LoopVectorizationLegality *Legal;
bool AddedSafetyChecks;
SCEVUnionPredicate &Preds;
};
class InnerLoopUnroller : public InnerLoopVectorizer {
public:
InnerLoopUnroller(Loop *OrigLoop, ScalarEvolution *SE, LoopInfo *LI,
DominatorTree *DT, const TargetLibraryInfo *TLI,
const TargetTransformInfo *TTI, unsigned UnrollFactor,
SCEVUnionPredicate &Preds)
: InnerLoopVectorizer(OrigLoop, SE, LI, DT, TLI, TTI, 1, UnrollFactor,
Preds) {}
private:
void scalarizeInstruction(Instruction *Instr,
bool IfPredicateStore = false) override;
void vectorizeMemoryInstruction(Instruction *Instr) override;
Value *getBroadcastInstrs(Value *V) override;
Value *getStepVector(Value *Val, int StartIdx, Value *Step) override;
Value *reverseVector(Value *Vec) override;
};
static Instruction *getDebugLocFromInstOrOperands(Instruction *I) {
if (!I)
return I;
DebugLoc Empty;
if (I->getDebugLoc() != Empty)
return I;
for (User::op_iterator OI = I->op_begin(), OE = I->op_end(); OI != OE; ++OI) {
if (Instruction *OpInst = dyn_cast<Instruction>(*OI))
if (OpInst->getDebugLoc() != Empty)
return OpInst;
}
return I;
}
static void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) {
if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr))
B.SetCurrentDebugLocation(Inst->getDebugLoc());
else
B.SetCurrentDebugLocation(DebugLoc());
}
#ifndef NDEBUG
static std::string getDebugLocString(const Loop *L) {
std::string Result;
if (L) {
raw_string_ostream OS(Result);
if (const DebugLoc LoopDbgLoc = L->getStartLoc())
LoopDbgLoc.print(OS);
else
OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier();
OS.flush();
}
return Result;
}
#endif
static void propagateMetadata(Instruction *To, const Instruction *From) {
SmallVector<std::pair<unsigned, MDNode *>, 4> Metadata;
From->getAllMetadataOtherThanDebugLoc(Metadata);
for (auto M : Metadata) {
unsigned Kind = M.first;
if (Kind != LLVMContext::MD_tbaa &&
Kind != LLVMContext::MD_alias_scope &&
Kind != LLVMContext::MD_noalias &&
Kind != LLVMContext::MD_fpmath &&
Kind != LLVMContext::MD_nontemporal)
continue;
To->setMetadata(Kind, M.second);
}
}
static void propagateMetadata(SmallVectorImpl<Value *> &To, const Instruction *From) {
for (Value *V : To)
if (Instruction *I = dyn_cast<Instruction>(V))
propagateMetadata(I, From);
}
class InterleaveGroup {
public:
InterleaveGroup(Instruction *Instr, int Stride, unsigned Align)
: Align(Align), SmallestKey(0), LargestKey(0), InsertPos(Instr) {
assert(Align && "The alignment should be non-zero");
Factor = std::abs(Stride);
assert(Factor > 1 && "Invalid interleave factor");
Reverse = Stride < 0;
Members[0] = Instr;
}
bool isReverse() const { return Reverse; }
unsigned getFactor() const { return Factor; }
unsigned getAlignment() const { return Align; }
unsigned getNumMembers() const { return Members.size(); }
bool insertMember(Instruction *Instr, int Index, unsigned NewAlign) {
assert(NewAlign && "The new member's alignment should be non-zero");
int Key = Index + SmallestKey;
if (Members.count(Key))
return false;
if (Key > LargestKey) {
if (Index >= static_cast<int>(Factor))
return false;
LargestKey = Key;
} else if (Key < SmallestKey) {
if (LargestKey - Key >= static_cast<int>(Factor))
return false;
SmallestKey = Key;
}
Align = std::min(Align, NewAlign);
Members[Key] = Instr;
return true;
}
Instruction *getMember(unsigned Index) const {
int Key = SmallestKey + Index;
if (!Members.count(Key))
return nullptr;
return Members.find(Key)->second;
}
unsigned getIndex(Instruction *Instr) const {
for (auto I : Members)
if (I.second == Instr)
return I.first - SmallestKey;
llvm_unreachable("InterleaveGroup contains no such member");
}
Instruction *getInsertPos() const { return InsertPos; }
void setInsertPos(Instruction *Inst) { InsertPos = Inst; }
private:
unsigned Factor; bool Reverse;
unsigned Align;
DenseMap<int, Instruction *> Members;
int SmallestKey;
int LargestKey;
Instruction *InsertPos;
};
class InterleavedAccessInfo {
public:
InterleavedAccessInfo(ScalarEvolution *SE, Loop *L, DominatorTree *DT,
SCEVUnionPredicate &Preds)
: SE(SE), TheLoop(L), DT(DT), Preds(Preds) {}
~InterleavedAccessInfo() {
SmallSet<InterleaveGroup *, 4> DelSet;
for (auto &I : InterleaveGroupMap)
DelSet.insert(I.second);
for (auto *Ptr : DelSet)
delete Ptr;
}
void analyzeInterleaving(const ValueToValueMap &Strides);
bool isInterleaved(Instruction *Instr) const {
return InterleaveGroupMap.count(Instr);
}
InterleaveGroup *getInterleaveGroup(Instruction *Instr) const {
if (InterleaveGroupMap.count(Instr))
return InterleaveGroupMap.find(Instr)->second;
return nullptr;
}
private:
ScalarEvolution *SE;
Loop *TheLoop;
DominatorTree *DT;
SCEVUnionPredicate &Preds;
DenseMap<Instruction *, InterleaveGroup *> InterleaveGroupMap;
struct StrideDescriptor {
StrideDescriptor(int Stride, const SCEV *Scev, unsigned Size,
unsigned Align)
: Stride(Stride), Scev(Scev), Size(Size), Align(Align) {}
StrideDescriptor() : Stride(0), Scev(nullptr), Size(0), Align(0) {}
int Stride; const SCEV *Scev; unsigned Size; unsigned Align; };
InterleaveGroup *createInterleaveGroup(Instruction *Instr, int Stride,
unsigned Align) {
assert(!InterleaveGroupMap.count(Instr) &&
"Already in an interleaved access group");
InterleaveGroupMap[Instr] = new InterleaveGroup(Instr, Stride, Align);
return InterleaveGroupMap[Instr];
}
void releaseGroup(InterleaveGroup *Group) {
for (unsigned i = 0; i < Group->getFactor(); i++)
if (Instruction *Member = Group->getMember(i))
InterleaveGroupMap.erase(Member);
delete Group;
}
void collectConstStridedAccesses(
MapVector<Instruction *, StrideDescriptor> &StrideAccesses,
const ValueToValueMap &Strides);
};
class LoopVectorizeHints {
enum HintKind {
HK_WIDTH,
HK_UNROLL,
HK_FORCE
};
struct Hint {
const char * Name;
unsigned Value; HintKind Kind;
Hint(const char * Name, unsigned Value, HintKind Kind)
: Name(Name), Value(Value), Kind(Kind) { }
bool validate(unsigned Val) {
switch (Kind) {
case HK_WIDTH:
return isPowerOf2_32(Val) && Val <= VectorizerParams::MaxVectorWidth;
case HK_UNROLL:
return isPowerOf2_32(Val) && Val <= MaxInterleaveFactor;
case HK_FORCE:
return (Val <= 1);
}
return false;
}
};
Hint Width;
Hint Interleave;
Hint Force;
static StringRef Prefix() { return "llvm.loop."; }
public:
enum ForceKind {
FK_Undefined = -1, FK_Disabled = 0, FK_Enabled = 1, };
LoopVectorizeHints(const Loop *L, bool DisableInterleaving)
: Width("vectorize.width", VectorizerParams::VectorizationFactor,
HK_WIDTH),
Interleave("interleave.count", DisableInterleaving, HK_UNROLL),
Force("vectorize.enable", FK_Undefined, HK_FORCE),
TheLoop(L) {
getHintsFromMetadata();
if (VectorizerParams::isInterleaveForced())
Interleave.Value = VectorizerParams::VectorizationInterleave;
DEBUG(if (DisableInterleaving && Interleave.Value == 1) dbgs()
<< "LV: Interleaving disabled by the pass manager\n");
}
void setAlreadyVectorized() {
Width.Value = Interleave.Value = 1;
Hint Hints[] = {Width, Interleave};
writeHintsToMetadata(Hints);
}
bool allowVectorization(Function *F, Loop *L, bool AlwaysVectorize) const {
if (getForce() == LoopVectorizeHints::FK_Disabled) {
DEBUG(dbgs() << "LV: Not vectorizing: #pragma vectorize disable.\n");
emitOptimizationRemarkAnalysis(F->getContext(),
vectorizeAnalysisPassName(), *F,
L->getStartLoc(), emitRemark());
return false;
}
if (!AlwaysVectorize && getForce() != LoopVectorizeHints::FK_Enabled) {
DEBUG(dbgs() << "LV: Not vectorizing: No #pragma vectorize enable.\n");
emitOptimizationRemarkAnalysis(F->getContext(),
vectorizeAnalysisPassName(), *F,
L->getStartLoc(), emitRemark());
return false;
}
if (getWidth() == 1 && getInterleave() == 1) {
DEBUG(dbgs() << "LV: Not vectorizing: Disabled/already vectorized.\n");
emitOptimizationRemarkAnalysis(
F->getContext(), vectorizeAnalysisPassName(), *F, L->getStartLoc(),
"loop not vectorized: vectorization and interleaving are explicitly "
"disabled, or vectorize width and interleave count are both set to "
"1");
return false;
}
return true;
}
std::string emitRemark() const {
VectorizationReport R;
if (Force.Value == LoopVectorizeHints::FK_Disabled)
R << "vectorization is explicitly disabled";
else {
R << "use -Rpass-analysis=loop-vectorize for more info";
if (Force.Value == LoopVectorizeHints::FK_Enabled) {
R << " (Force=true";
if (Width.Value != 0)
R << ", Vector Width=" << Width.Value;
if (Interleave.Value != 0)
R << ", Interleave Count=" << Interleave.Value;
R << ")";
}
}
return R.str();
}
unsigned getWidth() const { return Width.Value; }
unsigned getInterleave() const { return Interleave.Value; }
enum ForceKind getForce() const { return (ForceKind)Force.Value; }
const char *vectorizeAnalysisPassName() const {
if (getWidth() == 1)
return LV_NAME;
if (getForce() == LoopVectorizeHints::FK_Disabled)
return LV_NAME;
if (getForce() == LoopVectorizeHints::FK_Undefined && getWidth() == 0)
return LV_NAME;
return DiagnosticInfo::AlwaysPrint;
}
bool allowReordering() const {
return getForce() == LoopVectorizeHints::FK_Enabled || getWidth() > 1;
}
private:
void getHintsFromMetadata() {
MDNode *LoopID = TheLoop->getLoopID();
if (!LoopID)
return;
assert(LoopID->getNumOperands() > 0 && "requires at least one operand");
assert(LoopID->getOperand(0) == LoopID && "invalid loop id");
for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
const MDString *S = nullptr;
SmallVector<Metadata *, 4> Args;
if (const MDNode *MD = dyn_cast<MDNode>(LoopID->getOperand(i))) {
if (!MD || MD->getNumOperands() == 0)
continue;
S = dyn_cast<MDString>(MD->getOperand(0));
for (unsigned i = 1, ie = MD->getNumOperands(); i < ie; ++i)
Args.push_back(MD->getOperand(i));
} else {
S = dyn_cast<MDString>(LoopID->getOperand(i));
assert(Args.size() == 0 && "too many arguments for MDString");
}
if (!S)
continue;
StringRef Name = S->getString();
if (Args.size() == 1)
setHint(Name, Args[0]);
}
}
void setHint(StringRef Name, Metadata *Arg) {
if (!Name.startswith(Prefix()))
return;
Name = Name.substr(Prefix().size(), StringRef::npos);
const ConstantInt *C = mdconst::dyn_extract<ConstantInt>(Arg);
if (!C) return;
unsigned Val = C->getZExtValue();
Hint *Hints[] = {&Width, &Interleave, &Force};
for (auto H : Hints) {
if (Name == H->Name) {
if (H->validate(Val))
H->Value = Val;
else
DEBUG(dbgs() << "LV: ignoring invalid hint '" << Name << "'\n");
break;
}
}
}
MDNode *createHintMetadata(StringRef Name, unsigned V) const {
LLVMContext &Context = TheLoop->getHeader()->getContext();
Metadata *MDs[] = {MDString::get(Context, Name),
ConstantAsMetadata::get(
ConstantInt::get(Type::getInt32Ty(Context), V))};
return MDNode::get(Context, MDs);
}
bool matchesHintMetadataName(MDNode *Node, ArrayRef<Hint> HintTypes) {
MDString* Name = dyn_cast<MDString>(Node->getOperand(0));
if (!Name)
return false;
for (auto H : HintTypes)
if (Name->getString().endswith(H.Name))
return true;
return false;
}
void writeHintsToMetadata(ArrayRef<Hint> HintTypes) {
if (HintTypes.size() == 0)
return;
SmallVector<Metadata *, 4> MDs(1);
MDNode *LoopID = TheLoop->getLoopID();
if (LoopID) {
for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
MDNode *Node = cast<MDNode>(LoopID->getOperand(i));
if (!matchesHintMetadataName(Node, HintTypes))
MDs.push_back(Node);
}
}
for (auto H : HintTypes)
MDs.push_back(createHintMetadata(Twine(Prefix(), H.Name).str(), H.Value));
LLVMContext &Context = TheLoop->getHeader()->getContext();
MDNode *NewLoopID = MDNode::get(Context, MDs);
NewLoopID->replaceOperandWith(0, NewLoopID);
TheLoop->setLoopID(NewLoopID);
}
const Loop *TheLoop;
};
static void emitAnalysisDiag(const Function *TheFunction, const Loop *TheLoop,
const LoopVectorizeHints &Hints,
const LoopAccessReport &Message) {
const char *Name = Hints.vectorizeAnalysisPassName();
LoopAccessReport::emitAnalysis(Message, TheFunction, TheLoop, Name);
}
static void emitMissedWarning(Function *F, Loop *L,
const LoopVectorizeHints &LH) {
emitOptimizationRemarkMissed(F->getContext(), LV_NAME, *F, L->getStartLoc(),
LH.emitRemark());
if (LH.getForce() == LoopVectorizeHints::FK_Enabled) {
if (LH.getWidth() != 1)
emitLoopVectorizeWarning(
F->getContext(), *F, L->getStartLoc(),
"failed explicitly specified loop vectorization");
else if (LH.getInterleave() != 1)
emitLoopInterleaveWarning(
F->getContext(), *F, L->getStartLoc(),
"failed explicitly specified loop interleaving");
}
}
class LoopVectorizationLegality {
public:
LoopVectorizationLegality(Loop *L, ScalarEvolution *SE, DominatorTree *DT,
TargetLibraryInfo *TLI, AliasAnalysis *AA,
Function *F, const TargetTransformInfo *TTI,
LoopAccessAnalysis *LAA,
LoopVectorizationRequirements *R,
const LoopVectorizeHints *H,
SCEVUnionPredicate &Preds)
: NumPredStores(0), TheLoop(L), SE(SE), TLI(TLI), TheFunction(F),
TTI(TTI), DT(DT), LAA(LAA), LAI(nullptr),
InterleaveInfo(SE, L, DT, Preds), Induction(nullptr),
WidestIndTy(nullptr), HasFunNoNaNAttr(false), Requirements(R), Hints(H),
Preds(Preds) {}
typedef DenseMap<PHINode *, RecurrenceDescriptor> ReductionList;
typedef MapVector<PHINode*, InductionDescriptor> InductionList;
bool canVectorize();
PHINode *getInduction() { return Induction; }
ReductionList *getReductionVars() { return &Reductions; }
InductionList *getInductionVars() { return &Inductions; }
Type *getWidestInductionType() { return WidestIndTy; }
bool isInductionVariable(const Value *V);
bool blockNeedsPredication(BasicBlock *BB);
int isConsecutivePtr(Value *Ptr);
bool isUniform(Value *V);
bool isUniformAfterVectorization(Instruction* I) { return Uniforms.count(I); }
const RuntimePointerChecking *getRuntimePointerChecking() const {
return LAI->getRuntimePointerChecking();
}
const LoopAccessInfo *getLAI() const {
return LAI;
}
bool isAccessInterleaved(Instruction *Instr) {
return InterleaveInfo.isInterleaved(Instr);
}
const InterleaveGroup *getInterleavedAccessGroup(Instruction *Instr) {
return InterleaveInfo.getInterleaveGroup(Instr);
}
unsigned getMaxSafeDepDistBytes() { return LAI->getMaxSafeDepDistBytes(); }
bool hasStride(Value *V) { return StrideSet.count(V); }
bool mustCheckStrides() { return !StrideSet.empty(); }
SmallPtrSet<Value *, 8>::iterator strides_begin() {
return StrideSet.begin();
}
SmallPtrSet<Value *, 8>::iterator strides_end() { return StrideSet.end(); }
bool isLegalMaskedStore(Type *DataType, Value *Ptr) {
return isConsecutivePtr(Ptr) && TTI->isLegalMaskedStore(DataType);
}
bool isLegalMaskedLoad(Type *DataType, Value *Ptr) {
return isConsecutivePtr(Ptr) && TTI->isLegalMaskedLoad(DataType);
}
bool isMaskRequired(const Instruction* I) {
return (MaskedOp.count(I) != 0);
}
unsigned getNumStores() const {
return LAI->getNumStores();
}
unsigned getNumLoads() const {
return LAI->getNumLoads();
}
unsigned getNumPredStores() const {
return NumPredStores;
}
private:
bool canVectorizeInstrs();
bool canVectorizeMemory();
bool canVectorizeWithIfConvert();
void collectLoopUniforms();
bool blockCanBePredicated(BasicBlock *BB, SmallPtrSetImpl<Value *> &SafePtrs);
void collectStridedAccess(Value *LoadOrStoreInst);
void emitAnalysis(const LoopAccessReport &Message) const {
emitAnalysisDiag(TheFunction, TheLoop, *Hints, Message);
}
unsigned NumPredStores;
Loop *TheLoop;
ScalarEvolution *SE;
TargetLibraryInfo *TLI;
Function *TheFunction;
const TargetTransformInfo *TTI;
DominatorTree *DT;
LoopAccessAnalysis *LAA;
const LoopAccessInfo *LAI;
InterleavedAccessInfo InterleaveInfo;
PHINode *Induction;
ReductionList Reductions;
InductionList Inductions;
Type *WidestIndTy;
SmallPtrSet<Value*, 4> AllowedExit;
SmallPtrSet<Instruction*, 4> Uniforms;
bool HasFunNoNaNAttr;
LoopVectorizationRequirements *Requirements;
const LoopVectorizeHints *Hints;
ValueToValueMap Strides;
SmallPtrSet<Value *, 8> StrideSet;
SmallPtrSet<const Instruction *, 8> MaskedOp;
SCEVUnionPredicate &Preds;
};
class LoopVectorizationCostModel {
public:
LoopVectorizationCostModel(Loop *L, ScalarEvolution *SE, LoopInfo *LI,
LoopVectorizationLegality *Legal,
const TargetTransformInfo &TTI,
const TargetLibraryInfo *TLI, DemandedBits *DB,
AssumptionCache *AC, const Function *F,
const LoopVectorizeHints *Hints,
SmallPtrSetImpl<const Value *> &ValuesToIgnore,
SCEVUnionPredicate &Preds)
: TheLoop(L), SE(SE), LI(LI), Legal(Legal), TTI(TTI), TLI(TLI), DB(DB),
TheFunction(F), Hints(Hints), ValuesToIgnore(ValuesToIgnore) {}
struct VectorizationFactor {
unsigned Width; unsigned Cost; };
VectorizationFactor selectVectorizationFactor(bool OptForSize);
std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
unsigned selectInterleaveCount(bool OptForSize, unsigned VF,
unsigned LoopCost);
unsigned computeInterleaveCount(bool OptForSize, unsigned VF,
unsigned LoopCost);
struct RegisterUsage {
unsigned LoopInvariantRegs;
unsigned MaxLocalUsers;
unsigned NumInstructions;
};
SmallVector<RegisterUsage, 8>
calculateRegisterUsage(const SmallVector<unsigned, 8> &VFs);
private:
unsigned expectedCost(unsigned VF);
unsigned getInstructionCost(Instruction *I, unsigned VF);
bool isConsecutiveLoadOrStore(Instruction *I);
void emitAnalysis(const LoopAccessReport &Message) const {
emitAnalysisDiag(TheFunction, TheLoop, *Hints, Message);
}
public:
DenseMap<Instruction*,uint64_t> MinBWs;
Loop *TheLoop;
ScalarEvolution *SE;
LoopInfo *LI;
LoopVectorizationLegality *Legal;
const TargetTransformInfo &TTI;
const TargetLibraryInfo *TLI;
DemandedBits *DB;
const Function *TheFunction;
const LoopVectorizeHints *Hints;
const SmallPtrSetImpl<const Value *> &ValuesToIgnore;
};
class LoopVectorizationRequirements {
public:
LoopVectorizationRequirements()
: NumRuntimePointerChecks(0), UnsafeAlgebraInst(nullptr) {}
void addUnsafeAlgebraInst(Instruction *I) {
if (!UnsafeAlgebraInst)
UnsafeAlgebraInst = I;
}
void addRuntimePointerChecks(unsigned Num) { NumRuntimePointerChecks = Num; }
bool doesNotMeet(Function *F, Loop *L, const LoopVectorizeHints &Hints) {
const char *Name = Hints.vectorizeAnalysisPassName();
bool Failed = false;
if (UnsafeAlgebraInst && !Hints.allowReordering()) {
emitOptimizationRemarkAnalysisFPCommute(
F->getContext(), Name, *F, UnsafeAlgebraInst->getDebugLoc(),
VectorizationReport() << "cannot prove it is safe to reorder "
"floating-point operations");
Failed = true;
}
bool PragmaThresholdReached =
NumRuntimePointerChecks > PragmaVectorizeMemoryCheckThreshold;
bool ThresholdReached =
NumRuntimePointerChecks > VectorizerParams::RuntimeMemoryCheckThreshold;
if ((ThresholdReached && !Hints.allowReordering()) ||
PragmaThresholdReached) {
emitOptimizationRemarkAnalysisAliasing(
F->getContext(), Name, *F, L->getStartLoc(),
VectorizationReport()
<< "cannot prove it is safe to reorder memory operations");
DEBUG(dbgs() << "LV: Too many memory checks needed.\n");
Failed = true;
}
return Failed;
}
private:
unsigned NumRuntimePointerChecks;
Instruction *UnsafeAlgebraInst;
};
static void addInnerLoop(Loop &L, SmallVectorImpl<Loop *> &V) {
if (L.empty())
return V.push_back(&L);
for (Loop *InnerL : L)
addInnerLoop(*InnerL, V);
}
struct LoopVectorize : public FunctionPass {
static char ID;
explicit LoopVectorize(bool NoUnrolling = false, bool AlwaysVectorize = true)
: FunctionPass(ID),
DisableUnrolling(NoUnrolling),
AlwaysVectorize(AlwaysVectorize) {
initializeLoopVectorizePass(*PassRegistry::getPassRegistry());
}
ScalarEvolution *SE;
LoopInfo *LI;
TargetTransformInfo *TTI;
DominatorTree *DT;
BlockFrequencyInfo *BFI;
TargetLibraryInfo *TLI;
DemandedBits *DB;
AliasAnalysis *AA;
AssumptionCache *AC;
LoopAccessAnalysis *LAA;
bool DisableUnrolling;
bool AlwaysVectorize;
BlockFrequency ColdEntryFreq;
bool runOnFunction(Function &F) override {
SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
TLI = TLIP ? &TLIP->getTLI() : nullptr;
AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
LAA = &getAnalysis<LoopAccessAnalysis>();
DB = &getAnalysis<DemandedBits>();
const BranchProbability ColdProb(1, 5); ColdEntryFreq = BlockFrequency(BFI->getEntryFreq()) * ColdProb;
if (!TTI->getNumberOfRegisters(true) && TTI->getMaxInterleaveFactor(1) < 2)
return false;
SmallVector<Loop *, 8> Worklist;
for (Loop *L : *LI)
addInnerLoop(*L, Worklist);
LoopsAnalyzed += Worklist.size();
bool Changed = false;
while (!Worklist.empty())
Changed |= processLoop(Worklist.pop_back_val());
return Changed;
}
static void AddRuntimeUnrollDisableMetaData(Loop *L) {
SmallVector<Metadata *, 4> MDs;
MDs.push_back(nullptr);
bool IsUnrollMetadata = false;
MDNode *LoopID = L->getLoopID();
if (LoopID) {
for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
MDNode *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
if (MD) {
const MDString *S = dyn_cast<MDString>(MD->getOperand(0));
IsUnrollMetadata =
S && S->getString().startswith("llvm.loop.unroll.disable");
}
MDs.push_back(LoopID->getOperand(i));
}
}
if (!IsUnrollMetadata) {
LLVMContext &Context = L->getHeader()->getContext();
SmallVector<Metadata *, 1> DisableOperands;
DisableOperands.push_back(
MDString::get(Context, "llvm.loop.unroll.runtime.disable"));
MDNode *DisableNode = MDNode::get(Context, DisableOperands);
MDs.push_back(DisableNode);
MDNode *NewLoopID = MDNode::get(Context, MDs);
NewLoopID->replaceOperandWith(0, NewLoopID);
L->setLoopID(NewLoopID);
}
}
bool processLoop(Loop *L) {
assert(L->empty() && "Only process inner loops.");
#ifndef NDEBUG
const std::string DebugLocStr = getDebugLocString(L);
#endif
DEBUG(dbgs() << "\nLV: Checking a loop in \""
<< L->getHeader()->getParent()->getName() << "\" from "
<< DebugLocStr << "\n");
LoopVectorizeHints Hints(L, DisableUnrolling);
DEBUG(dbgs() << "LV: Loop hints:"
<< " force="
<< (Hints.getForce() == LoopVectorizeHints::FK_Disabled
? "disabled"
: (Hints.getForce() == LoopVectorizeHints::FK_Enabled
? "enabled"
: "?")) << " width=" << Hints.getWidth()
<< " unroll=" << Hints.getInterleave() << "\n");
Function *F = L->getHeader()->getParent();
if (!Hints.allowVectorization(F, L, AlwaysVectorize)) {
DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n");
return false;
}
const unsigned TC = SE->getSmallConstantTripCount(L);
if (TC > 0u && TC < TinyTripCountVectorThreshold) {
DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
<< "This loop is not worth vectorizing.");
if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
else {
DEBUG(dbgs() << "\n");
emitAnalysisDiag(F, L, Hints, VectorizationReport()
<< "vectorization is not beneficial "
"and is not explicitly forced");
return false;
}
}
SCEVUnionPredicate Preds;
LoopVectorizationRequirements Requirements;
LoopVectorizationLegality LVL(L, SE, DT, TLI, AA, F, TTI, LAA,
&Requirements, &Hints, Preds);
if (!LVL.canVectorize()) {
DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
emitMissedWarning(F, L, Hints);
return false;
}
SmallPtrSet<const Value *, 32> ValuesToIgnore;
CodeMetrics::collectEphemeralValues(L, AC, ValuesToIgnore);
for (auto &Reduction : *LVL.getReductionVars()) {
RecurrenceDescriptor &RedDes = Reduction.second;
SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
ValuesToIgnore.insert(Casts.begin(), Casts.end());
}
LoopVectorizationCostModel CM(L, SE, LI, &LVL, *TTI, TLI, DB, AC, F, &Hints,
ValuesToIgnore, Preds);
bool OptForSize = Hints.getForce() != LoopVectorizeHints::FK_Enabled &&
F->optForSize();
if (LoopVectorizeWithBlockFrequency) {
BlockFrequency LoopEntryFreq = BFI->getBlockFreq(L->getLoopPreheader());
if (Hints.getForce() != LoopVectorizeHints::FK_Enabled &&
LoopEntryFreq < ColdEntryFreq)
OptForSize = true;
}
if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
DEBUG(dbgs() << "LV: Can't vectorize when the NoImplicitFloat"
"attribute is used.\n");
emitAnalysisDiag(
F, L, Hints,
VectorizationReport()
<< "loop not vectorized due to NoImplicitFloat attribute");
emitMissedWarning(F, L, Hints);
return false;
}
const LoopVectorizationCostModel::VectorizationFactor VF =
CM.selectVectorizationFactor(OptForSize);
unsigned IC = CM.selectInterleaveCount(OptForSize, VF.Width, VF.Cost);
unsigned UserIC = Hints.getInterleave();
std::string VecDiagMsg, IntDiagMsg;
bool VectorizeLoop = true, InterleaveLoop = true;
if (Requirements.doesNotMeet(F, L, Hints)) {
DEBUG(dbgs() << "LV: Not vectorizing: loop did not meet vectorization "
"requirements.\n");
emitMissedWarning(F, L, Hints);
return false;
}
if (VF.Width == 1) {
DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
VecDiagMsg =
"the cost-model indicates that vectorization is not beneficial";
VectorizeLoop = false;
}
if (IC == 1 && UserIC <= 1) {
DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");
IntDiagMsg =
"the cost-model indicates that interleaving is not beneficial";
InterleaveLoop = false;
if (UserIC == 1)
IntDiagMsg +=
" and is explicitly disabled or interleave count is set to 1";
} else if (IC > 1 && UserIC == 1) {
DEBUG(dbgs()
<< "LV: Interleaving is beneficial but is explicitly disabled.");
IntDiagMsg = "the cost-model indicates that interleaving is beneficial "
"but is explicitly disabled or interleave count is set to 1";
InterleaveLoop = false;
}
IC = UserIC > 0 ? UserIC : IC;
const char *VAPassName = Hints.vectorizeAnalysisPassName();
if (!VectorizeLoop && !InterleaveLoop) {
emitOptimizationRemarkAnalysis(F->getContext(), VAPassName, *F,
L->getStartLoc(), VecDiagMsg);
emitOptimizationRemarkAnalysis(F->getContext(), LV_NAME, *F,
L->getStartLoc(), IntDiagMsg);
return false;
} else if (!VectorizeLoop && InterleaveLoop) {
DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
emitOptimizationRemarkAnalysis(F->getContext(), VAPassName, *F,
L->getStartLoc(), VecDiagMsg);
} else if (VectorizeLoop && !InterleaveLoop) {
DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width << ") in "
<< DebugLocStr << '\n');
emitOptimizationRemarkAnalysis(F->getContext(), LV_NAME, *F,
L->getStartLoc(), IntDiagMsg);
} else if (VectorizeLoop && InterleaveLoop) {
DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width << ") in "
<< DebugLocStr << '\n');
DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
}
if (!VectorizeLoop) {
assert(IC > 1 && "interleave count should not be 1 or 0");
InnerLoopUnroller Unroller(L, SE, LI, DT, TLI, TTI, IC, Preds);
Unroller.vectorize(&LVL, CM.MinBWs);
emitOptimizationRemark(F->getContext(), LV_NAME, *F, L->getStartLoc(),
Twine("interleaved loop (interleaved count: ") +
Twine(IC) + ")");
} else {
InnerLoopVectorizer LB(L, SE, LI, DT, TLI, TTI, VF.Width, IC, Preds);
LB.vectorize(&LVL, CM.MinBWs);
++LoopsVectorized;
if (!LB.IsSafetyChecksAdded())
AddRuntimeUnrollDisableMetaData(L);
emitOptimizationRemark(F->getContext(), LV_NAME, *F, L->getStartLoc(),
Twine("vectorized loop (vectorization width: ") +
Twine(VF.Width) + ", interleaved count: " +
Twine(IC) + ")");
}
Hints.setAlreadyVectorized();
DEBUG(verifyFunction(*L->getHeader()->getParent()));
return true;
}
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addRequired<AssumptionCacheTracker>();
AU.addRequiredID(LoopSimplifyID);
AU.addRequiredID(LCSSAID);
AU.addRequired<BlockFrequencyInfoWrapperPass>();
AU.addRequired<DominatorTreeWrapperPass>();
AU.addRequired<LoopInfoWrapperPass>();
AU.addRequired<ScalarEvolutionWrapperPass>();
AU.addRequired<TargetTransformInfoWrapperPass>();
AU.addRequired<AAResultsWrapperPass>();
AU.addRequired<LoopAccessAnalysis>();
AU.addRequired<DemandedBits>();
AU.addPreserved<LoopInfoWrapperPass>();
AU.addPreserved<DominatorTreeWrapperPass>();
AU.addPreserved<BasicAAWrapperPass>();
AU.addPreserved<AAResultsWrapperPass>();
AU.addPreserved<GlobalsAAWrapperPass>();
}
};
}
Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
Instruction *Instr = dyn_cast<Instruction>(V);
bool NewInstr =
(Instr && std::find(LoopVectorBody.begin(), LoopVectorBody.end(),
Instr->getParent()) != LoopVectorBody.end());
bool Invariant = OrigLoop->isLoopInvariant(V) && !NewInstr;
IRBuilder<>::InsertPointGuard Guard(Builder);
if (Invariant)
Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast");
return Shuf;
}
Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx,
Value *Step) {
assert(Val->getType()->isVectorTy() && "Must be a vector");
assert(Val->getType()->getScalarType()->isIntegerTy() &&
"Elem must be an integer");
assert(Step->getType() == Val->getType()->getScalarType() &&
"Step has wrong type");
Type *ITy = Val->getType()->getScalarType();
VectorType *Ty = cast<VectorType>(Val->getType());
int VLen = Ty->getNumElements();
SmallVector<Constant*, 8> Indices;
for (int i = 0; i < VLen; ++i)
Indices.push_back(ConstantInt::get(ITy, StartIdx + i));
Constant *Cv = ConstantVector::get(Indices);
assert(Cv->getType() == Val->getType() && "Invalid consecutive vec");
Step = Builder.CreateVectorSplat(VLen, Step);
assert(Step->getType() == Val->getType() && "Invalid step vec");
Step = Builder.CreateMul(Cv, Step);
return Builder.CreateAdd(Val, Step, "induction");
}
int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) {
assert(Ptr->getType()->isPointerTy() && "Unexpected non-ptr");
if (Ptr->getType()->getPointerElementType()->isAggregateType())
return 0;
PHINode *Phi = dyn_cast_or_null<PHINode>(Ptr);
if (Phi && Inductions.count(Phi)) {
InductionDescriptor II = Inductions[Phi];
return II.getConsecutiveDirection();
}
GetElementPtrInst *Gep = getGEPInstruction(Ptr);
if (!Gep)
return 0;
unsigned NumOperands = Gep->getNumOperands();
Value *GpPtr = Gep->getPointerOperand();
Phi = dyn_cast<PHINode>(GpPtr);
if (Phi && Inductions.count(Phi)) {
PointerType *GepPtrType = cast<PointerType>(GpPtr->getType());
if (GepPtrType->getElementType()->isAggregateType())
return 0;
for (unsigned i = 1; i < NumOperands; ++i)
if (!SE->isLoopInvariant(SE->getSCEV(Gep->getOperand(i)), TheLoop))
return 0;
InductionDescriptor II = Inductions[Phi];
return II.getConsecutiveDirection();
}
unsigned InductionOperand = getGEPInductionOperand(Gep);
for (unsigned i = 0; i != NumOperands; ++i)
if (i != InductionOperand &&
!SE->isLoopInvariant(SE->getSCEV(Gep->getOperand(i)), TheLoop))
return 0;
const SCEV *Last = nullptr;
if (!Strides.count(Gep))
Last = SE->getSCEV(Gep->getOperand(InductionOperand));
else {
Last = replaceSymbolicStrideSCEV(SE, Strides, Preds,
Gep->getOperand(InductionOperand), Gep);
if (const SCEVCastExpr *C = dyn_cast<SCEVCastExpr>(Last))
Last =
(C->getSCEVType() == scSignExtend || C->getSCEVType() == scZeroExtend)
? C->getOperand()
: Last;
}
if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Last)) {
const SCEV *Step = AR->getStepRecurrence(*SE);
if (Step->isOne())
return 1;
if (Step->isAllOnesValue())
return -1;
}
return 0;
}
bool LoopVectorizationLegality::isUniform(Value *V) {
return LAI->isUniform(V);
}
InnerLoopVectorizer::VectorParts&
InnerLoopVectorizer::getVectorValue(Value *V) {
assert(V != Induction && "The new induction variable should not be used.");
assert(!V->getType()->isVectorTy() && "Can't widen a vector");
if (Legal->hasStride(V))
V = ConstantInt::get(V->getType(), 1);
if (WidenMap.has(V))
return WidenMap.get(V);
Value *B = getBroadcastInstrs(V);
return WidenMap.splat(V, B);
}
Value *InnerLoopVectorizer::reverseVector(Value *Vec) {
assert(Vec->getType()->isVectorTy() && "Invalid type");
SmallVector<Constant*, 8> ShuffleMask;
for (unsigned i = 0; i < VF; ++i)
ShuffleMask.push_back(Builder.getInt32(VF - i - 1));
return Builder.CreateShuffleVector(Vec, UndefValue::get(Vec->getType()),
ConstantVector::get(ShuffleMask),
"reverse");
}
static Constant *getInterleavedMask(IRBuilder<> &Builder, unsigned VF,
unsigned NumVec) {
SmallVector<Constant *, 16> Mask;
for (unsigned i = 0; i < VF; i++)
for (unsigned j = 0; j < NumVec; j++)
Mask.push_back(Builder.getInt32(j * VF + i));
return ConstantVector::get(Mask);
}
static Constant *getStridedMask(IRBuilder<> &Builder, unsigned Start,
unsigned Stride, unsigned VF) {
SmallVector<Constant *, 16> Mask;
for (unsigned i = 0; i < VF; i++)
Mask.push_back(Builder.getInt32(Start + i * Stride));
return ConstantVector::get(Mask);
}
static Constant *getSequentialMask(IRBuilder<> &Builder, unsigned NumInt,
unsigned NumUndef) {
SmallVector<Constant *, 16> Mask;
for (unsigned i = 0; i < NumInt; i++)
Mask.push_back(Builder.getInt32(i));
Constant *Undef = UndefValue::get(Builder.getInt32Ty());
for (unsigned i = 0; i < NumUndef; i++)
Mask.push_back(Undef);
return ConstantVector::get(Mask);
}
static Value *ConcatenateTwoVectors(IRBuilder<> &Builder, Value *V1,
Value *V2) {
VectorType *VecTy1 = dyn_cast<VectorType>(V1->getType());
VectorType *VecTy2 = dyn_cast<VectorType>(V2->getType());
assert(VecTy1 && VecTy2 &&
VecTy1->getScalarType() == VecTy2->getScalarType() &&
"Expect two vectors with the same element type");
unsigned NumElts1 = VecTy1->getNumElements();
unsigned NumElts2 = VecTy2->getNumElements();
assert(NumElts1 >= NumElts2 && "Unexpect the first vector has less elements");
if (NumElts1 > NumElts2) {
Constant *ExtMask =
getSequentialMask(Builder, NumElts2, NumElts1 - NumElts2);
V2 = Builder.CreateShuffleVector(V2, UndefValue::get(VecTy2), ExtMask);
}
Constant *Mask = getSequentialMask(Builder, NumElts1 + NumElts2, 0);
return Builder.CreateShuffleVector(V1, V2, Mask);
}
static Value *ConcatenateVectors(IRBuilder<> &Builder,
ArrayRef<Value *> InputList) {
unsigned NumVec = InputList.size();
assert(NumVec > 1 && "Should be at least two vectors");
SmallVector<Value *, 8> ResList;
ResList.append(InputList.begin(), InputList.end());
do {
SmallVector<Value *, 8> TmpList;
for (unsigned i = 0; i < NumVec - 1; i += 2) {
Value *V0 = ResList[i], *V1 = ResList[i + 1];
assert((V0->getType() == V1->getType() || i == NumVec - 2) &&
"Only the last vector may have a different type");
TmpList.push_back(ConcatenateTwoVectors(Builder, V0, V1));
}
if (NumVec % 2 != 0)
TmpList.push_back(ResList[NumVec - 1]);
ResList = TmpList;
NumVec = ResList.size();
} while (NumVec > 1);
return ResList[0];
}
void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr) {
const InterleaveGroup *Group = Legal->getInterleavedAccessGroup(Instr);
assert(Group && "Fail to get an interleaved access group.");
if (Instr != Group->getInsertPos())
return;
LoadInst *LI = dyn_cast<LoadInst>(Instr);
StoreInst *SI = dyn_cast<StoreInst>(Instr);
Value *Ptr = LI ? LI->getPointerOperand() : SI->getPointerOperand();
Type *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType();
unsigned InterleaveFactor = Group->getFactor();
Type *VecTy = VectorType::get(ScalarTy, InterleaveFactor * VF);
Type *PtrTy = VecTy->getPointerTo(Ptr->getType()->getPointerAddressSpace());
setDebugLocFromInst(Builder, Ptr);
VectorParts &PtrParts = getVectorValue(Ptr);
SmallVector<Value *, 2> NewPtrs;
unsigned Index = Group->getIndex(Instr);
for (unsigned Part = 0; Part < UF; Part++) {
Value *NewPtr = Builder.CreateExtractElement(
PtrParts[Part],
Group->isReverse() ? Builder.getInt32(VF - 1) : Builder.getInt32(0));
NewPtr = Builder.CreateGEP(NewPtr, Builder.getInt32(-Index));
NewPtrs.push_back(Builder.CreateBitCast(NewPtr, PtrTy));
}
setDebugLocFromInst(Builder, Instr);
Value *UndefVec = UndefValue::get(VecTy);
if (LI) {
for (unsigned Part = 0; Part < UF; Part++) {
Instruction *NewLoadInstr = Builder.CreateAlignedLoad(
NewPtrs[Part], Group->getAlignment(), "wide.vec");
for (unsigned i = 0; i < InterleaveFactor; i++) {
Instruction *Member = Group->getMember(i);
if (!Member)
continue;
Constant *StrideMask = getStridedMask(Builder, i, InterleaveFactor, VF);
Value *StridedVec = Builder.CreateShuffleVector(
NewLoadInstr, UndefVec, StrideMask, "strided.vec");
if (Member->getType() != ScalarTy) {
VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
StridedVec = Builder.CreateBitOrPointerCast(StridedVec, OtherVTy);
}
VectorParts &Entry = WidenMap.get(Member);
Entry[Part] =
Group->isReverse() ? reverseVector(StridedVec) : StridedVec;
}
propagateMetadata(NewLoadInstr, Instr);
}
return;
}
VectorType *SubVT = VectorType::get(ScalarTy, VF);
for (unsigned Part = 0; Part < UF; Part++) {
SmallVector<Value *, 4> StoredVecs;
for (unsigned i = 0; i < InterleaveFactor; i++) {
Instruction *Member = Group->getMember(i);
assert(Member && "Fail to get a member from an interleaved store group");
Value *StoredVec =
getVectorValue(dyn_cast<StoreInst>(Member)->getValueOperand())[Part];
if (Group->isReverse())
StoredVec = reverseVector(StoredVec);
if (StoredVec->getType() != SubVT)
StoredVec = Builder.CreateBitOrPointerCast(StoredVec, SubVT);
StoredVecs.push_back(StoredVec);
}
Value *WideVec = ConcatenateVectors(Builder, StoredVecs);
Constant *IMask = getInterleavedMask(Builder, VF, InterleaveFactor);
Value *IVec = Builder.CreateShuffleVector(WideVec, UndefVec, IMask,
"interleaved.vec");
Instruction *NewStoreInstr =
Builder.CreateAlignedStore(IVec, NewPtrs[Part], Group->getAlignment());
propagateMetadata(NewStoreInstr, Instr);
}
}
void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) {
LoadInst *LI = dyn_cast<LoadInst>(Instr);
StoreInst *SI = dyn_cast<StoreInst>(Instr);
assert((LI || SI) && "Invalid Load/Store instruction");
if (Legal->isAccessInterleaved(Instr))
return vectorizeInterleaveGroup(Instr);
Type *ScalarDataTy = LI ? LI->getType() : SI->getValueOperand()->getType();
Type *DataTy = VectorType::get(ScalarDataTy, VF);
Value *Ptr = LI ? LI->getPointerOperand() : SI->getPointerOperand();
unsigned Alignment = LI ? LI->getAlignment() : SI->getAlignment();
const DataLayout &DL = Instr->getModule()->getDataLayout();
if (!Alignment)
Alignment = DL.getABITypeAlignment(ScalarDataTy);
unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace();
unsigned ScalarAllocatedSize = DL.getTypeAllocSize(ScalarDataTy);
unsigned VectorElementSize = DL.getTypeStoreSize(DataTy) / VF;
if (SI && Legal->blockNeedsPredication(SI->getParent()) &&
!Legal->isMaskRequired(SI))
return scalarizeInstruction(Instr, true);
if (ScalarAllocatedSize != VectorElementSize)
return scalarizeInstruction(Instr);
int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
bool Reverse = ConsecutiveStride < 0;
bool UniformLoad = LI && Legal->isUniform(Ptr);
if (!ConsecutiveStride || UniformLoad)
return scalarizeInstruction(Instr);
Constant *Zero = Builder.getInt32(0);
VectorParts &Entry = WidenMap.get(Instr);
GetElementPtrInst *Gep = getGEPInstruction(Ptr);
if (Gep && Legal->isInductionVariable(Gep->getPointerOperand())) {
setDebugLocFromInst(Builder, Gep);
Value *PtrOperand = Gep->getPointerOperand();
Value *FirstBasePtr = getVectorValue(PtrOperand)[0];
FirstBasePtr = Builder.CreateExtractElement(FirstBasePtr, Zero);
GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone());
Gep2->setOperand(0, FirstBasePtr);
Gep2->setName("gep.indvar.base");
Ptr = Builder.Insert(Gep2);
} else if (Gep) {
setDebugLocFromInst(Builder, Gep);
assert(SE->isLoopInvariant(SE->getSCEV(Gep->getPointerOperand()),
OrigLoop) && "Base ptr must be invariant");
unsigned NumOperands = Gep->getNumOperands();
unsigned InductionOperand = getGEPInductionOperand(Gep);
GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone());
for (unsigned i = 0; i < NumOperands; ++i) {
Value *GepOperand = Gep->getOperand(i);
Instruction *GepOperandInst = dyn_cast<Instruction>(GepOperand);
if (i == InductionOperand ||
(GepOperandInst && OrigLoop->contains(GepOperandInst))) {
assert((i == InductionOperand ||
SE->isLoopInvariant(SE->getSCEV(GepOperandInst), OrigLoop)) &&
"Must be last index or loop invariant");
VectorParts &GEPParts = getVectorValue(GepOperand);
Value *Index = GEPParts[0];
Index = Builder.CreateExtractElement(Index, Zero);
Gep2->setOperand(i, Index);
Gep2->setName("gep.indvar.idx");
}
}
Ptr = Builder.Insert(Gep2);
} else {
assert(isa<PHINode>(Ptr) && "Invalid induction ptr");
setDebugLocFromInst(Builder, Ptr);
VectorParts &PtrVal = getVectorValue(Ptr);
Ptr = Builder.CreateExtractElement(PtrVal[0], Zero);
}
VectorParts Mask = createBlockInMask(Instr->getParent());
if (SI) {
assert(!Legal->isUniform(SI->getPointerOperand()) &&
"We do not allow storing to uniform addresses");
setDebugLocFromInst(Builder, SI);
VectorParts StoredVal = getVectorValue(SI->getValueOperand());
for (unsigned Part = 0; Part < UF; ++Part) {
Value *PartPtr =
Builder.CreateGEP(nullptr, Ptr, Builder.getInt32(Part * VF));
if (Reverse) {
StoredVal[Part] = reverseVector(StoredVal[Part]);
PartPtr = Builder.CreateGEP(nullptr, Ptr, Builder.getInt32(-Part * VF));
PartPtr = Builder.CreateGEP(nullptr, PartPtr, Builder.getInt32(1 - VF));
Mask[Part] = reverseVector(Mask[Part]);
}
Value *VecPtr = Builder.CreateBitCast(PartPtr,
DataTy->getPointerTo(AddressSpace));
Instruction *NewSI;
if (Legal->isMaskRequired(SI))
NewSI = Builder.CreateMaskedStore(StoredVal[Part], VecPtr, Alignment,
Mask[Part]);
else
NewSI = Builder.CreateAlignedStore(StoredVal[Part], VecPtr, Alignment);
propagateMetadata(NewSI, SI);
}
return;
}
assert(LI && "Must have a load instruction");
setDebugLocFromInst(Builder, LI);
for (unsigned Part = 0; Part < UF; ++Part) {
Value *PartPtr =
Builder.CreateGEP(nullptr, Ptr, Builder.getInt32(Part * VF));
if (Reverse) {
PartPtr = Builder.CreateGEP(nullptr, Ptr, Builder.getInt32(-Part * VF));
PartPtr = Builder.CreateGEP(nullptr, PartPtr, Builder.getInt32(1 - VF));
Mask[Part] = reverseVector(Mask[Part]);
}
Instruction* NewLI;
Value *VecPtr = Builder.CreateBitCast(PartPtr,
DataTy->getPointerTo(AddressSpace));
if (Legal->isMaskRequired(LI))
NewLI = Builder.CreateMaskedLoad(VecPtr, Alignment, Mask[Part],
UndefValue::get(DataTy),
"wide.masked.load");
else
NewLI = Builder.CreateAlignedLoad(VecPtr, Alignment, "wide.load");
propagateMetadata(NewLI, LI);
Entry[Part] = Reverse ? reverseVector(NewLI) : NewLI;
}
}
void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, bool IfPredicateStore) {
assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
SmallVector<VectorParts, 4> Params;
setDebugLocFromInst(Builder, Instr);
for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
Value *SrcOp = Instr->getOperand(op);
if (SrcOp == OldInduction) {
Params.push_back(getVectorValue(SrcOp));
continue;
}
Instruction *SrcInst = dyn_cast<Instruction>(SrcOp);
if (SrcInst && OrigLoop->contains(SrcInst)) {
assert(WidenMap.has(SrcInst) && "Source operand is unavailable");
Params.push_back(WidenMap.get(SrcInst));
} else {
VectorParts Scalars;
Scalars.append(UF, SrcOp);
Params.push_back(Scalars);
}
}
assert(Params.size() == Instr->getNumOperands() &&
"Invalid number of operands");
bool IsVoidRetTy = Instr->getType()->isVoidTy();
Value *UndefVec = IsVoidRetTy ? nullptr :
UndefValue::get(VectorType::get(Instr->getType(), VF));
VectorParts &VecResults = WidenMap.splat(Instr, UndefVec);
VectorParts Cond;
if (IfPredicateStore) {
assert(Instr->getParent()->getSinglePredecessor() &&
"Only support single predecessor blocks");
Cond = createEdgeMask(Instr->getParent()->getSinglePredecessor(),
Instr->getParent());
}
for (unsigned Part = 0; Part < UF; ++Part) {
for (unsigned Width = 0; Width < VF; ++Width) {
Value *Cmp = nullptr;
if (IfPredicateStore) {
Cmp = Builder.CreateExtractElement(Cond[Part], Builder.getInt32(Width));
Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Cmp, ConstantInt::get(Cmp->getType(), 1));
}
Instruction *Cloned = Instr->clone();
if (!IsVoidRetTy)
Cloned->setName(Instr->getName() + ".cloned");
for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
Value *Op = Params[op][Part];
if (Op->getType()->isVectorTy())
Op = Builder.CreateExtractElement(Op, Builder.getInt32(Width));
Cloned->setOperand(op, Op);
}
Builder.Insert(Cloned);
if (!IsVoidRetTy)
VecResults[Part] = Builder.CreateInsertElement(VecResults[Part], Cloned,
Builder.getInt32(Width));
if (IfPredicateStore)
PredicatedStores.push_back(std::make_pair(cast<StoreInst>(Cloned),
Cmp));
}
}
}
PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start,
Value *End, Value *Step,
Instruction *DL) {
BasicBlock *Header = L->getHeader();
BasicBlock *Latch = L->getLoopLatch();
if (!Latch)
Latch = Header;
IRBuilder<> Builder(&*Header->getFirstInsertionPt());
setDebugLocFromInst(Builder, getDebugLocFromInstOrOperands(OldInduction));
auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index");
Builder.SetInsertPoint(Latch->getTerminator());
Value *Next = Builder.CreateAdd(Induction, Step, "index.next");
Induction->addIncoming(Start, L->getLoopPreheader());
Induction->addIncoming(Next, Latch);
Value *ICmp = Builder.CreateICmpEQ(Next, End);
Builder.CreateCondBr(ICmp, L->getExitBlock(), Header);
Latch->getTerminator()->eraseFromParent();
return Induction;
}
Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) {
if (TripCount)
return TripCount;
IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
const SCEV *BackedgeTakenCount = SE->getBackedgeTakenCount(OrigLoop);
assert(BackedgeTakenCount != SE->getCouldNotCompute() && "Invalid loop count");
Type *IdxTy = Legal->getWidestInductionType();
if (BackedgeTakenCount->getType()->getPrimitiveSizeInBits() >
IdxTy->getPrimitiveSizeInBits())
BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy);
BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy);
const SCEV *ExitCount = SE->getAddExpr(
BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
SCEVExpander Exp(*SE, DL, "induction");
TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
L->getLoopPreheader()->getTerminator());
if (TripCount->getType()->isPointerTy())
TripCount =
CastInst::CreatePointerCast(TripCount, IdxTy,
"exitcount.ptrcnt.to.int",
L->getLoopPreheader()->getTerminator());
return TripCount;
}
Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) {
if (VectorTripCount)
return VectorTripCount;
Value *TC = getOrCreateTripCount(L);
IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
Constant *Step = ConstantInt::get(TC->getType(), VF * UF);
Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
return VectorTripCount;
}
void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L,
BasicBlock *Bypass) {
Value *Count = getOrCreateTripCount(L);
BasicBlock *BB = L->getLoopPreheader();
IRBuilder<> Builder(BB->getTerminator());
Value *CheckMinIters =
Builder.CreateICmpULT(Count,
ConstantInt::get(Count->getType(), VF * UF),
"min.iters.check");
BasicBlock *NewBB = BB->splitBasicBlock(BB->getTerminator(),
"min.iters.checked");
if (L->getParentLoop())
L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI);
ReplaceInstWithInst(BB->getTerminator(),
BranchInst::Create(Bypass, NewBB, CheckMinIters));
LoopBypassBlocks.push_back(BB);
}
void InnerLoopVectorizer::emitVectorLoopEnteredCheck(Loop *L,
BasicBlock *Bypass) {
Value *TC = getOrCreateVectorTripCount(L);
BasicBlock *BB = L->getLoopPreheader();
IRBuilder<> Builder(BB->getTerminator());
Value *Cmp = Builder.CreateICmpEQ(TC, Constant::getNullValue(TC->getType()),
"cmp.zero");
BasicBlock *NewBB = BB->splitBasicBlock(BB->getTerminator(),
"vector.ph");
if (L->getParentLoop())
L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI);
ReplaceInstWithInst(BB->getTerminator(),
BranchInst::Create(Bypass, NewBB, Cmp));
LoopBypassBlocks.push_back(BB);
}
void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) {
BasicBlock *BB = L->getLoopPreheader();
SCEVExpander Exp(*SE, Bypass->getModule()->getDataLayout(), "scev.check");
Value *SCEVCheck = Exp.expandCodeForPredicate(&Preds, BB->getTerminator());
if (auto *C = dyn_cast<ConstantInt>(SCEVCheck))
if (C->isZero())
return;
BB->setName("vector.scevcheck");
auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph");
if (L->getParentLoop())
L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI);
ReplaceInstWithInst(BB->getTerminator(),
BranchInst::Create(Bypass, NewBB, SCEVCheck));
LoopBypassBlocks.push_back(BB);
AddedSafetyChecks = true;
}
void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L,
BasicBlock *Bypass) {
BasicBlock *BB = L->getLoopPreheader();
Instruction *FirstCheckInst;
Instruction *MemRuntimeCheck;
std::tie(FirstCheckInst, MemRuntimeCheck) =
Legal->getLAI()->addRuntimeChecks(BB->getTerminator());
if (!MemRuntimeCheck)
return;
BB->setName("vector.memcheck");
auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph");
if (L->getParentLoop())
L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI);
ReplaceInstWithInst(BB->getTerminator(),
BranchInst::Create(Bypass, NewBB, MemRuntimeCheck));
LoopBypassBlocks.push_back(BB);
AddedSafetyChecks = true;
}
void InnerLoopVectorizer::createEmptyLoop() {
BasicBlock *OldBasicBlock = OrigLoop->getHeader();
BasicBlock *VectorPH = OrigLoop->getLoopPreheader();
BasicBlock *ExitBlock = OrigLoop->getExitBlock();
assert(VectorPH && "Invalid loop structure");
assert(ExitBlock && "Must have an exit block");
OldInduction = Legal->getInduction();
Type *IdxTy = Legal->getWidestInductionType();
BasicBlock *VecBody =
VectorPH->splitBasicBlock(VectorPH->getTerminator(), "vector.body");
BasicBlock *MiddleBlock =
VecBody->splitBasicBlock(VecBody->getTerminator(), "middle.block");
BasicBlock *ScalarPH =
MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(), "scalar.ph");
Loop* Lp = new Loop();
Loop *ParentLoop = OrigLoop->getParentLoop();
if (ParentLoop) {
ParentLoop->addChildLoop(Lp);
ParentLoop->addBasicBlockToLoop(ScalarPH, *LI);
ParentLoop->addBasicBlockToLoop(MiddleBlock, *LI);
} else {
LI->addTopLevelLoop(Lp);
}
Lp->addBasicBlockToLoop(VecBody, *LI);
Value *Count = getOrCreateTripCount(Lp);
Value *StartIdx = ConstantInt::get(IdxTy, 0);
emitMinimumIterationCountCheck(Lp, ScalarPH);
emitVectorLoopEnteredCheck(Lp, ScalarPH);
emitSCEVChecks(Lp, ScalarPH);
emitMemRuntimeChecks(Lp, ScalarPH);
Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
Constant *Step = ConstantInt::get(IdxTy, VF * UF);
Induction =
createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
getDebugLocFromInstOrOperands(OldInduction));
LoopVectorizationLegality::InductionList::iterator I, E;
LoopVectorizationLegality::InductionList *List = Legal->getInductionVars();
for (I = List->begin(), E = List->end(); I != E; ++I) {
PHINode *OrigPhi = I->first;
InductionDescriptor II = I->second;
PHINode *BCResumeVal = PHINode::Create(OrigPhi->getType(), 3,
"bc.resume.val",
ScalarPH->getTerminator());
Value *EndValue;
if (OrigPhi == OldInduction) {
EndValue = CountRoundDown;
} else {
IRBuilder<> B(LoopBypassBlocks.back()->getTerminator());
Value *CRD = B.CreateSExtOrTrunc(CountRoundDown,
II.getStepValue()->getType(),
"cast.crd");
EndValue = II.transform(B, CRD);
EndValue->setName("ind.end");
}
BCResumeVal->addIncoming(EndValue, MiddleBlock);
unsigned BlockIdx = OrigPhi->getBasicBlockIndex(ScalarPH);
for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
BCResumeVal->addIncoming(II.getStartValue(), LoopBypassBlocks[I]);
OrigPhi->setIncomingValue(BlockIdx, BCResumeVal);
}
Value *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count,
CountRoundDown, "cmp.n",
MiddleBlock->getTerminator());
ReplaceInstWithInst(MiddleBlock->getTerminator(),
BranchInst::Create(ExitBlock, ScalarPH, CmpN));
Builder.SetInsertPoint(&*VecBody->getFirstInsertionPt());
LoopVectorPreHeader = Lp->getLoopPreheader();
LoopScalarPreHeader = ScalarPH;
LoopMiddleBlock = MiddleBlock;
LoopExitBlock = ExitBlock;
LoopVectorBody.push_back(VecBody);
LoopScalarBody = OldBasicBlock;
LoopVectorizeHints Hints(Lp, true);
Hints.setAlreadyVectorized();
}
namespace {
struct CSEDenseMapInfo {
static bool canHandle(Instruction *I) {
return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
}
static inline Instruction *getEmptyKey() {
return DenseMapInfo<Instruction *>::getEmptyKey();
}
static inline Instruction *getTombstoneKey() {
return DenseMapInfo<Instruction *>::getTombstoneKey();
}
static unsigned getHashValue(Instruction *I) {
assert(canHandle(I) && "Unknown instruction!");
return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),
I->value_op_end()));
}
static bool isEqual(Instruction *LHS, Instruction *RHS) {
if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
LHS == getTombstoneKey() || RHS == getTombstoneKey())
return LHS == RHS;
return LHS->isIdenticalTo(RHS);
}
};
}
static bool isPredicatedBlock(unsigned BlockNum) {
return BlockNum % 2;
}
static void cse(SmallVector<BasicBlock *, 4> &BBs) {
SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap;
for (unsigned i = 0, e = BBs.size(); i != e; ++i) {
BasicBlock *BB = BBs[i];
for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
Instruction *In = &*I++;
if (!CSEDenseMapInfo::canHandle(In))
continue;
if (Instruction *V = CSEMap.lookup(In)) {
In->replaceAllUsesWith(V);
In->eraseFromParent();
continue;
}
if (isPredicatedBlock(i))
continue;
CSEMap[In] = In;
}
}
}
static Value *addFastMathFlag(Value *V) {
if (isa<FPMathOperator>(V)){
FastMathFlags Flags;
Flags.setUnsafeAlgebra();
cast<Instruction>(V)->setFastMathFlags(Flags);
}
return V;
}
static unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract,
const TargetTransformInfo &TTI) {
if (Ty->isVoidTy())
return 0;
assert(Ty->isVectorTy() && "Can only scalarize vectors");
unsigned Cost = 0;
for (int i = 0, e = Ty->getVectorNumElements(); i < e; ++i) {
if (Insert)
Cost += TTI.getVectorInstrCost(Instruction::InsertElement, Ty, i);
if (Extract)
Cost += TTI.getVectorInstrCost(Instruction::ExtractElement, Ty, i);
}
return Cost;
}
static unsigned getVectorCallCost(CallInst *CI, unsigned VF,
const TargetTransformInfo &TTI,
const TargetLibraryInfo *TLI,
bool &NeedToScalarize) {
Function *F = CI->getCalledFunction();
StringRef FnName = CI->getCalledFunction()->getName();
Type *ScalarRetTy = CI->getType();
SmallVector<Type *, 4> Tys, ScalarTys;
for (auto &ArgOp : CI->arg_operands())
ScalarTys.push_back(ArgOp->getType());
unsigned ScalarCallCost = TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys);
if (VF == 1)
return ScalarCallCost;
Type *RetTy = ToVectorTy(ScalarRetTy, VF);
for (unsigned i = 0, ie = ScalarTys.size(); i != ie; ++i)
Tys.push_back(ToVectorTy(ScalarTys[i], VF));
unsigned ScalarizationCost =
getScalarizationOverhead(RetTy, true, false, TTI);
for (unsigned i = 0, ie = Tys.size(); i != ie; ++i)
ScalarizationCost += getScalarizationOverhead(Tys[i], false, true, TTI);
unsigned Cost = ScalarCallCost * VF + ScalarizationCost;
NeedToScalarize = true;
if (!TLI || !TLI->isFunctionVectorizable(FnName, VF) || CI->isNoBuiltin())
return Cost;
unsigned VectorCallCost = TTI.getCallInstrCost(nullptr, RetTy, Tys);
if (VectorCallCost < Cost) {
NeedToScalarize = false;
return VectorCallCost;
}
return Cost;
}
static unsigned getVectorIntrinsicCost(CallInst *CI, unsigned VF,
const TargetTransformInfo &TTI,
const TargetLibraryInfo *TLI) {
Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI);
assert(ID && "Expected intrinsic call!");
Type *RetTy = ToVectorTy(CI->getType(), VF);
SmallVector<Type *, 4> Tys;
for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i)
Tys.push_back(ToVectorTy(CI->getArgOperand(i)->getType(), VF));
return TTI.getIntrinsicInstrCost(ID, RetTy, Tys);
}
static Type *smallestIntegerVectorType(Type *T1, Type *T2) {
IntegerType *I1 = cast<IntegerType>(T1->getVectorElementType());
IntegerType *I2 = cast<IntegerType>(T2->getVectorElementType());
return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2;
}
static Type *largestIntegerVectorType(Type *T1, Type *T2) {
IntegerType *I1 = cast<IntegerType>(T1->getVectorElementType());
IntegerType *I2 = cast<IntegerType>(T2->getVectorElementType());
return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2;
}
void InnerLoopVectorizer::truncateToMinimalBitwidths() {
for (auto &KV : MinBWs) {
VectorParts &Parts = WidenMap.get(KV.first);
for (Value *&I : Parts) {
if (I->use_empty())
continue;
Type *OriginalTy = I->getType();
Type *ScalarTruncatedTy = IntegerType::get(OriginalTy->getContext(),
KV.second);
Type *TruncatedTy = VectorType::get(ScalarTruncatedTy,
OriginalTy->getVectorNumElements());
if (TruncatedTy == OriginalTy)
continue;
IRBuilder<> B(cast<Instruction>(I));
auto ShrinkOperand = [&](Value *V) -> Value* {
if (auto *ZI = dyn_cast<ZExtInst>(V))
if (ZI->getSrcTy() == TruncatedTy)
return ZI->getOperand(0);
return B.CreateZExtOrTrunc(V, TruncatedTy);
};
Value *NewI = nullptr;
if (BinaryOperator *BO = dyn_cast<BinaryOperator>(I)) {
NewI = B.CreateBinOp(BO->getOpcode(),
ShrinkOperand(BO->getOperand(0)),
ShrinkOperand(BO->getOperand(1)));
cast<BinaryOperator>(NewI)->copyIRFlags(I);
} else if (ICmpInst *CI = dyn_cast<ICmpInst>(I)) {
NewI = B.CreateICmp(CI->getPredicate(),
ShrinkOperand(CI->getOperand(0)),
ShrinkOperand(CI->getOperand(1)));
} else if (SelectInst *SI = dyn_cast<SelectInst>(I)) {
NewI = B.CreateSelect(SI->getCondition(),
ShrinkOperand(SI->getTrueValue()),
ShrinkOperand(SI->getFalseValue()));
} else if (CastInst *CI = dyn_cast<CastInst>(I)) {
switch (CI->getOpcode()) {
default: llvm_unreachable("Unhandled cast!");
case Instruction::Trunc:
NewI = ShrinkOperand(CI->getOperand(0));
break;
case Instruction::SExt:
NewI = B.CreateSExtOrTrunc(CI->getOperand(0),
smallestIntegerVectorType(OriginalTy,
TruncatedTy));
break;
case Instruction::ZExt:
NewI = B.CreateZExtOrTrunc(CI->getOperand(0),
smallestIntegerVectorType(OriginalTy,
TruncatedTy));
break;
}
} else if (ShuffleVectorInst *SI = dyn_cast<ShuffleVectorInst>(I)) {
auto Elements0 = SI->getOperand(0)->getType()->getVectorNumElements();
auto *O0 =
B.CreateZExtOrTrunc(SI->getOperand(0),
VectorType::get(ScalarTruncatedTy, Elements0));
auto Elements1 = SI->getOperand(1)->getType()->getVectorNumElements();
auto *O1 =
B.CreateZExtOrTrunc(SI->getOperand(1),
VectorType::get(ScalarTruncatedTy, Elements1));
NewI = B.CreateShuffleVector(O0, O1, SI->getMask());
} else if (isa<LoadInst>(I)) {
continue;
} else {
llvm_unreachable("Unhandled instruction type!");
}
NewI->takeName(cast<Instruction>(I));
Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy);
I->replaceAllUsesWith(Res);
cast<Instruction>(I)->eraseFromParent();
I = Res;
}
}
for (auto &KV : MinBWs) {
VectorParts &Parts = WidenMap.get(KV.first);
for (Value *&I : Parts) {
ZExtInst *Inst = dyn_cast<ZExtInst>(I);
if (Inst && Inst->use_empty()) {
Value *NewI = Inst->getOperand(0);
Inst->eraseFromParent();
I = NewI;
}
}
}
}
void InnerLoopVectorizer::vectorizeLoop() {
Constant *Zero = Builder.getInt32(0);
PhiVector RdxPHIsToFix;
LoopBlocksDFS DFS(OrigLoop);
DFS.perform(LI);
for (LoopBlocksDFS::RPOIterator bb = DFS.beginRPO(),
be = DFS.endRPO(); bb != be; ++bb)
vectorizeBlockInLoop(*bb, &RdxPHIsToFix);
if (VF > 1)
truncateToMinimalBitwidths();
for (PhiVector::iterator it = RdxPHIsToFix.begin(), e = RdxPHIsToFix.end();
it != e; ++it) {
PHINode *RdxPhi = *it;
assert(RdxPhi && "Unable to recover vectorized PHI");
assert(Legal->getReductionVars()->count(RdxPhi) &&
"Unable to find the reduction variable");
RecurrenceDescriptor RdxDesc = (*Legal->getReductionVars())[RdxPhi];
RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind();
TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind =
RdxDesc.getMinMaxRecurrenceKind();
setDebugLocFromInst(Builder, ReductionStartValue);
Builder.SetInsertPoint(LoopBypassBlocks[1]->getTerminator());
VectorParts &VectorExit = getVectorValue(LoopExitInst);
Type *VecTy = VectorExit[0]->getType();
Value *Identity;
Value *VectorStart;
if (RK == RecurrenceDescriptor::RK_IntegerMinMax ||
RK == RecurrenceDescriptor::RK_FloatMinMax) {
if (VF == 1) {
VectorStart = Identity = ReductionStartValue;
} else {
VectorStart = Identity =
Builder.CreateVectorSplat(VF, ReductionStartValue, "minmax.ident");
}
} else {
Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity(
RK, VecTy->getScalarType());
if (VF == 1) {
Identity = Iden;
VectorStart = ReductionStartValue;
} else {
Identity = ConstantVector::getSplat(VF, Iden);
VectorStart =
Builder.CreateInsertElement(Identity, ReductionStartValue, Zero);
}
}
VectorParts &VecRdxPhi = WidenMap.get(RdxPhi);
BasicBlock *Latch = OrigLoop->getLoopLatch();
Value *LoopVal = RdxPhi->getIncomingValueForBlock(Latch);
VectorParts &Val = getVectorValue(LoopVal);
for (unsigned part = 0; part < UF; ++part) {
Value *StartVal = (part == 0) ? VectorStart : Identity;
cast<PHINode>(VecRdxPhi[part])->addIncoming(StartVal,
LoopVectorPreHeader);
cast<PHINode>(VecRdxPhi[part])->addIncoming(Val[part],
LoopVectorBody.back());
}
Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
VectorParts RdxParts = getVectorValue(LoopExitInst);
setDebugLocFromInst(Builder, LoopExitInst);
if (VF > 1 && RdxPhi->getType() != RdxDesc.getRecurrenceType()) {
Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF);
Builder.SetInsertPoint(LoopVectorBody.back()->getTerminator());
for (unsigned part = 0; part < UF; ++part) {
Value *Trunc = Builder.CreateTrunc(RdxParts[part], RdxVecTy);
Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy)
: Builder.CreateZExt(Trunc, VecTy);
for (Value::user_iterator UI = RdxParts[part]->user_begin();
UI != RdxParts[part]->user_end();)
if (*UI != Trunc) {
(*UI++)->replaceUsesOfWith(RdxParts[part], Extnd);
RdxParts[part] = Extnd;
} else {
++UI;
}
}
Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
for (unsigned part = 0; part < UF; ++part)
RdxParts[part] = Builder.CreateTrunc(RdxParts[part], RdxVecTy);
}
Value *ReducedPartRdx = RdxParts[0];
unsigned Op = RecurrenceDescriptor::getRecurrenceBinOp(RK);
setDebugLocFromInst(Builder, ReducedPartRdx);
for (unsigned part = 1; part < UF; ++part) {
if (Op != Instruction::ICmp && Op != Instruction::FCmp)
ReducedPartRdx = addFastMathFlag(
Builder.CreateBinOp((Instruction::BinaryOps)Op, RdxParts[part],
ReducedPartRdx, "bin.rdx"));
else
ReducedPartRdx = RecurrenceDescriptor::createMinMaxOp(
Builder, MinMaxKind, ReducedPartRdx, RdxParts[part]);
}
if (VF > 1) {
assert(isPowerOf2_32(VF) &&
"Reduction emission only supported for pow2 vectors!");
Value *TmpVec = ReducedPartRdx;
SmallVector<Constant*, 32> ShuffleMask(VF, nullptr);
for (unsigned i = VF; i != 1; i >>= 1) {
for (unsigned j = 0; j != i/2; ++j)
ShuffleMask[j] = Builder.getInt32(i/2 + j);
std::fill(&ShuffleMask[i/2], ShuffleMask.end(),
UndefValue::get(Builder.getInt32Ty()));
Value *Shuf =
Builder.CreateShuffleVector(TmpVec,
UndefValue::get(TmpVec->getType()),
ConstantVector::get(ShuffleMask),
"rdx.shuf");
if (Op != Instruction::ICmp && Op != Instruction::FCmp)
TmpVec = addFastMathFlag(Builder.CreateBinOp(
(Instruction::BinaryOps)Op, TmpVec, Shuf, "bin.rdx"));
else
TmpVec = RecurrenceDescriptor::createMinMaxOp(Builder, MinMaxKind,
TmpVec, Shuf);
}
ReducedPartRdx = Builder.CreateExtractElement(TmpVec,
Builder.getInt32(0));
if (RdxPhi->getType() != RdxDesc.getRecurrenceType())
ReducedPartRdx =
RdxDesc.isSigned()
? Builder.CreateSExt(ReducedPartRdx, RdxPhi->getType())
: Builder.CreateZExt(ReducedPartRdx, RdxPhi->getType());
}
PHINode *BCBlockPhi = PHINode::Create(RdxPhi->getType(), 2, "bc.merge.rdx",
LoopScalarPreHeader->getTerminator());
for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]);
BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);
for (BasicBlock::iterator LEI = LoopExitBlock->begin(),
LEE = LoopExitBlock->end(); LEI != LEE; ++LEI) {
PHINode *LCSSAPhi = dyn_cast<PHINode>(LEI);
if (!LCSSAPhi) break;
assert(LCSSAPhi->getNumIncomingValues() < 3 && "Invalid LCSSA PHI");
if (LCSSAPhi->getIncomingValue(0) == LoopExitInst) {
LCSSAPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);
break;
}
}
int IncomingEdgeBlockIdx =
(RdxPhi)->getBasicBlockIndex(OrigLoop->getLoopLatch());
assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
(RdxPhi)->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
(RdxPhi)->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
}
fixLCSSAPHIs();
updateAnalysis();
for (auto KV : PredicatedStores) {
BasicBlock::iterator I(KV.first);
auto *BB = SplitBlock(I->getParent(), &*std::next(I), DT, LI);
auto *T = SplitBlockAndInsertIfThen(KV.second, &*I, false,
nullptr, DT);
I->moveBefore(T);
I->getParent()->setName("pred.store.if");
BB->setName("pred.store.continue");
}
DEBUG(DT->verifyDomTree());
cse(LoopVectorBody);
}
void InnerLoopVectorizer::fixLCSSAPHIs() {
for (BasicBlock::iterator LEI = LoopExitBlock->begin(),
LEE = LoopExitBlock->end(); LEI != LEE; ++LEI) {
PHINode *LCSSAPhi = dyn_cast<PHINode>(LEI);
if (!LCSSAPhi) break;
if (LCSSAPhi->getNumIncomingValues() == 1)
LCSSAPhi->addIncoming(UndefValue::get(LCSSAPhi->getType()),
LoopMiddleBlock);
}
}
InnerLoopVectorizer::VectorParts
InnerLoopVectorizer::createEdgeMask(BasicBlock *Src, BasicBlock *Dst) {
assert(std::find(pred_begin(Dst), pred_end(Dst), Src) != pred_end(Dst) &&
"Invalid edge");
std::pair<BasicBlock*, BasicBlock*> Edge(Src, Dst);
EdgeMaskCache::iterator ECEntryIt = MaskCache.find(Edge);
if (ECEntryIt != MaskCache.end())
return ECEntryIt->second;
VectorParts SrcMask = createBlockInMask(Src);
BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
assert(BI && "Unexpected terminator found");
if (BI->isConditional()) {
VectorParts EdgeMask = getVectorValue(BI->getCondition());
if (BI->getSuccessor(0) != Dst)
for (unsigned part = 0; part < UF; ++part)
EdgeMask[part] = Builder.CreateNot(EdgeMask[part]);
for (unsigned part = 0; part < UF; ++part)
EdgeMask[part] = Builder.CreateAnd(EdgeMask[part], SrcMask[part]);
MaskCache[Edge] = EdgeMask;
return EdgeMask;
}
MaskCache[Edge] = SrcMask;
return SrcMask;
}
InnerLoopVectorizer::VectorParts
InnerLoopVectorizer::createBlockInMask(BasicBlock *BB) {
assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
if (OrigLoop->getHeader() == BB) {
Value *C = ConstantInt::get(IntegerType::getInt1Ty(BB->getContext()), 1);
return getVectorValue(C);
}
Value *Zero = ConstantInt::get(IntegerType::getInt1Ty(BB->getContext()), 0);
VectorParts BlockMask = getVectorValue(Zero);
for (pred_iterator it = pred_begin(BB), e = pred_end(BB); it != e; ++it) {
VectorParts EM = createEdgeMask(*it, BB);
for (unsigned part = 0; part < UF; ++part)
BlockMask[part] = Builder.CreateOr(BlockMask[part], EM[part]);
}
return BlockMask;
}
void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN,
InnerLoopVectorizer::VectorParts &Entry,
unsigned UF, unsigned VF, PhiVector *PV) {
PHINode* P = cast<PHINode>(PN);
if (Legal->getReductionVars()->count(P)) {
for (unsigned part = 0; part < UF; ++part) {
Type *VecTy = (VF == 1) ? PN->getType() :
VectorType::get(PN->getType(), VF);
Entry[part] = PHINode::Create(
VecTy, 2, "vec.phi", &*LoopVectorBody.back()->getFirstInsertionPt());
}
PV->push_back(P);
return;
}
setDebugLocFromInst(Builder, P);
if (P->getParent() != OrigLoop->getHeader()) {
unsigned NumIncoming = P->getNumIncomingValues();
for (unsigned In = 0; In < NumIncoming; In++) {
VectorParts Cond = createEdgeMask(P->getIncomingBlock(In),
P->getParent());
VectorParts &In0 = getVectorValue(P->getIncomingValue(In));
for (unsigned part = 0; part < UF; ++part) {
if (In == 0)
Entry[part] = Builder.CreateSelect(Cond[part], In0[part],
In0[part]);
else
Entry[part] = Builder.CreateSelect(Cond[part], In0[part],
Entry[part], "predphi");
}
}
return;
}
assert(Legal->getInductionVars()->count(P) &&
"Not an induction variable");
InductionDescriptor II = Legal->getInductionVars()->lookup(P);
switch (II.getKind()) {
case InductionDescriptor::IK_NoInduction:
llvm_unreachable("Unknown induction");
case InductionDescriptor::IK_IntInduction: {
assert(P->getType() == II.getStartValue()->getType() && "Types must match");
Value *V = Induction;
if (P != OldInduction) {
V = Builder.CreateSExtOrTrunc(Induction, P->getType());
V = II.transform(Builder, V);
V->setName("offset.idx");
}
Value *Broadcasted = getBroadcastInstrs(V);
for (unsigned part = 0; part < UF; ++part)
Entry[part] = getStepVector(Broadcasted, VF * part, II.getStepValue());
return;
}
case InductionDescriptor::IK_PtrInduction:
assert(P->getType()->isPointerTy() && "Unexpected type.");
Value *PtrInd = Induction;
PtrInd = Builder.CreateSExtOrTrunc(PtrInd, II.getStepValue()->getType());
for (unsigned part = 0; part < UF; ++part) {
if (VF == 1) {
int EltIndex = part;
Constant *Idx = ConstantInt::get(PtrInd->getType(), EltIndex);
Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx);
Value *SclrGep = II.transform(Builder, GlobalIdx);
SclrGep->setName("next.gep");
Entry[part] = SclrGep;
continue;
}
Value *VecVal = UndefValue::get(VectorType::get(P->getType(), VF));
for (unsigned int i = 0; i < VF; ++i) {
int EltIndex = i + part * VF;
Constant *Idx = ConstantInt::get(PtrInd->getType(), EltIndex);
Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx);
Value *SclrGep = II.transform(Builder, GlobalIdx);
SclrGep->setName("next.gep");
VecVal = Builder.CreateInsertElement(VecVal, SclrGep,
Builder.getInt32(i),
"insert.gep");
}
Entry[part] = VecVal;
}
return;
}
}
void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) {
for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
VectorParts &Entry = WidenMap.get(&*it);
switch (it->getOpcode()) {
case Instruction::Br:
continue;
case Instruction::PHI: {
widenPHIInstruction(&*it, Entry, UF, VF, PV);
continue;
}
case Instruction::Add:
case Instruction::FAdd:
case Instruction::Sub:
case Instruction::FSub:
case Instruction::Mul:
case Instruction::FMul:
case Instruction::UDiv:
case Instruction::SDiv:
case Instruction::FDiv:
case Instruction::URem:
case Instruction::SRem:
case Instruction::FRem:
case Instruction::Shl:
case Instruction::LShr:
case Instruction::AShr:
case Instruction::And:
case Instruction::Or:
case Instruction::Xor: {
BinaryOperator *BinOp = dyn_cast<BinaryOperator>(it);
setDebugLocFromInst(Builder, BinOp);
VectorParts &A = getVectorValue(it->getOperand(0));
VectorParts &B = getVectorValue(it->getOperand(1));
for (unsigned Part = 0; Part < UF; ++Part) {
Value *V = Builder.CreateBinOp(BinOp->getOpcode(), A[Part], B[Part]);
if (BinaryOperator *VecOp = dyn_cast<BinaryOperator>(V))
VecOp->copyIRFlags(BinOp);
Entry[Part] = V;
}
propagateMetadata(Entry, &*it);
break;
}
case Instruction::Select: {
bool InvariantCond = SE->isLoopInvariant(SE->getSCEV(it->getOperand(0)),
OrigLoop);
setDebugLocFromInst(Builder, &*it);
VectorParts &Cond = getVectorValue(it->getOperand(0));
VectorParts &Op0 = getVectorValue(it->getOperand(1));
VectorParts &Op1 = getVectorValue(it->getOperand(2));
Value *ScalarCond = (VF == 1) ? Cond[0] :
Builder.CreateExtractElement(Cond[0], Builder.getInt32(0));
for (unsigned Part = 0; Part < UF; ++Part) {
Entry[Part] = Builder.CreateSelect(
InvariantCond ? ScalarCond : Cond[Part],
Op0[Part],
Op1[Part]);
}
propagateMetadata(Entry, &*it);
break;
}
case Instruction::ICmp:
case Instruction::FCmp: {
bool FCmp = (it->getOpcode() == Instruction::FCmp);
CmpInst *Cmp = dyn_cast<CmpInst>(it);
setDebugLocFromInst(Builder, &*it);
VectorParts &A = getVectorValue(it->getOperand(0));
VectorParts &B = getVectorValue(it->getOperand(1));
for (unsigned Part = 0; Part < UF; ++Part) {
Value *C = nullptr;
if (FCmp) {
C = Builder.CreateFCmp(Cmp->getPredicate(), A[Part], B[Part]);
cast<FCmpInst>(C)->copyFastMathFlags(&*it);
} else {
C = Builder.CreateICmp(Cmp->getPredicate(), A[Part], B[Part]);
}
Entry[Part] = C;
}
propagateMetadata(Entry, &*it);
break;
}
case Instruction::Store:
case Instruction::Load:
vectorizeMemoryInstruction(&*it);
break;
case Instruction::ZExt:
case Instruction::SExt:
case Instruction::FPToUI:
case Instruction::FPToSI:
case Instruction::FPExt:
case Instruction::PtrToInt:
case Instruction::IntToPtr:
case Instruction::SIToFP:
case Instruction::UIToFP:
case Instruction::Trunc:
case Instruction::FPTrunc:
case Instruction::BitCast: {
CastInst *CI = dyn_cast<CastInst>(it);
setDebugLocFromInst(Builder, &*it);
if (CI->getOperand(0) == OldInduction &&
it->getOpcode() == Instruction::Trunc) {
Value *ScalarCast = Builder.CreateCast(CI->getOpcode(), Induction,
CI->getType());
Value *Broadcasted = getBroadcastInstrs(ScalarCast);
InductionDescriptor II = Legal->getInductionVars()->lookup(OldInduction);
Constant *Step =
ConstantInt::getSigned(CI->getType(), II.getStepValue()->getSExtValue());
for (unsigned Part = 0; Part < UF; ++Part)
Entry[Part] = getStepVector(Broadcasted, VF * Part, Step);
propagateMetadata(Entry, &*it);
break;
}
Type *DestTy = (VF == 1) ? CI->getType() :
VectorType::get(CI->getType(), VF);
VectorParts &A = getVectorValue(it->getOperand(0));
for (unsigned Part = 0; Part < UF; ++Part)
Entry[Part] = Builder.CreateCast(CI->getOpcode(), A[Part], DestTy);
propagateMetadata(Entry, &*it);
break;
}
case Instruction::Call: {
if (isa<DbgInfoIntrinsic>(it))
break;
setDebugLocFromInst(Builder, &*it);
Module *M = BB->getParent()->getParent();
CallInst *CI = cast<CallInst>(it);
StringRef FnName = CI->getCalledFunction()->getName();
Function *F = CI->getCalledFunction();
Type *RetTy = ToVectorTy(CI->getType(), VF);
SmallVector<Type *, 4> Tys;
for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i)
Tys.push_back(ToVectorTy(CI->getArgOperand(i)->getType(), VF));
Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI);
if (ID &&
(ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
ID == Intrinsic::lifetime_start)) {
scalarizeInstruction(&*it);
break;
}
bool NeedToScalarize;
unsigned CallCost = getVectorCallCost(CI, VF, *TTI, TLI, NeedToScalarize);
bool UseVectorIntrinsic =
ID && getVectorIntrinsicCost(CI, VF, *TTI, TLI) <= CallCost;
if (!UseVectorIntrinsic && NeedToScalarize) {
scalarizeInstruction(&*it);
break;
}
for (unsigned Part = 0; Part < UF; ++Part) {
SmallVector<Value *, 4> Args;
for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) {
Value *Arg = CI->getArgOperand(i);
if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, i)) {
VectorParts &VectorArg = getVectorValue(CI->getArgOperand(i));
Arg = VectorArg[Part];
}
Args.push_back(Arg);
}
Function *VectorF;
if (UseVectorIntrinsic) {
Type *TysForDecl[] = {CI->getType()};
if (VF > 1)
TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF);
VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl);
} else {
StringRef VFnName = TLI->getVectorizedFunction(FnName, VF);
assert(!VFnName.empty() && "Vector function name is empty.");
VectorF = M->getFunction(VFnName);
if (!VectorF) {
FunctionType *FTy = FunctionType::get(RetTy, Tys, false);
VectorF =
Function::Create(FTy, Function::ExternalLinkage, VFnName, M);
VectorF->copyAttributesFrom(F);
}
}
assert(VectorF && "Can't create vector function.");
Entry[Part] = Builder.CreateCall(VectorF, Args);
}
propagateMetadata(Entry, &*it);
break;
}
default:
scalarizeInstruction(&*it);
break;
} }}
void InnerLoopVectorizer::updateAnalysis() {
SE->forgetLoop(OrigLoop);
assert(DT->properlyDominates(LoopBypassBlocks.front(), LoopExitBlock) &&
"Entry does not dominate exit.");
for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I)
DT->addNewBlock(LoopBypassBlocks[I], LoopBypassBlocks[I-1]);
DT->addNewBlock(LoopVectorPreHeader, LoopBypassBlocks.back());
assert(LoopVectorBody.size() == 1 && "Expected single block loop!");
DT->addNewBlock(LoopVectorBody[0], LoopVectorPreHeader);
DT->addNewBlock(LoopMiddleBlock, LoopVectorBody.back());
DT->addNewBlock(LoopScalarPreHeader, LoopBypassBlocks[0]);
DT->changeImmediateDominator(LoopScalarBody, LoopScalarPreHeader);
DT->changeImmediateDominator(LoopExitBlock, LoopBypassBlocks[0]);
DEBUG(DT->verifyDomTree());
}
static bool canIfConvertPHINodes(BasicBlock *BB) {
for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
PHINode *Phi = dyn_cast<PHINode>(I);
if (!Phi)
return true;
for (unsigned p = 0, e = Phi->getNumIncomingValues(); p != e; ++p)
if (Constant *C = dyn_cast<Constant>(Phi->getIncomingValue(p)))
if (C->canTrap())
return false;
}
return true;
}
bool LoopVectorizationLegality::canVectorizeWithIfConvert() {
if (!EnableIfConversion) {
emitAnalysis(VectorizationReport() << "if-conversion is disabled");
return false;
}
assert(TheLoop->getNumBlocks() > 1 && "Single block loops are vectorizable");
SmallPtrSet<Value *, 8> SafePointes;
for (Loop::block_iterator BI = TheLoop->block_begin(),
BE = TheLoop->block_end(); BI != BE; ++BI) {
BasicBlock *BB = *BI;
if (blockNeedsPredication(BB))
continue;
for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
if (LoadInst *LI = dyn_cast<LoadInst>(I))
SafePointes.insert(LI->getPointerOperand());
else if (StoreInst *SI = dyn_cast<StoreInst>(I))
SafePointes.insert(SI->getPointerOperand());
}
}
BasicBlock *Header = TheLoop->getHeader();
for (Loop::block_iterator BI = TheLoop->block_begin(),
BE = TheLoop->block_end(); BI != BE; ++BI) {
BasicBlock *BB = *BI;
if (!isa<BranchInst>(BB->getTerminator())) {
emitAnalysis(VectorizationReport(BB->getTerminator())
<< "loop contains a switch statement");
return false;
}
if (blockNeedsPredication(BB)) {
if (!blockCanBePredicated(BB, SafePointes)) {
emitAnalysis(VectorizationReport(BB->getTerminator())
<< "control flow cannot be substituted for a select");
return false;
}
} else if (BB != Header && !canIfConvertPHINodes(BB)) {
emitAnalysis(VectorizationReport(BB->getTerminator())
<< "control flow cannot be substituted for a select");
return false;
}
}
return true;
}
bool LoopVectorizationLegality::canVectorize() {
if (!TheLoop->getLoopPreheader()) {
emitAnalysis(
VectorizationReport() <<
"loop control flow is not understood by vectorizer");
return false;
}
if (!TheLoop->empty()) {
emitAnalysis(VectorizationReport() << "loop is not the innermost loop");
return false;
}
if (TheLoop->getNumBackEdges() != 1) {
emitAnalysis(
VectorizationReport() <<
"loop control flow is not understood by vectorizer");
return false;
}
if (!TheLoop->getExitingBlock()) {
emitAnalysis(
VectorizationReport() <<
"loop control flow is not understood by vectorizer");
return false;
}
if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
emitAnalysis(
VectorizationReport() <<
"loop control flow is not understood by vectorizer");
return false;
}
DEBUG(dbgs() << "LV: Found a loop: " <<
TheLoop->getHeader()->getName() << '\n');
unsigned NumBlocks = TheLoop->getNumBlocks();
if (NumBlocks != 1 && !canVectorizeWithIfConvert()) {
DEBUG(dbgs() << "LV: Can't if-convert the loop.\n");
return false;
}
const SCEV *ExitCount = SE->getBackedgeTakenCount(TheLoop);
if (ExitCount == SE->getCouldNotCompute()) {
emitAnalysis(VectorizationReport() <<
"could not determine number of loop iterations");
DEBUG(dbgs() << "LV: SCEV could not compute the loop exit count.\n");
return false;
}
if (!canVectorizeInstrs()) {
DEBUG(dbgs() << "LV: Can't vectorize the instructions or CFG\n");
return false;
}
if (!canVectorizeMemory()) {
DEBUG(dbgs() << "LV: Can't vectorize due to memory conflicts\n");
return false;
}
collectLoopUniforms();
DEBUG(dbgs() << "LV: We can vectorize this loop"
<< (LAI->getRuntimePointerChecking()->Need
? " (with a runtime bound check)"
: "")
<< "!\n");
bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
if (EnableInterleavedMemAccesses.getNumOccurrences() > 0)
UseInterleaved = EnableInterleavedMemAccesses;
if (UseInterleaved)
InterleaveInfo.analyzeInterleaving(Strides);
unsigned SCEVThreshold = VectorizeSCEVCheckThreshold;
if (Hints->getForce() == LoopVectorizeHints::FK_Enabled)
SCEVThreshold = PragmaVectorizeSCEVCheckThreshold;
if (Preds.getComplexity() > SCEVThreshold) {
emitAnalysis(VectorizationReport()
<< "Too many SCEV assumptions need to be made and checked "
<< "at runtime");
DEBUG(dbgs() << "LV: Too many SCEV checks needed.\n");
return false;
}
return true;
}
static Type *convertPointerToIntegerType(const DataLayout &DL, Type *Ty) {
if (Ty->isPointerTy())
return DL.getIntPtrType(Ty);
if (Ty->getScalarSizeInBits() < 32)
return Type::getInt32Ty(Ty->getContext());
return Ty;
}
static Type* getWiderType(const DataLayout &DL, Type *Ty0, Type *Ty1) {
Ty0 = convertPointerToIntegerType(DL, Ty0);
Ty1 = convertPointerToIntegerType(DL, Ty1);
if (Ty0->getScalarSizeInBits() > Ty1->getScalarSizeInBits())
return Ty0;
return Ty1;
}
static bool hasOutsideLoopUser(const Loop *TheLoop, Instruction *Inst,
SmallPtrSetImpl<Value *> &Reductions) {
if (!Reductions.count(Inst))
for (User *U : Inst->users()) {
Instruction *UI = cast<Instruction>(U);
if (!TheLoop->contains(UI)) {
DEBUG(dbgs() << "LV: Found an outside user for : " << *UI << '\n');
return true;
}
}
return false;
}
bool LoopVectorizationLegality::canVectorizeInstrs() {
BasicBlock *Header = TheLoop->getHeader();
Function &F = *Header->getParent();
const DataLayout &DL = F.getParent()->getDataLayout();
if (F.hasFnAttribute("no-nans-fp-math"))
HasFunNoNaNAttr =
F.getFnAttribute("no-nans-fp-math").getValueAsString() == "true";
for (Loop::block_iterator bb = TheLoop->block_begin(),
be = TheLoop->block_end(); bb != be; ++bb) {
for (BasicBlock::iterator it = (*bb)->begin(), e = (*bb)->end(); it != e;
++it) {
if (PHINode *Phi = dyn_cast<PHINode>(it)) {
Type *PhiTy = Phi->getType();
if (!PhiTy->isIntegerTy() &&
!PhiTy->isFloatingPointTy() &&
!PhiTy->isPointerTy()) {
emitAnalysis(VectorizationReport(&*it)
<< "loop control flow is not understood by vectorizer");
DEBUG(dbgs() << "LV: Found an non-int non-pointer PHI.\n");
return false;
}
if (*bb != Header) {
if (!hasOutsideLoopUser(TheLoop, &*it, AllowedExit))
continue;
emitAnalysis(VectorizationReport(&*it) <<
"value could not be identified as "
"an induction or reduction variable");
return false;
}
if (Phi->getNumIncomingValues() != 2) {
emitAnalysis(VectorizationReport(&*it)
<< "control flow not understood by vectorizer");
DEBUG(dbgs() << "LV: Found an invalid PHI.\n");
return false;
}
InductionDescriptor ID;
if (InductionDescriptor::isInductionPHI(Phi, SE, ID)) {
Inductions[Phi] = ID;
if (!WidestIndTy)
WidestIndTy = convertPointerToIntegerType(DL, PhiTy);
else
WidestIndTy = getWiderType(DL, PhiTy, WidestIndTy);
if (ID.getKind() == InductionDescriptor::IK_IntInduction &&
ID.getStepValue()->isOne() &&
isa<Constant>(ID.getStartValue()) &&
cast<Constant>(ID.getStartValue())->isNullValue()) {
if (!Induction || PhiTy == WidestIndTy)
Induction = Phi;
}
DEBUG(dbgs() << "LV: Found an induction variable.\n");
if (hasOutsideLoopUser(TheLoop, &*it, AllowedExit)) {
emitAnalysis(VectorizationReport(&*it) <<
"use of induction value outside of the "
"loop is not handled by vectorizer");
return false;
}
continue;
}
if (RecurrenceDescriptor::isReductionPHI(Phi, TheLoop,
Reductions[Phi])) {
if (Reductions[Phi].hasUnsafeAlgebra())
Requirements->addUnsafeAlgebraInst(
Reductions[Phi].getUnsafeAlgebraInst());
AllowedExit.insert(Reductions[Phi].getLoopExitInstr());
continue;
}
emitAnalysis(VectorizationReport(&*it) <<
"value that could not be identified as "
"reduction is used outside the loop");
DEBUG(dbgs() << "LV: Found an unidentified PHI."<< *Phi <<"\n");
return false;
}
CallInst *CI = dyn_cast<CallInst>(it);
if (CI && !getIntrinsicIDForCall(CI, TLI) && !isa<DbgInfoIntrinsic>(CI) &&
!(CI->getCalledFunction() && TLI &&
TLI->isFunctionVectorizable(CI->getCalledFunction()->getName()))) {
emitAnalysis(VectorizationReport(&*it)
<< "call instruction cannot be vectorized");
DEBUG(dbgs() << "LV: Found a non-intrinsic, non-libfunc callsite.\n");
return false;
}
if (CI &&
hasVectorInstrinsicScalarOpd(getIntrinsicIDForCall(CI, TLI), 1)) {
if (!SE->isLoopInvariant(SE->getSCEV(CI->getOperand(1)), TheLoop)) {
emitAnalysis(VectorizationReport(&*it)
<< "intrinsic instruction cannot be vectorized");
DEBUG(dbgs() << "LV: Found unvectorizable intrinsic " << *CI << "\n");
return false;
}
}
if ((!VectorType::isValidElementType(it->getType()) &&
!it->getType()->isVoidTy()) || isa<ExtractElementInst>(it)) {
emitAnalysis(VectorizationReport(&*it)
<< "instruction return type cannot be vectorized");
DEBUG(dbgs() << "LV: Found unvectorizable type.\n");
return false;
}
if (StoreInst *ST = dyn_cast<StoreInst>(it)) {
Type *T = ST->getValueOperand()->getType();
if (!VectorType::isValidElementType(T)) {
emitAnalysis(VectorizationReport(ST) <<
"store instruction cannot be vectorized");
return false;
}
if (EnableMemAccessVersioning)
collectStridedAccess(ST);
}
if (EnableMemAccessVersioning)
if (LoadInst *LI = dyn_cast<LoadInst>(it))
collectStridedAccess(LI);
if (hasOutsideLoopUser(TheLoop, &*it, AllowedExit)) {
emitAnalysis(VectorizationReport(&*it) <<
"value cannot be used outside the loop");
return false;
}
}
}
if (!Induction) {
DEBUG(dbgs() << "LV: Did not find one integer induction var.\n");
if (Inductions.empty()) {
emitAnalysis(VectorizationReport()
<< "loop induction variable could not be identified");
return false;
}
}
if (Induction && WidestIndTy != Induction->getType())
Induction = nullptr;
return true;
}
void LoopVectorizationLegality::collectStridedAccess(Value *MemAccess) {
Value *Ptr = nullptr;
if (LoadInst *LI = dyn_cast<LoadInst>(MemAccess))
Ptr = LI->getPointerOperand();
else if (StoreInst *SI = dyn_cast<StoreInst>(MemAccess))
Ptr = SI->getPointerOperand();
else
return;
Value *Stride = getStrideFromPointer(Ptr, SE, TheLoop);
if (!Stride)
return;
DEBUG(dbgs() << "LV: Found a strided access that we can version");
DEBUG(dbgs() << " Ptr: " << *Ptr << " Stride: " << *Stride << "\n");
Strides[Ptr] = Stride;
StrideSet.insert(Stride);
}
void LoopVectorizationLegality::collectLoopUniforms() {
std::vector<Value*> Worklist;
BasicBlock *Latch = TheLoop->getLoopLatch();
Worklist.push_back(Latch->getTerminator()->getOperand(0));
for (Loop::block_iterator B = TheLoop->block_begin(),
BE = TheLoop->block_end(); B != BE; ++B)
for (BasicBlock::iterator I = (*B)->begin(), IE = (*B)->end();
I != IE; ++I)
if (I->getType()->isPointerTy() && isConsecutivePtr(&*I))
Worklist.insert(Worklist.end(), I->op_begin(), I->op_end());
while (!Worklist.empty()) {
Instruction *I = dyn_cast<Instruction>(Worklist.back());
Worklist.pop_back();
if (!I || !TheLoop->contains(I) || isa<PHINode>(I))
continue;
Uniforms.insert(I);
Worklist.insert(Worklist.end(), I->op_begin(), I->op_end());
}
}
bool LoopVectorizationLegality::canVectorizeMemory() {
LAI = &LAA->getInfo(TheLoop, Strides);
auto &OptionalReport = LAI->getReport();
if (OptionalReport)
emitAnalysis(VectorizationReport(*OptionalReport));
if (!LAI->canVectorizeMemory())
return false;
if (LAI->hasStoreToLoopInvariantAddress()) {
emitAnalysis(
VectorizationReport()
<< "write to a loop invariant address could not be vectorized");
DEBUG(dbgs() << "LV: We don't allow storing to uniform addresses\n");
return false;
}
Requirements->addRuntimePointerChecks(LAI->getNumRuntimePointerChecks());
Preds.add(&LAI->Preds);
return true;
}
bool LoopVectorizationLegality::isInductionVariable(const Value *V) {
Value *In0 = const_cast<Value*>(V);
PHINode *PN = dyn_cast_or_null<PHINode>(In0);
if (!PN)
return false;
return Inductions.count(PN);
}
bool LoopVectorizationLegality::blockNeedsPredication(BasicBlock *BB) {
return LoopAccessInfo::blockNeedsPredication(BB, TheLoop, DT);
}
bool LoopVectorizationLegality::blockCanBePredicated(BasicBlock *BB,
SmallPtrSetImpl<Value *> &SafePtrs) {
for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
for (Instruction::op_iterator OI = it->op_begin(), OE = it->op_end();
OI != OE; ++OI) {
if (Constant *C = dyn_cast<Constant>(*OI))
if (C->canTrap())
return false;
}
if (it->mayReadFromMemory()) {
LoadInst *LI = dyn_cast<LoadInst>(it);
if (!LI)
return false;
if (!SafePtrs.count(LI->getPointerOperand())) {
if (isLegalMaskedLoad(LI->getType(), LI->getPointerOperand())) {
MaskedOp.insert(LI);
continue;
}
return false;
}
}
if (it->mayWriteToMemory()) {
StoreInst *SI = dyn_cast<StoreInst>(it);
if (!SI)
return false;
bool isSafePtr = (SafePtrs.count(SI->getPointerOperand()) != 0);
bool isSinglePredecessor = SI->getParent()->getSinglePredecessor();
if (++NumPredStores > NumberOfStoresToPredicate || !isSafePtr ||
!isSinglePredecessor) {
bool isLegalMaskedOp =
isLegalMaskedStore(SI->getValueOperand()->getType(),
SI->getPointerOperand());
if (isLegalMaskedOp) {
--NumPredStores;
MaskedOp.insert(SI);
continue;
}
return false;
}
}
if (it->mayThrow())
return false;
switch (it->getOpcode()) {
default: continue;
case Instruction::UDiv:
case Instruction::SDiv:
case Instruction::URem:
case Instruction::SRem:
return false;
}
}
return true;
}
void InterleavedAccessInfo::collectConstStridedAccesses(
MapVector<Instruction *, StrideDescriptor> &StrideAccesses,
const ValueToValueMap &Strides) {
SmallVector<Instruction *, 16> AccessList;
for (auto *BB : TheLoop->getBlocks()) {
bool IsPred = LoopAccessInfo::blockNeedsPredication(BB, TheLoop, DT);
for (auto &I : *BB) {
if (!isa<LoadInst>(&I) && !isa<StoreInst>(&I))
continue;
if (IsPred)
return;
AccessList.push_back(&I);
}
}
if (AccessList.empty())
return;
auto &DL = TheLoop->getHeader()->getModule()->getDataLayout();
for (auto I : AccessList) {
LoadInst *LI = dyn_cast<LoadInst>(I);
StoreInst *SI = dyn_cast<StoreInst>(I);
Value *Ptr = LI ? LI->getPointerOperand() : SI->getPointerOperand();
int Stride = isStridedPtr(SE, Ptr, TheLoop, Strides, Preds);
unsigned Factor = std::abs(Stride);
if (Factor < 2 || Factor > MaxInterleaveGroupFactor)
continue;
const SCEV *Scev = replaceSymbolicStrideSCEV(SE, Strides, Preds, Ptr);
PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType());
unsigned Size = DL.getTypeAllocSize(PtrTy->getElementType());
unsigned Align = LI ? LI->getAlignment() : SI->getAlignment();
if (!Align)
Align = DL.getABITypeAlignment(PtrTy->getElementType());
StrideAccesses[I] = StrideDescriptor(Stride, Scev, Size, Align);
}
}
void InterleavedAccessInfo::analyzeInterleaving(
const ValueToValueMap &Strides) {
DEBUG(dbgs() << "LV: Analyzing interleaved accesses...\n");
MapVector<Instruction *, StrideDescriptor> StrideAccesses;
collectConstStridedAccesses(StrideAccesses, Strides);
if (StrideAccesses.empty())
return;
SmallSetVector<InterleaveGroup *, 4> StoreGroups;
for (auto I = StrideAccesses.rbegin(), E = StrideAccesses.rend(); I != E;
++I) {
Instruction *A = I->first;
StrideDescriptor DesA = I->second;
InterleaveGroup *Group = getInterleaveGroup(A);
if (!Group) {
DEBUG(dbgs() << "LV: Creating an interleave group with:" << *A << '\n');
Group = createInterleaveGroup(A, DesA.Stride, DesA.Align);
}
if (A->mayWriteToMemory())
StoreGroups.insert(Group);
for (auto II = std::next(I); II != E; ++II) {
Instruction *B = II->first;
StrideDescriptor DesB = II->second;
if (isInterleaved(B) || A->mayReadFromMemory() != B->mayReadFromMemory())
continue;
if (DesB.Stride != DesA.Stride || DesB.Size != DesA.Size)
continue;
const SCEVConstant *DistToA =
dyn_cast<SCEVConstant>(SE->getMinusSCEV(DesB.Scev, DesA.Scev));
if (!DistToA)
continue;
int DistanceToA = DistToA->getValue()->getValue().getSExtValue();
if (DistanceToA % static_cast<int>(DesA.Size))
continue;
int IndexB =
Group->getIndex(A) + DistanceToA / static_cast<int>(DesA.Size);
if (Group->insertMember(B, IndexB, DesB.Align)) {
DEBUG(dbgs() << "LV: Inserted:" << *B << '\n'
<< " into the interleave group with" << *A << '\n');
InterleaveGroupMap[B] = Group;
if (B->mayReadFromMemory())
Group->setInsertPos(B);
}
} }
for (InterleaveGroup *Group : StoreGroups)
if (Group->getNumMembers() != Group->getFactor())
releaseGroup(Group);
}
LoopVectorizationCostModel::VectorizationFactor
LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) {
VectorizationFactor Factor = { 1U, 0U };
if (OptForSize && Legal->getRuntimePointerChecking()->Need) {
emitAnalysis(VectorizationReport() <<
"runtime pointer checks needed. Enable vectorization of this "
"loop with '#pragma clang loop vectorize(enable)' when "
"compiling with -Os/-Oz");
DEBUG(dbgs() <<
"LV: Aborting. Runtime ptr check is required with -Os/-Oz.\n");
return Factor;
}
if (!EnableCondStoresVectorization && Legal->getNumPredStores()) {
emitAnalysis(VectorizationReport() <<
"store that is conditionally executed prevents vectorization");
DEBUG(dbgs() << "LV: No vectorization. There are conditional stores.\n");
return Factor;
}
unsigned TC = SE->getSmallConstantTripCount(TheLoop);
DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
unsigned SmallestType, WidestType;
std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
unsigned WidestRegister = TTI.getRegisterBitWidth(true);
unsigned MaxSafeDepDist = -1U;
if (Legal->getMaxSafeDepDistBytes() != -1U)
MaxSafeDepDist = Legal->getMaxSafeDepDistBytes() * 8;
WidestRegister = ((WidestRegister < MaxSafeDepDist) ?
WidestRegister : MaxSafeDepDist);
unsigned MaxVectorSize = WidestRegister / WidestType;
DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType << " / "
<< WidestType << " bits.\n");
DEBUG(dbgs() << "LV: The Widest register is: "
<< WidestRegister << " bits.\n");
if (MaxVectorSize == 0) {
DEBUG(dbgs() << "LV: The target has no vector registers.\n");
MaxVectorSize = 1;
}
assert(MaxVectorSize <= 64 && "Did not expect to pack so many elements"
" into one vector!");
unsigned VF = MaxVectorSize;
if (MaximizeBandwidth && !OptForSize) {
SmallVector<unsigned, 8> VFs;
unsigned NewMaxVectorSize = WidestRegister / SmallestType;
for (unsigned VS = MaxVectorSize; VS <= NewMaxVectorSize; VS *= 2)
VFs.push_back(VS);
auto RUs = calculateRegisterUsage(VFs);
unsigned TargetNumRegisters = TTI.getNumberOfRegisters(true);
for (int i = RUs.size() - 1; i >= 0; --i) {
if (RUs[i].MaxLocalUsers <= TargetNumRegisters) {
VF = VFs[i];
break;
}
}
}
if (OptForSize) {
if (TC < 2) {
emitAnalysis
(VectorizationReport() <<
"unable to calculate the loop count due to complex control flow");
DEBUG(dbgs() << "LV: Aborting. A tail loop is required with -Os/-Oz.\n");
return Factor;
}
VF = TC % MaxVectorSize;
if (VF == 0)
VF = MaxVectorSize;
else {
emitAnalysis(VectorizationReport() <<
"cannot optimize for size and vectorize at the "
"same time. Enable vectorization of this loop "
"with '#pragma clang loop vectorize(enable)' "
"when compiling with -Os/-Oz");
DEBUG(dbgs() << "LV: Aborting. A tail loop is required with -Os/-Oz.\n");
return Factor;
}
}
int UserVF = Hints->getWidth();
if (UserVF != 0) {
assert(isPowerOf2_32(UserVF) && "VF needs to be a power of two");
DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
Factor.Width = UserVF;
return Factor;
}
float Cost = expectedCost(1);
#ifndef NDEBUG
const float ScalarCost = Cost;
#endif
unsigned Width = 1;
DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost << ".\n");
bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled;
if (ForceVectorization && VF > 1) {
Width = 2;
Cost = expectedCost(Width) / (float)Width;
}
for (unsigned i=2; i <= VF; i*=2) {
float VectorCost = expectedCost(i) / (float)i;
DEBUG(dbgs() << "LV: Vector loop of width " << i << " costs: " <<
(int)VectorCost << ".\n");
if (VectorCost < Cost) {
Cost = VectorCost;
Width = i;
}
}
DEBUG(if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs()
<< "LV: Vectorization seems to be not beneficial, "
<< "but was forced by a user.\n");
DEBUG(dbgs() << "LV: Selecting VF: "<< Width << ".\n");
Factor.Width = Width;
Factor.Cost = Width * Cost;
return Factor;
}
std::pair<unsigned, unsigned>
LoopVectorizationCostModel::getSmallestAndWidestTypes() {
unsigned MinWidth = -1U;
unsigned MaxWidth = 8;
const DataLayout &DL = TheFunction->getParent()->getDataLayout();
for (Loop::block_iterator bb = TheLoop->block_begin(),
be = TheLoop->block_end(); bb != be; ++bb) {
BasicBlock *BB = *bb;
for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
Type *T = it->getType();
if (ValuesToIgnore.count(&*it))
continue;
if (!isa<LoadInst>(it) && !isa<StoreInst>(it) && !isa<PHINode>(it))
continue;
if (PHINode *PN = dyn_cast<PHINode>(it)) {
if (!Legal->getReductionVars()->count(PN))
continue;
RecurrenceDescriptor RdxDesc = (*Legal->getReductionVars())[PN];
T = RdxDesc.getRecurrenceType();
}
if (StoreInst *ST = dyn_cast<StoreInst>(it))
T = ST->getValueOperand()->getType();
if (T->isPointerTy() && !isConsecutiveLoadOrStore(&*it))
continue;
MinWidth = std::min(MinWidth,
(unsigned)DL.getTypeSizeInBits(T->getScalarType()));
MaxWidth = std::max(MaxWidth,
(unsigned)DL.getTypeSizeInBits(T->getScalarType()));
}
}
return {MinWidth, MaxWidth};
}
unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize,
unsigned VF,
unsigned LoopCost) {
if (OptForSize)
return 1;
if (Legal->getMaxSafeDepDistBytes() != -1U)
return 1;
unsigned TC = SE->getSmallConstantTripCount(TheLoop);
if (TC > 1 && TC < TinyTripCountInterleaveThreshold)
return 1;
unsigned TargetNumRegisters = TTI.getNumberOfRegisters(VF > 1);
DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters <<
" registers\n");
if (VF == 1) {
if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
TargetNumRegisters = ForceTargetNumScalarRegs;
} else {
if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
TargetNumRegisters = ForceTargetNumVectorRegs;
}
RegisterUsage R = calculateRegisterUsage({VF})[0];
R.MaxLocalUsers = std::max(R.MaxLocalUsers, 1U);
R.NumInstructions = std::max(R.NumInstructions, 1U);
unsigned IC = PowerOf2Floor((TargetNumRegisters - R.LoopInvariantRegs) /
R.MaxLocalUsers);
if (EnableIndVarRegisterHeur)
IC = PowerOf2Floor((TargetNumRegisters - R.LoopInvariantRegs - 1) /
std::max(1U, (R.MaxLocalUsers - 1)));
unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF);
if (VF == 1) {
if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
} else {
if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
}
if (LoopCost == 0)
LoopCost = expectedCost(VF);
if (IC > MaxInterleaveCount)
IC = MaxInterleaveCount;
else if (IC < 1)
IC = 1;
if (VF > 1 && Legal->getReductionVars()->size()) {
DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
return IC;
}
bool InterleavingRequiresRuntimePointerCheck =
(VF == 1 && Legal->getRuntimePointerChecking()->Need);
DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n');
if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) {
unsigned SmallIC =
std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost));
unsigned NumStores = Legal->getNumStores();
unsigned NumLoads = Legal->getNumLoads();
unsigned StoresIC = IC / (NumStores ? NumStores : 1);
unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);
if (Legal->getReductionVars()->size() &&
TheLoop->getLoopDepth() > 1) {
unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC);
SmallIC = std::min(SmallIC, F);
StoresIC = std::min(StoresIC, F);
LoadsIC = std::min(LoadsIC, F);
}
if (EnableLoadStoreRuntimeInterleave &&
std::max(StoresIC, LoadsIC) > SmallIC) {
DEBUG(dbgs() << "LV: Interleaving to saturate store or load ports.\n");
return std::max(StoresIC, LoadsIC);
}
DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
return SmallIC;
}
bool HasReductions = (Legal->getReductionVars()->size() > 0);
if (TTI.enableAggressiveInterleaving(HasReductions)) {
DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
return IC;
}
DEBUG(dbgs() << "LV: Not Interleaving.\n");
return 1;
}
SmallVector<LoopVectorizationCostModel::RegisterUsage, 8>
LoopVectorizationCostModel::calculateRegisterUsage(
const SmallVector<unsigned, 8> &VFs) {
LoopBlocksDFS DFS(TheLoop);
DFS.perform(LI);
RegisterUsage RU;
RU.NumInstructions = 0;
typedef DenseMap<Instruction*, unsigned> IntervalMap;
DenseMap<unsigned, Instruction*> IdxToInstr;
IntervalMap EndPoint;
SmallSet<Instruction*, 8> Ends;
SmallPtrSet<Value*, 8> LoopInvariants;
unsigned Index = 0;
for (LoopBlocksDFS::RPOIterator bb = DFS.beginRPO(),
be = DFS.endRPO(); bb != be; ++bb) {
RU.NumInstructions += (*bb)->size();
for (Instruction &I : **bb) {
IdxToInstr[Index++] = &I;
for (unsigned i = 0; i < I.getNumOperands(); ++i) {
Value *U = I.getOperand(i);
Instruction *Instr = dyn_cast<Instruction>(U);
if (!Instr) continue;
if (!TheLoop->contains(Instr)) {
LoopInvariants.insert(Instr);
continue;
}
EndPoint[Instr] = Index;
Ends.insert(Instr);
}
}
}
typedef SmallVector<Instruction*, 2> InstrList;
DenseMap<unsigned, InstrList> TransposeEnds;
for (IntervalMap::iterator it = EndPoint.begin(), e = EndPoint.end();
it != e; ++it)
TransposeEnds[it->second].push_back(it->first);
SmallSet<Instruction*, 8> OpenIntervals;
unsigned MaxSafeDepDist = -1U;
if (Legal->getMaxSafeDepDistBytes() != -1U)
MaxSafeDepDist = Legal->getMaxSafeDepDistBytes() * 8;
unsigned WidestRegister =
std::min(TTI.getRegisterBitWidth(true), MaxSafeDepDist);
const DataLayout &DL = TheFunction->getParent()->getDataLayout();
SmallVector<RegisterUsage, 8> RUs(VFs.size());
SmallVector<unsigned, 8> MaxUsages(VFs.size(), 0);
DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
auto GetRegUsage = [&DL, WidestRegister](Type *Ty, unsigned VF) {
unsigned TypeSize = DL.getTypeSizeInBits(Ty->getScalarType());
return std::max<unsigned>(1, VF * TypeSize / WidestRegister);
};
for (unsigned int i = 0; i < Index; ++i) {
Instruction *I = IdxToInstr[i];
if (!Ends.count(I)) continue;
if (ValuesToIgnore.count(I))
continue;
InstrList &List = TransposeEnds[i];
for (unsigned int j = 0, e = List.size(); j < e; ++j)
OpenIntervals.erase(List[j]);
for (unsigned j = 0, e = VFs.size(); j < e; ++j) {
if (VFs[j] == 1) {
MaxUsages[j] = std::max(MaxUsages[j], OpenIntervals.size());
continue;
}
unsigned RegUsage = 0;
for (auto Inst : OpenIntervals)
RegUsage += GetRegUsage(Inst->getType(), VFs[j]);
MaxUsages[j] = std::max(MaxUsages[j], RegUsage);
}
DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # "
<< OpenIntervals.size() << '\n');
OpenIntervals.insert(I);
}
for (unsigned i = 0, e = VFs.size(); i < e; ++i) {
unsigned Invariant = 0;
if (VFs[i] == 1)
Invariant = LoopInvariants.size();
else {
for (auto Inst : LoopInvariants)
Invariant += GetRegUsage(Inst->getType(), VFs[i]);
}
DEBUG(dbgs() << "LV(REG): VF = " << VFs[i] << '\n');
DEBUG(dbgs() << "LV(REG): Found max usage: " << MaxUsages[i] << '\n');
DEBUG(dbgs() << "LV(REG): Found invariant usage: " << Invariant << '\n');
DEBUG(dbgs() << "LV(REG): LoopSize: " << RU.NumInstructions << '\n');
RU.LoopInvariantRegs = Invariant;
RU.MaxLocalUsers = MaxUsages[i];
RUs[i] = RU;
}
return RUs;
}
unsigned LoopVectorizationCostModel::expectedCost(unsigned VF) {
unsigned Cost = 0;
for (Loop::block_iterator bb = TheLoop->block_begin(),
be = TheLoop->block_end(); bb != be; ++bb) {
unsigned BlockCost = 0;
BasicBlock *BB = *bb;
for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
if (isa<DbgInfoIntrinsic>(it))
continue;
if (ValuesToIgnore.count(&*it))
continue;
unsigned C = getInstructionCost(&*it, VF);
if (ForceTargetInstructionCost.getNumOccurrences() > 0)
C = ForceTargetInstructionCost;
BlockCost += C;
DEBUG(dbgs() << "LV: Found an estimated cost of " << C << " for VF " <<
VF << " For instruction: " << *it << '\n');
}
if (VF == 1 && Legal->blockNeedsPredication(*bb))
BlockCost /= 2;
Cost += BlockCost;
}
return Cost;
}
static bool isLikelyComplexAddressComputation(Value *Ptr,
LoopVectorizationLegality *Legal,
ScalarEvolution *SE,
const Loop *TheLoop) {
GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr);
if (!Gep)
return true;
unsigned NumOperands = Gep->getNumOperands();
for (unsigned i = 1; i < NumOperands; ++i) {
Value *Opd = Gep->getOperand(i);
if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) &&
!Legal->isInductionVariable(Opd))
return true;
}
unsigned MaxMergeDistance = 64;
const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(Ptr));
if (!AddRec)
return true;
const SCEV *Step = AddRec->getStepRecurrence(*SE);
const SCEVConstant *C = dyn_cast<SCEVConstant>(Step);
if (!C)
return true;
const APInt &APStepVal = C->getValue()->getValue();
if (APStepVal.getBitWidth() > 64)
return true;
int64_t StepVal = APStepVal.getSExtValue();
return StepVal > MaxMergeDistance;
}
static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) {
return Legal->hasStride(I->getOperand(0)) ||
Legal->hasStride(I->getOperand(1));
}
unsigned
LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
if (Legal->isUniformAfterVectorization(I))
VF = 1;
Type *RetTy = I->getType();
if (VF > 1 && MinBWs.count(I))
RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
Type *VectorTy = ToVectorTy(RetTy, VF);
switch (I->getOpcode()) {
case Instruction::GetElementPtr:
return 0;
case Instruction::Br: {
return TTI.getCFInstrCost(I->getOpcode());
}
case Instruction::PHI:
return 0;
case Instruction::Add:
case Instruction::FAdd:
case Instruction::Sub:
case Instruction::FSub:
case Instruction::Mul:
case Instruction::FMul:
case Instruction::UDiv:
case Instruction::SDiv:
case Instruction::FDiv:
case Instruction::URem:
case Instruction::SRem:
case Instruction::FRem:
case Instruction::Shl:
case Instruction::LShr:
case Instruction::AShr:
case Instruction::And:
case Instruction::Or:
case Instruction::Xor: {
if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal))
return 0;
TargetTransformInfo::OperandValueKind Op1VK =
TargetTransformInfo::OK_AnyValue;
TargetTransformInfo::OperandValueKind Op2VK =
TargetTransformInfo::OK_AnyValue;
TargetTransformInfo::OperandValueProperties Op1VP =
TargetTransformInfo::OP_None;
TargetTransformInfo::OperandValueProperties Op2VP =
TargetTransformInfo::OP_None;
Value *Op2 = I->getOperand(1);
if (isa<ConstantInt>(Op2)) {
ConstantInt *CInt = cast<ConstantInt>(Op2);
if (CInt && CInt->getValue().isPowerOf2())
Op2VP = TargetTransformInfo::OP_PowerOf2;
Op2VK = TargetTransformInfo::OK_UniformConstantValue;
} else if (isa<ConstantVector>(Op2) || isa<ConstantDataVector>(Op2)) {
Op2VK = TargetTransformInfo::OK_NonUniformConstantValue;
Constant *SplatValue = cast<Constant>(Op2)->getSplatValue();
if (SplatValue) {
ConstantInt *CInt = dyn_cast<ConstantInt>(SplatValue);
if (CInt && CInt->getValue().isPowerOf2())
Op2VP = TargetTransformInfo::OP_PowerOf2;
Op2VK = TargetTransformInfo::OK_UniformConstantValue;
}
}
return TTI.getArithmeticInstrCost(I->getOpcode(), VectorTy, Op1VK, Op2VK,
Op1VP, Op2VP);
}
case Instruction::Select: {
SelectInst *SI = cast<SelectInst>(I);
const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
Type *CondTy = SI->getCondition()->getType();
if (!ScalarCond)
CondTy = VectorType::get(CondTy, VF);
return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy);
}
case Instruction::ICmp:
case Instruction::FCmp: {
Type *ValTy = I->getOperand(0)->getType();
Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
auto It = MinBWs.find(Op0AsInstruction);
if (VF > 1 && It != MinBWs.end())
ValTy = IntegerType::get(ValTy->getContext(), It->second);
VectorTy = ToVectorTy(ValTy, VF);
return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy);
}
case Instruction::Store:
case Instruction::Load: {
StoreInst *SI = dyn_cast<StoreInst>(I);
LoadInst *LI = dyn_cast<LoadInst>(I);
Type *ValTy = (SI ? SI->getValueOperand()->getType() :
LI->getType());
VectorTy = ToVectorTy(ValTy, VF);
unsigned Alignment = SI ? SI->getAlignment() : LI->getAlignment();
unsigned AS = SI ? SI->getPointerAddressSpace() :
LI->getPointerAddressSpace();
Value *Ptr = SI ? SI->getPointerOperand() : LI->getPointerOperand();
if (VF == 1)
return TTI.getAddressComputationCost(VectorTy) +
TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS);
if (Legal->isAccessInterleaved(I)) {
auto Group = Legal->getInterleavedAccessGroup(I);
assert(Group && "Fail to get an interleaved access group.");
if (Group->getInsertPos() != I)
return 0;
unsigned InterleaveFactor = Group->getFactor();
Type *WideVecTy =
VectorType::get(VectorTy->getVectorElementType(),
VectorTy->getVectorNumElements() * InterleaveFactor);
SmallVector<unsigned, 4> Indices;
if (LI) {
for (unsigned i = 0; i < InterleaveFactor; i++)
if (Group->getMember(i))
Indices.push_back(i);
}
unsigned Cost = TTI.getInterleavedMemoryOpCost(
I->getOpcode(), WideVecTy, Group->getFactor(), Indices,
Group->getAlignment(), AS);
if (Group->isReverse())
Cost +=
Group->getNumMembers() *
TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
return Cost;
}
int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
bool Reverse = ConsecutiveStride < 0;
const DataLayout &DL = I->getModule()->getDataLayout();
unsigned ScalarAllocatedSize = DL.getTypeAllocSize(ValTy);
unsigned VectorElementSize = DL.getTypeStoreSize(VectorTy) / VF;
if (!ConsecutiveStride || ScalarAllocatedSize != VectorElementSize) {
bool IsComplexComputation =
isLikelyComplexAddressComputation(Ptr, Legal, SE, TheLoop);
unsigned Cost = 0;
Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
for (unsigned i = 0; i < VF; ++i) {
Cost += TTI.getVectorInstrCost(Instruction::ExtractElement, PtrTy, i);
Cost += TTI.getVectorInstrCost(SI ? Instruction::ExtractElement :
Instruction::InsertElement,
VectorTy, i);
}
Cost += VF * TTI.getAddressComputationCost(PtrTy, IsComplexComputation);
Cost += VF * TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(),
Alignment, AS);
return Cost;
}
unsigned Cost = TTI.getAddressComputationCost(VectorTy);
if (Legal->isMaskRequired(I))
Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment,
AS);
else
Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS);
if (Reverse)
Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse,
VectorTy, 0);
return Cost;
}
case Instruction::ZExt:
case Instruction::SExt:
case Instruction::FPToUI:
case Instruction::FPToSI:
case Instruction::FPExt:
case Instruction::PtrToInt:
case Instruction::IntToPtr:
case Instruction::SIToFP:
case Instruction::UIToFP:
case Instruction::Trunc:
case Instruction::FPTrunc:
case Instruction::BitCast: {
if (I->getOpcode() == Instruction::Trunc &&
Legal->isInductionVariable(I->getOperand(0)))
return TTI.getCastInstrCost(I->getOpcode(), I->getType(),
I->getOperand(0)->getType());
Type *SrcScalarTy = I->getOperand(0)->getType();
Type *SrcVecTy = ToVectorTy(SrcScalarTy, VF);
if (VF > 1 && MinBWs.count(I)) {
Type *MinVecTy = VectorTy;
if (I->getOpcode() == Instruction::Trunc) {
SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy);
VectorTy = largestIntegerVectorType(ToVectorTy(I->getType(), VF),
MinVecTy);
} else if (I->getOpcode() == Instruction::ZExt ||
I->getOpcode() == Instruction::SExt) {
SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy);
VectorTy = smallestIntegerVectorType(ToVectorTy(I->getType(), VF),
MinVecTy);
}
}
return TTI.getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy);
}
case Instruction::Call: {
bool NeedToScalarize;
CallInst *CI = cast<CallInst>(I);
unsigned CallCost = getVectorCallCost(CI, VF, TTI, TLI, NeedToScalarize);
if (getIntrinsicIDForCall(CI, TLI))
return std::min(CallCost, getVectorIntrinsicCost(CI, VF, TTI, TLI));
return CallCost;
}
default: {
unsigned Cost = 0;
if (!RetTy->isVoidTy() && VF != 1) {
unsigned InsCost = TTI.getVectorInstrCost(Instruction::InsertElement,
VectorTy);
unsigned ExtCost = TTI.getVectorInstrCost(Instruction::ExtractElement,
VectorTy);
Cost += VF * (InsCost + ExtCost * I->getNumOperands());
}
Cost += VF * TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy);
return Cost;
}
}}
char LoopVectorize::ID = 0;
static const char lv_name[] = "Loop Vectorization";
INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false)
INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass)
INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
INITIALIZE_PASS_DEPENDENCY(LCSSA)
INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
INITIALIZE_PASS_DEPENDENCY(LoopAccessAnalysis)
INITIALIZE_PASS_DEPENDENCY(DemandedBits)
INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false)
namespace llvm {
Pass *createLoopVectorizePass(bool NoUnrolling, bool AlwaysVectorize) {
return new LoopVectorize(NoUnrolling, AlwaysVectorize);
}
}
bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) {
if (StoreInst *ST = dyn_cast<StoreInst>(Inst))
return Legal->isConsecutivePtr(ST->getPointerOperand()) != 0;
if (LoadInst *LI = dyn_cast<LoadInst>(Inst))
return Legal->isConsecutivePtr(LI->getPointerOperand()) != 0;
return false;
}
void InnerLoopUnroller::scalarizeInstruction(Instruction *Instr,
bool IfPredicateStore) {
assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
SmallVector<VectorParts, 4> Params;
setDebugLocFromInst(Builder, Instr);
for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
Value *SrcOp = Instr->getOperand(op);
if (SrcOp == OldInduction) {
Params.push_back(getVectorValue(SrcOp));
continue;
}
Instruction *SrcInst = dyn_cast<Instruction>(SrcOp);
if (SrcInst && OrigLoop->contains(SrcInst)) {
assert(WidenMap.has(SrcInst) && "Source operand is unavailable");
Params.push_back(WidenMap.get(SrcInst));
} else {
VectorParts Scalars;
Scalars.append(UF, SrcOp);
Params.push_back(Scalars);
}
}
assert(Params.size() == Instr->getNumOperands() &&
"Invalid number of operands");
bool IsVoidRetTy = Instr->getType()->isVoidTy();
Value *UndefVec = IsVoidRetTy ? nullptr :
UndefValue::get(Instr->getType());
VectorParts &VecResults = WidenMap.splat(Instr, UndefVec);
VectorParts Cond;
if (IfPredicateStore) {
assert(Instr->getParent()->getSinglePredecessor() &&
"Only support single predecessor blocks");
Cond = createEdgeMask(Instr->getParent()->getSinglePredecessor(),
Instr->getParent());
}
for (unsigned Part = 0; Part < UF; ++Part) {
Value *Cmp = nullptr;
if (IfPredicateStore) {
if (Cond[Part]->getType()->isVectorTy())
Cond[Part] =
Builder.CreateExtractElement(Cond[Part], Builder.getInt32(0));
Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Cond[Part],
ConstantInt::get(Cond[Part]->getType(), 1));
}
Instruction *Cloned = Instr->clone();
if (!IsVoidRetTy)
Cloned->setName(Instr->getName() + ".cloned");
for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
Value *Op = Params[op][Part];
Cloned->setOperand(op, Op);
}
Builder.Insert(Cloned);
if (!IsVoidRetTy)
VecResults[Part] = Cloned;
if (IfPredicateStore)
PredicatedStores.push_back(std::make_pair(cast<StoreInst>(Cloned),
Cmp));
}
}
void InnerLoopUnroller::vectorizeMemoryInstruction(Instruction *Instr) {
StoreInst *SI = dyn_cast<StoreInst>(Instr);
bool IfPredicateStore = (SI && Legal->blockNeedsPredication(SI->getParent()));
return scalarizeInstruction(Instr, IfPredicateStore);
}
Value *InnerLoopUnroller::reverseVector(Value *Vec) {
return Vec;
}
Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) {
return V;
}
Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step) {
Type *ITy = Val->getType();
assert(!ITy->isVectorTy() && "Val must be a scalar");
Constant *C = ConstantInt::get(ITy, StartIdx);
return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction");
}