#include "llvm/Transforms/Vectorize.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/EquivalenceClasses.h"
#include "llvm/ADT/Hashing.h"
#include "llvm/ADT/MapVector.h"
#include "llvm/ADT/SetVector.h"
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/SmallSet.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/ADT/StringExtras.h"
#include "llvm/Analysis/AliasAnalysis.h"
#include "llvm/Analysis/AliasSetTracker.h"
#include "llvm/Analysis/AssumptionCache.h"
#include "llvm/Analysis/BlockFrequencyInfo.h"
#include "llvm/Analysis/CodeMetrics.h"
#include "llvm/Analysis/LoopAccessAnalysis.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/LoopIterator.h"
#include "llvm/Analysis/LoopPass.h"
#include "llvm/Analysis/ScalarEvolution.h"
#include "llvm/Analysis/ScalarEvolutionExpander.h"
#include "llvm/Analysis/ScalarEvolutionExpressions.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/DataLayout.h"
#include "llvm/IR/DebugInfo.h"
#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/DiagnosticInfo.h"
#include "llvm/IR/Dominators.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/LLVMContext.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/PatternMatch.h"
#include "llvm/IR/Type.h"
#include "llvm/IR/Value.h"
#include "llvm/IR/ValueHandle.h"
#include "llvm/IR/Verifier.h"
#include "llvm/Pass.h"
#include "llvm/Support/BranchProbability.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Transforms/Utils/VectorUtils.h"
#include <algorithm>
#include <map>
#include <tuple>
using namespace llvm;
using namespace llvm::PatternMatch;
#define LV_NAME "loop-vectorize"
#define DEBUG_TYPE LV_NAME
STATISTIC(LoopsVectorized, "Number of loops vectorized");
STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
static cl::opt<bool>
EnableIfConversion("enable-if-conversion", cl::init(true), cl::Hidden,
cl::desc("Enable if-conversion during vectorization."));
static cl::opt<unsigned>
TinyTripCountVectorThreshold("vectorizer-min-trip-count", cl::init(16),
cl::Hidden,
cl::desc("Don't vectorize loops with a constant "
"trip count that is smaller than this "
"value."));
static cl::opt<bool> EnableMemAccessVersioning(
"enable-mem-access-versioning", cl::init(true), cl::Hidden,
cl::desc("Enable symblic stride memory access versioning"));
static const unsigned TinyTripCountUnrollThreshold = 128;
static cl::opt<unsigned> ForceTargetNumScalarRegs(
"force-target-num-scalar-regs", cl::init(0), cl::Hidden,
cl::desc("A flag that overrides the target's number of scalar registers."));
static cl::opt<unsigned> ForceTargetNumVectorRegs(
"force-target-num-vector-regs", cl::init(0), cl::Hidden,
cl::desc("A flag that overrides the target's number of vector registers."));
static const unsigned MaxInterleaveFactor = 16;
static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor(
"force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
cl::desc("A flag that overrides the target's max interleave factor for "
"scalar loops."));
static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(
"force-target-max-vector-interleave", cl::init(0), cl::Hidden,
cl::desc("A flag that overrides the target's max interleave factor for "
"vectorized loops."));
static cl::opt<unsigned> ForceTargetInstructionCost(
"force-target-instruction-cost", cl::init(0), cl::Hidden,
cl::desc("A flag that overrides the target's expected cost for "
"an instruction to a single constant value. Mostly "
"useful for getting consistent testing."));
static cl::opt<unsigned> SmallLoopCost(
"small-loop-cost", cl::init(20), cl::Hidden,
cl::desc("The cost of a loop that is considered 'small' by the unroller."));
static cl::opt<bool> LoopVectorizeWithBlockFrequency(
"loop-vectorize-with-block-frequency", cl::init(false), cl::Hidden,
cl::desc("Enable the use of the block frequency analysis to access PGO "
"heuristics minimizing code growth in cold regions and being more "
"aggressive in hot regions."));
static cl::opt<bool> EnableLoadStoreRuntimeUnroll(
"enable-loadstore-runtime-unroll", cl::init(true), cl::Hidden,
cl::desc("Enable runtime unrolling until load/store ports are saturated"));
static cl::opt<unsigned> NumberOfStoresToPredicate(
"vectorize-num-stores-pred", cl::init(1), cl::Hidden,
cl::desc("Max number of stores to be predicated behind an if."));
static cl::opt<bool> EnableIndVarRegisterHeur(
"enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
cl::desc("Count the induction variable only once when unrolling"));
static cl::opt<bool> EnableCondStoresVectorization(
"enable-cond-stores-vec", cl::init(false), cl::Hidden,
cl::desc("Enable if predication of stores during vectorization."));
static cl::opt<unsigned> MaxNestedScalarReductionUF(
"max-nested-scalar-reduction-unroll", cl::init(2), cl::Hidden,
cl::desc("The maximum unroll factor to use when unrolling a scalar "
"reduction in a nested loop."));
namespace {
class LoopVectorizationLegality;
class LoopVectorizationCostModel;
class LoopVectorizeHints;
class VectorizationReport : public LoopAccessReport {
public:
VectorizationReport(Instruction *I = nullptr)
: LoopAccessReport("loop not vectorized: ", I) {}
explicit VectorizationReport(const LoopAccessReport &R)
: LoopAccessReport(Twine("loop not vectorized: ") + R.str(),
R.getInstr()) {}
};
static Type* ToVectorTy(Type *Scalar, unsigned VF) {
if (Scalar->isVoidTy() || VF == 1)
return Scalar;
return VectorType::get(Scalar, VF);
}
class InnerLoopVectorizer {
public:
InnerLoopVectorizer(Loop *OrigLoop, ScalarEvolution *SE, LoopInfo *LI,
DominatorTree *DT,
const TargetLibraryInfo *TLI,
const TargetTransformInfo *TTI, unsigned VecWidth,
unsigned UnrollFactor)
: OrigLoop(OrigLoop), SE(SE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
VF(VecWidth), UF(UnrollFactor), Builder(SE->getContext()),
Induction(nullptr), OldInduction(nullptr), WidenMap(UnrollFactor),
Legal(nullptr) {}
void vectorize(LoopVectorizationLegality *L) {
Legal = L;
createEmptyLoop();
vectorizeLoop();
updateAnalysis();
}
virtual ~InnerLoopVectorizer() {}
protected:
typedef SmallVector<PHINode*, 4> PhiVector;
typedef SmallVector<Value*, 2> VectorParts;
typedef DenseMap<std::pair<BasicBlock*, BasicBlock*>,
VectorParts> EdgeMaskCache;
std::pair<Instruction *, Instruction *> addStrideCheck(Instruction *Loc);
void createEmptyLoop();
virtual void vectorizeLoop();
void fixLCSSAPHIs();
VectorParts createBlockInMask(BasicBlock *BB);
VectorParts createEdgeMask(BasicBlock *Src, BasicBlock *Dst);
void vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV);
void widenPHIInstruction(Instruction *PN, VectorParts &Entry,
unsigned UF, unsigned VF, PhiVector *PV);
void updateAnalysis();
virtual void scalarizeInstruction(Instruction *Instr,
bool IfPredicateStore=false);
virtual void vectorizeMemoryInstruction(Instruction *Instr);
virtual Value *getBroadcastInstrs(Value *V);
virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step);
VectorParts &getVectorValue(Value *V);
virtual Value *reverseVector(Value *Vec);
struct ValueMap {
ValueMap(unsigned UnrollFactor) : UF(UnrollFactor) {}
bool has(Value *Key) const { return MapStorage.count(Key); }
VectorParts &splat(Value *Key, Value *Val) {
VectorParts &Entry = MapStorage[Key];
Entry.assign(UF, Val);
return Entry;
}
VectorParts &get(Value *Key) {
VectorParts &Entry = MapStorage[Key];
if (Entry.empty())
Entry.resize(UF);
assert(Entry.size() == UF);
return Entry;
}
private:
unsigned UF;
std::map<Value *, VectorParts> MapStorage;
};
Loop *OrigLoop;
ScalarEvolution *SE;
LoopInfo *LI;
DominatorTree *DT;
AliasAnalysis *AA;
const TargetLibraryInfo *TLI;
const TargetTransformInfo *TTI;
unsigned VF;
protected:
unsigned UF;
IRBuilder<> Builder;
BasicBlock *LoopVectorPreHeader;
BasicBlock *LoopScalarPreHeader;
BasicBlock *LoopMiddleBlock;
BasicBlock *LoopExitBlock;
SmallVector<BasicBlock *, 4> LoopVectorBody;
BasicBlock *LoopScalarBody;
SmallVector<BasicBlock *, 4> LoopBypassBlocks;
PHINode *Induction;
PHINode *OldInduction;
Value *ExtendedIdx;
ValueMap WidenMap;
EdgeMaskCache MaskCache;
LoopVectorizationLegality *Legal;
};
class InnerLoopUnroller : public InnerLoopVectorizer {
public:
InnerLoopUnroller(Loop *OrigLoop, ScalarEvolution *SE, LoopInfo *LI,
DominatorTree *DT,
const TargetLibraryInfo *TLI,
const TargetTransformInfo *TTI, unsigned UnrollFactor)
: InnerLoopVectorizer(OrigLoop, SE, LI, DT, TLI, TTI, 1,
UnrollFactor) {}
private:
void scalarizeInstruction(Instruction *Instr,
bool IfPredicateStore = false) override;
void vectorizeMemoryInstruction(Instruction *Instr) override;
Value *getBroadcastInstrs(Value *V) override;
Value *getStepVector(Value *Val, int StartIdx, Value *Step) override;
Value *reverseVector(Value *Vec) override;
};
static Instruction *getDebugLocFromInstOrOperands(Instruction *I) {
if (!I)
return I;
DebugLoc Empty;
if (I->getDebugLoc() != Empty)
return I;
for (User::op_iterator OI = I->op_begin(), OE = I->op_end(); OI != OE; ++OI) {
if (Instruction *OpInst = dyn_cast<Instruction>(*OI))
if (OpInst->getDebugLoc() != Empty)
return OpInst;
}
return I;
}
static void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) {
if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr))
B.SetCurrentDebugLocation(Inst->getDebugLoc());
else
B.SetCurrentDebugLocation(DebugLoc());
}
#ifndef NDEBUG
static std::string getDebugLocString(const Loop *L) {
std::string Result;
if (L) {
raw_string_ostream OS(Result);
if (const DebugLoc LoopDbgLoc = L->getStartLoc())
LoopDbgLoc.print(OS);
else
OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier();
OS.flush();
}
return Result;
}
#endif
static void propagateMetadata(Instruction *To, const Instruction *From) {
SmallVector<std::pair<unsigned, MDNode *>, 4> Metadata;
From->getAllMetadataOtherThanDebugLoc(Metadata);
for (auto M : Metadata) {
unsigned Kind = M.first;
if (Kind != LLVMContext::MD_tbaa &&
Kind != LLVMContext::MD_alias_scope &&
Kind != LLVMContext::MD_noalias &&
Kind != LLVMContext::MD_fpmath)
continue;
To->setMetadata(Kind, M.second);
}
}
static void propagateMetadata(SmallVectorImpl<Value *> &To, const Instruction *From) {
for (Value *V : To)
if (Instruction *I = dyn_cast<Instruction>(V))
propagateMetadata(I, From);
}
class LoopVectorizationLegality {
public:
LoopVectorizationLegality(Loop *L, ScalarEvolution *SE, DominatorTree *DT,
TargetLibraryInfo *TLI, AliasAnalysis *AA,
Function *F, const TargetTransformInfo *TTI,
LoopAccessAnalysis *LAA)
: NumPredStores(0), TheLoop(L), SE(SE), TLI(TLI), TheFunction(F),
TTI(TTI), DT(DT), LAA(LAA), LAI(nullptr), Induction(nullptr),
WidestIndTy(nullptr), HasFunNoNaNAttr(false) {}
enum ReductionKind {
RK_NoReduction, RK_IntegerAdd, RK_IntegerMult, RK_IntegerOr, RK_IntegerAnd, RK_IntegerXor, RK_IntegerMinMax, RK_FloatAdd, RK_FloatMult, RK_FloatMinMax };
enum InductionKind {
IK_NoInduction, IK_IntInduction, IK_PtrInduction };
enum MinMaxReductionKind {
MRK_Invalid,
MRK_UIntMin,
MRK_UIntMax,
MRK_SIntMin,
MRK_SIntMax,
MRK_FloatMin,
MRK_FloatMax
};
struct ReductionDescriptor {
ReductionDescriptor() : StartValue(nullptr), LoopExitInstr(nullptr),
Kind(RK_NoReduction), MinMaxKind(MRK_Invalid) {}
ReductionDescriptor(Value *Start, Instruction *Exit, ReductionKind K,
MinMaxReductionKind MK)
: StartValue(Start), LoopExitInstr(Exit), Kind(K), MinMaxKind(MK) {}
TrackingVH<Value> StartValue;
Instruction *LoopExitInstr;
ReductionKind Kind;
MinMaxReductionKind MinMaxKind;
};
struct ReductionInstDesc {
ReductionInstDesc(bool IsRedux, Instruction *I) :
IsReduction(IsRedux), PatternLastInst(I), MinMaxKind(MRK_Invalid) {}
ReductionInstDesc(Instruction *I, MinMaxReductionKind K) :
IsReduction(true), PatternLastInst(I), MinMaxKind(K) {}
bool IsReduction;
Instruction *PatternLastInst;
MinMaxReductionKind MinMaxKind;
};
struct InductionInfo {
InductionInfo(Value *Start, InductionKind K, ConstantInt *Step)
: StartValue(Start), IK(K), StepValue(Step) {
assert(IK != IK_NoInduction && "Not an induction");
assert(StartValue && "StartValue is null");
assert(StepValue && !StepValue->isZero() && "StepValue is zero");
assert((IK != IK_PtrInduction || StartValue->getType()->isPointerTy()) &&
"StartValue is not a pointer for pointer induction");
assert((IK != IK_IntInduction || StartValue->getType()->isIntegerTy()) &&
"StartValue is not an integer for integer induction");
assert(StepValue->getType()->isIntegerTy() &&
"StepValue is not an integer");
}
InductionInfo()
: StartValue(nullptr), IK(IK_NoInduction), StepValue(nullptr) {}
int getConsecutiveDirection() const {
if (StepValue && (StepValue->isOne() || StepValue->isMinusOne()))
return StepValue->getSExtValue();
return 0;
}
Value *transform(IRBuilder<> &B, Value *Index) const {
switch (IK) {
case IK_IntInduction:
assert(Index->getType() == StartValue->getType() &&
"Index type does not match StartValue type");
if (StepValue->isMinusOne())
return B.CreateSub(StartValue, Index);
if (!StepValue->isOne())
Index = B.CreateMul(Index, StepValue);
return B.CreateAdd(StartValue, Index);
case IK_PtrInduction:
assert(Index->getType() == StepValue->getType() &&
"Index type does not match StepValue type");
if (StepValue->isMinusOne())
Index = B.CreateNeg(Index);
else if (!StepValue->isOne())
Index = B.CreateMul(Index, StepValue);
return B.CreateGEP(StartValue, Index);
case IK_NoInduction:
return nullptr;
}
llvm_unreachable("invalid enum");
}
TrackingVH<Value> StartValue;
InductionKind IK;
ConstantInt *StepValue;
};
typedef DenseMap<PHINode*, ReductionDescriptor> ReductionList;
typedef MapVector<PHINode*, InductionInfo> InductionList;
bool canVectorize();
PHINode *getInduction() { return Induction; }
ReductionList *getReductionVars() { return &Reductions; }
InductionList *getInductionVars() { return &Inductions; }
Type *getWidestInductionType() { return WidestIndTy; }
bool isInductionVariable(const Value *V);
bool blockNeedsPredication(BasicBlock *BB);
int isConsecutivePtr(Value *Ptr);
bool isUniform(Value *V);
bool isUniformAfterVectorization(Instruction* I) { return Uniforms.count(I); }
const LoopAccessInfo::RuntimePointerCheck *getRuntimePointerCheck() const {
return LAI->getRuntimePointerCheck();
}
const LoopAccessInfo *getLAI() const {
return LAI;
}
static Constant *getReductionIdentity(ReductionKind K, Type *Tp);
unsigned getMaxSafeDepDistBytes() { return LAI->getMaxSafeDepDistBytes(); }
bool hasStride(Value *V) { return StrideSet.count(V); }
bool mustCheckStrides() { return !StrideSet.empty(); }
SmallPtrSet<Value *, 8>::iterator strides_begin() {
return StrideSet.begin();
}
SmallPtrSet<Value *, 8>::iterator strides_end() { return StrideSet.end(); }
bool isLegalMaskedStore(Type *DataType, Value *Ptr) {
return TTI->isLegalMaskedStore(DataType, isConsecutivePtr(Ptr));
}
bool isLegalMaskedLoad(Type *DataType, Value *Ptr) {
return TTI->isLegalMaskedLoad(DataType, isConsecutivePtr(Ptr));
}
bool isMaskRequired(const Instruction* I) {
return (MaskedOp.count(I) != 0);
}
unsigned getNumStores() const {
return LAI->getNumStores();
}
unsigned getNumLoads() const {
return LAI->getNumLoads();
}
unsigned getNumPredStores() const {
return NumPredStores;
}
private:
bool canVectorizeInstrs();
bool canVectorizeMemory();
bool canVectorizeWithIfConvert();
void collectLoopUniforms();
bool blockCanBePredicated(BasicBlock *BB, SmallPtrSetImpl<Value *> &SafePtrs);
bool AddReductionVar(PHINode *Phi, ReductionKind Kind);
ReductionInstDesc isReductionInstr(Instruction *I, ReductionKind Kind,
ReductionInstDesc &Desc);
static ReductionInstDesc isMinMaxSelectCmpPattern(Instruction *I,
ReductionInstDesc &Prev);
InductionKind isInductionVariable(PHINode *Phi, ConstantInt *&StepValue);
void collectStridedAccess(Value *LoadOrStoreInst);
void emitAnalysis(const LoopAccessReport &Message) {
LoopAccessReport::emitAnalysis(Message, TheFunction, TheLoop, LV_NAME);
}
unsigned NumPredStores;
Loop *TheLoop;
ScalarEvolution *SE;
TargetLibraryInfo *TLI;
Function *TheFunction;
const TargetTransformInfo *TTI;
DominatorTree *DT;
LoopAccessAnalysis *LAA;
const LoopAccessInfo *LAI;
PHINode *Induction;
ReductionList Reductions;
InductionList Inductions;
Type *WidestIndTy;
SmallPtrSet<Value*, 4> AllowedExit;
SmallPtrSet<Instruction*, 4> Uniforms;
bool HasFunNoNaNAttr;
ValueToValueMap Strides;
SmallPtrSet<Value *, 8> StrideSet;
SmallPtrSet<const Instruction*, 8> MaskedOp;
};
class LoopVectorizationCostModel {
public:
LoopVectorizationCostModel(Loop *L, ScalarEvolution *SE, LoopInfo *LI,
LoopVectorizationLegality *Legal,
const TargetTransformInfo &TTI,
const TargetLibraryInfo *TLI, AssumptionCache *AC,
const Function *F, const LoopVectorizeHints *Hints)
: TheLoop(L), SE(SE), LI(LI), Legal(Legal), TTI(TTI), TLI(TLI),
TheFunction(F), Hints(Hints) {
CodeMetrics::collectEphemeralValues(L, AC, EphValues);
}
struct VectorizationFactor {
unsigned Width; unsigned Cost; };
VectorizationFactor selectVectorizationFactor(bool OptForSize);
unsigned getWidestType();
unsigned selectUnrollFactor(bool OptForSize, unsigned VF, unsigned LoopCost);
struct RegisterUsage {
unsigned LoopInvariantRegs;
unsigned MaxLocalUsers;
unsigned NumInstructions;
};
RegisterUsage calculateRegisterUsage();
private:
unsigned expectedCost(unsigned VF);
unsigned getInstructionCost(Instruction *I, unsigned VF);
bool isConsecutiveLoadOrStore(Instruction *I);
void emitAnalysis(const LoopAccessReport &Message) {
LoopAccessReport::emitAnalysis(Message, TheFunction, TheLoop, LV_NAME);
}
SmallPtrSet<const Value *, 32> EphValues;
Loop *TheLoop;
ScalarEvolution *SE;
LoopInfo *LI;
LoopVectorizationLegality *Legal;
const TargetTransformInfo &TTI;
const TargetLibraryInfo *TLI;
const Function *TheFunction;
const LoopVectorizeHints *Hints;
};
class LoopVectorizeHints {
enum HintKind {
HK_WIDTH,
HK_UNROLL,
HK_FORCE
};
struct Hint {
const char * Name;
unsigned Value; HintKind Kind;
Hint(const char * Name, unsigned Value, HintKind Kind)
: Name(Name), Value(Value), Kind(Kind) { }
bool validate(unsigned Val) {
switch (Kind) {
case HK_WIDTH:
return isPowerOf2_32(Val) && Val <= VectorizerParams::MaxVectorWidth;
case HK_UNROLL:
return isPowerOf2_32(Val) && Val <= MaxInterleaveFactor;
case HK_FORCE:
return (Val <= 1);
}
return false;
}
};
Hint Width;
Hint Interleave;
Hint Force;
static StringRef Prefix() { return "llvm.loop."; }
public:
enum ForceKind {
FK_Undefined = -1, FK_Disabled = 0, FK_Enabled = 1, };
LoopVectorizeHints(const Loop *L, bool DisableInterleaving)
: Width("vectorize.width", VectorizerParams::VectorizationFactor,
HK_WIDTH),
Interleave("interleave.count", DisableInterleaving, HK_UNROLL),
Force("vectorize.enable", FK_Undefined, HK_FORCE),
TheLoop(L) {
getHintsFromMetadata();
if (VectorizerParams::isInterleaveForced())
Interleave.Value = VectorizerParams::VectorizationInterleave;
DEBUG(if (DisableInterleaving && Interleave.Value == 1) dbgs()
<< "LV: Interleaving disabled by the pass manager\n");
}
void setAlreadyVectorized() {
Width.Value = Interleave.Value = 1;
Hint Hints[] = {Width, Interleave};
writeHintsToMetadata(Hints);
}
std::string emitRemark() const {
VectorizationReport R;
if (Force.Value == LoopVectorizeHints::FK_Disabled)
R << "vectorization is explicitly disabled";
else {
R << "use -Rpass-analysis=loop-vectorize for more info";
if (Force.Value == LoopVectorizeHints::FK_Enabled) {
R << " (Force=true";
if (Width.Value != 0)
R << ", Vector Width=" << Width.Value;
if (Interleave.Value != 0)
R << ", Interleave Count=" << Interleave.Value;
R << ")";
}
}
return R.str();
}
unsigned getWidth() const { return Width.Value; }
unsigned getInterleave() const { return Interleave.Value; }
enum ForceKind getForce() const { return (ForceKind)Force.Value; }
private:
void getHintsFromMetadata() {
MDNode *LoopID = TheLoop->getLoopID();
if (!LoopID)
return;
assert(LoopID->getNumOperands() > 0 && "requires at least one operand");
assert(LoopID->getOperand(0) == LoopID && "invalid loop id");
for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
const MDString *S = nullptr;
SmallVector<Metadata *, 4> Args;
if (const MDNode *MD = dyn_cast<MDNode>(LoopID->getOperand(i))) {
if (!MD || MD->getNumOperands() == 0)
continue;
S = dyn_cast<MDString>(MD->getOperand(0));
for (unsigned i = 1, ie = MD->getNumOperands(); i < ie; ++i)
Args.push_back(MD->getOperand(i));
} else {
S = dyn_cast<MDString>(LoopID->getOperand(i));
assert(Args.size() == 0 && "too many arguments for MDString");
}
if (!S)
continue;
StringRef Name = S->getString();
if (Args.size() == 1)
setHint(Name, Args[0]);
}
}
void setHint(StringRef Name, Metadata *Arg) {
if (!Name.startswith(Prefix()))
return;
Name = Name.substr(Prefix().size(), StringRef::npos);
const ConstantInt *C = mdconst::dyn_extract<ConstantInt>(Arg);
if (!C) return;
unsigned Val = C->getZExtValue();
Hint *Hints[] = {&Width, &Interleave, &Force};
for (auto H : Hints) {
if (Name == H->Name) {
if (H->validate(Val))
H->Value = Val;
else
DEBUG(dbgs() << "LV: ignoring invalid hint '" << Name << "'\n");
break;
}
}
}
MDNode *createHintMetadata(StringRef Name, unsigned V) const {
LLVMContext &Context = TheLoop->getHeader()->getContext();
Metadata *MDs[] = {MDString::get(Context, Name),
ConstantAsMetadata::get(
ConstantInt::get(Type::getInt32Ty(Context), V))};
return MDNode::get(Context, MDs);
}
bool matchesHintMetadataName(MDNode *Node, ArrayRef<Hint> HintTypes) {
MDString* Name = dyn_cast<MDString>(Node->getOperand(0));
if (!Name)
return false;
for (auto H : HintTypes)
if (Name->getString().endswith(H.Name))
return true;
return false;
}
void writeHintsToMetadata(ArrayRef<Hint> HintTypes) {
if (HintTypes.size() == 0)
return;
SmallVector<Metadata *, 4> MDs(1);
MDNode *LoopID = TheLoop->getLoopID();
if (LoopID) {
for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
MDNode *Node = cast<MDNode>(LoopID->getOperand(i));
if (!matchesHintMetadataName(Node, HintTypes))
MDs.push_back(Node);
}
}
for (auto H : HintTypes)
MDs.push_back(createHintMetadata(Twine(Prefix(), H.Name).str(), H.Value));
LLVMContext &Context = TheLoop->getHeader()->getContext();
MDNode *NewLoopID = MDNode::get(Context, MDs);
NewLoopID->replaceOperandWith(0, NewLoopID);
TheLoop->setLoopID(NewLoopID);
}
const Loop *TheLoop;
};
static void emitMissedWarning(Function *F, Loop *L,
const LoopVectorizeHints &LH) {
emitOptimizationRemarkMissed(F->getContext(), DEBUG_TYPE, *F,
L->getStartLoc(), LH.emitRemark());
if (LH.getForce() == LoopVectorizeHints::FK_Enabled) {
if (LH.getWidth() != 1)
emitLoopVectorizeWarning(
F->getContext(), *F, L->getStartLoc(),
"failed explicitly specified loop vectorization");
else if (LH.getInterleave() != 1)
emitLoopInterleaveWarning(
F->getContext(), *F, L->getStartLoc(),
"failed explicitly specified loop interleaving");
}
}
static void addInnerLoop(Loop &L, SmallVectorImpl<Loop *> &V) {
if (L.empty())
return V.push_back(&L);
for (Loop *InnerL : L)
addInnerLoop(*InnerL, V);
}
struct LoopVectorize : public FunctionPass {
static char ID;
explicit LoopVectorize(bool NoUnrolling = false, bool AlwaysVectorize = true)
: FunctionPass(ID),
DisableUnrolling(NoUnrolling),
AlwaysVectorize(AlwaysVectorize) {
initializeLoopVectorizePass(*PassRegistry::getPassRegistry());
}
ScalarEvolution *SE;
LoopInfo *LI;
TargetTransformInfo *TTI;
DominatorTree *DT;
BlockFrequencyInfo *BFI;
TargetLibraryInfo *TLI;
AliasAnalysis *AA;
AssumptionCache *AC;
LoopAccessAnalysis *LAA;
bool DisableUnrolling;
bool AlwaysVectorize;
BlockFrequency ColdEntryFreq;
bool runOnFunction(Function &F) override {
SE = &getAnalysis<ScalarEvolution>();
LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
BFI = &getAnalysis<BlockFrequencyInfo>();
auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
TLI = TLIP ? &TLIP->getTLI() : nullptr;
AA = &getAnalysis<AliasAnalysis>();
AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
LAA = &getAnalysis<LoopAccessAnalysis>();
const BranchProbability ColdProb(1, 5); ColdEntryFreq = BlockFrequency(BFI->getEntryFreq()) * ColdProb;
if (!TTI->getNumberOfRegisters(true))
return false;
SmallVector<Loop *, 8> Worklist;
for (Loop *L : *LI)
addInnerLoop(*L, Worklist);
LoopsAnalyzed += Worklist.size();
bool Changed = false;
while (!Worklist.empty())
Changed |= processLoop(Worklist.pop_back_val());
return Changed;
}
bool processLoop(Loop *L) {
assert(L->empty() && "Only process inner loops.");
#ifndef NDEBUG
const std::string DebugLocStr = getDebugLocString(L);
#endif
DEBUG(dbgs() << "\nLV: Checking a loop in \""
<< L->getHeader()->getParent()->getName() << "\" from "
<< DebugLocStr << "\n");
LoopVectorizeHints Hints(L, DisableUnrolling);
DEBUG(dbgs() << "LV: Loop hints:"
<< " force="
<< (Hints.getForce() == LoopVectorizeHints::FK_Disabled
? "disabled"
: (Hints.getForce() == LoopVectorizeHints::FK_Enabled
? "enabled"
: "?")) << " width=" << Hints.getWidth()
<< " unroll=" << Hints.getInterleave() << "\n");
Function *F = L->getHeader()->getParent();
if (Hints.getForce() == LoopVectorizeHints::FK_Disabled) {
DEBUG(dbgs() << "LV: Not vectorizing: #pragma vectorize disable.\n");
emitOptimizationRemarkAnalysis(F->getContext(), DEBUG_TYPE, *F,
L->getStartLoc(), Hints.emitRemark());
return false;
}
if (!AlwaysVectorize && Hints.getForce() != LoopVectorizeHints::FK_Enabled) {
DEBUG(dbgs() << "LV: Not vectorizing: No #pragma vectorize enable.\n");
emitOptimizationRemarkAnalysis(F->getContext(), DEBUG_TYPE, *F,
L->getStartLoc(), Hints.emitRemark());
return false;
}
if (Hints.getWidth() == 1 && Hints.getInterleave() == 1) {
DEBUG(dbgs() << "LV: Not vectorizing: Disabled/already vectorized.\n");
emitOptimizationRemarkAnalysis(
F->getContext(), DEBUG_TYPE, *F, L->getStartLoc(),
"loop not vectorized: vector width and interleave count are "
"explicitly set to 1");
return false;
}
const unsigned TC = SE->getSmallConstantTripCount(L);
if (TC > 0u && TC < TinyTripCountVectorThreshold) {
DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
<< "This loop is not worth vectorizing.");
if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
else {
DEBUG(dbgs() << "\n");
emitOptimizationRemarkAnalysis(
F->getContext(), DEBUG_TYPE, *F, L->getStartLoc(),
"vectorization is not beneficial and is not explicitly forced");
return false;
}
}
LoopVectorizationLegality LVL(L, SE, DT, TLI, AA, F, TTI, LAA);
if (!LVL.canVectorize()) {
DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
emitMissedWarning(F, L, Hints);
return false;
}
LoopVectorizationCostModel CM(L, SE, LI, &LVL, *TTI, TLI, AC, F, &Hints);
bool OptForSize = Hints.getForce() != LoopVectorizeHints::FK_Enabled &&
F->hasFnAttribute(Attribute::OptimizeForSize);
if (LoopVectorizeWithBlockFrequency) {
BlockFrequency LoopEntryFreq = BFI->getBlockFreq(L->getLoopPreheader());
if (Hints.getForce() != LoopVectorizeHints::FK_Enabled &&
LoopEntryFreq < ColdEntryFreq)
OptForSize = true;
}
if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
DEBUG(dbgs() << "LV: Can't vectorize when the NoImplicitFloat"
"attribute is used.\n");
emitOptimizationRemarkAnalysis(
F->getContext(), DEBUG_TYPE, *F, L->getStartLoc(),
"loop not vectorized due to NoImplicitFloat attribute");
emitMissedWarning(F, L, Hints);
return false;
}
const LoopVectorizationCostModel::VectorizationFactor VF =
CM.selectVectorizationFactor(OptForSize);
const unsigned UF =
CM.selectUnrollFactor(OptForSize, VF.Width, VF.Cost);
DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width << ") in "
<< DebugLocStr << '\n');
DEBUG(dbgs() << "LV: Unroll Factor is " << UF << '\n');
if (VF.Width == 1) {
DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial\n");
if (UF == 1) {
emitOptimizationRemarkAnalysis(
F->getContext(), DEBUG_TYPE, *F, L->getStartLoc(),
"not beneficial to vectorize and user disabled interleaving");
return false;
}
DEBUG(dbgs() << "LV: Trying to at least unroll the loops.\n");
emitOptimizationRemark(F->getContext(), DEBUG_TYPE, *F, L->getStartLoc(),
Twine("unrolled with interleaving factor " +
Twine(UF) +
" (vectorization not beneficial)"));
InnerLoopUnroller Unroller(L, SE, LI, DT, TLI, TTI, UF);
Unroller.vectorize(&LVL);
} else {
InnerLoopVectorizer LB(L, SE, LI, DT, TLI, TTI, VF.Width, UF);
LB.vectorize(&LVL);
++LoopsVectorized;
emitOptimizationRemark(
F->getContext(), DEBUG_TYPE, *F, L->getStartLoc(),
Twine("vectorized loop (vectorization factor: ") + Twine(VF.Width) +
", unrolling interleave factor: " + Twine(UF) + ")");
}
Hints.setAlreadyVectorized();
DEBUG(verifyFunction(*L->getHeader()->getParent()));
return true;
}
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addRequired<AssumptionCacheTracker>();
AU.addRequiredID(LoopSimplifyID);
AU.addRequiredID(LCSSAID);
AU.addRequired<BlockFrequencyInfo>();
AU.addRequired<DominatorTreeWrapperPass>();
AU.addRequired<LoopInfoWrapperPass>();
AU.addRequired<ScalarEvolution>();
AU.addRequired<TargetTransformInfoWrapperPass>();
AU.addRequired<AliasAnalysis>();
AU.addRequired<LoopAccessAnalysis>();
AU.addPreserved<LoopInfoWrapperPass>();
AU.addPreserved<DominatorTreeWrapperPass>();
AU.addPreserved<AliasAnalysis>();
}
};
}
Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
Instruction *Instr = dyn_cast<Instruction>(V);
bool NewInstr =
(Instr && std::find(LoopVectorBody.begin(), LoopVectorBody.end(),
Instr->getParent()) != LoopVectorBody.end());
bool Invariant = OrigLoop->isLoopInvariant(V) && !NewInstr;
IRBuilder<>::InsertPointGuard Guard(Builder);
if (Invariant)
Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast");
return Shuf;
}
Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx,
Value *Step) {
assert(Val->getType()->isVectorTy() && "Must be a vector");
assert(Val->getType()->getScalarType()->isIntegerTy() &&
"Elem must be an integer");
assert(Step->getType() == Val->getType()->getScalarType() &&
"Step has wrong type");
Type *ITy = Val->getType()->getScalarType();
VectorType *Ty = cast<VectorType>(Val->getType());
int VLen = Ty->getNumElements();
SmallVector<Constant*, 8> Indices;
for (int i = 0; i < VLen; ++i)
Indices.push_back(ConstantInt::get(ITy, StartIdx + i));
Constant *Cv = ConstantVector::get(Indices);
assert(Cv->getType() == Val->getType() && "Invalid consecutive vec");
Step = Builder.CreateVectorSplat(VLen, Step);
assert(Step->getType() == Val->getType() && "Invalid step vec");
Step = Builder.CreateMul(Cv, Step);
return Builder.CreateAdd(Val, Step, "induction");
}
static unsigned getGEPInductionOperand(const GetElementPtrInst *Gep) {
const DataLayout &DL = Gep->getModule()->getDataLayout();
unsigned LastOperand = Gep->getNumOperands() - 1;
unsigned GEPAllocSize = DL.getTypeAllocSize(
cast<PointerType>(Gep->getType()->getScalarType())->getElementType());
while (LastOperand > 1 && match(Gep->getOperand(LastOperand), m_Zero())) {
gep_type_iterator GEPTI = gep_type_begin(Gep);
std::advance(GEPTI, LastOperand - 1);
if (DL.getTypeAllocSize(*GEPTI) != GEPAllocSize)
break;
--LastOperand;
}
return LastOperand;
}
int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) {
assert(Ptr->getType()->isPointerTy() && "Unexpected non-ptr");
if (Ptr->getType()->getPointerElementType()->isAggregateType())
return 0;
PHINode *Phi = dyn_cast_or_null<PHINode>(Ptr);
if (Phi && Inductions.count(Phi)) {
InductionInfo II = Inductions[Phi];
return II.getConsecutiveDirection();
}
GetElementPtrInst *Gep = dyn_cast_or_null<GetElementPtrInst>(Ptr);
if (!Gep)
return 0;
unsigned NumOperands = Gep->getNumOperands();
Value *GpPtr = Gep->getPointerOperand();
Phi = dyn_cast<PHINode>(GpPtr);
if (Phi && Inductions.count(Phi)) {
PointerType *GepPtrType = cast<PointerType>(GpPtr->getType());
if (GepPtrType->getElementType()->isAggregateType())
return 0;
for (unsigned i = 1; i < NumOperands; ++i)
if (!SE->isLoopInvariant(SE->getSCEV(Gep->getOperand(i)), TheLoop))
return 0;
InductionInfo II = Inductions[Phi];
return II.getConsecutiveDirection();
}
unsigned InductionOperand = getGEPInductionOperand(Gep);
for (unsigned i = 0; i != NumOperands; ++i)
if (i != InductionOperand &&
!SE->isLoopInvariant(SE->getSCEV(Gep->getOperand(i)), TheLoop))
return 0;
const SCEV *Last = nullptr;
if (!Strides.count(Gep))
Last = SE->getSCEV(Gep->getOperand(InductionOperand));
else {
Last = replaceSymbolicStrideSCEV(SE, Strides,
Gep->getOperand(InductionOperand), Gep);
if (const SCEVCastExpr *C = dyn_cast<SCEVCastExpr>(Last))
Last =
(C->getSCEVType() == scSignExtend || C->getSCEVType() == scZeroExtend)
? C->getOperand()
: Last;
}
if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Last)) {
const SCEV *Step = AR->getStepRecurrence(*SE);
if (Step->isOne())
return 1;
if (Step->isAllOnesValue())
return -1;
}
return 0;
}
bool LoopVectorizationLegality::isUniform(Value *V) {
return LAI->isUniform(V);
}
InnerLoopVectorizer::VectorParts&
InnerLoopVectorizer::getVectorValue(Value *V) {
assert(V != Induction && "The new induction variable should not be used.");
assert(!V->getType()->isVectorTy() && "Can't widen a vector");
if (Legal->hasStride(V))
V = ConstantInt::get(V->getType(), 1);
if (WidenMap.has(V))
return WidenMap.get(V);
Value *B = getBroadcastInstrs(V);
return WidenMap.splat(V, B);
}
Value *InnerLoopVectorizer::reverseVector(Value *Vec) {
assert(Vec->getType()->isVectorTy() && "Invalid type");
SmallVector<Constant*, 8> ShuffleMask;
for (unsigned i = 0; i < VF; ++i)
ShuffleMask.push_back(Builder.getInt32(VF - i - 1));
return Builder.CreateShuffleVector(Vec, UndefValue::get(Vec->getType()),
ConstantVector::get(ShuffleMask),
"reverse");
}
void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) {
LoadInst *LI = dyn_cast<LoadInst>(Instr);
StoreInst *SI = dyn_cast<StoreInst>(Instr);
assert((LI || SI) && "Invalid Load/Store instruction");
Type *ScalarDataTy = LI ? LI->getType() : SI->getValueOperand()->getType();
Type *DataTy = VectorType::get(ScalarDataTy, VF);
Value *Ptr = LI ? LI->getPointerOperand() : SI->getPointerOperand();
unsigned Alignment = LI ? LI->getAlignment() : SI->getAlignment();
const DataLayout &DL = Instr->getModule()->getDataLayout();
if (!Alignment)
Alignment = DL.getABITypeAlignment(ScalarDataTy);
unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace();
unsigned ScalarAllocatedSize = DL.getTypeAllocSize(ScalarDataTy);
unsigned VectorElementSize = DL.getTypeStoreSize(DataTy) / VF;
if (SI && Legal->blockNeedsPredication(SI->getParent()) &&
!Legal->isMaskRequired(SI))
return scalarizeInstruction(Instr, true);
if (ScalarAllocatedSize != VectorElementSize)
return scalarizeInstruction(Instr);
int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
bool Reverse = ConsecutiveStride < 0;
bool UniformLoad = LI && Legal->isUniform(Ptr);
if (!ConsecutiveStride || UniformLoad)
return scalarizeInstruction(Instr);
Constant *Zero = Builder.getInt32(0);
VectorParts &Entry = WidenMap.get(Instr);
GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr);
if (Gep && Legal->isInductionVariable(Gep->getPointerOperand())) {
setDebugLocFromInst(Builder, Gep);
Value *PtrOperand = Gep->getPointerOperand();
Value *FirstBasePtr = getVectorValue(PtrOperand)[0];
FirstBasePtr = Builder.CreateExtractElement(FirstBasePtr, Zero);
GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone());
Gep2->setOperand(0, FirstBasePtr);
Gep2->setName("gep.indvar.base");
Ptr = Builder.Insert(Gep2);
} else if (Gep) {
setDebugLocFromInst(Builder, Gep);
assert(SE->isLoopInvariant(SE->getSCEV(Gep->getPointerOperand()),
OrigLoop) && "Base ptr must be invariant");
unsigned NumOperands = Gep->getNumOperands();
unsigned InductionOperand = getGEPInductionOperand(Gep);
GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone());
for (unsigned i = 0; i < NumOperands; ++i) {
Value *GepOperand = Gep->getOperand(i);
Instruction *GepOperandInst = dyn_cast<Instruction>(GepOperand);
if (i == InductionOperand ||
(GepOperandInst && OrigLoop->contains(GepOperandInst))) {
assert((i == InductionOperand ||
SE->isLoopInvariant(SE->getSCEV(GepOperandInst), OrigLoop)) &&
"Must be last index or loop invariant");
VectorParts &GEPParts = getVectorValue(GepOperand);
Value *Index = GEPParts[0];
Index = Builder.CreateExtractElement(Index, Zero);
Gep2->setOperand(i, Index);
Gep2->setName("gep.indvar.idx");
}
}
Ptr = Builder.Insert(Gep2);
} else {
assert(isa<PHINode>(Ptr) && "Invalid induction ptr");
setDebugLocFromInst(Builder, Ptr);
VectorParts &PtrVal = getVectorValue(Ptr);
Ptr = Builder.CreateExtractElement(PtrVal[0], Zero);
}
VectorParts Mask = createBlockInMask(Instr->getParent());
if (SI) {
assert(!Legal->isUniform(SI->getPointerOperand()) &&
"We do not allow storing to uniform addresses");
setDebugLocFromInst(Builder, SI);
VectorParts StoredVal = getVectorValue(SI->getValueOperand());
for (unsigned Part = 0; Part < UF; ++Part) {
Value *PartPtr = Builder.CreateGEP(Ptr, Builder.getInt32(Part * VF));
if (Reverse) {
StoredVal[Part] = reverseVector(StoredVal[Part]);
PartPtr = Builder.CreateGEP(Ptr, Builder.getInt32(-Part * VF));
PartPtr = Builder.CreateGEP(PartPtr, Builder.getInt32(1 - VF));
Mask[Part] = reverseVector(Mask[Part]);
}
Value *VecPtr = Builder.CreateBitCast(PartPtr,
DataTy->getPointerTo(AddressSpace));
Instruction *NewSI;
if (Legal->isMaskRequired(SI))
NewSI = Builder.CreateMaskedStore(StoredVal[Part], VecPtr, Alignment,
Mask[Part]);
else
NewSI = Builder.CreateAlignedStore(StoredVal[Part], VecPtr, Alignment);
propagateMetadata(NewSI, SI);
}
return;
}
assert(LI && "Must have a load instruction");
setDebugLocFromInst(Builder, LI);
for (unsigned Part = 0; Part < UF; ++Part) {
Value *PartPtr = Builder.CreateGEP(Ptr, Builder.getInt32(Part * VF));
if (Reverse) {
PartPtr = Builder.CreateGEP(Ptr, Builder.getInt32(-Part * VF));
PartPtr = Builder.CreateGEP(PartPtr, Builder.getInt32(1 - VF));
Mask[Part] = reverseVector(Mask[Part]);
}
Instruction* NewLI;
Value *VecPtr = Builder.CreateBitCast(PartPtr,
DataTy->getPointerTo(AddressSpace));
if (Legal->isMaskRequired(LI))
NewLI = Builder.CreateMaskedLoad(VecPtr, Alignment, Mask[Part],
UndefValue::get(DataTy),
"wide.masked.load");
else
NewLI = Builder.CreateAlignedLoad(VecPtr, Alignment, "wide.load");
propagateMetadata(NewLI, LI);
Entry[Part] = Reverse ? reverseVector(NewLI) : NewLI;
}
}
void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, bool IfPredicateStore) {
assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
SmallVector<VectorParts, 4> Params;
setDebugLocFromInst(Builder, Instr);
for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
Value *SrcOp = Instr->getOperand(op);
if (SrcOp == OldInduction) {
Params.push_back(getVectorValue(SrcOp));
continue;
}
Instruction *SrcInst = dyn_cast<Instruction>(SrcOp);
if (SrcInst && OrigLoop->contains(SrcInst)) {
assert(WidenMap.has(SrcInst) && "Source operand is unavailable");
Params.push_back(WidenMap.get(SrcInst));
} else {
VectorParts Scalars;
Scalars.append(UF, SrcOp);
Params.push_back(Scalars);
}
}
assert(Params.size() == Instr->getNumOperands() &&
"Invalid number of operands");
bool IsVoidRetTy = Instr->getType()->isVoidTy();
Value *UndefVec = IsVoidRetTy ? nullptr :
UndefValue::get(VectorType::get(Instr->getType(), VF));
VectorParts &VecResults = WidenMap.splat(Instr, UndefVec);
Instruction *InsertPt = Builder.GetInsertPoint();
BasicBlock *IfBlock = Builder.GetInsertBlock();
BasicBlock *CondBlock = nullptr;
VectorParts Cond;
Loop *VectorLp = nullptr;
if (IfPredicateStore) {
assert(Instr->getParent()->getSinglePredecessor() &&
"Only support single predecessor blocks");
Cond = createEdgeMask(Instr->getParent()->getSinglePredecessor(),
Instr->getParent());
VectorLp = LI->getLoopFor(IfBlock);
assert(VectorLp && "Must have a loop for this block");
}
for (unsigned Part = 0; Part < UF; ++Part) {
for (unsigned Width = 0; Width < VF; ++Width) {
Value *Cmp = nullptr;
if (IfPredicateStore) {
Cmp = Builder.CreateExtractElement(Cond[Part], Builder.getInt32(Width));
Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Cmp, ConstantInt::get(Cmp->getType(), 1));
CondBlock = IfBlock->splitBasicBlock(InsertPt, "cond.store");
LoopVectorBody.push_back(CondBlock);
VectorLp->addBasicBlockToLoop(CondBlock, *LI);
Builder.SetInsertPoint(InsertPt);
}
Instruction *Cloned = Instr->clone();
if (!IsVoidRetTy)
Cloned->setName(Instr->getName() + ".cloned");
for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
Value *Op = Params[op][Part];
if (Op->getType()->isVectorTy())
Op = Builder.CreateExtractElement(Op, Builder.getInt32(Width));
Cloned->setOperand(op, Op);
}
Builder.Insert(Cloned);
if (!IsVoidRetTy)
VecResults[Part] = Builder.CreateInsertElement(VecResults[Part], Cloned,
Builder.getInt32(Width));
if (IfPredicateStore) {
BasicBlock *NewIfBlock = CondBlock->splitBasicBlock(InsertPt, "else");
LoopVectorBody.push_back(NewIfBlock);
VectorLp->addBasicBlockToLoop(NewIfBlock, *LI);
Builder.SetInsertPoint(InsertPt);
Instruction *OldBr = IfBlock->getTerminator();
BranchInst::Create(CondBlock, NewIfBlock, Cmp, OldBr);
OldBr->eraseFromParent();
IfBlock = NewIfBlock;
}
}
}
}
static Instruction *getFirstInst(Instruction *FirstInst, Value *V,
Instruction *Loc) {
if (FirstInst)
return FirstInst;
if (Instruction *I = dyn_cast<Instruction>(V))
return I->getParent() == Loc->getParent() ? I : nullptr;
return nullptr;
}
std::pair<Instruction *, Instruction *>
InnerLoopVectorizer::addStrideCheck(Instruction *Loc) {
Instruction *tnullptr = nullptr;
if (!Legal->mustCheckStrides())
return std::pair<Instruction *, Instruction *>(tnullptr, tnullptr);
IRBuilder<> ChkBuilder(Loc);
Value *Check = nullptr;
Instruction *FirstInst = nullptr;
for (SmallPtrSet<Value *, 8>::iterator SI = Legal->strides_begin(),
SE = Legal->strides_end();
SI != SE; ++SI) {
Value *Ptr = stripIntegerCast(*SI);
Value *C = ChkBuilder.CreateICmpNE(Ptr, ConstantInt::get(Ptr->getType(), 1),
"stride.chk");
FirstInst = getFirstInst(FirstInst, C, Loc);
if (Check)
Check = ChkBuilder.CreateOr(Check, C);
else
Check = C;
}
LLVMContext &Ctx = Loc->getContext();
Instruction *TheCheck =
BinaryOperator::CreateAnd(Check, ConstantInt::getTrue(Ctx));
ChkBuilder.Insert(TheCheck, "stride.not.one");
FirstInst = getFirstInst(FirstInst, TheCheck, Loc);
return std::make_pair(FirstInst, TheCheck);
}
void InnerLoopVectorizer::createEmptyLoop() {
BasicBlock *OldBasicBlock = OrigLoop->getHeader();
BasicBlock *BypassBlock = OrigLoop->getLoopPreheader();
BasicBlock *ExitBlock = OrigLoop->getExitBlock();
assert(BypassBlock && "Invalid loop structure");
assert(ExitBlock && "Must have an exit block");
OldInduction = Legal->getInduction();
Type *IdxTy = Legal->getWidestInductionType();
const SCEV *ExitCount = SE->getBackedgeTakenCount(OrigLoop);
assert(ExitCount != SE->getCouldNotCompute() && "Invalid loop count");
if (ExitCount->getType()->getPrimitiveSizeInBits() >
IdxTy->getPrimitiveSizeInBits())
ExitCount = SE->getTruncateOrNoop(ExitCount, IdxTy);
const SCEV *BackedgeTakeCount = SE->getNoopOrZeroExtend(ExitCount, IdxTy);
ExitCount = SE->getAddExpr(BackedgeTakeCount,
SE->getConstant(BackedgeTakeCount->getType(), 1));
const DataLayout &DL = OldBasicBlock->getModule()->getDataLayout();
SCEVExpander Exp(*SE, DL, "induction");
Value *BackedgeCount =
Exp.expandCodeFor(BackedgeTakeCount, BackedgeTakeCount->getType(),
BypassBlock->getTerminator());
if (BackedgeCount->getType()->isPointerTy())
BackedgeCount = CastInst::CreatePointerCast(BackedgeCount, IdxTy,
"backedge.ptrcnt.to.int",
BypassBlock->getTerminator());
Instruction *CheckBCOverflow =
CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, BackedgeCount,
Constant::getAllOnesValue(BackedgeCount->getType()),
"backedge.overflow", BypassBlock->getTerminator());
Builder.SetInsertPoint(BypassBlock->getTerminator());
Value *StartIdx = ExtendedIdx = OldInduction ?
Builder.CreateZExt(OldInduction->getIncomingValueForBlock(BypassBlock),
IdxTy):
ConstantInt::get(IdxTy, 0);
Instruction *OverflowCheckAnchor = BinaryOperator::CreateAdd(
StartIdx, ConstantInt::get(IdxTy, 0), "overflow.check.anchor",
BypassBlock->getTerminator());
Value *Count = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
BypassBlock->getTerminator());
LoopBypassBlocks.push_back(BypassBlock);
BasicBlock *VectorPH =
BypassBlock->splitBasicBlock(BypassBlock->getTerminator(), "vector.ph");
BasicBlock *VecBody =
VectorPH->splitBasicBlock(VectorPH->getTerminator(), "vector.body");
BasicBlock *MiddleBlock =
VecBody->splitBasicBlock(VecBody->getTerminator(), "middle.block");
BasicBlock *ScalarPH =
MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(), "scalar.ph");
Loop* Lp = new Loop();
Loop *ParentLoop = OrigLoop->getParentLoop();
if (ParentLoop) {
ParentLoop->addChildLoop(Lp);
ParentLoop->addBasicBlockToLoop(ScalarPH, *LI);
ParentLoop->addBasicBlockToLoop(VectorPH, *LI);
ParentLoop->addBasicBlockToLoop(MiddleBlock, *LI);
} else {
LI->addTopLevelLoop(Lp);
}
Lp->addBasicBlockToLoop(VecBody, *LI);
Builder.SetInsertPoint(VecBody->getFirstNonPHI());
setDebugLocFromInst(Builder, getDebugLocFromInstOrOperands(OldInduction));
Induction = Builder.CreatePHI(IdxTy, 2, "index");
Constant *Step = ConstantInt::get(IdxTy, VF * UF);
IRBuilder<> BypassBuilder(BypassBlock->getTerminator());
setDebugLocFromInst(BypassBuilder,
getDebugLocFromInstOrOperands(OldInduction));
if (Count->getType() != IdxTy) {
if (ExitCount->getType()->isPointerTy())
Count = BypassBuilder.CreatePointerCast(Count, IdxTy, "ptrcnt.to.int");
else
Count = BypassBuilder.CreateZExtOrTrunc(Count, IdxTy, "cnt.cast");
}
Value *IdxEnd = BypassBuilder.CreateAdd(Count, StartIdx, "end.idx");
Value *R = BypassBuilder.CreateURem(Count, Step, "n.mod.vf");
Value *CountRoundDown = BypassBuilder.CreateSub(Count, R, "n.vec");
Value *IdxEndRoundDown = BypassBuilder.CreateAdd(CountRoundDown, StartIdx,
"end.idx.rnd.down");
Value *Cmp =
BypassBuilder.CreateICmpEQ(IdxEndRoundDown, StartIdx, "cmp.zero");
BasicBlock *LastBypassBlock = BypassBlock;
{
auto PastOverflowCheck =
std::next(BasicBlock::iterator(OverflowCheckAnchor));
BasicBlock *CheckBlock =
LastBypassBlock->splitBasicBlock(PastOverflowCheck, "overflow.checked");
if (ParentLoop)
ParentLoop->addBasicBlockToLoop(CheckBlock, *LI);
LoopBypassBlocks.push_back(CheckBlock);
Instruction *OldTerm = LastBypassBlock->getTerminator();
BranchInst::Create(ScalarPH, CheckBlock, CheckBCOverflow, OldTerm);
OldTerm->eraseFromParent();
LastBypassBlock = CheckBlock;
}
Instruction *StrideCheck;
Instruction *FirstCheckInst;
std::tie(FirstCheckInst, StrideCheck) =
addStrideCheck(LastBypassBlock->getTerminator());
if (StrideCheck) {
BasicBlock *CheckBlock =
LastBypassBlock->splitBasicBlock(FirstCheckInst, "vector.stridecheck");
if (ParentLoop)
ParentLoop->addBasicBlockToLoop(CheckBlock, *LI);
LoopBypassBlocks.push_back(CheckBlock);
Instruction *OldTerm = LastBypassBlock->getTerminator();
BranchInst::Create(MiddleBlock, CheckBlock, Cmp, OldTerm);
OldTerm->eraseFromParent();
Cmp = StrideCheck;
LastBypassBlock = CheckBlock;
}
Instruction *MemRuntimeCheck;
std::tie(FirstCheckInst, MemRuntimeCheck) =
Legal->getLAI()->addRuntimeCheck(LastBypassBlock->getTerminator());
if (MemRuntimeCheck) {
BasicBlock *CheckBlock =
LastBypassBlock->splitBasicBlock(FirstCheckInst, "vector.memcheck");
if (ParentLoop)
ParentLoop->addBasicBlockToLoop(CheckBlock, *LI);
LoopBypassBlocks.push_back(CheckBlock);
Instruction *OldTerm = LastBypassBlock->getTerminator();
BranchInst::Create(MiddleBlock, CheckBlock, Cmp, OldTerm);
OldTerm->eraseFromParent();
Cmp = MemRuntimeCheck;
LastBypassBlock = CheckBlock;
}
LastBypassBlock->getTerminator()->eraseFromParent();
BranchInst::Create(MiddleBlock, VectorPH, Cmp,
LastBypassBlock);
PHINode *ResumeIndex = nullptr;
LoopVectorizationLegality::InductionList::iterator I, E;
LoopVectorizationLegality::InductionList *List = Legal->getInductionVars();
BypassBuilder.SetInsertPoint(LoopBypassBlocks.back()->getTerminator());
for (I = List->begin(), E = List->end(); I != E; ++I) {
PHINode *OrigPhi = I->first;
LoopVectorizationLegality::InductionInfo II = I->second;
Type *ResumeValTy = (OrigPhi == OldInduction) ? IdxTy : OrigPhi->getType();
PHINode *ResumeVal = PHINode::Create(ResumeValTy, 2, "resume.val",
MiddleBlock->getTerminator());
PHINode *TruncResumeVal = (OrigPhi == OldInduction) ?
PHINode::Create(OrigPhi->getType(), 2, "trunc.resume.val",
MiddleBlock->getTerminator()) : nullptr;
PHINode *BCResumeVal = PHINode::Create(ResumeValTy, 3, "bc.resume.val",
ScalarPH->getTerminator());
BCResumeVal->addIncoming(ResumeVal, MiddleBlock);
PHINode *BCTruncResumeVal = nullptr;
if (OrigPhi == OldInduction) {
BCTruncResumeVal =
PHINode::Create(OrigPhi->getType(), 2, "bc.trunc.resume.val",
ScalarPH->getTerminator());
BCTruncResumeVal->addIncoming(TruncResumeVal, MiddleBlock);
}
Value *EndValue = nullptr;
switch (II.IK) {
case LoopVectorizationLegality::IK_NoInduction:
llvm_unreachable("Unknown induction");
case LoopVectorizationLegality::IK_IntInduction: {
assert(OrigPhi->getType()->isIntegerTy() && "Invalid type");
if (OrigPhi == OldInduction) {
EndValue =
BypassBuilder.CreateTrunc(IdxEndRoundDown, OrigPhi->getType());
for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I)
TruncResumeVal->addIncoming(II.StartValue, LoopBypassBlocks[I]);
TruncResumeVal->addIncoming(EndValue, VecBody);
BCTruncResumeVal->addIncoming(II.StartValue, LoopBypassBlocks[0]);
EndValue = IdxEndRoundDown;
ResumeIndex = ResumeVal;
break;
}
Value *CRD = BypassBuilder.CreateSExtOrTrunc(CountRoundDown,
II.StartValue->getType(),
"cast.crd");
EndValue = II.transform(BypassBuilder, CRD);
EndValue->setName("ind.end");
break;
}
case LoopVectorizationLegality::IK_PtrInduction: {
Value *CRD = BypassBuilder.CreateSExtOrTrunc(CountRoundDown,
II.StepValue->getType(),
"cast.crd");
EndValue = II.transform(BypassBuilder, CRD);
EndValue->setName("ptr.ind.end");
break;
}
}
for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I) {
if (OrigPhi == OldInduction)
ResumeVal->addIncoming(StartIdx, LoopBypassBlocks[I]);
else
ResumeVal->addIncoming(II.StartValue, LoopBypassBlocks[I]);
}
ResumeVal->addIncoming(EndValue, VecBody);
unsigned BlockIdx = OrigPhi->getBasicBlockIndex(ScalarPH);
if (OrigPhi == OldInduction) {
BCResumeVal->addIncoming(StartIdx, LoopBypassBlocks[0]);
OrigPhi->setIncomingValue(BlockIdx, BCTruncResumeVal);
} else {
BCResumeVal->addIncoming(II.StartValue, LoopBypassBlocks[0]);
OrigPhi->setIncomingValue(BlockIdx, BCResumeVal);
}
}
if (!OldInduction){
assert(!ResumeIndex && "Unexpected resume value found");
ResumeIndex = PHINode::Create(IdxTy, 2, "new.indc.resume.val",
MiddleBlock->getTerminator());
for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I)
ResumeIndex->addIncoming(StartIdx, LoopBypassBlocks[I]);
ResumeIndex->addIncoming(IdxEndRoundDown, VecBody);
}
assert(ResumeIndex && ResumeIndex->getType()->isIntegerTy() &&
"Invalid resume Index");
Value *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, IdxEnd,
ResumeIndex, "cmp.n",
MiddleBlock->getTerminator());
BranchInst::Create(ExitBlock, ScalarPH, CmpN, MiddleBlock->getTerminator());
MiddleBlock->getTerminator()->eraseFromParent();
Value *NextIdx = Builder.CreateAdd(Induction, Step, "index.next");
Induction->addIncoming(StartIdx, VectorPH);
Induction->addIncoming(NextIdx, VecBody);
Value *ICmp = Builder.CreateICmpEQ(NextIdx, IdxEndRoundDown);
Builder.CreateCondBr(ICmp, MiddleBlock, VecBody);
VecBody->getTerminator()->eraseFromParent();
Builder.SetInsertPoint(VecBody->getFirstInsertionPt());
LoopVectorPreHeader = VectorPH;
LoopScalarPreHeader = ScalarPH;
LoopMiddleBlock = MiddleBlock;
LoopExitBlock = ExitBlock;
LoopVectorBody.push_back(VecBody);
LoopScalarBody = OldBasicBlock;
LoopVectorizeHints Hints(Lp, true);
Hints.setAlreadyVectorized();
}
Constant*
LoopVectorizationLegality::getReductionIdentity(ReductionKind K, Type *Tp) {
switch (K) {
case RK_IntegerXor:
case RK_IntegerAdd:
case RK_IntegerOr:
return ConstantInt::get(Tp, 0);
case RK_IntegerMult:
return ConstantInt::get(Tp, 1);
case RK_IntegerAnd:
return ConstantInt::get(Tp, -1, true);
case RK_FloatMult:
return ConstantFP::get(Tp, 1.0L);
case RK_FloatAdd:
return ConstantFP::get(Tp, 0.0L);
default:
llvm_unreachable("Unknown reduction kind");
}
}
static unsigned
getReductionBinOp(LoopVectorizationLegality::ReductionKind Kind) {
switch (Kind) {
case LoopVectorizationLegality::RK_IntegerAdd:
return Instruction::Add;
case LoopVectorizationLegality::RK_IntegerMult:
return Instruction::Mul;
case LoopVectorizationLegality::RK_IntegerOr:
return Instruction::Or;
case LoopVectorizationLegality::RK_IntegerAnd:
return Instruction::And;
case LoopVectorizationLegality::RK_IntegerXor:
return Instruction::Xor;
case LoopVectorizationLegality::RK_FloatMult:
return Instruction::FMul;
case LoopVectorizationLegality::RK_FloatAdd:
return Instruction::FAdd;
case LoopVectorizationLegality::RK_IntegerMinMax:
return Instruction::ICmp;
case LoopVectorizationLegality::RK_FloatMinMax:
return Instruction::FCmp;
default:
llvm_unreachable("Unknown reduction operation");
}
}
Value *createMinMaxOp(IRBuilder<> &Builder,
LoopVectorizationLegality::MinMaxReductionKind RK,
Value *Left,
Value *Right) {
CmpInst::Predicate P = CmpInst::ICMP_NE;
switch (RK) {
default:
llvm_unreachable("Unknown min/max reduction kind");
case LoopVectorizationLegality::MRK_UIntMin:
P = CmpInst::ICMP_ULT;
break;
case LoopVectorizationLegality::MRK_UIntMax:
P = CmpInst::ICMP_UGT;
break;
case LoopVectorizationLegality::MRK_SIntMin:
P = CmpInst::ICMP_SLT;
break;
case LoopVectorizationLegality::MRK_SIntMax:
P = CmpInst::ICMP_SGT;
break;
case LoopVectorizationLegality::MRK_FloatMin:
P = CmpInst::FCMP_OLT;
break;
case LoopVectorizationLegality::MRK_FloatMax:
P = CmpInst::FCMP_OGT;
break;
}
Value *Cmp;
if (RK == LoopVectorizationLegality::MRK_FloatMin ||
RK == LoopVectorizationLegality::MRK_FloatMax)
Cmp = Builder.CreateFCmp(P, Left, Right, "rdx.minmax.cmp");
else
Cmp = Builder.CreateICmp(P, Left, Right, "rdx.minmax.cmp");
Value *Select = Builder.CreateSelect(Cmp, Left, Right, "rdx.minmax.select");
return Select;
}
namespace {
struct CSEDenseMapInfo {
static bool canHandle(Instruction *I) {
return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
}
static inline Instruction *getEmptyKey() {
return DenseMapInfo<Instruction *>::getEmptyKey();
}
static inline Instruction *getTombstoneKey() {
return DenseMapInfo<Instruction *>::getTombstoneKey();
}
static unsigned getHashValue(Instruction *I) {
assert(canHandle(I) && "Unknown instruction!");
return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),
I->value_op_end()));
}
static bool isEqual(Instruction *LHS, Instruction *RHS) {
if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
LHS == getTombstoneKey() || RHS == getTombstoneKey())
return LHS == RHS;
return LHS->isIdenticalTo(RHS);
}
};
}
static bool isPredicatedBlock(unsigned BlockNum) {
return BlockNum % 2;
}
static void cse(SmallVector<BasicBlock *, 4> &BBs) {
SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap;
for (unsigned i = 0, e = BBs.size(); i != e; ++i) {
BasicBlock *BB = BBs[i];
for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
Instruction *In = I++;
if (!CSEDenseMapInfo::canHandle(In))
continue;
if (Instruction *V = CSEMap.lookup(In)) {
In->replaceAllUsesWith(V);
In->eraseFromParent();
continue;
}
if (isPredicatedBlock(i))
continue;
CSEMap[In] = In;
}
}
}
static Value *addFastMathFlag(Value *V) {
if (isa<FPMathOperator>(V)){
FastMathFlags Flags;
Flags.setUnsafeAlgebra();
cast<Instruction>(V)->setFastMathFlags(Flags);
}
return V;
}
static unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract,
const TargetTransformInfo &TTI) {
if (Ty->isVoidTy())
return 0;
assert(Ty->isVectorTy() && "Can only scalarize vectors");
unsigned Cost = 0;
for (int i = 0, e = Ty->getVectorNumElements(); i < e; ++i) {
if (Insert)
Cost += TTI.getVectorInstrCost(Instruction::InsertElement, Ty, i);
if (Extract)
Cost += TTI.getVectorInstrCost(Instruction::ExtractElement, Ty, i);
}
return Cost;
}
static unsigned getVectorCallCost(CallInst *CI, unsigned VF,
const TargetTransformInfo &TTI,
const TargetLibraryInfo *TLI,
bool &NeedToScalarize) {
Function *F = CI->getCalledFunction();
StringRef FnName = CI->getCalledFunction()->getName();
Type *ScalarRetTy = CI->getType();
SmallVector<Type *, 4> Tys, ScalarTys;
for (auto &ArgOp : CI->arg_operands())
ScalarTys.push_back(ArgOp->getType());
unsigned ScalarCallCost = TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys);
if (VF == 1)
return ScalarCallCost;
Type *RetTy = ToVectorTy(ScalarRetTy, VF);
for (unsigned i = 0, ie = ScalarTys.size(); i != ie; ++i)
Tys.push_back(ToVectorTy(ScalarTys[i], VF));
unsigned ScalarizationCost =
getScalarizationOverhead(RetTy, true, false, TTI);
for (unsigned i = 0, ie = Tys.size(); i != ie; ++i)
ScalarizationCost += getScalarizationOverhead(Tys[i], false, true, TTI);
unsigned Cost = ScalarCallCost * VF + ScalarizationCost;
NeedToScalarize = true;
if (!TLI || !TLI->isFunctionVectorizable(FnName, VF) || CI->isNoBuiltin())
return Cost;
unsigned VectorCallCost = TTI.getCallInstrCost(nullptr, RetTy, Tys);
if (VectorCallCost < Cost) {
NeedToScalarize = false;
return VectorCallCost;
}
return Cost;
}
static unsigned getVectorIntrinsicCost(CallInst *CI, unsigned VF,
const TargetTransformInfo &TTI,
const TargetLibraryInfo *TLI) {
Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI);
assert(ID && "Expected intrinsic call!");
Type *RetTy = ToVectorTy(CI->getType(), VF);
SmallVector<Type *, 4> Tys;
for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i)
Tys.push_back(ToVectorTy(CI->getArgOperand(i)->getType(), VF));
return TTI.getIntrinsicInstrCost(ID, RetTy, Tys);
}
void InnerLoopVectorizer::vectorizeLoop() {
Constant *Zero = Builder.getInt32(0);
PhiVector RdxPHIsToFix;
LoopBlocksDFS DFS(OrigLoop);
DFS.perform(LI);
for (LoopBlocksDFS::RPOIterator bb = DFS.beginRPO(),
be = DFS.endRPO(); bb != be; ++bb)
vectorizeBlockInLoop(*bb, &RdxPHIsToFix);
for (PhiVector::iterator it = RdxPHIsToFix.begin(), e = RdxPHIsToFix.end();
it != e; ++it) {
PHINode *RdxPhi = *it;
assert(RdxPhi && "Unable to recover vectorized PHI");
assert(Legal->getReductionVars()->count(RdxPhi) &&
"Unable to find the reduction variable");
LoopVectorizationLegality::ReductionDescriptor RdxDesc =
(*Legal->getReductionVars())[RdxPhi];
setDebugLocFromInst(Builder, RdxDesc.StartValue);
Builder.SetInsertPoint(LoopBypassBlocks[1]->getTerminator());
VectorParts &VectorExit = getVectorValue(RdxDesc.LoopExitInstr);
Type *VecTy = VectorExit[0]->getType();
Value *Identity;
Value *VectorStart;
if (RdxDesc.Kind == LoopVectorizationLegality::RK_IntegerMinMax ||
RdxDesc.Kind == LoopVectorizationLegality::RK_FloatMinMax) {
if (VF == 1) {
VectorStart = Identity = RdxDesc.StartValue;
} else {
VectorStart = Identity = Builder.CreateVectorSplat(VF,
RdxDesc.StartValue,
"minmax.ident");
}
} else {
Constant *Iden =
LoopVectorizationLegality::getReductionIdentity(RdxDesc.Kind,
VecTy->getScalarType());
if (VF == 1) {
Identity = Iden;
VectorStart = RdxDesc.StartValue;
} else {
Identity = ConstantVector::getSplat(VF, Iden);
VectorStart = Builder.CreateInsertElement(Identity,
RdxDesc.StartValue, Zero);
}
}
VectorParts &VecRdxPhi = WidenMap.get(RdxPhi);
BasicBlock *Latch = OrigLoop->getLoopLatch();
Value *LoopVal = RdxPhi->getIncomingValueForBlock(Latch);
VectorParts &Val = getVectorValue(LoopVal);
for (unsigned part = 0; part < UF; ++part) {
Value *StartVal = (part == 0) ? VectorStart : Identity;
cast<PHINode>(VecRdxPhi[part])->addIncoming(StartVal,
LoopVectorPreHeader);
cast<PHINode>(VecRdxPhi[part])->addIncoming(Val[part],
LoopVectorBody.back());
}
Builder.SetInsertPoint(LoopMiddleBlock->getFirstInsertionPt());
VectorParts RdxParts;
setDebugLocFromInst(Builder, RdxDesc.LoopExitInstr);
for (unsigned part = 0; part < UF; ++part) {
VectorParts &RdxExitVal = getVectorValue(RdxDesc.LoopExitInstr);
PHINode *NewPhi = Builder.CreatePHI(VecTy, 2, "rdx.vec.exit.phi");
Value *StartVal = (part == 0) ? VectorStart : Identity;
for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I)
NewPhi->addIncoming(StartVal, LoopBypassBlocks[I]);
NewPhi->addIncoming(RdxExitVal[part],
LoopVectorBody.back());
RdxParts.push_back(NewPhi);
}
Value *ReducedPartRdx = RdxParts[0];
unsigned Op = getReductionBinOp(RdxDesc.Kind);
setDebugLocFromInst(Builder, ReducedPartRdx);
for (unsigned part = 1; part < UF; ++part) {
if (Op != Instruction::ICmp && Op != Instruction::FCmp)
ReducedPartRdx = addFastMathFlag(
Builder.CreateBinOp((Instruction::BinaryOps)Op, RdxParts[part],
ReducedPartRdx, "bin.rdx"));
else
ReducedPartRdx = createMinMaxOp(Builder, RdxDesc.MinMaxKind,
ReducedPartRdx, RdxParts[part]);
}
if (VF > 1) {
assert(isPowerOf2_32(VF) &&
"Reduction emission only supported for pow2 vectors!");
Value *TmpVec = ReducedPartRdx;
SmallVector<Constant*, 32> ShuffleMask(VF, nullptr);
for (unsigned i = VF; i != 1; i >>= 1) {
for (unsigned j = 0; j != i/2; ++j)
ShuffleMask[j] = Builder.getInt32(i/2 + j);
std::fill(&ShuffleMask[i/2], ShuffleMask.end(),
UndefValue::get(Builder.getInt32Ty()));
Value *Shuf =
Builder.CreateShuffleVector(TmpVec,
UndefValue::get(TmpVec->getType()),
ConstantVector::get(ShuffleMask),
"rdx.shuf");
if (Op != Instruction::ICmp && Op != Instruction::FCmp)
TmpVec = addFastMathFlag(Builder.CreateBinOp(
(Instruction::BinaryOps)Op, TmpVec, Shuf, "bin.rdx"));
else
TmpVec = createMinMaxOp(Builder, RdxDesc.MinMaxKind, TmpVec, Shuf);
}
ReducedPartRdx = Builder.CreateExtractElement(TmpVec,
Builder.getInt32(0));
}
PHINode *BCBlockPhi = PHINode::Create(RdxPhi->getType(), 2, "bc.merge.rdx",
LoopScalarPreHeader->getTerminator());
BCBlockPhi->addIncoming(RdxDesc.StartValue, LoopBypassBlocks[0]);
BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);
for (BasicBlock::iterator LEI = LoopExitBlock->begin(),
LEE = LoopExitBlock->end(); LEI != LEE; ++LEI) {
PHINode *LCSSAPhi = dyn_cast<PHINode>(LEI);
if (!LCSSAPhi) break;
assert(LCSSAPhi->getNumIncomingValues() < 3 && "Invalid LCSSA PHI");
if (LCSSAPhi->getIncomingValue(0) == RdxDesc.LoopExitInstr) {
LCSSAPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);
break;
}
}
int IncomingEdgeBlockIdx =
(RdxPhi)->getBasicBlockIndex(OrigLoop->getLoopLatch());
assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
(RdxPhi)->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
(RdxPhi)->setIncomingValue(IncomingEdgeBlockIdx, RdxDesc.LoopExitInstr);
}
fixLCSSAPHIs();
cse(LoopVectorBody);
}
void InnerLoopVectorizer::fixLCSSAPHIs() {
for (BasicBlock::iterator LEI = LoopExitBlock->begin(),
LEE = LoopExitBlock->end(); LEI != LEE; ++LEI) {
PHINode *LCSSAPhi = dyn_cast<PHINode>(LEI);
if (!LCSSAPhi) break;
if (LCSSAPhi->getNumIncomingValues() == 1)
LCSSAPhi->addIncoming(UndefValue::get(LCSSAPhi->getType()),
LoopMiddleBlock);
}
}
InnerLoopVectorizer::VectorParts
InnerLoopVectorizer::createEdgeMask(BasicBlock *Src, BasicBlock *Dst) {
assert(std::find(pred_begin(Dst), pred_end(Dst), Src) != pred_end(Dst) &&
"Invalid edge");
std::pair<BasicBlock*, BasicBlock*> Edge(Src, Dst);
EdgeMaskCache::iterator ECEntryIt = MaskCache.find(Edge);
if (ECEntryIt != MaskCache.end())
return ECEntryIt->second;
VectorParts SrcMask = createBlockInMask(Src);
BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
assert(BI && "Unexpected terminator found");
if (BI->isConditional()) {
VectorParts EdgeMask = getVectorValue(BI->getCondition());
if (BI->getSuccessor(0) != Dst)
for (unsigned part = 0; part < UF; ++part)
EdgeMask[part] = Builder.CreateNot(EdgeMask[part]);
for (unsigned part = 0; part < UF; ++part)
EdgeMask[part] = Builder.CreateAnd(EdgeMask[part], SrcMask[part]);
MaskCache[Edge] = EdgeMask;
return EdgeMask;
}
MaskCache[Edge] = SrcMask;
return SrcMask;
}
InnerLoopVectorizer::VectorParts
InnerLoopVectorizer::createBlockInMask(BasicBlock *BB) {
assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
if (OrigLoop->getHeader() == BB) {
Value *C = ConstantInt::get(IntegerType::getInt1Ty(BB->getContext()), 1);
return getVectorValue(C);
}
Value *Zero = ConstantInt::get(IntegerType::getInt1Ty(BB->getContext()), 0);
VectorParts BlockMask = getVectorValue(Zero);
for (pred_iterator it = pred_begin(BB), e = pred_end(BB); it != e; ++it) {
VectorParts EM = createEdgeMask(*it, BB);
for (unsigned part = 0; part < UF; ++part)
BlockMask[part] = Builder.CreateOr(BlockMask[part], EM[part]);
}
return BlockMask;
}
void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN,
InnerLoopVectorizer::VectorParts &Entry,
unsigned UF, unsigned VF, PhiVector *PV) {
PHINode* P = cast<PHINode>(PN);
if (Legal->getReductionVars()->count(P)) {
for (unsigned part = 0; part < UF; ++part) {
Type *VecTy = (VF == 1) ? PN->getType() :
VectorType::get(PN->getType(), VF);
Entry[part] = PHINode::Create(VecTy, 2, "vec.phi",
LoopVectorBody.back()-> getFirstInsertionPt());
}
PV->push_back(P);
return;
}
setDebugLocFromInst(Builder, P);
if (P->getParent() != OrigLoop->getHeader()) {
unsigned NumIncoming = P->getNumIncomingValues();
for (unsigned In = 0; In < NumIncoming; In++) {
VectorParts Cond = createEdgeMask(P->getIncomingBlock(In),
P->getParent());
VectorParts &In0 = getVectorValue(P->getIncomingValue(In));
for (unsigned part = 0; part < UF; ++part) {
if (In == 0)
Entry[part] = Builder.CreateSelect(Cond[part], In0[part],
In0[part]);
else
Entry[part] = Builder.CreateSelect(Cond[part], In0[part],
Entry[part], "predphi");
}
}
return;
}
assert(Legal->getInductionVars()->count(P) &&
"Not an induction variable");
LoopVectorizationLegality::InductionInfo II =
Legal->getInductionVars()->lookup(P);
switch (II.IK) {
case LoopVectorizationLegality::IK_NoInduction:
llvm_unreachable("Unknown induction");
case LoopVectorizationLegality::IK_IntInduction: {
assert(P->getType() == II.StartValue->getType() && "Types must match");
Type *PhiTy = P->getType();
Value *Broadcasted;
if (P == OldInduction) {
Broadcasted = Builder.CreateTrunc(Induction, PhiTy);
} else {
Value *NormalizedIdx = Builder.CreateSub(Induction, ExtendedIdx,
"normalized.idx");
NormalizedIdx = Builder.CreateSExtOrTrunc(NormalizedIdx, PhiTy);
Broadcasted = II.transform(Builder, NormalizedIdx);
Broadcasted->setName("offset.idx");
}
Broadcasted = getBroadcastInstrs(Broadcasted);
for (unsigned part = 0; part < UF; ++part)
Entry[part] = getStepVector(Broadcasted, VF * part, II.StepValue);
return;
}
case LoopVectorizationLegality::IK_PtrInduction:
assert(P->getType()->isPointerTy() && "Unexpected type.");
Value *NormalizedIdx =
Builder.CreateSub(Induction, ExtendedIdx, "normalized.idx");
NormalizedIdx =
Builder.CreateSExtOrTrunc(NormalizedIdx, II.StepValue->getType());
for (unsigned part = 0; part < UF; ++part) {
if (VF == 1) {
int EltIndex = part;
Constant *Idx = ConstantInt::get(NormalizedIdx->getType(), EltIndex);
Value *GlobalIdx = Builder.CreateAdd(NormalizedIdx, Idx);
Value *SclrGep = II.transform(Builder, GlobalIdx);
SclrGep->setName("next.gep");
Entry[part] = SclrGep;
continue;
}
Value *VecVal = UndefValue::get(VectorType::get(P->getType(), VF));
for (unsigned int i = 0; i < VF; ++i) {
int EltIndex = i + part * VF;
Constant *Idx = ConstantInt::get(NormalizedIdx->getType(), EltIndex);
Value *GlobalIdx = Builder.CreateAdd(NormalizedIdx, Idx);
Value *SclrGep = II.transform(Builder, GlobalIdx);
SclrGep->setName("next.gep");
VecVal = Builder.CreateInsertElement(VecVal, SclrGep,
Builder.getInt32(i),
"insert.gep");
}
Entry[part] = VecVal;
}
return;
}
}
void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) {
for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
VectorParts &Entry = WidenMap.get(it);
switch (it->getOpcode()) {
case Instruction::Br:
continue;
case Instruction::PHI: {
widenPHIInstruction(it, Entry, UF, VF, PV);
continue;
}
case Instruction::Add:
case Instruction::FAdd:
case Instruction::Sub:
case Instruction::FSub:
case Instruction::Mul:
case Instruction::FMul:
case Instruction::UDiv:
case Instruction::SDiv:
case Instruction::FDiv:
case Instruction::URem:
case Instruction::SRem:
case Instruction::FRem:
case Instruction::Shl:
case Instruction::LShr:
case Instruction::AShr:
case Instruction::And:
case Instruction::Or:
case Instruction::Xor: {
BinaryOperator *BinOp = dyn_cast<BinaryOperator>(it);
setDebugLocFromInst(Builder, BinOp);
VectorParts &A = getVectorValue(it->getOperand(0));
VectorParts &B = getVectorValue(it->getOperand(1));
for (unsigned Part = 0; Part < UF; ++Part) {
Value *V = Builder.CreateBinOp(BinOp->getOpcode(), A[Part], B[Part]);
if (BinaryOperator *VecOp = dyn_cast<BinaryOperator>(V))
VecOp->copyIRFlags(BinOp);
Entry[Part] = V;
}
propagateMetadata(Entry, it);
break;
}
case Instruction::Select: {
bool InvariantCond = SE->isLoopInvariant(SE->getSCEV(it->getOperand(0)),
OrigLoop);
setDebugLocFromInst(Builder, it);
VectorParts &Cond = getVectorValue(it->getOperand(0));
VectorParts &Op0 = getVectorValue(it->getOperand(1));
VectorParts &Op1 = getVectorValue(it->getOperand(2));
Value *ScalarCond = (VF == 1) ? Cond[0] :
Builder.CreateExtractElement(Cond[0], Builder.getInt32(0));
for (unsigned Part = 0; Part < UF; ++Part) {
Entry[Part] = Builder.CreateSelect(
InvariantCond ? ScalarCond : Cond[Part],
Op0[Part],
Op1[Part]);
}
propagateMetadata(Entry, it);
break;
}
case Instruction::ICmp:
case Instruction::FCmp: {
bool FCmp = (it->getOpcode() == Instruction::FCmp);
CmpInst *Cmp = dyn_cast<CmpInst>(it);
setDebugLocFromInst(Builder, it);
VectorParts &A = getVectorValue(it->getOperand(0));
VectorParts &B = getVectorValue(it->getOperand(1));
for (unsigned Part = 0; Part < UF; ++Part) {
Value *C = nullptr;
if (FCmp)
C = Builder.CreateFCmp(Cmp->getPredicate(), A[Part], B[Part]);
else
C = Builder.CreateICmp(Cmp->getPredicate(), A[Part], B[Part]);
Entry[Part] = C;
}
propagateMetadata(Entry, it);
break;
}
case Instruction::Store:
case Instruction::Load:
vectorizeMemoryInstruction(it);
break;
case Instruction::ZExt:
case Instruction::SExt:
case Instruction::FPToUI:
case Instruction::FPToSI:
case Instruction::FPExt:
case Instruction::PtrToInt:
case Instruction::IntToPtr:
case Instruction::SIToFP:
case Instruction::UIToFP:
case Instruction::Trunc:
case Instruction::FPTrunc:
case Instruction::BitCast: {
CastInst *CI = dyn_cast<CastInst>(it);
setDebugLocFromInst(Builder, it);
if (CI->getOperand(0) == OldInduction &&
it->getOpcode() == Instruction::Trunc) {
Value *ScalarCast = Builder.CreateCast(CI->getOpcode(), Induction,
CI->getType());
Value *Broadcasted = getBroadcastInstrs(ScalarCast);
LoopVectorizationLegality::InductionInfo II =
Legal->getInductionVars()->lookup(OldInduction);
Constant *Step =
ConstantInt::getSigned(CI->getType(), II.StepValue->getSExtValue());
for (unsigned Part = 0; Part < UF; ++Part)
Entry[Part] = getStepVector(Broadcasted, VF * Part, Step);
propagateMetadata(Entry, it);
break;
}
Type *DestTy = (VF == 1) ? CI->getType() :
VectorType::get(CI->getType(), VF);
VectorParts &A = getVectorValue(it->getOperand(0));
for (unsigned Part = 0; Part < UF; ++Part)
Entry[Part] = Builder.CreateCast(CI->getOpcode(), A[Part], DestTy);
propagateMetadata(Entry, it);
break;
}
case Instruction::Call: {
if (isa<DbgInfoIntrinsic>(it))
break;
setDebugLocFromInst(Builder, it);
Module *M = BB->getParent()->getParent();
CallInst *CI = cast<CallInst>(it);
StringRef FnName = CI->getCalledFunction()->getName();
Function *F = CI->getCalledFunction();
Type *RetTy = ToVectorTy(CI->getType(), VF);
SmallVector<Type *, 4> Tys;
for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i)
Tys.push_back(ToVectorTy(CI->getArgOperand(i)->getType(), VF));
Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI);
if (ID &&
(ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
ID == Intrinsic::lifetime_start)) {
scalarizeInstruction(it);
break;
}
bool NeedToScalarize;
unsigned CallCost = getVectorCallCost(CI, VF, *TTI, TLI, NeedToScalarize);
bool UseVectorIntrinsic =
ID && getVectorIntrinsicCost(CI, VF, *TTI, TLI) <= CallCost;
if (!UseVectorIntrinsic && NeedToScalarize) {
scalarizeInstruction(it);
break;
}
for (unsigned Part = 0; Part < UF; ++Part) {
SmallVector<Value *, 4> Args;
for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) {
Value *Arg = CI->getArgOperand(i);
if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, i)) {
VectorParts &VectorArg = getVectorValue(CI->getArgOperand(i));
Arg = VectorArg[Part];
}
Args.push_back(Arg);
}
Function *VectorF;
if (UseVectorIntrinsic) {
Type *TysForDecl[] = {CI->getType()};
if (VF > 1)
TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF);
VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl);
} else {
StringRef VFnName = TLI->getVectorizedFunction(FnName, VF);
assert(!VFnName.empty() && "Vector function name is empty.");
VectorF = M->getFunction(VFnName);
if (!VectorF) {
FunctionType *FTy = FunctionType::get(RetTy, Tys, false);
VectorF =
Function::Create(FTy, Function::ExternalLinkage, VFnName, M);
VectorF->copyAttributesFrom(F);
}
}
assert(VectorF && "Can't create vector function.");
Entry[Part] = Builder.CreateCall(VectorF, Args);
}
propagateMetadata(Entry, it);
break;
}
default:
scalarizeInstruction(it);
break;
} }}
void InnerLoopVectorizer::updateAnalysis() {
SE->forgetLoop(OrigLoop);
assert(DT->properlyDominates(LoopBypassBlocks.front(), LoopExitBlock) &&
"Entry does not dominate exit.");
for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I)
DT->addNewBlock(LoopBypassBlocks[I], LoopBypassBlocks[I-1]);
DT->addNewBlock(LoopVectorPreHeader, LoopBypassBlocks.back());
for (unsigned i = 0, e = LoopVectorBody.size(); i != e; ++i) {
if (i == 0)
DT->addNewBlock(LoopVectorBody[0], LoopVectorPreHeader);
else if (isPredicatedBlock(i)) {
DT->addNewBlock(LoopVectorBody[i], LoopVectorBody[i-1]);
} else {
DT->addNewBlock(LoopVectorBody[i], LoopVectorBody[i-2]);
}
}
DT->addNewBlock(LoopMiddleBlock, LoopBypassBlocks[1]);
DT->addNewBlock(LoopScalarPreHeader, LoopBypassBlocks[0]);
DT->changeImmediateDominator(LoopScalarBody, LoopScalarPreHeader);
DT->changeImmediateDominator(LoopExitBlock, LoopBypassBlocks[0]);
DEBUG(DT->verifyDomTree());
}
static bool canIfConvertPHINodes(BasicBlock *BB) {
for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
PHINode *Phi = dyn_cast<PHINode>(I);
if (!Phi)
return true;
for (unsigned p = 0, e = Phi->getNumIncomingValues(); p != e; ++p)
if (Constant *C = dyn_cast<Constant>(Phi->getIncomingValue(p)))
if (C->canTrap())
return false;
}
return true;
}
bool LoopVectorizationLegality::canVectorizeWithIfConvert() {
if (!EnableIfConversion) {
emitAnalysis(VectorizationReport() << "if-conversion is disabled");
return false;
}
assert(TheLoop->getNumBlocks() > 1 && "Single block loops are vectorizable");
SmallPtrSet<Value *, 8> SafePointes;
for (Loop::block_iterator BI = TheLoop->block_begin(),
BE = TheLoop->block_end(); BI != BE; ++BI) {
BasicBlock *BB = *BI;
if (blockNeedsPredication(BB))
continue;
for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
if (LoadInst *LI = dyn_cast<LoadInst>(I))
SafePointes.insert(LI->getPointerOperand());
else if (StoreInst *SI = dyn_cast<StoreInst>(I))
SafePointes.insert(SI->getPointerOperand());
}
}
BasicBlock *Header = TheLoop->getHeader();
for (Loop::block_iterator BI = TheLoop->block_begin(),
BE = TheLoop->block_end(); BI != BE; ++BI) {
BasicBlock *BB = *BI;
if (!isa<BranchInst>(BB->getTerminator())) {
emitAnalysis(VectorizationReport(BB->getTerminator())
<< "loop contains a switch statement");
return false;
}
if (blockNeedsPredication(BB)) {
if (!blockCanBePredicated(BB, SafePointes)) {
emitAnalysis(VectorizationReport(BB->getTerminator())
<< "control flow cannot be substituted for a select");
return false;
}
} else if (BB != Header && !canIfConvertPHINodes(BB)) {
emitAnalysis(VectorizationReport(BB->getTerminator())
<< "control flow cannot be substituted for a select");
return false;
}
}
return true;
}
bool LoopVectorizationLegality::canVectorize() {
if (!TheLoop->getLoopPreheader()) {
emitAnalysis(
VectorizationReport() <<
"loop control flow is not understood by vectorizer");
return false;
}
if (!TheLoop->getSubLoopsVector().empty()) {
emitAnalysis(VectorizationReport() << "loop is not the innermost loop");
return false;
}
if (TheLoop->getNumBackEdges() != 1) {
emitAnalysis(
VectorizationReport() <<
"loop control flow is not understood by vectorizer");
return false;
}
if (!TheLoop->getExitingBlock()) {
emitAnalysis(
VectorizationReport() <<
"loop control flow is not understood by vectorizer");
return false;
}
if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
emitAnalysis(
VectorizationReport() <<
"loop control flow is not understood by vectorizer");
return false;
}
DEBUG(dbgs() << "LV: Found a loop: " <<
TheLoop->getHeader()->getName() << '\n');
unsigned NumBlocks = TheLoop->getNumBlocks();
if (NumBlocks != 1 && !canVectorizeWithIfConvert()) {
DEBUG(dbgs() << "LV: Can't if-convert the loop.\n");
return false;
}
const SCEV *ExitCount = SE->getBackedgeTakenCount(TheLoop);
if (ExitCount == SE->getCouldNotCompute()) {
emitAnalysis(VectorizationReport() <<
"could not determine number of loop iterations");
DEBUG(dbgs() << "LV: SCEV could not compute the loop exit count.\n");
return false;
}
if (!canVectorizeInstrs()) {
DEBUG(dbgs() << "LV: Can't vectorize the instructions or CFG\n");
return false;
}
if (!canVectorizeMemory()) {
DEBUG(dbgs() << "LV: Can't vectorize due to memory conflicts\n");
return false;
}
collectLoopUniforms();
DEBUG(dbgs() << "LV: We can vectorize this loop" <<
(LAI->getRuntimePointerCheck()->Need ? " (with a runtime bound check)" :
"")
<<"!\n");
return true;
}
static Type *convertPointerToIntegerType(const DataLayout &DL, Type *Ty) {
if (Ty->isPointerTy())
return DL.getIntPtrType(Ty);
if (Ty->getScalarSizeInBits() < 32)
return Type::getInt32Ty(Ty->getContext());
return Ty;
}
static Type* getWiderType(const DataLayout &DL, Type *Ty0, Type *Ty1) {
Ty0 = convertPointerToIntegerType(DL, Ty0);
Ty1 = convertPointerToIntegerType(DL, Ty1);
if (Ty0->getScalarSizeInBits() > Ty1->getScalarSizeInBits())
return Ty0;
return Ty1;
}
static bool hasOutsideLoopUser(const Loop *TheLoop, Instruction *Inst,
SmallPtrSetImpl<Value *> &Reductions) {
if (!Reductions.count(Inst))
for (User *U : Inst->users()) {
Instruction *UI = cast<Instruction>(U);
if (!TheLoop->contains(UI)) {
DEBUG(dbgs() << "LV: Found an outside user for : " << *UI << '\n');
return true;
}
}
return false;
}
bool LoopVectorizationLegality::canVectorizeInstrs() {
BasicBlock *PreHeader = TheLoop->getLoopPreheader();
BasicBlock *Header = TheLoop->getHeader();
Function &F = *Header->getParent();
const DataLayout &DL = F.getParent()->getDataLayout();
if (F.hasFnAttribute("no-nans-fp-math"))
HasFunNoNaNAttr =
F.getFnAttribute("no-nans-fp-math").getValueAsString() == "true";
for (Loop::block_iterator bb = TheLoop->block_begin(),
be = TheLoop->block_end(); bb != be; ++bb) {
for (BasicBlock::iterator it = (*bb)->begin(), e = (*bb)->end(); it != e;
++it) {
if (PHINode *Phi = dyn_cast<PHINode>(it)) {
Type *PhiTy = Phi->getType();
if (!PhiTy->isIntegerTy() &&
!PhiTy->isFloatingPointTy() &&
!PhiTy->isPointerTy()) {
emitAnalysis(VectorizationReport(it)
<< "loop control flow is not understood by vectorizer");
DEBUG(dbgs() << "LV: Found an non-int non-pointer PHI.\n");
return false;
}
if (*bb != Header) {
if (!hasOutsideLoopUser(TheLoop, it, AllowedExit))
continue;
emitAnalysis(VectorizationReport(it) <<
"value could not be identified as "
"an induction or reduction variable");
return false;
}
if (Phi->getNumIncomingValues() != 2) {
emitAnalysis(VectorizationReport(it)
<< "control flow not understood by vectorizer");
DEBUG(dbgs() << "LV: Found an invalid PHI.\n");
return false;
}
Value *StartValue = Phi->getIncomingValueForBlock(PreHeader);
ConstantInt *StepValue = nullptr;
InductionKind IK = isInductionVariable(Phi, StepValue);
if (IK_NoInduction != IK) {
if (!WidestIndTy)
WidestIndTy = convertPointerToIntegerType(DL, PhiTy);
else
WidestIndTy = getWiderType(DL, PhiTy, WidestIndTy);
if (IK == IK_IntInduction && StepValue->isOne()) {
if (!Induction || PhiTy == WidestIndTy)
Induction = Phi;
}
DEBUG(dbgs() << "LV: Found an induction variable.\n");
Inductions[Phi] = InductionInfo(StartValue, IK, StepValue);
if (hasOutsideLoopUser(TheLoop, it, AllowedExit)) {
emitAnalysis(VectorizationReport(it) <<
"use of induction value outside of the "
"loop is not handled by vectorizer");
return false;
}
continue;
}
if (AddReductionVar(Phi, RK_IntegerAdd)) {
DEBUG(dbgs() << "LV: Found an ADD reduction PHI."<< *Phi <<"\n");
continue;
}
if (AddReductionVar(Phi, RK_IntegerMult)) {
DEBUG(dbgs() << "LV: Found a MUL reduction PHI."<< *Phi <<"\n");
continue;
}
if (AddReductionVar(Phi, RK_IntegerOr)) {
DEBUG(dbgs() << "LV: Found an OR reduction PHI."<< *Phi <<"\n");
continue;
}
if (AddReductionVar(Phi, RK_IntegerAnd)) {
DEBUG(dbgs() << "LV: Found an AND reduction PHI."<< *Phi <<"\n");
continue;
}
if (AddReductionVar(Phi, RK_IntegerXor)) {
DEBUG(dbgs() << "LV: Found a XOR reduction PHI."<< *Phi <<"\n");
continue;
}
if (AddReductionVar(Phi, RK_IntegerMinMax)) {
DEBUG(dbgs() << "LV: Found a MINMAX reduction PHI."<< *Phi <<"\n");
continue;
}
if (AddReductionVar(Phi, RK_FloatMult)) {
DEBUG(dbgs() << "LV: Found an FMult reduction PHI."<< *Phi <<"\n");
continue;
}
if (AddReductionVar(Phi, RK_FloatAdd)) {
DEBUG(dbgs() << "LV: Found an FAdd reduction PHI."<< *Phi <<"\n");
continue;
}
if (AddReductionVar(Phi, RK_FloatMinMax)) {
DEBUG(dbgs() << "LV: Found an float MINMAX reduction PHI."<< *Phi <<
"\n");
continue;
}
emitAnalysis(VectorizationReport(it) <<
"value that could not be identified as "
"reduction is used outside the loop");
DEBUG(dbgs() << "LV: Found an unidentified PHI."<< *Phi <<"\n");
return false;
}
CallInst *CI = dyn_cast<CallInst>(it);
if (CI && !getIntrinsicIDForCall(CI, TLI) && !isa<DbgInfoIntrinsic>(CI) &&
!(CI->getCalledFunction() && TLI &&
TLI->isFunctionVectorizable(CI->getCalledFunction()->getName()))) {
emitAnalysis(VectorizationReport(it) <<
"call instruction cannot be vectorized");
DEBUG(dbgs() << "LV: Found a non-intrinsic, non-libfunc callsite.\n");
return false;
}
if (CI &&
hasVectorInstrinsicScalarOpd(getIntrinsicIDForCall(CI, TLI), 1)) {
if (!SE->isLoopInvariant(SE->getSCEV(CI->getOperand(1)), TheLoop)) {
emitAnalysis(VectorizationReport(it)
<< "intrinsic instruction cannot be vectorized");
DEBUG(dbgs() << "LV: Found unvectorizable intrinsic " << *CI << "\n");
return false;
}
}
if ((!VectorType::isValidElementType(it->getType()) &&
!it->getType()->isVoidTy()) || isa<ExtractElementInst>(it)) {
emitAnalysis(VectorizationReport(it)
<< "instruction return type cannot be vectorized");
DEBUG(dbgs() << "LV: Found unvectorizable type.\n");
return false;
}
if (StoreInst *ST = dyn_cast<StoreInst>(it)) {
Type *T = ST->getValueOperand()->getType();
if (!VectorType::isValidElementType(T)) {
emitAnalysis(VectorizationReport(ST) <<
"store instruction cannot be vectorized");
return false;
}
if (EnableMemAccessVersioning)
collectStridedAccess(ST);
}
if (EnableMemAccessVersioning)
if (LoadInst *LI = dyn_cast<LoadInst>(it))
collectStridedAccess(LI);
if (hasOutsideLoopUser(TheLoop, it, AllowedExit)) {
emitAnalysis(VectorizationReport(it) <<
"value cannot be used outside the loop");
return false;
}
}
}
if (!Induction) {
DEBUG(dbgs() << "LV: Did not find one integer induction var.\n");
if (Inductions.empty()) {
emitAnalysis(VectorizationReport()
<< "loop induction variable could not be identified");
return false;
}
}
return true;
}
static Value *stripGetElementPtr(Value *Ptr, ScalarEvolution *SE, Loop *Lp) {
GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr);
if (!GEP)
return Ptr;
unsigned InductionOperand = getGEPInductionOperand(GEP);
for (unsigned i = 0, e = GEP->getNumOperands(); i != e; ++i)
if (i != InductionOperand &&
!SE->isLoopInvariant(SE->getSCEV(GEP->getOperand(i)), Lp))
return Ptr;
return GEP->getOperand(InductionOperand);
}
static Value *getUniqueCastUse(Value *Ptr, Loop *Lp, Type *Ty) {
Value *UniqueCast = nullptr;
for (User *U : Ptr->users()) {
CastInst *CI = dyn_cast<CastInst>(U);
if (CI && CI->getType() == Ty) {
if (!UniqueCast)
UniqueCast = CI;
else
return nullptr;
}
}
return UniqueCast;
}
static Value *getStrideFromPointer(Value *Ptr, ScalarEvolution *SE, Loop *Lp) {
const PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType());
if (!PtrTy || PtrTy->isAggregateType())
return nullptr;
Value *OrigPtr = Ptr;
int64_t PtrAccessSize = 1;
Ptr = stripGetElementPtr(Ptr, SE, Lp);
const SCEV *V = SE->getSCEV(Ptr);
if (Ptr != OrigPtr)
while (const SCEVCastExpr *C = dyn_cast<SCEVCastExpr>(V))
V = C->getOperand();
const SCEVAddRecExpr *S = dyn_cast<SCEVAddRecExpr>(V);
if (!S)
return nullptr;
V = S->getStepRecurrence(*SE);
if (!V)
return nullptr;
if (OrigPtr == Ptr) {
const DataLayout &DL = Lp->getHeader()->getModule()->getDataLayout();
DL.getTypeAllocSize(PtrTy->getElementType());
if (const SCEVMulExpr *M = dyn_cast<SCEVMulExpr>(V)) {
if (M->getOperand(0)->getSCEVType() != scConstant)
return nullptr;
const APInt &APStepVal =
cast<SCEVConstant>(M->getOperand(0))->getValue()->getValue();
if (APStepVal.getBitWidth() > 64)
return nullptr;
int64_t StepVal = APStepVal.getSExtValue();
if (PtrAccessSize != StepVal)
return nullptr;
V = M->getOperand(1);
}
}
Type *StripedOffRecurrenceCast = nullptr;
if (const SCEVCastExpr *C = dyn_cast<SCEVCastExpr>(V)) {
StripedOffRecurrenceCast = C->getType();
V = C->getOperand();
}
const SCEVUnknown *U = dyn_cast<SCEVUnknown>(V);
if (!U)
return nullptr;
Value *Stride = U->getValue();
if (!Lp->isLoopInvariant(Stride))
return nullptr;
if (StripedOffRecurrenceCast)
Stride = getUniqueCastUse(Stride, Lp, StripedOffRecurrenceCast);
return Stride;
}
void LoopVectorizationLegality::collectStridedAccess(Value *MemAccess) {
Value *Ptr = nullptr;
if (LoadInst *LI = dyn_cast<LoadInst>(MemAccess))
Ptr = LI->getPointerOperand();
else if (StoreInst *SI = dyn_cast<StoreInst>(MemAccess))
Ptr = SI->getPointerOperand();
else
return;
Value *Stride = getStrideFromPointer(Ptr, SE, TheLoop);
if (!Stride)
return;
DEBUG(dbgs() << "LV: Found a strided access that we can version");
DEBUG(dbgs() << " Ptr: " << *Ptr << " Stride: " << *Stride << "\n");
Strides[Ptr] = Stride;
StrideSet.insert(Stride);
}
void LoopVectorizationLegality::collectLoopUniforms() {
std::vector<Value*> Worklist;
BasicBlock *Latch = TheLoop->getLoopLatch();
Worklist.push_back(Latch->getTerminator()->getOperand(0));
for (Loop::block_iterator B = TheLoop->block_begin(),
BE = TheLoop->block_end(); B != BE; ++B)
for (BasicBlock::iterator I = (*B)->begin(), IE = (*B)->end();
I != IE; ++I)
if (I->getType()->isPointerTy() && isConsecutivePtr(I))
Worklist.insert(Worklist.end(), I->op_begin(), I->op_end());
while (!Worklist.empty()) {
Instruction *I = dyn_cast<Instruction>(Worklist.back());
Worklist.pop_back();
if (!I || !TheLoop->contains(I) || isa<PHINode>(I))
continue;
Uniforms.insert(I);
Worklist.insert(Worklist.end(), I->op_begin(), I->op_end());
}
}
bool LoopVectorizationLegality::canVectorizeMemory() {
LAI = &LAA->getInfo(TheLoop, Strides);
auto &OptionalReport = LAI->getReport();
if (OptionalReport)
emitAnalysis(VectorizationReport(*OptionalReport));
if (!LAI->canVectorizeMemory())
return false;
if (LAI->getNumRuntimePointerChecks() >
VectorizerParams::RuntimeMemoryCheckThreshold) {
emitAnalysis(VectorizationReport()
<< LAI->getNumRuntimePointerChecks() << " exceeds limit of "
<< VectorizerParams::RuntimeMemoryCheckThreshold
<< " dependent memory operations checked at runtime");
DEBUG(dbgs() << "LV: Too many memory checks needed.\n");
return false;
}
return true;
}
static bool hasMultipleUsesOf(Instruction *I,
SmallPtrSetImpl<Instruction *> &Insts) {
unsigned NumUses = 0;
for(User::op_iterator Use = I->op_begin(), E = I->op_end(); Use != E; ++Use) {
if (Insts.count(dyn_cast<Instruction>(*Use)))
++NumUses;
if (NumUses > 1)
return true;
}
return false;
}
static bool areAllUsesIn(Instruction *I, SmallPtrSetImpl<Instruction *> &Set) {
for(User::op_iterator Use = I->op_begin(), E = I->op_end(); Use != E; ++Use)
if (!Set.count(dyn_cast<Instruction>(*Use)))
return false;
return true;
}
bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi,
ReductionKind Kind) {
if (Phi->getNumIncomingValues() != 2)
return false;
if (Phi->getParent() != TheLoop->getHeader())
return false;
Value *RdxStart = Phi->getIncomingValueForBlock(TheLoop->getLoopPreheader());
Instruction *ExitInstruction = nullptr;
bool FoundReduxOp = false;
bool FoundStartPHI = false;
unsigned NumCmpSelectPatternInst = 0;
ReductionInstDesc ReduxDesc(false, nullptr);
SmallPtrSet<Instruction *, 8> VisitedInsts;
SmallVector<Instruction *, 8> Worklist;
Worklist.push_back(Phi);
VisitedInsts.insert(Phi);
while (!Worklist.empty()) {
Instruction *Cur = Worklist.back();
Worklist.pop_back();
if (Cur->use_empty())
return false;
bool IsAPhi = isa<PHINode>(Cur);
if (Cur != Phi && IsAPhi && Cur->getParent() == Phi->getParent())
return false;
if (!Cur->isCommutative() && !IsAPhi && !isa<SelectInst>(Cur) &&
!isa<ICmpInst>(Cur) && !isa<FCmpInst>(Cur) &&
!VisitedInsts.count(dyn_cast<Instruction>(Cur->getOperand(0))))
return false;
ReduxDesc = isReductionInstr(Cur, Kind, ReduxDesc);
if (!ReduxDesc.IsReduction)
return false;
if (!IsAPhi && Kind != RK_IntegerMinMax && Kind != RK_FloatMinMax &&
hasMultipleUsesOf(Cur, VisitedInsts))
return false;
if(IsAPhi && Cur != Phi && !areAllUsesIn(Cur, VisitedInsts))
return false;
if (Kind == RK_IntegerMinMax && (isa<ICmpInst>(Cur) ||
isa<SelectInst>(Cur)))
++NumCmpSelectPatternInst;
if (Kind == RK_FloatMinMax && (isa<FCmpInst>(Cur) ||
isa<SelectInst>(Cur)))
++NumCmpSelectPatternInst;
FoundReduxOp |= !IsAPhi;
SmallVector<Instruction *, 8> NonPHIs;
SmallVector<Instruction *, 8> PHIs;
for (User *U : Cur->users()) {
Instruction *UI = cast<Instruction>(U);
BasicBlock *Parent = UI->getParent();
if (!TheLoop->contains(Parent)) {
if (ExitInstruction != nullptr || Cur == Phi)
return false;
if (std::find(Phi->op_begin(), Phi->op_end(), Cur) == Phi->op_end())
return false;
ExitInstruction = Cur;
continue;
}
ReductionInstDesc IgnoredVal(false, nullptr);
if (VisitedInsts.insert(UI).second) {
if (isa<PHINode>(UI))
PHIs.push_back(UI);
else
NonPHIs.push_back(UI);
} else if (!isa<PHINode>(UI) &&
((!isa<FCmpInst>(UI) &&
!isa<ICmpInst>(UI) &&
!isa<SelectInst>(UI)) ||
!isMinMaxSelectCmpPattern(UI, IgnoredVal).IsReduction))
return false;
if (UI == Phi)
FoundStartPHI = true;
}
Worklist.append(PHIs.begin(), PHIs.end());
Worklist.append(NonPHIs.begin(), NonPHIs.end());
}
if ((Kind == RK_IntegerMinMax || Kind == RK_FloatMinMax) &&
NumCmpSelectPatternInst != 2)
return false;
if (!FoundStartPHI || !FoundReduxOp || !ExitInstruction)
return false;
AllowedExit.insert(ExitInstruction);
ReductionDescriptor RD(RdxStart, ExitInstruction, Kind,
ReduxDesc.MinMaxKind);
Reductions[Phi] = RD;
return true;
}
LoopVectorizationLegality::ReductionInstDesc
LoopVectorizationLegality::isMinMaxSelectCmpPattern(Instruction *I,
ReductionInstDesc &Prev) {
assert((isa<ICmpInst>(I) || isa<FCmpInst>(I) || isa<SelectInst>(I)) &&
"Expect a select instruction");
Instruction *Cmp = nullptr;
SelectInst *Select = nullptr;
if ((Cmp = dyn_cast<ICmpInst>(I)) || (Cmp = dyn_cast<FCmpInst>(I))) {
if (!Cmp->hasOneUse() || !(Select = dyn_cast<SelectInst>(*I->user_begin())))
return ReductionInstDesc(false, I);
return ReductionInstDesc(Select, Prev.MinMaxKind);
}
if (!(Select = dyn_cast<SelectInst>(I)))
return ReductionInstDesc(false, I);
if (!(Cmp = dyn_cast<ICmpInst>(I->getOperand(0))) &&
!(Cmp = dyn_cast<FCmpInst>(I->getOperand(0))))
return ReductionInstDesc(false, I);
if (!Cmp->hasOneUse())
return ReductionInstDesc(false, I);
Value *CmpLeft;
Value *CmpRight;
if (m_UMin(m_Value(CmpLeft), m_Value(CmpRight)).match(Select))
return ReductionInstDesc(Select, MRK_UIntMin);
else if (m_UMax(m_Value(CmpLeft), m_Value(CmpRight)).match(Select))
return ReductionInstDesc(Select, MRK_UIntMax);
else if (m_SMax(m_Value(CmpLeft), m_Value(CmpRight)).match(Select))
return ReductionInstDesc(Select, MRK_SIntMax);
else if (m_SMin(m_Value(CmpLeft), m_Value(CmpRight)).match(Select))
return ReductionInstDesc(Select, MRK_SIntMin);
else if (m_OrdFMin(m_Value(CmpLeft), m_Value(CmpRight)).match(Select))
return ReductionInstDesc(Select, MRK_FloatMin);
else if (m_OrdFMax(m_Value(CmpLeft), m_Value(CmpRight)).match(Select))
return ReductionInstDesc(Select, MRK_FloatMax);
else if (m_UnordFMin(m_Value(CmpLeft), m_Value(CmpRight)).match(Select))
return ReductionInstDesc(Select, MRK_FloatMin);
else if (m_UnordFMax(m_Value(CmpLeft), m_Value(CmpRight)).match(Select))
return ReductionInstDesc(Select, MRK_FloatMax);
return ReductionInstDesc(false, I);
}
LoopVectorizationLegality::ReductionInstDesc
LoopVectorizationLegality::isReductionInstr(Instruction *I,
ReductionKind Kind,
ReductionInstDesc &Prev) {
bool FP = I->getType()->isFloatingPointTy();
bool FastMath = FP && I->hasUnsafeAlgebra();
switch (I->getOpcode()) {
default:
return ReductionInstDesc(false, I);
case Instruction::PHI:
if (FP && (Kind != RK_FloatMult && Kind != RK_FloatAdd &&
Kind != RK_FloatMinMax))
return ReductionInstDesc(false, I);
return ReductionInstDesc(I, Prev.MinMaxKind);
case Instruction::Sub:
case Instruction::Add:
return ReductionInstDesc(Kind == RK_IntegerAdd, I);
case Instruction::Mul:
return ReductionInstDesc(Kind == RK_IntegerMult, I);
case Instruction::And:
return ReductionInstDesc(Kind == RK_IntegerAnd, I);
case Instruction::Or:
return ReductionInstDesc(Kind == RK_IntegerOr, I);
case Instruction::Xor:
return ReductionInstDesc(Kind == RK_IntegerXor, I);
case Instruction::FMul:
return ReductionInstDesc(Kind == RK_FloatMult && FastMath, I);
case Instruction::FSub:
case Instruction::FAdd:
return ReductionInstDesc(Kind == RK_FloatAdd && FastMath, I);
case Instruction::FCmp:
case Instruction::ICmp:
case Instruction::Select:
if (Kind != RK_IntegerMinMax &&
(!HasFunNoNaNAttr || Kind != RK_FloatMinMax))
return ReductionInstDesc(false, I);
return isMinMaxSelectCmpPattern(I, Prev);
}
}
LoopVectorizationLegality::InductionKind
LoopVectorizationLegality::isInductionVariable(PHINode *Phi,
ConstantInt *&StepValue) {
Type *PhiTy = Phi->getType();
if (!PhiTy->isIntegerTy() && !PhiTy->isPointerTy())
return IK_NoInduction;
const SCEV *PhiScev = SE->getSCEV(Phi);
const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PhiScev);
if (!AR) {
DEBUG(dbgs() << "LV: PHI is not a poly recurrence.\n");
return IK_NoInduction;
}
const SCEV *Step = AR->getStepRecurrence(*SE);
const SCEVConstant *C = dyn_cast<SCEVConstant>(Step);
if (!C)
return IK_NoInduction;
ConstantInt *CV = C->getValue();
if (PhiTy->isIntegerTy()) {
StepValue = CV;
return IK_IntInduction;
}
assert(PhiTy->isPointerTy() && "The PHI must be a pointer");
Type *PointerElementType = PhiTy->getPointerElementType();
if (!PointerElementType->isSized())
return IK_NoInduction;
const DataLayout &DL = Phi->getModule()->getDataLayout();
int64_t Size = static_cast<int64_t>(DL.getTypeAllocSize(PointerElementType));
if (!Size)
return IK_NoInduction;
int64_t CVSize = CV->getSExtValue();
if (CVSize % Size)
return IK_NoInduction;
StepValue = ConstantInt::getSigned(CV->getType(), CVSize / Size);
return IK_PtrInduction;
}
bool LoopVectorizationLegality::isInductionVariable(const Value *V) {
Value *In0 = const_cast<Value*>(V);
PHINode *PN = dyn_cast_or_null<PHINode>(In0);
if (!PN)
return false;
return Inductions.count(PN);
}
bool LoopVectorizationLegality::blockNeedsPredication(BasicBlock *BB) {
return LoopAccessInfo::blockNeedsPredication(BB, TheLoop, DT);
}
bool LoopVectorizationLegality::blockCanBePredicated(BasicBlock *BB,
SmallPtrSetImpl<Value *> &SafePtrs) {
for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
for (Instruction::op_iterator OI = it->op_begin(), OE = it->op_end();
OI != OE; ++OI) {
if (Constant *C = dyn_cast<Constant>(*OI))
if (C->canTrap())
return false;
}
if (it->mayReadFromMemory()) {
LoadInst *LI = dyn_cast<LoadInst>(it);
if (!LI)
return false;
if (!SafePtrs.count(LI->getPointerOperand())) {
if (isLegalMaskedLoad(LI->getType(), LI->getPointerOperand())) {
MaskedOp.insert(LI);
continue;
}
return false;
}
}
if (it->mayWriteToMemory()) {
StoreInst *SI = dyn_cast<StoreInst>(it);
if (!SI)
return false;
bool isSafePtr = (SafePtrs.count(SI->getPointerOperand()) != 0);
bool isSinglePredecessor = SI->getParent()->getSinglePredecessor();
if (++NumPredStores > NumberOfStoresToPredicate || !isSafePtr ||
!isSinglePredecessor) {
bool isLegalMaskedOp =
isLegalMaskedStore(SI->getValueOperand()->getType(),
SI->getPointerOperand());
if (isLegalMaskedOp) {
--NumPredStores;
MaskedOp.insert(SI);
continue;
}
return false;
}
}
if (it->mayThrow())
return false;
switch (it->getOpcode()) {
default: continue;
case Instruction::UDiv:
case Instruction::SDiv:
case Instruction::URem:
case Instruction::SRem:
return false;
}
}
return true;
}
LoopVectorizationCostModel::VectorizationFactor
LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) {
VectorizationFactor Factor = { 1U, 0U };
if (OptForSize && Legal->getRuntimePointerCheck()->Need) {
emitAnalysis(VectorizationReport() <<
"runtime pointer checks needed. Enable vectorization of this "
"loop with '#pragma clang loop vectorize(enable)' when "
"compiling with -Os");
DEBUG(dbgs() << "LV: Aborting. Runtime ptr check is required in Os.\n");
return Factor;
}
if (!EnableCondStoresVectorization && Legal->getNumPredStores()) {
emitAnalysis(VectorizationReport() <<
"store that is conditionally executed prevents vectorization");
DEBUG(dbgs() << "LV: No vectorization. There are conditional stores.\n");
return Factor;
}
unsigned TC = SE->getSmallConstantTripCount(TheLoop);
DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
unsigned WidestType = getWidestType();
unsigned WidestRegister = TTI.getRegisterBitWidth(true);
unsigned MaxSafeDepDist = -1U;
if (Legal->getMaxSafeDepDistBytes() != -1U)
MaxSafeDepDist = Legal->getMaxSafeDepDistBytes() * 8;
WidestRegister = ((WidestRegister < MaxSafeDepDist) ?
WidestRegister : MaxSafeDepDist);
unsigned MaxVectorSize = WidestRegister / WidestType;
DEBUG(dbgs() << "LV: The Widest type: " << WidestType << " bits.\n");
DEBUG(dbgs() << "LV: The Widest register is: "
<< WidestRegister << " bits.\n");
if (MaxVectorSize == 0) {
DEBUG(dbgs() << "LV: The target has no vector registers.\n");
MaxVectorSize = 1;
}
assert(MaxVectorSize <= 64 && "Did not expect to pack so many elements"
" into one vector!");
unsigned VF = MaxVectorSize;
if (OptForSize) {
if (TC < 2) {
emitAnalysis
(VectorizationReport() <<
"unable to calculate the loop count due to complex control flow");
DEBUG(dbgs() << "LV: Aborting. A tail loop is required in Os.\n");
return Factor;
}
VF = TC % MaxVectorSize;
if (VF == 0)
VF = MaxVectorSize;
if (VF < 2) {
emitAnalysis(VectorizationReport() <<
"cannot optimize for size and vectorize at the "
"same time. Enable vectorization of this loop "
"with '#pragma clang loop vectorize(enable)' "
"when compiling with -Os");
DEBUG(dbgs() << "LV: Aborting. A tail loop is required in Os.\n");
return Factor;
}
}
int UserVF = Hints->getWidth();
if (UserVF != 0) {
assert(isPowerOf2_32(UserVF) && "VF needs to be a power of two");
DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
Factor.Width = UserVF;
return Factor;
}
float Cost = expectedCost(1);
#ifndef NDEBUG
const float ScalarCost = Cost;
#endif
unsigned Width = 1;
DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost << ".\n");
bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled;
if (ForceVectorization && VF > 1) {
Width = 2;
Cost = expectedCost(Width) / (float)Width;
}
for (unsigned i=2; i <= VF; i*=2) {
float VectorCost = expectedCost(i) / (float)i;
DEBUG(dbgs() << "LV: Vector loop of width " << i << " costs: " <<
(int)VectorCost << ".\n");
if (VectorCost < Cost) {
Cost = VectorCost;
Width = i;
}
}
DEBUG(if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs()
<< "LV: Vectorization seems to be not beneficial, "
<< "but was forced by a user.\n");
DEBUG(dbgs() << "LV: Selecting VF: "<< Width << ".\n");
Factor.Width = Width;
Factor.Cost = Width * Cost;
return Factor;
}
unsigned LoopVectorizationCostModel::getWidestType() {
unsigned MaxWidth = 8;
const DataLayout &DL = TheFunction->getParent()->getDataLayout();
for (Loop::block_iterator bb = TheLoop->block_begin(),
be = TheLoop->block_end(); bb != be; ++bb) {
BasicBlock *BB = *bb;
for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
Type *T = it->getType();
if (EphValues.count(it))
continue;
if (!isa<LoadInst>(it) && !isa<StoreInst>(it) && !isa<PHINode>(it))
continue;
if (PHINode *PN = dyn_cast<PHINode>(it))
if (!Legal->getReductionVars()->count(PN))
continue;
if (StoreInst *ST = dyn_cast<StoreInst>(it))
T = ST->getValueOperand()->getType();
if (T->isPointerTy() && !isConsecutiveLoadOrStore(it))
continue;
MaxWidth = std::max(MaxWidth,
(unsigned)DL.getTypeSizeInBits(T->getScalarType()));
}
}
return MaxWidth;
}
unsigned
LoopVectorizationCostModel::selectUnrollFactor(bool OptForSize,
unsigned VF,
unsigned LoopCost) {
int UserUF = Hints->getInterleave();
if (UserUF != 0)
return UserUF;
if (OptForSize)
return 1;
if (Legal->getMaxSafeDepDistBytes() != -1U)
return 1;
unsigned TC = SE->getSmallConstantTripCount(TheLoop);
if (TC > 1 && TC < TinyTripCountUnrollThreshold)
return 1;
unsigned TargetNumRegisters = TTI.getNumberOfRegisters(VF > 1);
DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters <<
" registers\n");
if (VF == 1) {
if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
TargetNumRegisters = ForceTargetNumScalarRegs;
} else {
if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
TargetNumRegisters = ForceTargetNumVectorRegs;
}
LoopVectorizationCostModel::RegisterUsage R = calculateRegisterUsage();
R.MaxLocalUsers = std::max(R.MaxLocalUsers, 1U);
R.NumInstructions = std::max(R.NumInstructions, 1U);
unsigned UF = PowerOf2Floor((TargetNumRegisters - R.LoopInvariantRegs) /
R.MaxLocalUsers);
if (EnableIndVarRegisterHeur)
UF = PowerOf2Floor((TargetNumRegisters - R.LoopInvariantRegs - 1) /
std::max(1U, (R.MaxLocalUsers - 1)));
unsigned MaxInterleaveSize = TTI.getMaxInterleaveFactor();
if (VF == 1) {
if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
MaxInterleaveSize = ForceTargetMaxScalarInterleaveFactor;
} else {
if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
MaxInterleaveSize = ForceTargetMaxVectorInterleaveFactor;
}
if (LoopCost == 0)
LoopCost = expectedCost(VF);
if (UF > MaxInterleaveSize)
UF = MaxInterleaveSize;
else if (UF < 1)
UF = 1;
if (VF > 1 && Legal->getReductionVars()->size()) {
DEBUG(dbgs() << "LV: Unrolling because of reductions.\n");
return UF;
}
bool UnrollingRequiresRuntimePointerCheck =
(VF == 1 && Legal->getRuntimePointerCheck()->Need);
DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n');
if (!UnrollingRequiresRuntimePointerCheck &&
LoopCost < SmallLoopCost) {
unsigned SmallUF = std::min(UF, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost));
unsigned NumStores = Legal->getNumStores();
unsigned NumLoads = Legal->getNumLoads();
unsigned StoresUF = UF / (NumStores ? NumStores : 1);
unsigned LoadsUF = UF / (NumLoads ? NumLoads : 1);
if (Legal->getReductionVars()->size() &&
TheLoop->getLoopDepth() > 1) {
unsigned F = static_cast<unsigned>(MaxNestedScalarReductionUF);
SmallUF = std::min(SmallUF, F);
StoresUF = std::min(StoresUF, F);
LoadsUF = std::min(LoadsUF, F);
}
if (EnableLoadStoreRuntimeUnroll && std::max(StoresUF, LoadsUF) > SmallUF) {
DEBUG(dbgs() << "LV: Unrolling to saturate store or load ports.\n");
return std::max(StoresUF, LoadsUF);
}
DEBUG(dbgs() << "LV: Unrolling to reduce branch cost.\n");
return SmallUF;
}
DEBUG(dbgs() << "LV: Not Unrolling.\n");
return 1;
}
LoopVectorizationCostModel::RegisterUsage
LoopVectorizationCostModel::calculateRegisterUsage() {
LoopBlocksDFS DFS(TheLoop);
DFS.perform(LI);
RegisterUsage R;
R.NumInstructions = 0;
typedef DenseMap<Instruction*, unsigned> IntervalMap;
DenseMap<unsigned, Instruction*> IdxToInstr;
IntervalMap EndPoint;
SmallSet<Instruction*, 8> Ends;
SmallPtrSet<Value*, 8> LoopInvariants;
unsigned Index = 0;
for (LoopBlocksDFS::RPOIterator bb = DFS.beginRPO(),
be = DFS.endRPO(); bb != be; ++bb) {
R.NumInstructions += (*bb)->size();
for (BasicBlock::iterator it = (*bb)->begin(), e = (*bb)->end(); it != e;
++it) {
Instruction *I = it;
IdxToInstr[Index++] = I;
for (unsigned i = 0; i < I->getNumOperands(); ++i) {
Value *U = I->getOperand(i);
Instruction *Instr = dyn_cast<Instruction>(U);
if (!Instr) continue;
if (!TheLoop->contains(Instr)) {
LoopInvariants.insert(Instr);
continue;
}
EndPoint[Instr] = Index;
Ends.insert(Instr);
}
}
}
typedef SmallVector<Instruction*, 2> InstrList;
DenseMap<unsigned, InstrList> TransposeEnds;
for (IntervalMap::iterator it = EndPoint.begin(), e = EndPoint.end();
it != e; ++it)
TransposeEnds[it->second].push_back(it->first);
SmallSet<Instruction*, 8> OpenIntervals;
unsigned MaxUsage = 0;
DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
for (unsigned int i = 0; i < Index; ++i) {
Instruction *I = IdxToInstr[i];
if (!Ends.count(I)) continue;
if (EphValues.count(I))
continue;
InstrList &List = TransposeEnds[i];
for (unsigned int j=0, e = List.size(); j < e; ++j)
OpenIntervals.erase(List[j]);
MaxUsage = std::max(MaxUsage, OpenIntervals.size());
DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " <<
OpenIntervals.size() << '\n');
OpenIntervals.insert(I);
}
unsigned Invariant = LoopInvariants.size();
DEBUG(dbgs() << "LV(REG): Found max usage: " << MaxUsage << '\n');
DEBUG(dbgs() << "LV(REG): Found invariant usage: " << Invariant << '\n');
DEBUG(dbgs() << "LV(REG): LoopSize: " << R.NumInstructions << '\n');
R.LoopInvariantRegs = Invariant;
R.MaxLocalUsers = MaxUsage;
return R;
}
unsigned LoopVectorizationCostModel::expectedCost(unsigned VF) {
unsigned Cost = 0;
for (Loop::block_iterator bb = TheLoop->block_begin(),
be = TheLoop->block_end(); bb != be; ++bb) {
unsigned BlockCost = 0;
BasicBlock *BB = *bb;
for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
if (isa<DbgInfoIntrinsic>(it))
continue;
if (EphValues.count(it))
continue;
unsigned C = getInstructionCost(it, VF);
if (ForceTargetInstructionCost.getNumOccurrences() > 0)
C = ForceTargetInstructionCost;
BlockCost += C;
DEBUG(dbgs() << "LV: Found an estimated cost of " << C << " for VF " <<
VF << " For instruction: " << *it << '\n');
}
if (VF == 1 && Legal->blockNeedsPredication(*bb))
BlockCost /= 2;
Cost += BlockCost;
}
return Cost;
}
static bool isLikelyComplexAddressComputation(Value *Ptr,
LoopVectorizationLegality *Legal,
ScalarEvolution *SE,
const Loop *TheLoop) {
GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr);
if (!Gep)
return true;
unsigned NumOperands = Gep->getNumOperands();
for (unsigned i = 1; i < NumOperands; ++i) {
Value *Opd = Gep->getOperand(i);
if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) &&
!Legal->isInductionVariable(Opd))
return true;
}
unsigned MaxMergeDistance = 64;
const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(Ptr));
if (!AddRec)
return true;
const SCEV *Step = AddRec->getStepRecurrence(*SE);
const SCEVConstant *C = dyn_cast<SCEVConstant>(Step);
if (!C)
return true;
const APInt &APStepVal = C->getValue()->getValue();
if (APStepVal.getBitWidth() > 64)
return true;
int64_t StepVal = APStepVal.getSExtValue();
return StepVal > MaxMergeDistance;
}
static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) {
if (Legal->hasStride(I->getOperand(0)) || Legal->hasStride(I->getOperand(1)))
return true;
return false;
}
unsigned
LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
if (Legal->isUniformAfterVectorization(I))
VF = 1;
Type *RetTy = I->getType();
Type *VectorTy = ToVectorTy(RetTy, VF);
switch (I->getOpcode()) {
case Instruction::GetElementPtr:
return 0;
case Instruction::Br: {
return TTI.getCFInstrCost(I->getOpcode());
}
case Instruction::PHI:
return 0;
case Instruction::Add:
case Instruction::FAdd:
case Instruction::Sub:
case Instruction::FSub:
case Instruction::Mul:
case Instruction::FMul:
case Instruction::UDiv:
case Instruction::SDiv:
case Instruction::FDiv:
case Instruction::URem:
case Instruction::SRem:
case Instruction::FRem:
case Instruction::Shl:
case Instruction::LShr:
case Instruction::AShr:
case Instruction::And:
case Instruction::Or:
case Instruction::Xor: {
if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal))
return 0;
TargetTransformInfo::OperandValueKind Op1VK =
TargetTransformInfo::OK_AnyValue;
TargetTransformInfo::OperandValueKind Op2VK =
TargetTransformInfo::OK_AnyValue;
TargetTransformInfo::OperandValueProperties Op1VP =
TargetTransformInfo::OP_None;
TargetTransformInfo::OperandValueProperties Op2VP =
TargetTransformInfo::OP_None;
Value *Op2 = I->getOperand(1);
if (isa<ConstantInt>(Op2)) {
ConstantInt *CInt = cast<ConstantInt>(Op2);
if (CInt && CInt->getValue().isPowerOf2())
Op2VP = TargetTransformInfo::OP_PowerOf2;
Op2VK = TargetTransformInfo::OK_UniformConstantValue;
} else if (isa<ConstantVector>(Op2) || isa<ConstantDataVector>(Op2)) {
Op2VK = TargetTransformInfo::OK_NonUniformConstantValue;
Constant *SplatValue = cast<Constant>(Op2)->getSplatValue();
if (SplatValue) {
ConstantInt *CInt = dyn_cast<ConstantInt>(SplatValue);
if (CInt && CInt->getValue().isPowerOf2())
Op2VP = TargetTransformInfo::OP_PowerOf2;
Op2VK = TargetTransformInfo::OK_UniformConstantValue;
}
}
return TTI.getArithmeticInstrCost(I->getOpcode(), VectorTy, Op1VK, Op2VK,
Op1VP, Op2VP);
}
case Instruction::Select: {
SelectInst *SI = cast<SelectInst>(I);
const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
Type *CondTy = SI->getCondition()->getType();
if (!ScalarCond)
CondTy = VectorType::get(CondTy, VF);
return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy);
}
case Instruction::ICmp:
case Instruction::FCmp: {
Type *ValTy = I->getOperand(0)->getType();
VectorTy = ToVectorTy(ValTy, VF);
return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy);
}
case Instruction::Store:
case Instruction::Load: {
StoreInst *SI = dyn_cast<StoreInst>(I);
LoadInst *LI = dyn_cast<LoadInst>(I);
Type *ValTy = (SI ? SI->getValueOperand()->getType() :
LI->getType());
VectorTy = ToVectorTy(ValTy, VF);
unsigned Alignment = SI ? SI->getAlignment() : LI->getAlignment();
unsigned AS = SI ? SI->getPointerAddressSpace() :
LI->getPointerAddressSpace();
Value *Ptr = SI ? SI->getPointerOperand() : LI->getPointerOperand();
if (VF == 1)
return TTI.getAddressComputationCost(VectorTy) +
TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS);
int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
bool Reverse = ConsecutiveStride < 0;
const DataLayout &DL = I->getModule()->getDataLayout();
unsigned ScalarAllocatedSize = DL.getTypeAllocSize(ValTy);
unsigned VectorElementSize = DL.getTypeStoreSize(VectorTy) / VF;
if (!ConsecutiveStride || ScalarAllocatedSize != VectorElementSize) {
bool IsComplexComputation =
isLikelyComplexAddressComputation(Ptr, Legal, SE, TheLoop);
unsigned Cost = 0;
Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
for (unsigned i = 0; i < VF; ++i) {
Cost += TTI.getVectorInstrCost(Instruction::ExtractElement, PtrTy, i);
Cost += TTI.getVectorInstrCost(SI ? Instruction::ExtractElement :
Instruction::InsertElement,
VectorTy, i);
}
Cost += VF * TTI.getAddressComputationCost(PtrTy, IsComplexComputation);
Cost += VF * TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(),
Alignment, AS);
return Cost;
}
unsigned Cost = TTI.getAddressComputationCost(VectorTy);
Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS);
if (Reverse)
Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse,
VectorTy, 0);
return Cost;
}
case Instruction::ZExt:
case Instruction::SExt:
case Instruction::FPToUI:
case Instruction::FPToSI:
case Instruction::FPExt:
case Instruction::PtrToInt:
case Instruction::IntToPtr:
case Instruction::SIToFP:
case Instruction::UIToFP:
case Instruction::Trunc:
case Instruction::FPTrunc:
case Instruction::BitCast: {
if (I->getOpcode() == Instruction::Trunc &&
Legal->isInductionVariable(I->getOperand(0)))
return TTI.getCastInstrCost(I->getOpcode(), I->getType(),
I->getOperand(0)->getType());
Type *SrcVecTy = ToVectorTy(I->getOperand(0)->getType(), VF);
return TTI.getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy);
}
case Instruction::Call: {
bool NeedToScalarize;
CallInst *CI = cast<CallInst>(I);
unsigned CallCost = getVectorCallCost(CI, VF, TTI, TLI, NeedToScalarize);
if (getIntrinsicIDForCall(CI, TLI))
return std::min(CallCost, getVectorIntrinsicCost(CI, VF, TTI, TLI));
return CallCost;
}
default: {
unsigned Cost = 0;
if (!RetTy->isVoidTy() && VF != 1) {
unsigned InsCost = TTI.getVectorInstrCost(Instruction::InsertElement,
VectorTy);
unsigned ExtCost = TTI.getVectorInstrCost(Instruction::ExtractElement,
VectorTy);
Cost += VF * (InsCost + ExtCost * I->getNumOperands());
}
Cost += VF * TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy);
return Cost;
}
}}
char LoopVectorize::ID = 0;
static const char lv_name[] = "Loop Vectorization";
INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false)
INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfo)
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
INITIALIZE_PASS_DEPENDENCY(LCSSA)
INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
INITIALIZE_PASS_DEPENDENCY(LoopAccessAnalysis)
INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false)
namespace llvm {
Pass *createLoopVectorizePass(bool NoUnrolling, bool AlwaysVectorize) {
return new LoopVectorize(NoUnrolling, AlwaysVectorize);
}
}
bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) {
if (StoreInst *ST = dyn_cast<StoreInst>(Inst))
return Legal->isConsecutivePtr(ST->getPointerOperand()) != 0;
if (LoadInst *LI = dyn_cast<LoadInst>(Inst))
return Legal->isConsecutivePtr(LI->getPointerOperand()) != 0;
return false;
}
void InnerLoopUnroller::scalarizeInstruction(Instruction *Instr,
bool IfPredicateStore) {
assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
SmallVector<VectorParts, 4> Params;
setDebugLocFromInst(Builder, Instr);
for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
Value *SrcOp = Instr->getOperand(op);
if (SrcOp == OldInduction) {
Params.push_back(getVectorValue(SrcOp));
continue;
}
Instruction *SrcInst = dyn_cast<Instruction>(SrcOp);
if (SrcInst && OrigLoop->contains(SrcInst)) {
assert(WidenMap.has(SrcInst) && "Source operand is unavailable");
Params.push_back(WidenMap.get(SrcInst));
} else {
VectorParts Scalars;
Scalars.append(UF, SrcOp);
Params.push_back(Scalars);
}
}
assert(Params.size() == Instr->getNumOperands() &&
"Invalid number of operands");
bool IsVoidRetTy = Instr->getType()->isVoidTy();
Value *UndefVec = IsVoidRetTy ? nullptr :
UndefValue::get(Instr->getType());
VectorParts &VecResults = WidenMap.splat(Instr, UndefVec);
Instruction *InsertPt = Builder.GetInsertPoint();
BasicBlock *IfBlock = Builder.GetInsertBlock();
BasicBlock *CondBlock = nullptr;
VectorParts Cond;
Loop *VectorLp = nullptr;
if (IfPredicateStore) {
assert(Instr->getParent()->getSinglePredecessor() &&
"Only support single predecessor blocks");
Cond = createEdgeMask(Instr->getParent()->getSinglePredecessor(),
Instr->getParent());
VectorLp = LI->getLoopFor(IfBlock);
assert(VectorLp && "Must have a loop for this block");
}
for (unsigned Part = 0; Part < UF; ++Part) {
Value *Cmp = nullptr;
if (IfPredicateStore) {
if (Cond[Part]->getType()->isVectorTy())
Cond[Part] =
Builder.CreateExtractElement(Cond[Part], Builder.getInt32(0));
Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Cond[Part],
ConstantInt::get(Cond[Part]->getType(), 1));
CondBlock = IfBlock->splitBasicBlock(InsertPt, "cond.store");
LoopVectorBody.push_back(CondBlock);
VectorLp->addBasicBlockToLoop(CondBlock, *LI);
Builder.SetInsertPoint(InsertPt);
}
Instruction *Cloned = Instr->clone();
if (!IsVoidRetTy)
Cloned->setName(Instr->getName() + ".cloned");
for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
Value *Op = Params[op][Part];
Cloned->setOperand(op, Op);
}
Builder.Insert(Cloned);
if (!IsVoidRetTy)
VecResults[Part] = Cloned;
if (IfPredicateStore) {
BasicBlock *NewIfBlock = CondBlock->splitBasicBlock(InsertPt, "else");
LoopVectorBody.push_back(NewIfBlock);
VectorLp->addBasicBlockToLoop(NewIfBlock, *LI);
Builder.SetInsertPoint(InsertPt);
Instruction *OldBr = IfBlock->getTerminator();
BranchInst::Create(CondBlock, NewIfBlock, Cmp, OldBr);
OldBr->eraseFromParent();
IfBlock = NewIfBlock;
}
}
}
void InnerLoopUnroller::vectorizeMemoryInstruction(Instruction *Instr) {
StoreInst *SI = dyn_cast<StoreInst>(Instr);
bool IfPredicateStore = (SI && Legal->blockNeedsPredication(SI->getParent()));
return scalarizeInstruction(Instr, IfPredicateStore);
}
Value *InnerLoopUnroller::reverseVector(Value *Vec) {
return Vec;
}
Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) {
return V;
}
Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step) {
Type *ITy = Val->getType();
assert(!ITy->isVectorTy() && "Val must be a scalar");
Constant *C = ConstantInt::get(ITy, StartIdx);
return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction");
}