#define LV_NAME "loop-vectorize"
#define DEBUG_TYPE LV_NAME
#include "llvm/Transforms/Vectorize.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/MapVector.h"
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/SmallSet.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/StringExtras.h"
#include "llvm/Analysis/AliasAnalysis.h"
#include "llvm/Analysis/AliasSetTracker.h"
#include "llvm/Analysis/Dominators.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/LoopIterator.h"
#include "llvm/Analysis/LoopPass.h"
#include "llvm/Analysis/ScalarEvolution.h"
#include "llvm/Analysis/ScalarEvolutionExpander.h"
#include "llvm/Analysis/ScalarEvolutionExpressions.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/Analysis/Verifier.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/DataLayout.h"
#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/LLVMContext.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/Type.h"
#include "llvm/IR/Value.h"
#include "llvm/Pass.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/PatternMatch.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Support/ValueHandle.h"
#include "llvm/Target/TargetLibraryInfo.h"
#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
#include "llvm/Transforms/Utils/Local.h"
#include <algorithm>
#include <map>
using namespace llvm;
using namespace llvm::PatternMatch;
static cl::opt<unsigned>
VectorizationFactor("force-vector-width", cl::init(0), cl::Hidden,
cl::desc("Sets the SIMD width. Zero is autoselect."));
static cl::opt<unsigned>
VectorizationUnroll("force-vector-unroll", cl::init(0), cl::Hidden,
cl::desc("Sets the vectorization unroll count. "
"Zero is autoselect."));
static cl::opt<bool>
EnableIfConversion("enable-if-conversion", cl::init(true), cl::Hidden,
cl::desc("Enable if-conversion during vectorization."));
static const unsigned TinyTripCountVectorThreshold = 16;
static const unsigned TinyTripCountUnrollThreshold = 128;
static const unsigned RuntimeMemoryCheckThreshold = 8;
static const char*
AlreadyVectorizedMDName = "llvm.vectorizer.already_vectorized";
namespace {
class LoopVectorizationLegality;
class LoopVectorizationCostModel;
class InnerLoopVectorizer {
public:
InnerLoopVectorizer(Loop *OrigLoop, ScalarEvolution *SE, LoopInfo *LI,
DominatorTree *DT, DataLayout *DL,
const TargetLibraryInfo *TLI, unsigned VecWidth,
unsigned UnrollFactor)
: OrigLoop(OrigLoop), SE(SE), LI(LI), DT(DT), DL(DL), TLI(TLI),
VF(VecWidth), UF(UnrollFactor), Builder(SE->getContext()), Induction(0),
OldInduction(0), WidenMap(UnrollFactor) {}
void vectorize(LoopVectorizationLegality *Legal) {
createEmptyLoop(Legal);
vectorizeLoop(Legal);
updateAnalysis();
}
private:
typedef SmallVector<PHINode*, 4> PhiVector;
typedef SmallVector<Value*, 2> VectorParts;
Instruction *addRuntimeCheck(LoopVectorizationLegality *Legal,
Instruction *Loc);
void createEmptyLoop(LoopVectorizationLegality *Legal);
void vectorizeLoop(LoopVectorizationLegality *Legal);
VectorParts createBlockInMask(BasicBlock *BB);
VectorParts createEdgeMask(BasicBlock *Src, BasicBlock *Dst);
void vectorizeBlockInLoop(LoopVectorizationLegality *Legal, BasicBlock *BB,
PhiVector *PV);
void updateAnalysis();
void scalarizeInstruction(Instruction *Instr);
void vectorizeMemoryInstruction(Instruction *Instr,
LoopVectorizationLegality *Legal);
Value *getBroadcastInstrs(Value *V);
Value *getConsecutiveVector(Value* Val, int StartIdx, bool Negate);
VectorParts &getVectorValue(Value *V);
Value *reverseVector(Value *Vec);
struct ValueMap {
ValueMap(unsigned UnrollFactor) : UF(UnrollFactor) {}
bool has(Value *Key) { return MapStoreage.count(Key); }
VectorParts &splat(Value *Key, Value *Val) {
MapStoreage[Key].clear();
MapStoreage[Key].append(UF, Val);
return MapStoreage[Key];
}
VectorParts &get(Value *Key) {
if (!has(Key))
MapStoreage[Key].resize(UF);
return MapStoreage[Key];
}
unsigned UF;
std::map<Value*, VectorParts> MapStoreage;
};
Loop *OrigLoop;
ScalarEvolution *SE;
LoopInfo *LI;
DominatorTree *DT;
DataLayout *DL;
const TargetLibraryInfo *TLI;
unsigned VF;
unsigned UF;
IRBuilder<> Builder;
BasicBlock *LoopVectorPreHeader;
BasicBlock *LoopScalarPreHeader;
BasicBlock *LoopMiddleBlock;
BasicBlock *LoopExitBlock;
BasicBlock *LoopVectorBody;
BasicBlock *LoopScalarBody;
SmallVector<BasicBlock *, 4> LoopBypassBlocks;
PHINode *Induction;
PHINode *OldInduction;
Value *ExtendedIdx;
ValueMap WidenMap;
};
class LoopVectorizationLegality {
public:
LoopVectorizationLegality(Loop *L, ScalarEvolution *SE, DataLayout *DL,
DominatorTree *DT, TargetTransformInfo* TTI,
AliasAnalysis *AA, TargetLibraryInfo *TLI)
: TheLoop(L), SE(SE), DL(DL), DT(DT), TTI(TTI), AA(AA), TLI(TLI),
Induction(0), WidestIndTy(0) {}
enum ReductionKind {
RK_NoReduction, RK_IntegerAdd, RK_IntegerMult, RK_IntegerOr, RK_IntegerAnd, RK_IntegerXor, RK_IntegerMinMax, RK_FloatAdd, RK_FloatMult };
enum InductionKind {
IK_NoInduction, IK_IntInduction, IK_ReverseIntInduction, IK_PtrInduction, IK_ReversePtrInduction };
enum MinMaxReductionKind {
MRK_Invalid,
MRK_UIntMin,
MRK_UIntMax,
MRK_SIntMin,
MRK_SIntMax
};
struct ReductionDescriptor {
ReductionDescriptor() : StartValue(0), LoopExitInstr(0),
Kind(RK_NoReduction), MinMaxKind(MRK_Invalid) {}
ReductionDescriptor(Value *Start, Instruction *Exit, ReductionKind K,
MinMaxReductionKind MK)
: StartValue(Start), LoopExitInstr(Exit), Kind(K), MinMaxKind(MK) {}
TrackingVH<Value> StartValue;
Instruction *LoopExitInstr;
ReductionKind Kind;
MinMaxReductionKind MinMaxKind;
};
struct ReductionInstDesc {
ReductionInstDesc(bool IsRedux, Instruction *I) :
IsReduction(IsRedux), PatternLastInst(I), MinMaxKind(MRK_Invalid) {}
ReductionInstDesc(Instruction *I, MinMaxReductionKind K) :
IsReduction(true), PatternLastInst(I), MinMaxKind(K) {}
bool IsReduction;
Instruction *PatternLastInst;
MinMaxReductionKind MinMaxKind;
};
struct RuntimePointerCheck {
RuntimePointerCheck() : Need(false) {}
void reset() {
Need = false;
Pointers.clear();
Starts.clear();
Ends.clear();
}
void insert(ScalarEvolution *SE, Loop *Lp, Value *Ptr, bool WritePtr);
bool Need;
SmallVector<TrackingVH<Value>, 2> Pointers;
SmallVector<const SCEV*, 2> Starts;
SmallVector<const SCEV*, 2> Ends;
SmallVector<bool, 2> IsWritePtr;
};
struct InductionInfo {
InductionInfo(Value *Start, InductionKind K) : StartValue(Start), IK(K) {}
InductionInfo() : StartValue(0), IK(IK_NoInduction) {}
TrackingVH<Value> StartValue;
InductionKind IK;
};
typedef DenseMap<PHINode*, ReductionDescriptor> ReductionList;
typedef MapVector<PHINode*, InductionInfo> InductionList;
typedef DenseMap<Value*, Instruction* > AliasMap;
typedef DenseMap<Value*, std::vector<Instruction*> > AliasMultiMap;
bool canVectorize();
PHINode *getInduction() { return Induction; }
ReductionList *getReductionVars() { return &Reductions; }
InductionList *getInductionVars() { return &Inductions; }
Type *getWidestInductionType() { return WidestIndTy; }
bool isInductionVariable(const Value *V);
bool blockNeedsPredication(BasicBlock *BB);
int isConsecutivePtr(Value *Ptr);
bool isUniform(Value *V);
bool isUniformAfterVectorization(Instruction* I) { return Uniforms.count(I); }
RuntimePointerCheck *getRuntimePointerCheck() { return &PtrRtCheck; }
static Constant *getReductionIdentity(ReductionKind K, Type *Tp,
MinMaxReductionKind MinMaxK);
private:
bool canVectorizeInstrs();
bool canVectorizeMemory();
bool canVectorizeWithIfConvert();
void collectLoopUniforms();
bool blockCanBePredicated(BasicBlock *BB);
bool AddReductionVar(PHINode *Phi, ReductionKind Kind);
ReductionInstDesc isReductionInstr(Instruction *I, ReductionKind Kind,
ReductionInstDesc &Desc);
static ReductionInstDesc isMinMaxSelectCmpPattern(Instruction *I,
ReductionInstDesc &Prev);
InductionKind isInductionVariable(PHINode *Phi);
bool hasComputableBounds(Value *Ptr);
bool hasPossibleGlobalWriteReorder(Value *Object,
Instruction *Inst,
AliasMultiMap &WriteObjects,
unsigned MaxByteWidth);
AliasAnalysis::Location getLoadStoreLocation(Instruction *Inst);
Loop *TheLoop;
ScalarEvolution *SE;
DataLayout *DL;
DominatorTree *DT;
TargetTransformInfo *TTI;
AliasAnalysis *AA;
TargetLibraryInfo *TLI;
PHINode *Induction;
ReductionList Reductions;
InductionList Inductions;
Type *WidestIndTy;
SmallPtrSet<Value*, 4> AllowedExit;
SmallPtrSet<Instruction*, 4> Uniforms;
RuntimePointerCheck PtrRtCheck;
};
class LoopVectorizationCostModel {
public:
LoopVectorizationCostModel(Loop *L, ScalarEvolution *SE, LoopInfo *LI,
LoopVectorizationLegality *Legal,
const TargetTransformInfo &TTI,
DataLayout *DL, const TargetLibraryInfo *TLI)
: TheLoop(L), SE(SE), LI(LI), Legal(Legal), TTI(TTI), DL(DL), TLI(TLI) {}
struct VectorizationFactor {
unsigned Width; unsigned Cost; };
VectorizationFactor selectVectorizationFactor(bool OptForSize, unsigned UserVF);
unsigned getWidestType();
unsigned selectUnrollFactor(bool OptForSize, unsigned UserUF, unsigned VF,
unsigned LoopCost);
struct RegisterUsage {
unsigned LoopInvariantRegs;
unsigned MaxLocalUsers;
unsigned NumInstructions;
};
RegisterUsage calculateRegisterUsage();
private:
unsigned expectedCost(unsigned VF);
unsigned getInstructionCost(Instruction *I, unsigned VF);
static Type* ToVectorTy(Type *Scalar, unsigned VF);
bool isConsecutiveLoadOrStore(Instruction *I);
Loop *TheLoop;
ScalarEvolution *SE;
LoopInfo *LI;
LoopVectorizationLegality *Legal;
const TargetTransformInfo &TTI;
DataLayout *DL;
const TargetLibraryInfo *TLI;
};
struct LoopVectorize : public LoopPass {
static char ID;
explicit LoopVectorize() : LoopPass(ID) {
initializeLoopVectorizePass(*PassRegistry::getPassRegistry());
}
ScalarEvolution *SE;
DataLayout *DL;
LoopInfo *LI;
TargetTransformInfo *TTI;
DominatorTree *DT;
AliasAnalysis *AA;
TargetLibraryInfo *TLI;
virtual bool runOnLoop(Loop *L, LPPassManager &LPM) {
if (!L->empty())
return false;
SE = &getAnalysis<ScalarEvolution>();
DL = getAnalysisIfAvailable<DataLayout>();
LI = &getAnalysis<LoopInfo>();
TTI = &getAnalysis<TargetTransformInfo>();
DT = &getAnalysis<DominatorTree>();
AA = getAnalysisIfAvailable<AliasAnalysis>();
TLI = getAnalysisIfAvailable<TargetLibraryInfo>();
if (DL == NULL) {
DEBUG(dbgs() << "LV: Not vectorizing because of missing data layout");
return false;
}
DEBUG(dbgs() << "LV: Checking a loop in \"" <<
L->getHeader()->getParent()->getName() << "\"\n");
LoopVectorizationLegality LVL(L, SE, DL, DT, TTI, AA, TLI);
if (!LVL.canVectorize()) {
DEBUG(dbgs() << "LV: Not vectorizing.\n");
return false;
}
LoopVectorizationCostModel CM(L, SE, LI, &LVL, *TTI, DL, TLI);
Function *F = L->getHeader()->getParent();
Attribute::AttrKind SzAttr = Attribute::OptimizeForSize;
Attribute::AttrKind FlAttr = Attribute::NoImplicitFloat;
unsigned FnIndex = AttributeSet::FunctionIndex;
bool OptForSize = F->getAttributes().hasAttribute(FnIndex, SzAttr);
bool NoFloat = F->getAttributes().hasAttribute(FnIndex, FlAttr);
if (NoFloat) {
DEBUG(dbgs() << "LV: Can't vectorize when the NoImplicitFloat"
"attribute is used.\n");
return false;
}
LoopVectorizationCostModel::VectorizationFactor VF;
VF = CM.selectVectorizationFactor(OptForSize, VectorizationFactor);
unsigned UF = CM.selectUnrollFactor(OptForSize, VectorizationUnroll,
VF.Width, VF.Cost);
if (VF.Width == 1) {
DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
return false;
}
DEBUG(dbgs() << "LV: Found a vectorizable loop ("<< VF.Width << ") in "<<
F->getParent()->getModuleIdentifier()<<"\n");
DEBUG(dbgs() << "LV: Unroll Factor is " << UF << "\n");
InnerLoopVectorizer LB(L, SE, LI, DT, DL, TLI, VF.Width, UF);
LB.vectorize(&LVL);
DEBUG(verifyFunction(*L->getHeader()->getParent()));
return true;
}
virtual void getAnalysisUsage(AnalysisUsage &AU) const {
LoopPass::getAnalysisUsage(AU);
AU.addRequiredID(LoopSimplifyID);
AU.addRequiredID(LCSSAID);
AU.addRequired<DominatorTree>();
AU.addRequired<LoopInfo>();
AU.addRequired<ScalarEvolution>();
AU.addRequired<TargetTransformInfo>();
AU.addPreserved<LoopInfo>();
AU.addPreserved<DominatorTree>();
}
};
}
void
LoopVectorizationLegality::RuntimePointerCheck::insert(ScalarEvolution *SE,
Loop *Lp, Value *Ptr,
bool WritePtr) {
const SCEV *Sc = SE->getSCEV(Ptr);
const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Sc);
assert(AR && "Invalid addrec expression");
const SCEV *Ex = SE->getExitCount(Lp, Lp->getLoopLatch());
const SCEV *ScEnd = AR->evaluateAtIteration(Ex, *SE);
Pointers.push_back(Ptr);
Starts.push_back(AR->getStart());
Ends.push_back(ScEnd);
IsWritePtr.push_back(WritePtr);
}
Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
Instruction *Loc = Builder.GetInsertPoint();
Instruction *Instr = dyn_cast<Instruction>(V);
bool NewInstr = (Instr && Instr->getParent() == LoopVectorBody);
bool Invariant = OrigLoop->isLoopInvariant(V) && !NewInstr;
if (Invariant)
Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast");
if (Invariant)
Builder.SetInsertPoint(Loc);
return Shuf;
}
Value *InnerLoopVectorizer::getConsecutiveVector(Value* Val, int StartIdx,
bool Negate) {
assert(Val->getType()->isVectorTy() && "Must be a vector");
assert(Val->getType()->getScalarType()->isIntegerTy() &&
"Elem must be an integer");
Type *ITy = Val->getType()->getScalarType();
VectorType *Ty = cast<VectorType>(Val->getType());
int VLen = Ty->getNumElements();
SmallVector<Constant*, 8> Indices;
for (int i = 0; i < VLen; ++i) {
int64_t Idx = Negate ? (-i) : i;
Indices.push_back(ConstantInt::get(ITy, StartIdx + Idx, Negate));
}
Constant *Cv = ConstantVector::get(Indices);
assert(Cv->getType() == Val->getType() && "Invalid consecutive vec");
return Builder.CreateAdd(Val, Cv, "induction");
}
int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) {
assert(Ptr->getType()->isPointerTy() && "Unexpected non ptr");
if (cast<PointerType>(Ptr->getType())->getElementType()->isAggregateType())
return 0;
PHINode *Phi = dyn_cast_or_null<PHINode>(Ptr);
if (Phi && Inductions.count(Phi)) {
InductionInfo II = Inductions[Phi];
if (IK_PtrInduction == II.IK)
return 1;
else if (IK_ReversePtrInduction == II.IK)
return -1;
}
GetElementPtrInst *Gep = dyn_cast_or_null<GetElementPtrInst>(Ptr);
if (!Gep)
return 0;
unsigned NumOperands = Gep->getNumOperands();
Value *LastIndex = Gep->getOperand(NumOperands - 1);
Value *GpPtr = Gep->getPointerOperand();
Phi = dyn_cast<PHINode>(GpPtr);
if (Phi && Inductions.count(Phi)) {
PointerType *GepPtrType = cast<PointerType>(GpPtr->getType());
if (GepPtrType->getElementType()->isAggregateType())
return 0;
for (unsigned i = 1; i < NumOperands; ++i)
if (!SE->isLoopInvariant(SE->getSCEV(Gep->getOperand(i)), TheLoop))
return 0;
InductionInfo II = Inductions[Phi];
if (IK_PtrInduction == II.IK)
return 1;
else if (IK_ReversePtrInduction == II.IK)
return -1;
}
for (unsigned i = 0; i < NumOperands - 1; ++i)
if (!SE->isLoopInvariant(SE->getSCEV(Gep->getOperand(i)), TheLoop))
return 0;
const SCEV *Last = SE->getSCEV(LastIndex);
if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Last)) {
const SCEV *Step = AR->getStepRecurrence(*SE);
if (Step->isOne())
return 1;
if (Step->isAllOnesValue())
return -1;
}
return 0;
}
bool LoopVectorizationLegality::isUniform(Value *V) {
return (SE->isLoopInvariant(SE->getSCEV(V), TheLoop));
}
InnerLoopVectorizer::VectorParts&
InnerLoopVectorizer::getVectorValue(Value *V) {
assert(V != Induction && "The new induction variable should not be used.");
assert(!V->getType()->isVectorTy() && "Can't widen a vector");
if (WidenMap.has(V))
return WidenMap.get(V);
Value *B = getBroadcastInstrs(V);
WidenMap.splat(V, B);
return WidenMap.get(V);
}
Value *InnerLoopVectorizer::reverseVector(Value *Vec) {
assert(Vec->getType()->isVectorTy() && "Invalid type");
SmallVector<Constant*, 8> ShuffleMask;
for (unsigned i = 0; i < VF; ++i)
ShuffleMask.push_back(Builder.getInt32(VF - i - 1));
return Builder.CreateShuffleVector(Vec, UndefValue::get(Vec->getType()),
ConstantVector::get(ShuffleMask),
"reverse");
}
void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,
LoopVectorizationLegality *Legal) {
LoadInst *LI = dyn_cast<LoadInst>(Instr);
StoreInst *SI = dyn_cast<StoreInst>(Instr);
assert((LI || SI) && "Invalid Load/Store instruction");
Type *ScalarDataTy = LI ? LI->getType() : SI->getValueOperand()->getType();
Type *DataTy = VectorType::get(ScalarDataTy, VF);
Value *Ptr = LI ? LI->getPointerOperand() : SI->getPointerOperand();
unsigned Alignment = LI ? LI->getAlignment() : SI->getAlignment();
unsigned ScalarAllocatedSize = DL->getTypeAllocSize(ScalarDataTy);
unsigned VectorElementSize = DL->getTypeStoreSize(DataTy)/VF;
if (ScalarAllocatedSize != VectorElementSize)
return scalarizeInstruction(Instr);
int Stride = Legal->isConsecutivePtr(Ptr);
bool Reverse = Stride < 0;
bool UniformLoad = LI && Legal->isUniform(Ptr);
if (Stride == 0 || UniformLoad)
return scalarizeInstruction(Instr);
Constant *Zero = Builder.getInt32(0);
VectorParts &Entry = WidenMap.get(Instr);
GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr);
if (Gep && Legal->isInductionVariable(Gep->getPointerOperand())) {
Value *PtrOperand = Gep->getPointerOperand();
Value *FirstBasePtr = getVectorValue(PtrOperand)[0];
FirstBasePtr = Builder.CreateExtractElement(FirstBasePtr, Zero);
GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone());
Gep2->setOperand(0, FirstBasePtr);
Gep2->setName("gep.indvar.base");
Ptr = Builder.Insert(Gep2);
} else if (Gep) {
assert(SE->isLoopInvariant(SE->getSCEV(Gep->getPointerOperand()),
OrigLoop) && "Base ptr must be invariant");
unsigned NumOperands = Gep->getNumOperands();
Value *LastGepOperand = Gep->getOperand(NumOperands - 1);
VectorParts &GEPParts = getVectorValue(LastGepOperand);
Value *LastIndex = GEPParts[0];
LastIndex = Builder.CreateExtractElement(LastIndex, Zero);
GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone());
Gep2->setOperand(NumOperands - 1, LastIndex);
Gep2->setName("gep.indvar.idx");
Ptr = Builder.Insert(Gep2);
} else {
assert(isa<PHINode>(Ptr) && "Invalid induction ptr");
VectorParts &PtrVal = getVectorValue(Ptr);
Ptr = Builder.CreateExtractElement(PtrVal[0], Zero);
}
if (SI) {
assert(!Legal->isUniform(SI->getPointerOperand()) &&
"We do not allow storing to uniform addresses");
VectorParts &StoredVal = getVectorValue(SI->getValueOperand());
for (unsigned Part = 0; Part < UF; ++Part) {
Value *PartPtr = Builder.CreateGEP(Ptr, Builder.getInt32(Part * VF));
if (Reverse) {
StoredVal[Part] = reverseVector(StoredVal[Part]);
PartPtr = Builder.CreateGEP(Ptr, Builder.getInt32(-Part * VF));
PartPtr = Builder.CreateGEP(PartPtr, Builder.getInt32(1 - VF));
}
Value *VecPtr = Builder.CreateBitCast(PartPtr, DataTy->getPointerTo());
Builder.CreateStore(StoredVal[Part], VecPtr)->setAlignment(Alignment);
}
}
for (unsigned Part = 0; Part < UF; ++Part) {
Value *PartPtr = Builder.CreateGEP(Ptr, Builder.getInt32(Part * VF));
if (Reverse) {
PartPtr = Builder.CreateGEP(Ptr, Builder.getInt32(-Part * VF));
PartPtr = Builder.CreateGEP(PartPtr, Builder.getInt32(1 - VF));
}
Value *VecPtr = Builder.CreateBitCast(PartPtr, DataTy->getPointerTo());
Value *LI = Builder.CreateLoad(VecPtr, "wide.load");
cast<LoadInst>(LI)->setAlignment(Alignment);
Entry[Part] = Reverse ? reverseVector(LI) : LI;
}
}
void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr) {
assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
SmallVector<VectorParts, 4> Params;
for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
Value *SrcOp = Instr->getOperand(op);
if (SrcOp == OldInduction) {
Params.push_back(getVectorValue(SrcOp));
continue;
}
Instruction *SrcInst = dyn_cast<Instruction>(SrcOp);
if (SrcInst && OrigLoop->contains(SrcInst)) {
assert(WidenMap.has(SrcInst) && "Source operand is unavailable");
Params.push_back(WidenMap.get(SrcInst));
} else {
VectorParts Scalars;
Scalars.append(UF, SrcOp);
Params.push_back(Scalars);
}
}
assert(Params.size() == Instr->getNumOperands() &&
"Invalid number of operands");
bool IsVoidRetTy = Instr->getType()->isVoidTy();
Value *UndefVec = IsVoidRetTy ? 0 :
UndefValue::get(VectorType::get(Instr->getType(), VF));
VectorParts &VecResults = WidenMap.splat(Instr, UndefVec);
for (unsigned Part = 0; Part < UF; ++Part) {
for (unsigned Width = 0; Width < VF; ++Width) {
Instruction *Cloned = Instr->clone();
if (!IsVoidRetTy)
Cloned->setName(Instr->getName() + ".cloned");
for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
Value *Op = Params[op][Part];
if (Op->getType()->isVectorTy())
Op = Builder.CreateExtractElement(Op, Builder.getInt32(Width));
Cloned->setOperand(op, Op);
}
Builder.Insert(Cloned);
if (!IsVoidRetTy)
VecResults[Part] = Builder.CreateInsertElement(VecResults[Part], Cloned,
Builder.getInt32(Width));
}
}
}
Instruction *
InnerLoopVectorizer::addRuntimeCheck(LoopVectorizationLegality *Legal,
Instruction *Loc) {
LoopVectorizationLegality::RuntimePointerCheck *PtrRtCheck =
Legal->getRuntimePointerCheck();
if (!PtrRtCheck->Need)
return NULL;
Instruction *MemoryRuntimeCheck = 0;
unsigned NumPointers = PtrRtCheck->Pointers.size();
SmallVector<Value* , 2> Starts;
SmallVector<Value* , 2> Ends;
SCEVExpander Exp(*SE, "induction");
Type* PtrArithTy = Type::getInt8PtrTy(Loc->getContext(), 0);
for (unsigned i = 0; i < NumPointers; ++i) {
Value *Ptr = PtrRtCheck->Pointers[i];
const SCEV *Sc = SE->getSCEV(Ptr);
if (SE->isLoopInvariant(Sc, OrigLoop)) {
DEBUG(dbgs() << "LV: Adding RT check for a loop invariant ptr:" <<
*Ptr <<"\n");
Starts.push_back(Ptr);
Ends.push_back(Ptr);
} else {
DEBUG(dbgs() << "LV: Adding RT check for range:" << *Ptr <<"\n");
Value *Start = Exp.expandCodeFor(PtrRtCheck->Starts[i], PtrArithTy, Loc);
Value *End = Exp.expandCodeFor(PtrRtCheck->Ends[i], PtrArithTy, Loc);
Starts.push_back(Start);
Ends.push_back(End);
}
}
IRBuilder<> ChkBuilder(Loc);
for (unsigned i = 0; i < NumPointers; ++i) {
for (unsigned j = i+1; j < NumPointers; ++j) {
if (!PtrRtCheck->IsWritePtr[i] && !PtrRtCheck->IsWritePtr[j])
continue;
Value *Start0 = ChkBuilder.CreateBitCast(Starts[i], PtrArithTy, "bc");
Value *Start1 = ChkBuilder.CreateBitCast(Starts[j], PtrArithTy, "bc");
Value *End0 = ChkBuilder.CreateBitCast(Ends[i], PtrArithTy, "bc");
Value *End1 = ChkBuilder.CreateBitCast(Ends[j], PtrArithTy, "bc");
Value *Cmp0 = ChkBuilder.CreateICmpULE(Start0, End1, "bound0");
Value *Cmp1 = ChkBuilder.CreateICmpULE(Start1, End0, "bound1");
Value *IsConflict = ChkBuilder.CreateAnd(Cmp0, Cmp1, "found.conflict");
if (MemoryRuntimeCheck)
IsConflict = ChkBuilder.CreateOr(MemoryRuntimeCheck, IsConflict,
"conflict.rdx");
MemoryRuntimeCheck = cast<Instruction>(IsConflict);
}
}
return MemoryRuntimeCheck;
}
void
InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
BasicBlock *OldBasicBlock = OrigLoop->getHeader();
BasicBlock *BypassBlock = OrigLoop->getLoopPreheader();
BasicBlock *ExitBlock = OrigLoop->getExitBlock();
assert(ExitBlock && "Must have an exit block");
MDNode *MD = MDNode::get(OldBasicBlock->getContext(), ArrayRef<Value*>());
OldBasicBlock->getTerminator()->setMetadata(AlreadyVectorizedMDName, MD);
OldInduction = Legal->getInduction();
Type *IdxTy = Legal->getWidestInductionType();
const SCEV *ExitCount = SE->getExitCount(OrigLoop, OrigLoop->getLoopLatch());
assert(ExitCount != SE->getCouldNotCompute() && "Invalid loop count");
ExitCount = SE->getAddExpr(ExitCount,
SE->getConstant(ExitCount->getType(), 1));
SCEVExpander Exp(*SE, "induction");
Value *Count = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
BypassBlock->getTerminator());
Builder.SetInsertPoint(BypassBlock->getTerminator());
Value *StartIdx = ExtendedIdx = OldInduction ?
Builder.CreateZExt(OldInduction->getIncomingValueForBlock(BypassBlock),
IdxTy):
ConstantInt::get(IdxTy, 0);
assert(BypassBlock && "Invalid loop structure");
LoopBypassBlocks.push_back(BypassBlock);
BasicBlock *VectorPH =
BypassBlock->splitBasicBlock(BypassBlock->getTerminator(), "vector.ph");
BasicBlock *VecBody =
VectorPH->splitBasicBlock(VectorPH->getTerminator(), "vector.body");
BasicBlock *MiddleBlock =
VecBody->splitBasicBlock(VecBody->getTerminator(), "middle.block");
BasicBlock *ScalarPH =
MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(), "scalar.ph");
Builder.SetInsertPoint(VecBody->getFirstInsertionPt());
Induction = Builder.CreatePHI(IdxTy, 2, "index");
Constant *Step = ConstantInt::get(IdxTy, VF * UF);
IRBuilder<> BypassBuilder(BypassBlock->getTerminator());
if (Count->getType() != IdxTy) {
if (ExitCount->getType()->isPointerTy())
Count = BypassBuilder.CreatePointerCast(Count, IdxTy, "ptrcnt.to.int");
else
Count = BypassBuilder.CreateZExtOrTrunc(Count, IdxTy, "cnt.cast");
}
Value *IdxEnd = BypassBuilder.CreateAdd(Count, StartIdx, "end.idx");
Value *R = BypassBuilder.CreateURem(Count, Step, "n.mod.vf");
Value *CountRoundDown = BypassBuilder.CreateSub(Count, R, "n.vec");
Value *IdxEndRoundDown = BypassBuilder.CreateAdd(CountRoundDown, StartIdx,
"end.idx.rnd.down");
Value *Cmp = BypassBuilder.CreateICmpEQ(IdxEndRoundDown, StartIdx,
"cmp.zero");
BasicBlock *LastBypassBlock = BypassBlock;
Instruction *MemRuntimeCheck = addRuntimeCheck(Legal,
BypassBlock->getTerminator());
if (MemRuntimeCheck) {
BasicBlock *CheckBlock = BypassBlock->splitBasicBlock(MemRuntimeCheck,
"vector.memcheck");
LoopBypassBlocks.push_back(CheckBlock);
Instruction *OldTerm = BypassBlock->getTerminator();
BranchInst::Create(MiddleBlock, CheckBlock, Cmp, OldTerm);
OldTerm->eraseFromParent();
Cmp = MemRuntimeCheck;
LastBypassBlock = CheckBlock;
}
LastBypassBlock->getTerminator()->eraseFromParent();
BranchInst::Create(MiddleBlock, VectorPH, Cmp,
LastBypassBlock);
PHINode *ResumeIndex = 0;
LoopVectorizationLegality::InductionList::iterator I, E;
LoopVectorizationLegality::InductionList *List = Legal->getInductionVars();
BypassBuilder.SetInsertPoint(LoopBypassBlocks.back()->getTerminator());
for (I = List->begin(), E = List->end(); I != E; ++I) {
PHINode *OrigPhi = I->first;
LoopVectorizationLegality::InductionInfo II = I->second;
Type *ResumeValTy = (OrigPhi == OldInduction) ? IdxTy : OrigPhi->getType();
PHINode *ResumeVal = PHINode::Create(ResumeValTy, 2, "resume.val",
MiddleBlock->getTerminator());
PHINode *TruncResumeVal = (OrigPhi == OldInduction) ?
PHINode::Create(OrigPhi->getType(), 2, "trunc.resume.val",
MiddleBlock->getTerminator()) : 0;
Value *EndValue = 0;
switch (II.IK) {
case LoopVectorizationLegality::IK_NoInduction:
llvm_unreachable("Unknown induction");
case LoopVectorizationLegality::IK_IntInduction: {
assert(OrigPhi->getType()->isIntegerTy() && "Invalid type");
if (OrigPhi == OldInduction) {
EndValue =
BypassBuilder.CreateTrunc(IdxEndRoundDown, OrigPhi->getType());
for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
TruncResumeVal->addIncoming(II.StartValue, LoopBypassBlocks[I]);
TruncResumeVal->addIncoming(EndValue, VecBody);
EndValue = IdxEndRoundDown;
ResumeIndex = ResumeVal;
break;
}
Value *CRD = BypassBuilder.CreateSExtOrTrunc(CountRoundDown,
II.StartValue->getType(),
"cast.crd");
EndValue = BypassBuilder.CreateAdd(CRD, II.StartValue , "ind.end");
break;
}
case LoopVectorizationLegality::IK_ReverseIntInduction: {
Value *CRD = BypassBuilder.CreateSExtOrTrunc(CountRoundDown,
II.StartValue->getType(),
"cast.crd");
EndValue = BypassBuilder.CreateSub(II.StartValue, CRD, "rev.ind.end");
break;
}
case LoopVectorizationLegality::IK_PtrInduction: {
EndValue = BypassBuilder.CreateGEP(II.StartValue, CountRoundDown,
"ptr.ind.end");
break;
}
case LoopVectorizationLegality::IK_ReversePtrInduction: {
Value *Zero = ConstantInt::get(CountRoundDown->getType(), 0);
Value *NegIdx = BypassBuilder.CreateSub(Zero, CountRoundDown,
"rev.ind.end");
EndValue = BypassBuilder.CreateGEP(II.StartValue, NegIdx,
"rev.ptr.ind.end");
break;
}
}
for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I) {
if (OrigPhi == OldInduction)
ResumeVal->addIncoming(StartIdx, LoopBypassBlocks[I]);
else
ResumeVal->addIncoming(II.StartValue, LoopBypassBlocks[I]);
}
ResumeVal->addIncoming(EndValue, VecBody);
unsigned BlockIdx = OrigPhi->getBasicBlockIndex(ScalarPH);
if (OrigPhi == OldInduction)
OrigPhi->setIncomingValue(BlockIdx, TruncResumeVal);
else
OrigPhi->setIncomingValue(BlockIdx, ResumeVal);
}
if (!OldInduction){
assert(!ResumeIndex && "Unexpected resume value found");
ResumeIndex = PHINode::Create(IdxTy, 2, "new.indc.resume.val",
MiddleBlock->getTerminator());
for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
ResumeIndex->addIncoming(StartIdx, LoopBypassBlocks[I]);
ResumeIndex->addIncoming(IdxEndRoundDown, VecBody);
}
assert(ResumeIndex && ResumeIndex->getType()->isIntegerTy() &&
"Invalid resume Index");
Value *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, IdxEnd,
ResumeIndex, "cmp.n",
MiddleBlock->getTerminator());
BranchInst::Create(ExitBlock, ScalarPH, CmpN, MiddleBlock->getTerminator());
MiddleBlock->getTerminator()->eraseFromParent();
Value *NextIdx = Builder.CreateAdd(Induction, Step, "index.next");
Induction->addIncoming(StartIdx, VectorPH);
Induction->addIncoming(NextIdx, VecBody);
Value *ICmp = Builder.CreateICmpEQ(NextIdx, IdxEndRoundDown);
Builder.CreateCondBr(ICmp, MiddleBlock, VecBody);
VecBody->getTerminator()->eraseFromParent();
Builder.SetInsertPoint(VecBody->getFirstInsertionPt());
Loop* Lp = new Loop();
Loop *ParentLoop = OrigLoop->getParentLoop();
if (ParentLoop) {
ParentLoop->addChildLoop(Lp);
for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I)
ParentLoop->addBasicBlockToLoop(LoopBypassBlocks[I], LI->getBase());
ParentLoop->addBasicBlockToLoop(ScalarPH, LI->getBase());
ParentLoop->addBasicBlockToLoop(VectorPH, LI->getBase());
ParentLoop->addBasicBlockToLoop(MiddleBlock, LI->getBase());
} else {
LI->addTopLevelLoop(Lp);
}
Lp->addBasicBlockToLoop(VecBody, LI->getBase());
LoopVectorPreHeader = VectorPH;
LoopScalarPreHeader = ScalarPH;
LoopMiddleBlock = MiddleBlock;
LoopExitBlock = ExitBlock;
LoopVectorBody = VecBody;
LoopScalarBody = OldBasicBlock;
}
Constant*
LoopVectorizationLegality::getReductionIdentity(ReductionKind K, Type *Tp,
MinMaxReductionKind MinMaxK) {
switch (K) {
case RK_IntegerXor:
case RK_IntegerAdd:
case RK_IntegerOr:
return ConstantInt::get(Tp, 0);
case RK_IntegerMult:
return ConstantInt::get(Tp, 1);
case RK_IntegerAnd:
return ConstantInt::get(Tp, -1, true);
case RK_FloatMult:
return ConstantFP::get(Tp, 1.0L);
case RK_FloatAdd:
return ConstantFP::get(Tp, 0.0L);
case RK_IntegerMinMax:
switch(MinMaxK) {
default: llvm_unreachable("Unknown min/max predicate");
case MRK_UIntMin:
return ConstantInt::getAllOnesValue(Tp);
case MRK_UIntMax:
return ConstantInt::get(Tp, 0);
case MRK_SIntMin: {
unsigned BitWidth = Tp->getPrimitiveSizeInBits();
return ConstantInt::get(Tp->getContext(),
APInt::getSignedMaxValue(BitWidth));
}
case LoopVectorizationLegality::MRK_SIntMax: {
unsigned BitWidth = Tp->getPrimitiveSizeInBits();
return ConstantInt::get(Tp->getContext(),
APInt::getSignedMinValue(BitWidth));
}
}
default:
llvm_unreachable("Unknown reduction kind");
}
}
static Intrinsic::ID
getIntrinsicIDForCall(CallInst *CI, const TargetLibraryInfo *TLI) {
if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI)) {
switch (II->getIntrinsicID()) {
case Intrinsic::sqrt:
case Intrinsic::sin:
case Intrinsic::cos:
case Intrinsic::exp:
case Intrinsic::exp2:
case Intrinsic::log:
case Intrinsic::log10:
case Intrinsic::log2:
case Intrinsic::fabs:
case Intrinsic::floor:
case Intrinsic::ceil:
case Intrinsic::trunc:
case Intrinsic::rint:
case Intrinsic::nearbyint:
case Intrinsic::pow:
case Intrinsic::fma:
case Intrinsic::fmuladd:
return II->getIntrinsicID();
default:
return Intrinsic::not_intrinsic;
}
}
if (!TLI)
return Intrinsic::not_intrinsic;
LibFunc::Func Func;
Function *F = CI->getCalledFunction();
if (!F || !TLI->getLibFunc(F->getName(), Func))
return Intrinsic::not_intrinsic;
switch (Func) {
default:
break;
case LibFunc::sin:
case LibFunc::sinf:
case LibFunc::sinl:
return Intrinsic::sin;
case LibFunc::cos:
case LibFunc::cosf:
case LibFunc::cosl:
return Intrinsic::cos;
case LibFunc::exp:
case LibFunc::expf:
case LibFunc::expl:
return Intrinsic::exp;
case LibFunc::exp2:
case LibFunc::exp2f:
case LibFunc::exp2l:
return Intrinsic::exp2;
case LibFunc::log:
case LibFunc::logf:
case LibFunc::logl:
return Intrinsic::log;
case LibFunc::log10:
case LibFunc::log10f:
case LibFunc::log10l:
return Intrinsic::log10;
case LibFunc::log2:
case LibFunc::log2f:
case LibFunc::log2l:
return Intrinsic::log2;
case LibFunc::fabs:
case LibFunc::fabsf:
case LibFunc::fabsl:
return Intrinsic::fabs;
case LibFunc::floor:
case LibFunc::floorf:
case LibFunc::floorl:
return Intrinsic::floor;
case LibFunc::ceil:
case LibFunc::ceilf:
case LibFunc::ceill:
return Intrinsic::ceil;
case LibFunc::trunc:
case LibFunc::truncf:
case LibFunc::truncl:
return Intrinsic::trunc;
case LibFunc::rint:
case LibFunc::rintf:
case LibFunc::rintl:
return Intrinsic::rint;
case LibFunc::nearbyint:
case LibFunc::nearbyintf:
case LibFunc::nearbyintl:
return Intrinsic::nearbyint;
case LibFunc::pow:
case LibFunc::powf:
case LibFunc::powl:
return Intrinsic::pow;
}
return Intrinsic::not_intrinsic;
}
static unsigned
getReductionBinOp(LoopVectorizationLegality::ReductionKind Kind) {
switch (Kind) {
case LoopVectorizationLegality::RK_IntegerAdd:
return Instruction::Add;
case LoopVectorizationLegality::RK_IntegerMult:
return Instruction::Mul;
case LoopVectorizationLegality::RK_IntegerOr:
return Instruction::Or;
case LoopVectorizationLegality::RK_IntegerAnd:
return Instruction::And;
case LoopVectorizationLegality::RK_IntegerXor:
return Instruction::Xor;
case LoopVectorizationLegality::RK_FloatMult:
return Instruction::FMul;
case LoopVectorizationLegality::RK_FloatAdd:
return Instruction::FAdd;
case LoopVectorizationLegality::RK_IntegerMinMax:
return Instruction::ICmp;
default:
llvm_unreachable("Unknown reduction operation");
}
}
Value *createMinMaxOp(IRBuilder<> &Builder,
LoopVectorizationLegality::MinMaxReductionKind RK,
Value *Left,
Value *Right) {
CmpInst::Predicate P = CmpInst::ICMP_NE;
switch (RK) {
default:
llvm_unreachable("Unknown min/max reduction kind");
case LoopVectorizationLegality::MRK_UIntMin:
P = CmpInst::ICMP_ULT;
break;
case LoopVectorizationLegality::MRK_UIntMax:
P = CmpInst::ICMP_UGT;
break;
case LoopVectorizationLegality::MRK_SIntMin:
P = CmpInst::ICMP_SLT;
break;
case LoopVectorizationLegality::MRK_SIntMax:
P = CmpInst::ICMP_SGT;
}
Value *Cmp = Builder.CreateICmp(P, Left, Right, "rdx.minmax.cmp");
Value *Select = Builder.CreateSelect(Cmp, Left, Right, "rdx.minmax.select");
return Select;
}
void
InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
Constant *Zero = Builder.getInt32(0);
PhiVector RdxPHIsToFix;
LoopBlocksDFS DFS(OrigLoop);
DFS.perform(LI);
for (LoopBlocksDFS::RPOIterator bb = DFS.beginRPO(),
be = DFS.endRPO(); bb != be; ++bb)
vectorizeBlockInLoop(Legal, *bb, &RdxPHIsToFix);
for (PhiVector::iterator it = RdxPHIsToFix.begin(), e = RdxPHIsToFix.end();
it != e; ++it) {
PHINode *RdxPhi = *it;
assert(RdxPhi && "Unable to recover vectorized PHI");
assert(Legal->getReductionVars()->count(RdxPhi) &&
"Unable to find the reduction variable");
LoopVectorizationLegality::ReductionDescriptor RdxDesc =
(*Legal->getReductionVars())[RdxPhi];
Builder.SetInsertPoint(LoopBypassBlocks.front()->getTerminator());
VectorParts &VectorExit = getVectorValue(RdxDesc.LoopExitInstr);
Type *VecTy = VectorExit[0]->getType();
Constant *Iden =
LoopVectorizationLegality::getReductionIdentity(RdxDesc.Kind,
VecTy->getScalarType(),
RdxDesc.MinMaxKind);
Constant *Identity = ConstantVector::getSplat(VF, Iden);
Value *VectorStart = Builder.CreateInsertElement(Identity,
RdxDesc.StartValue, Zero);
BasicBlock *VecPreheader = Induction->getIncomingBlock(0);
VectorParts &VecRdxPhi = WidenMap.get(RdxPhi);
BasicBlock *Latch = OrigLoop->getLoopLatch();
Value *LoopVal = RdxPhi->getIncomingValueForBlock(Latch);
VectorParts &Val = getVectorValue(LoopVal);
for (unsigned part = 0; part < UF; ++part) {
Value *StartVal = (part == 0) ? VectorStart : Identity;
cast<PHINode>(VecRdxPhi[part])->addIncoming(StartVal, VecPreheader);
cast<PHINode>(VecRdxPhi[part])->addIncoming(Val[part], LoopVectorBody);
}
Builder.SetInsertPoint(LoopMiddleBlock->getFirstInsertionPt());
VectorParts RdxParts;
for (unsigned part = 0; part < UF; ++part) {
VectorParts &RdxExitVal = getVectorValue(RdxDesc.LoopExitInstr);
PHINode *NewPhi = Builder.CreatePHI(VecTy, 2, "rdx.vec.exit.phi");
Value *StartVal = (part == 0) ? VectorStart : Identity;
for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
NewPhi->addIncoming(StartVal, LoopBypassBlocks[I]);
NewPhi->addIncoming(RdxExitVal[part], LoopVectorBody);
RdxParts.push_back(NewPhi);
}
Value *ReducedPartRdx = RdxParts[0];
unsigned Op = getReductionBinOp(RdxDesc.Kind);
for (unsigned part = 1; part < UF; ++part) {
if (Op != Instruction::ICmp)
ReducedPartRdx = Builder.CreateBinOp((Instruction::BinaryOps)Op,
RdxParts[part], ReducedPartRdx,
"bin.rdx");
else
ReducedPartRdx = createMinMaxOp(Builder, RdxDesc.MinMaxKind,
ReducedPartRdx, RdxParts[part]);
}
assert(isPowerOf2_32(VF) &&
"Reduction emission only supported for pow2 vectors!");
Value *TmpVec = ReducedPartRdx;
SmallVector<Constant*, 32> ShuffleMask(VF, 0);
for (unsigned i = VF; i != 1; i >>= 1) {
for (unsigned j = 0; j != i/2; ++j)
ShuffleMask[j] = Builder.getInt32(i/2 + j);
std::fill(&ShuffleMask[i/2], ShuffleMask.end(),
UndefValue::get(Builder.getInt32Ty()));
Value *Shuf =
Builder.CreateShuffleVector(TmpVec,
UndefValue::get(TmpVec->getType()),
ConstantVector::get(ShuffleMask),
"rdx.shuf");
if (Op != Instruction::ICmp)
TmpVec = Builder.CreateBinOp((Instruction::BinaryOps)Op, TmpVec, Shuf,
"bin.rdx");
else
TmpVec = createMinMaxOp(Builder, RdxDesc.MinMaxKind, TmpVec, Shuf);
}
Value *Scalar0 = Builder.CreateExtractElement(TmpVec, Builder.getInt32(0));
for (BasicBlock::iterator LEI = LoopExitBlock->begin(),
LEE = LoopExitBlock->end(); LEI != LEE; ++LEI) {
PHINode *LCSSAPhi = dyn_cast<PHINode>(LEI);
if (!LCSSAPhi) continue;
assert(LCSSAPhi->getNumIncomingValues() < 3 && "Invalid LCSSA PHI");
if (LCSSAPhi->getIncomingValue(0) == RdxDesc.LoopExitInstr) {
LCSSAPhi->addIncoming(Scalar0, LoopMiddleBlock);
break;
}
}
int IncomingEdgeBlockIdx =
(RdxPhi)->getBasicBlockIndex(OrigLoop->getLoopLatch());
assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
(RdxPhi)->setIncomingValue(SelfEdgeBlockIdx, Scalar0);
(RdxPhi)->setIncomingValue(IncomingEdgeBlockIdx, RdxDesc.LoopExitInstr);
}
for (BasicBlock::iterator LEI = LoopExitBlock->begin(),
LEE = LoopExitBlock->end(); LEI != LEE; ++LEI) {
PHINode *LCSSAPhi = dyn_cast<PHINode>(LEI);
if (!LCSSAPhi) continue;
if (LCSSAPhi->getNumIncomingValues() == 1)
LCSSAPhi->addIncoming(UndefValue::get(LCSSAPhi->getType()),
LoopMiddleBlock);
}
}
InnerLoopVectorizer::VectorParts
InnerLoopVectorizer::createEdgeMask(BasicBlock *Src, BasicBlock *Dst) {
assert(std::find(pred_begin(Dst), pred_end(Dst), Src) != pred_end(Dst) &&
"Invalid edge");
VectorParts SrcMask = createBlockInMask(Src);
BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
assert(BI && "Unexpected terminator found");
if (BI->isConditional()) {
VectorParts EdgeMask = getVectorValue(BI->getCondition());
if (BI->getSuccessor(0) != Dst)
for (unsigned part = 0; part < UF; ++part)
EdgeMask[part] = Builder.CreateNot(EdgeMask[part]);
for (unsigned part = 0; part < UF; ++part)
EdgeMask[part] = Builder.CreateAnd(EdgeMask[part], SrcMask[part]);
return EdgeMask;
}
return SrcMask;
}
InnerLoopVectorizer::VectorParts
InnerLoopVectorizer::createBlockInMask(BasicBlock *BB) {
assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
if (OrigLoop->getHeader() == BB) {
Value *C = ConstantInt::get(IntegerType::getInt1Ty(BB->getContext()), 1);
return getVectorValue(C);
}
Value *Zero = ConstantInt::get(IntegerType::getInt1Ty(BB->getContext()), 0);
VectorParts BlockMask = getVectorValue(Zero);
for (pred_iterator it = pred_begin(BB), e = pred_end(BB); it != e; ++it) {
VectorParts EM = createEdgeMask(*it, BB);
for (unsigned part = 0; part < UF; ++part)
BlockMask[part] = Builder.CreateOr(BlockMask[part], EM[part]);
}
return BlockMask;
}
void
InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal,
BasicBlock *BB, PhiVector *PV) {
for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
VectorParts &Entry = WidenMap.get(it);
switch (it->getOpcode()) {
case Instruction::Br:
continue;
case Instruction::PHI:{
PHINode* P = cast<PHINode>(it);
if (Legal->getReductionVars()->count(P)) {
for (unsigned part = 0; part < UF; ++part) {
Type *VecTy = VectorType::get(it->getType(), VF);
Entry[part] = PHINode::Create(VecTy, 2, "vec.phi",
LoopVectorBody-> getFirstInsertionPt());
}
PV->push_back(P);
continue;
}
if (P->getParent() != OrigLoop->getHeader()) {
unsigned NumIncoming = P->getNumIncomingValues();
for (unsigned In = 0; In < NumIncoming; In++) {
VectorParts Cond = createEdgeMask(P->getIncomingBlock(In),
P->getParent());
VectorParts &In0 = getVectorValue(P->getIncomingValue(In));
for (unsigned part = 0; part < UF; ++part) {
if (In == 0)
Entry[part] = Builder.CreateSelect(Cond[part], In0[part],
In0[part]);
else
Entry[part] = Builder.CreateSelect(Cond[part], In0[part],
Entry[part], "predphi");
}
}
continue;
}
assert(Legal->getInductionVars()->count(P) &&
"Not an induction variable");
LoopVectorizationLegality::InductionInfo II =
Legal->getInductionVars()->lookup(P);
switch (II.IK) {
case LoopVectorizationLegality::IK_NoInduction:
llvm_unreachable("Unknown induction");
case LoopVectorizationLegality::IK_IntInduction: {
assert(P->getType() == II.StartValue->getType() && "Types must match");
Type *PhiTy = P->getType();
Value *Broadcasted;
if (P == OldInduction) {
Broadcasted = Builder.CreateTrunc(Induction, PhiTy);
} else {
Value *NormalizedIdx = Builder.CreateSub(Induction, ExtendedIdx,
"normalized.idx");
NormalizedIdx = Builder.CreateSExtOrTrunc(NormalizedIdx, PhiTy);
Broadcasted = Builder.CreateAdd(II.StartValue, NormalizedIdx,
"offset.idx");
}
Broadcasted = getBroadcastInstrs(Broadcasted);
for (unsigned part = 0; part < UF; ++part)
Entry[part] = getConsecutiveVector(Broadcasted, VF * part, false);
continue;
}
case LoopVectorizationLegality::IK_ReverseIntInduction:
case LoopVectorizationLegality::IK_PtrInduction:
case LoopVectorizationLegality::IK_ReversePtrInduction:
Value *StartIdx = ExtendedIdx;
Value *NormalizedIdx = Builder.CreateSub(Induction, StartIdx,
"normalized.idx");
if (LoopVectorizationLegality::IK_ReverseIntInduction == II.IK) {
IntegerType *DstTy = cast<IntegerType>(II.StartValue->getType());
Value *CNI = Builder.CreateSExtOrTrunc(NormalizedIdx, DstTy,
"resize.norm.idx");
Value *ReverseInd = Builder.CreateSub(II.StartValue, CNI,
"reverse.idx");
Value *Broadcasted = getBroadcastInstrs(ReverseInd);
for (unsigned part = 0; part < UF; ++part)
Entry[part] = getConsecutiveVector(Broadcasted, -(int)VF * part,
true);
continue;
}
assert(P->getType()->isPointerTy() && "Unexpected type.");
bool Reverse = (LoopVectorizationLegality::IK_ReversePtrInduction ==
II.IK);
for (unsigned part = 0; part < UF; ++part) {
Value *VecVal = UndefValue::get(VectorType::get(P->getType(), VF));
for (unsigned int i = 0; i < VF; ++i) {
int EltIndex = (i + part * VF) * (Reverse ? -1 : 1);
Constant *Idx = ConstantInt::get(Induction->getType(), EltIndex);
Value *GlobalIdx;
if (!Reverse)
GlobalIdx = Builder.CreateAdd(NormalizedIdx, Idx, "gep.idx");
else
GlobalIdx = Builder.CreateSub(Idx, NormalizedIdx, "gep.ridx");
Value *SclrGep = Builder.CreateGEP(II.StartValue, GlobalIdx,
"next.gep");
VecVal = Builder.CreateInsertElement(VecVal, SclrGep,
Builder.getInt32(i),
"insert.gep");
}
Entry[part] = VecVal;
}
continue;
}
}
case Instruction::Add:
case Instruction::FAdd:
case Instruction::Sub:
case Instruction::FSub:
case Instruction::Mul:
case Instruction::FMul:
case Instruction::UDiv:
case Instruction::SDiv:
case Instruction::FDiv:
case Instruction::URem:
case Instruction::SRem:
case Instruction::FRem:
case Instruction::Shl:
case Instruction::LShr:
case Instruction::AShr:
case Instruction::And:
case Instruction::Or:
case Instruction::Xor: {
BinaryOperator *BinOp = dyn_cast<BinaryOperator>(it);
VectorParts &A = getVectorValue(it->getOperand(0));
VectorParts &B = getVectorValue(it->getOperand(1));
for (unsigned Part = 0; Part < UF; ++Part) {
Value *V = Builder.CreateBinOp(BinOp->getOpcode(), A[Part], B[Part]);
BinaryOperator *VecOp = dyn_cast<BinaryOperator>(V);
if (VecOp && isa<OverflowingBinaryOperator>(BinOp)) {
VecOp->setHasNoSignedWrap(BinOp->hasNoSignedWrap());
VecOp->setHasNoUnsignedWrap(BinOp->hasNoUnsignedWrap());
}
if (VecOp && isa<PossiblyExactOperator>(VecOp))
VecOp->setIsExact(BinOp->isExact());
Entry[Part] = V;
}
break;
}
case Instruction::Select: {
bool InvariantCond = SE->isLoopInvariant(SE->getSCEV(it->getOperand(0)),
OrigLoop);
VectorParts &Cond = getVectorValue(it->getOperand(0));
VectorParts &Op0 = getVectorValue(it->getOperand(1));
VectorParts &Op1 = getVectorValue(it->getOperand(2));
Value *ScalarCond = Builder.CreateExtractElement(Cond[0],
Builder.getInt32(0));
for (unsigned Part = 0; Part < UF; ++Part) {
Entry[Part] = Builder.CreateSelect(
InvariantCond ? ScalarCond : Cond[Part],
Op0[Part],
Op1[Part]);
}
break;
}
case Instruction::ICmp:
case Instruction::FCmp: {
bool FCmp = (it->getOpcode() == Instruction::FCmp);
CmpInst *Cmp = dyn_cast<CmpInst>(it);
VectorParts &A = getVectorValue(it->getOperand(0));
VectorParts &B = getVectorValue(it->getOperand(1));
for (unsigned Part = 0; Part < UF; ++Part) {
Value *C = 0;
if (FCmp)
C = Builder.CreateFCmp(Cmp->getPredicate(), A[Part], B[Part]);
else
C = Builder.CreateICmp(Cmp->getPredicate(), A[Part], B[Part]);
Entry[Part] = C;
}
break;
}
case Instruction::Store:
case Instruction::Load:
vectorizeMemoryInstruction(it, Legal);
break;
case Instruction::ZExt:
case Instruction::SExt:
case Instruction::FPToUI:
case Instruction::FPToSI:
case Instruction::FPExt:
case Instruction::PtrToInt:
case Instruction::IntToPtr:
case Instruction::SIToFP:
case Instruction::UIToFP:
case Instruction::Trunc:
case Instruction::FPTrunc:
case Instruction::BitCast: {
CastInst *CI = dyn_cast<CastInst>(it);
if (CI->getOperand(0) == OldInduction &&
it->getOpcode() == Instruction::Trunc) {
Value *ScalarCast = Builder.CreateCast(CI->getOpcode(), Induction,
CI->getType());
Value *Broadcasted = getBroadcastInstrs(ScalarCast);
for (unsigned Part = 0; Part < UF; ++Part)
Entry[Part] = getConsecutiveVector(Broadcasted, VF * Part, false);
break;
}
Type *DestTy = VectorType::get(CI->getType()->getScalarType(), VF);
VectorParts &A = getVectorValue(it->getOperand(0));
for (unsigned Part = 0; Part < UF; ++Part)
Entry[Part] = Builder.CreateCast(CI->getOpcode(), A[Part], DestTy);
break;
}
case Instruction::Call: {
if (isa<DbgInfoIntrinsic>(it))
break;
Module *M = BB->getParent()->getParent();
CallInst *CI = cast<CallInst>(it);
Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI);
assert(ID && "Not an intrinsic call!");
for (unsigned Part = 0; Part < UF; ++Part) {
SmallVector<Value*, 4> Args;
for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) {
VectorParts &Arg = getVectorValue(CI->getArgOperand(i));
Args.push_back(Arg[Part]);
}
Type *Tys[] = { VectorType::get(CI->getType()->getScalarType(), VF) };
Function *F = Intrinsic::getDeclaration(M, ID, Tys);
Entry[Part] = Builder.CreateCall(F, Args);
}
break;
}
default:
scalarizeInstruction(it);
break;
} }}
void InnerLoopVectorizer::updateAnalysis() {
SE->forgetLoop(OrigLoop);
assert(DT->properlyDominates(LoopBypassBlocks.front(), LoopExitBlock) &&
"Entry does not dominate exit.");
for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I)
DT->addNewBlock(LoopBypassBlocks[I], LoopBypassBlocks[I-1]);
DT->addNewBlock(LoopVectorPreHeader, LoopBypassBlocks.back());
DT->addNewBlock(LoopVectorBody, LoopVectorPreHeader);
DT->addNewBlock(LoopMiddleBlock, LoopBypassBlocks.front());
DT->addNewBlock(LoopScalarPreHeader, LoopMiddleBlock);
DT->changeImmediateDominator(LoopScalarBody, LoopScalarPreHeader);
DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock);
DEBUG(DT->verifyAnalysis());
}
bool LoopVectorizationLegality::canVectorizeWithIfConvert() {
if (!EnableIfConversion)
return false;
assert(TheLoop->getNumBlocks() > 1 && "Single block loops are vectorizable");
std::vector<BasicBlock*> &LoopBlocks = TheLoop->getBlocksVector();
for (unsigned i = 0, e = LoopBlocks.size(); i < e; ++i) {
BasicBlock *BB = LoopBlocks[i];
if (!isa<BranchInst>(BB->getTerminator()))
return false;
if (blockNeedsPredication(BB) && !blockCanBePredicated(BB))
return false;
}
return true;
}
bool LoopVectorizationLegality::canVectorize() {
if (!TheLoop->getLoopPreheader())
return false;
if (TheLoop->getSubLoopsVector().size())
return false;
if (TheLoop->getNumBackEdges() != 1)
return false;
if (!TheLoop->getExitingBlock())
return false;
unsigned NumBlocks = TheLoop->getNumBlocks();
if (NumBlocks != 1 && !canVectorizeWithIfConvert()) {
DEBUG(dbgs() << "LV: Can't if-convert the loop.\n");
return false;
}
BasicBlock *Latch = TheLoop->getLoopLatch();
DEBUG(dbgs() << "LV: Found a loop: " <<
TheLoop->getHeader()->getName() << "\n");
const SCEV *ExitCount = SE->getExitCount(TheLoop, Latch);
if (ExitCount == SE->getCouldNotCompute()) {
DEBUG(dbgs() << "LV: SCEV could not compute the loop exit count.\n");
return false;
}
unsigned TC = SE->getSmallConstantTripCount(TheLoop, Latch);
if (TC > 0u && TC < TinyTripCountVectorThreshold) {
DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " <<
"This loop is not worth vectorizing.\n");
return false;
}
if (!canVectorizeInstrs()) {
DEBUG(dbgs() << "LV: Can't vectorize the instructions or CFG\n");
return false;
}
if (!canVectorizeMemory()) {
DEBUG(dbgs() << "LV: Can't vectorize due to memory conflicts\n");
return false;
}
collectLoopUniforms();
DEBUG(dbgs() << "LV: We can vectorize this loop" <<
(PtrRtCheck.Need ? " (with a runtime bound check)" : "")
<<"!\n");
return true;
}
static bool hasOutsideLoopUser(const Loop *TheLoop, Instruction *Inst,
SmallPtrSet<Value *, 4> &Reductions) {
if (!Reductions.count(Inst))
for (Value::use_iterator I = Inst->use_begin(), E = Inst->use_end();
I != E; ++I) {
Instruction *U = cast<Instruction>(*I);
if (!TheLoop->contains(U)) {
DEBUG(dbgs() << "LV: Found an outside user for : "<< *U << "\n");
return true;
}
}
return false;
}
static Type *convertPointerToIntegerType(DataLayout &DL, Type *Ty) {
if (Ty->isPointerTy())
return DL.getIntPtrType(Ty->getContext());
return Ty;
}
static Type* getWiderType(DataLayout &DL, Type *Ty0, Type *Ty1) {
Ty0 = convertPointerToIntegerType(DL, Ty0);
Ty1 = convertPointerToIntegerType(DL, Ty1);
if (Ty0->getScalarSizeInBits() > Ty1->getScalarSizeInBits())
return Ty0;
return Ty1;
}
bool LoopVectorizationLegality::canVectorizeInstrs() {
BasicBlock *PreHeader = TheLoop->getLoopPreheader();
BasicBlock *Header = TheLoop->getHeader();
if (Header->getTerminator()->getMetadata(AlreadyVectorizedMDName)) {
DEBUG(dbgs() << "LV: This loop was vectorized before\n");
return false;
}
for (Loop::block_iterator bb = TheLoop->block_begin(),
be = TheLoop->block_end(); bb != be; ++bb) {
for (BasicBlock::iterator it = (*bb)->begin(), e = (*bb)->end(); it != e;
++it) {
if (PHINode *Phi = dyn_cast<PHINode>(it)) {
Type *PhiTy = Phi->getType();
if (!PhiTy->isIntegerTy() &&
!PhiTy->isFloatingPointTy() &&
!PhiTy->isPointerTy()) {
DEBUG(dbgs() << "LV: Found an non-int non-pointer PHI.\n");
return false;
}
if (*bb != Header) {
if(!hasOutsideLoopUser(TheLoop, it, AllowedExit))
continue;
return false;
}
if (Phi->getNumIncomingValues() != 2) {
DEBUG(dbgs() << "LV: Found an invalid PHI.\n");
return false;
}
Value *StartValue = Phi->getIncomingValueForBlock(PreHeader);
InductionKind IK = isInductionVariable(Phi);
if (IK_NoInduction != IK) {
if (!WidestIndTy)
WidestIndTy = convertPointerToIntegerType(*DL, PhiTy);
else
WidestIndTy = getWiderType(*DL, PhiTy, WidestIndTy);
if (IK == IK_IntInduction) {
if (!Induction || PhiTy == WidestIndTy)
Induction = Phi;
}
DEBUG(dbgs() << "LV: Found an induction variable.\n");
Inductions[Phi] = InductionInfo(StartValue, IK);
continue;
}
if (AddReductionVar(Phi, RK_IntegerAdd)) {
DEBUG(dbgs() << "LV: Found an ADD reduction PHI."<< *Phi <<"\n");
continue;
}
if (AddReductionVar(Phi, RK_IntegerMult)) {
DEBUG(dbgs() << "LV: Found a MUL reduction PHI."<< *Phi <<"\n");
continue;
}
if (AddReductionVar(Phi, RK_IntegerOr)) {
DEBUG(dbgs() << "LV: Found an OR reduction PHI."<< *Phi <<"\n");
continue;
}
if (AddReductionVar(Phi, RK_IntegerAnd)) {
DEBUG(dbgs() << "LV: Found an AND reduction PHI."<< *Phi <<"\n");
continue;
}
if (AddReductionVar(Phi, RK_IntegerXor)) {
DEBUG(dbgs() << "LV: Found a XOR reduction PHI."<< *Phi <<"\n");
continue;
}
if (AddReductionVar(Phi, RK_IntegerMinMax)) {
DEBUG(dbgs() << "LV: Found a MINMAX reduction PHI."<< *Phi <<"\n");
continue;
}
if (AddReductionVar(Phi, RK_FloatMult)) {
DEBUG(dbgs() << "LV: Found an FMult reduction PHI."<< *Phi <<"\n");
continue;
}
if (AddReductionVar(Phi, RK_FloatAdd)) {
DEBUG(dbgs() << "LV: Found an FAdd reduction PHI."<< *Phi <<"\n");
continue;
}
DEBUG(dbgs() << "LV: Found an unidentified PHI."<< *Phi <<"\n");
return false;
}
CallInst *CI = dyn_cast<CallInst>(it);
if (CI && !getIntrinsicIDForCall(CI, TLI) && !isa<DbgInfoIntrinsic>(CI)) {
DEBUG(dbgs() << "LV: Found a call site.\n");
return false;
}
if (!VectorType::isValidElementType(it->getType()) &&
!it->getType()->isVoidTy()) {
DEBUG(dbgs() << "LV: Found unvectorizable type." << "\n");
return false;
}
if (StoreInst *ST = dyn_cast<StoreInst>(it)) {
Type *T = ST->getValueOperand()->getType();
if (!VectorType::isValidElementType(T))
return false;
}
if (hasOutsideLoopUser(TheLoop, it, AllowedExit))
return false;
}
}
if (!Induction) {
DEBUG(dbgs() << "LV: Did not find one integer induction var.\n");
assert(getInductionVars()->size() && "No induction variables");
}
return true;
}
void LoopVectorizationLegality::collectLoopUniforms() {
std::vector<Value*> Worklist;
BasicBlock *Latch = TheLoop->getLoopLatch();
Worklist.push_back(Latch->getTerminator()->getOperand(0));
while (Worklist.size()) {
Instruction *I = dyn_cast<Instruction>(Worklist.back());
Worklist.pop_back();
if (!I || !TheLoop->contains(I) || isa<PHINode>(I))
continue;
Uniforms.insert(I);
for (int i = 0, Op = I->getNumOperands(); i < Op; ++i) {
Worklist.push_back(I->getOperand(i));
}
}
}
AliasAnalysis::Location
LoopVectorizationLegality::getLoadStoreLocation(Instruction *Inst) {
if (StoreInst *Store = dyn_cast<StoreInst>(Inst))
return AA->getLocation(Store);
else if (LoadInst *Load = dyn_cast<LoadInst>(Inst))
return AA->getLocation(Load);
llvm_unreachable("Should be either load or store instruction");
}
bool
LoopVectorizationLegality::hasPossibleGlobalWriteReorder(
Value *Object,
Instruction *Inst,
AliasMultiMap& WriteObjects,
unsigned MaxByteWidth) {
AliasAnalysis::Location ThisLoc = getLoadStoreLocation(Inst);
std::vector<Instruction*>::iterator
it = WriteObjects[Object].begin(),
end = WriteObjects[Object].end();
for (; it != end; ++it) {
Instruction* I = *it;
if (I == Inst)
continue;
AliasAnalysis::Location ThatLoc = getLoadStoreLocation(I);
if (AA->alias(ThisLoc.getWithNewSize(MaxByteWidth),
ThatLoc.getWithNewSize(MaxByteWidth)))
return true;
}
return false;
}
bool LoopVectorizationLegality::canVectorizeMemory() {
typedef SmallVector<Value*, 16> ValueVector;
typedef SmallPtrSet<Value*, 16> ValueSet;
ValueVector Loads;
ValueVector Stores;
PtrRtCheck.Pointers.clear();
PtrRtCheck.Need = false;
for (Loop::block_iterator bb = TheLoop->block_begin(),
be = TheLoop->block_end(); bb != be; ++bb) {
for (BasicBlock::iterator it = (*bb)->begin(), e = (*bb)->end(); it != e;
++it) {
if (it->mayReadFromMemory()) {
LoadInst *Ld = dyn_cast<LoadInst>(it);
if (!Ld) return false;
if (!Ld->isSimple()) {
DEBUG(dbgs() << "LV: Found a non-simple load.\n");
return false;
}
Loads.push_back(Ld);
continue;
}
if (it->mayWriteToMemory()) {
StoreInst *St = dyn_cast<StoreInst>(it);
if (!St) return false;
if (!St->isSimple()) {
DEBUG(dbgs() << "LV: Found a non-simple store.\n");
return false;
}
Stores.push_back(St);
}
} }
if (!Stores.size()) {
DEBUG(dbgs() << "LV: Found a read-only loop!\n");
return true;
}
AliasMap Reads;
AliasMap ReadWrites;
ValueSet Seen;
ValueVector::iterator I, IE;
for (I = Stores.begin(), IE = Stores.end(); I != IE; ++I) {
StoreInst *ST = cast<StoreInst>(*I);
Value* Ptr = ST->getPointerOperand();
if (isUniform(Ptr)) {
DEBUG(dbgs() << "LV: We don't allow storing to uniform addresses\n");
return false;
}
if (Seen.insert(Ptr))
ReadWrites.insert(std::make_pair(Ptr, ST));
}
for (I = Loads.begin(), IE = Loads.end(); I != IE; ++I) {
LoadInst *LD = cast<LoadInst>(*I);
Value* Ptr = LD->getPointerOperand();
if (Seen.insert(Ptr) || 0 == isConsecutivePtr(Ptr))
Reads.insert(std::make_pair(Ptr, LD));
}
if (ReadWrites.size() == 1 && Reads.size() == 0) {
DEBUG(dbgs() << "LV: Found a write-only loop!\n");
return true;
}
unsigned NumReadPtrs = 0;
unsigned NumWritePtrs = 0;
bool CanDoRT = true;
AliasMap::iterator MI, ME;
for (MI = ReadWrites.begin(), ME = ReadWrites.end(); MI != ME; ++MI) {
Value *V = (*MI).first;
if (hasComputableBounds(V)) {
PtrRtCheck.insert(SE, TheLoop, V, true);
NumWritePtrs++;
DEBUG(dbgs() << "LV: Found a runtime check ptr:" << *V <<"\n");
} else {
CanDoRT = false;
break;
}
}
for (MI = Reads.begin(), ME = Reads.end(); MI != ME; ++MI) {
Value *V = (*MI).first;
if (hasComputableBounds(V)) {
PtrRtCheck.insert(SE, TheLoop, V, false);
NumReadPtrs++;
DEBUG(dbgs() << "LV: Found a runtime check ptr:" << *V <<"\n");
} else {
CanDoRT = false;
break;
}
}
unsigned NumComparisons = (NumWritePtrs * (NumReadPtrs + NumWritePtrs - 1));
DEBUG(dbgs() << "LV: We need to compare " << NumComparisons << " ptrs.\n");
if (!CanDoRT || NumComparisons > RuntimeMemoryCheckThreshold) {
PtrRtCheck.reset();
CanDoRT = false;
}
if (CanDoRT) {
DEBUG(dbgs() << "LV: We can perform a memory runtime check if needed.\n");
}
bool NeedRTCheck = false;
unsigned MaxByteWidth = (TTI->getRegisterBitWidth(true) / 8) *
TTI->getMaximumUnrollFactor();
AliasMultiMap WriteObjects;
ValueVector TempObjects;
bool AllWritesIdentified = true;
for (MI = ReadWrites.begin(), ME = ReadWrites.end(); MI != ME; ++MI) {
Value *Val = (*MI).first;
Instruction *Inst = (*MI).second;
GetUnderlyingObjects(Val, TempObjects, DL);
for (ValueVector::iterator UI=TempObjects.begin(), UE=TempObjects.end();
UI != UE; ++UI) {
if (!isIdentifiedObject(*UI)) {
DEBUG(dbgs() << "LV: Found an unidentified write ptr:"<< **UI <<"\n");
NeedRTCheck = true;
AllWritesIdentified = false;
}
if (WriteObjects[*UI].empty()) {
DEBUG(dbgs() << "LV: Adding Underlying value:" << **UI <<"\n");
WriteObjects[*UI].push_back(Inst);
continue;
}
if (!AA || dyn_cast<GlobalValue>(*UI) == NULL) {
DEBUG(dbgs() << "LV: Found a possible write-write reorder:"
<< **UI <<"\n");
return false;
}
DEBUG(dbgs() << "LV: Found a conflicting global value:"
<< **UI <<"\n");
DEBUG(dbgs() << "LV: While examining store:" << *Inst <<"\n");
DEBUG(dbgs() << "LV: On value:" << *Val <<"\n");
if (hasPossibleGlobalWriteReorder(*UI,
Inst,
WriteObjects,
MaxByteWidth)) {
DEBUG(dbgs() << "LV: Found a possible write-write reorder:"
<< *UI <<"\n");
return false;
}
WriteObjects[*UI].push_back(Inst);
}
TempObjects.clear();
}
for (MI = Reads.begin(), ME = Reads.end(); MI != ME; ++MI) {
Value *Val = (*MI).first;
GetUnderlyingObjects(Val, TempObjects, DL);
for (ValueVector::iterator UI=TempObjects.begin(), UE=TempObjects.end();
UI != UE; ++UI) {
if (!AllWritesIdentified && !isIdentifiedObject(*UI)) {
DEBUG(dbgs() << "LV: Found an unidentified read ptr:"<< **UI <<"\n");
NeedRTCheck = true;
}
if (WriteObjects[*UI].empty())
continue;
if (!AA || dyn_cast<GlobalValue>(*UI) == NULL) {
DEBUG(dbgs() << "LV: Found a possible write-write reorder:"
<< **UI <<"\n");
return false;
}
DEBUG(dbgs() << "LV: Found a global value: "
<< **UI <<"\n");
Instruction *Inst = (*MI).second;
DEBUG(dbgs() << "LV: While examining load:" << *Inst <<"\n");
DEBUG(dbgs() << "LV: On value:" << *Val <<"\n");
if (hasPossibleGlobalWriteReorder(*UI,
Inst,
WriteObjects,
MaxByteWidth)) {
DEBUG(dbgs() << "LV: Found a possible read-write reorder:"
<< *UI <<"\n");
return false;
}
}
TempObjects.clear();
}
PtrRtCheck.Need = NeedRTCheck;
if (NeedRTCheck && !CanDoRT) {
DEBUG(dbgs() << "LV: We can't vectorize because we can't find " <<
"the array bounds.\n");
PtrRtCheck.reset();
return false;
}
DEBUG(dbgs() << "LV: We "<< (NeedRTCheck ? "" : "don't") <<
" need a runtime memory check.\n");
return true;
}
bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi,
ReductionKind Kind) {
if (Phi->getNumIncomingValues() != 2)
return false;
if (Phi->getParent() != TheLoop->getHeader())
return false;
Value *RdxStart = Phi->getIncomingValueForBlock(TheLoop->getLoopPreheader());
Instruction *ExitInstruction = 0;
bool FoundBinOp = false;
Instruction *Iter = Phi;
unsigned NumICmpSelectPatternInst = 0;
ReductionInstDesc ReduxDesc(false, 0);
SmallPtrSet<Instruction *, 8> VisitedInsts;
while (VisitedInsts.insert(Iter)) {
if (Iter->use_empty())
return false;
bool FoundInBlockUser = false;
bool FoundStartPHI = false;
FoundBinOp |= !isa<PHINode>(Iter);
for (Value::use_iterator it = Iter->use_begin(), e = Iter->use_end();
it != e; ++it) {
Instruction *U = cast<Instruction>(*it);
if (U == Phi) {
FoundStartPHI = true;
continue;
}
BasicBlock *Parent = U->getParent();
if (!TheLoop->contains(Parent)) {
if (ExitInstruction != 0)
return false;
ExitInstruction = Iter;
}
if (isa<PHINode>(Iter) && isa<PHINode>(U) &&
U->getParent() != TheLoop->getHeader() &&
TheLoop->contains(U) &&
Iter->hasNUsesOrMore(2))
continue;
if (FoundInBlockUser && !NumICmpSelectPatternInst)
return false;
FoundInBlockUser = true;
ReduxDesc = isReductionInstr(U, Kind, ReduxDesc);
if (!ReduxDesc.IsReduction)
return false;
if (Kind == RK_IntegerMinMax && (isa<ICmpInst>(U) ||
isa<SelectInst>(U)))
++NumICmpSelectPatternInst;
if (!U->isCommutative() && !isa<PHINode>(U) && !isa<SelectInst>(U) &&
!isa<ICmpInst>(U) && U->getOperand(0) != Iter)
return false;
Iter = ReduxDesc.PatternLastInst;
}
if (Kind == RK_IntegerMinMax && NumICmpSelectPatternInst != 2)
return false;
if (FoundStartPHI) {
AllowedExit.insert(ExitInstruction);
ReductionDescriptor RD(RdxStart, ExitInstruction, Kind,
ReduxDesc.MinMaxKind);
Reductions[Phi] = RD;
return FoundBinOp && ExitInstruction;
}
}
return false;
}
LoopVectorizationLegality::ReductionInstDesc
LoopVectorizationLegality::isMinMaxSelectCmpPattern(Instruction *I, ReductionInstDesc &Prev) {
assert((isa<ICmpInst>(I) || isa<SelectInst>(I)) &&
"Expect a select instruction");
ICmpInst *Cmp = 0;
SelectInst *Select = 0;
if ((Cmp = dyn_cast<ICmpInst>(I))) {
if (!Cmp->hasOneUse() || !(Select = dyn_cast<SelectInst>(*I->use_begin())))
return ReductionInstDesc(false, I);
return ReductionInstDesc(Select, Prev.MinMaxKind);
}
if (!(Select = dyn_cast<SelectInst>(I)))
return ReductionInstDesc(false, I);
if (!(Cmp = dyn_cast<ICmpInst>(I->getOperand(0))))
return ReductionInstDesc(false, I);
if (!Cmp->hasOneUse())
return ReductionInstDesc(false, I);
Value *CmpLeft = Cmp->getOperand(0);
Value *CmpRight = Cmp->getOperand(1);
if (m_UMin(m_Value(CmpLeft), m_Value(CmpRight)).match(Select))
return ReductionInstDesc(Select, MRK_UIntMin);
else if (m_UMax(m_Value(CmpLeft), m_Value(CmpRight)).match(Select))
return ReductionInstDesc(Select, MRK_UIntMax);
else if (m_SMax(m_Value(CmpLeft), m_Value(CmpRight)).match(Select))
return ReductionInstDesc(Select, MRK_SIntMax);
else if (m_SMin(m_Value(CmpLeft), m_Value(CmpRight)).match(Select))
return ReductionInstDesc(Select, MRK_SIntMin);
return ReductionInstDesc(false, I);
}
LoopVectorizationLegality::ReductionInstDesc
LoopVectorizationLegality::isReductionInstr(Instruction *I,
ReductionKind Kind,
ReductionInstDesc &Prev) {
bool FP = I->getType()->isFloatingPointTy();
bool FastMath = (FP && I->isCommutative() && I->isAssociative());
switch (I->getOpcode()) {
default:
return ReductionInstDesc(false, I);
case Instruction::PHI:
if (FP && (Kind != RK_FloatMult && Kind != RK_FloatAdd))
return ReductionInstDesc(false, I);
return ReductionInstDesc(I, Prev.MinMaxKind);
case Instruction::Sub:
case Instruction::Add:
return ReductionInstDesc(Kind == RK_IntegerAdd, I);
case Instruction::Mul:
return ReductionInstDesc(Kind == RK_IntegerMult, I);
case Instruction::And:
return ReductionInstDesc(Kind == RK_IntegerAnd, I);
case Instruction::Or:
return ReductionInstDesc(Kind == RK_IntegerOr, I);
case Instruction::Xor:
return ReductionInstDesc(Kind == RK_IntegerXor, I);
case Instruction::FMul:
return ReductionInstDesc(Kind == RK_FloatMult && FastMath, I);
case Instruction::FAdd:
return ReductionInstDesc(Kind == RK_FloatAdd && FastMath, I);
case Instruction::ICmp:
case Instruction::Select:
if (Kind != RK_IntegerMinMax)
return ReductionInstDesc(false, I);
return isMinMaxSelectCmpPattern(I, Prev);
}
}
LoopVectorizationLegality::InductionKind
LoopVectorizationLegality::isInductionVariable(PHINode *Phi) {
Type *PhiTy = Phi->getType();
if (!PhiTy->isIntegerTy() && !PhiTy->isPointerTy())
return IK_NoInduction;
const SCEV *PhiScev = SE->getSCEV(Phi);
const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PhiScev);
if (!AR) {
DEBUG(dbgs() << "LV: PHI is not a poly recurrence.\n");
return IK_NoInduction;
}
const SCEV *Step = AR->getStepRecurrence(*SE);
if (PhiTy->isIntegerTy()) {
if (Step->isOne())
return IK_IntInduction;
if (Step->isAllOnesValue())
return IK_ReverseIntInduction;
return IK_NoInduction;
}
const SCEVConstant *C = dyn_cast<SCEVConstant>(Step);
if (!C)
return IK_NoInduction;
assert(PhiTy->isPointerTy() && "The PHI must be a pointer");
uint64_t Size = DL->getTypeAllocSize(PhiTy->getPointerElementType());
if (C->getValue()->equalsInt(Size))
return IK_PtrInduction;
else if (C->getValue()->equalsInt(0 - Size))
return IK_ReversePtrInduction;
return IK_NoInduction;
}
bool LoopVectorizationLegality::isInductionVariable(const Value *V) {
Value *In0 = const_cast<Value*>(V);
PHINode *PN = dyn_cast_or_null<PHINode>(In0);
if (!PN)
return false;
return Inductions.count(PN);
}
bool LoopVectorizationLegality::blockNeedsPredication(BasicBlock *BB) {
assert(TheLoop->contains(BB) && "Unknown block used");
BasicBlock* Latch = TheLoop->getLoopLatch();
return !DT->dominates(BB, Latch);
}
bool LoopVectorizationLegality::blockCanBePredicated(BasicBlock *BB) {
for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
if (it->mayReadFromMemory() || it->mayWriteToMemory() || it->mayThrow())
return false;
switch (it->getOpcode()) {
default: continue;
case Instruction::UDiv:
case Instruction::SDiv:
case Instruction::URem:
case Instruction::SRem:
return false;
}
}
return true;
}
bool LoopVectorizationLegality::hasComputableBounds(Value *Ptr) {
const SCEV *PhiScev = SE->getSCEV(Ptr);
const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PhiScev);
if (!AR)
return false;
return AR->isAffine();
}
LoopVectorizationCostModel::VectorizationFactor
LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize,
unsigned UserVF) {
VectorizationFactor Factor = { 1U, 0U };
if (OptForSize && Legal->getRuntimePointerCheck()->Need) {
DEBUG(dbgs() << "LV: Aborting. Runtime ptr check is required in Os.\n");
return Factor;
}
unsigned TC = SE->getSmallConstantTripCount(TheLoop, TheLoop->getLoopLatch());
DEBUG(dbgs() << "LV: Found trip count:"<<TC<<"\n");
unsigned WidestType = getWidestType();
unsigned WidestRegister = TTI.getRegisterBitWidth(true);
unsigned MaxVectorSize = WidestRegister / WidestType;
DEBUG(dbgs() << "LV: The Widest type: " << WidestType << " bits.\n");
DEBUG(dbgs() << "LV: The Widest register is:" << WidestRegister << "bits.\n");
if (MaxVectorSize == 0) {
DEBUG(dbgs() << "LV: The target has no vector registers.\n");
MaxVectorSize = 1;
}
assert(MaxVectorSize <= 32 && "Did not expect to pack so many elements"
" into one vector!");
unsigned VF = MaxVectorSize;
if (OptForSize) {
if (TC < 2) {
DEBUG(dbgs() << "LV: Aborting. A tail loop is required in Os.\n");
return Factor;
}
VF = TC % MaxVectorSize;
if (VF == 0)
VF = MaxVectorSize;
if (VF < 2) {
DEBUG(dbgs() << "LV: Aborting. A tail loop is required in Os.\n");
return Factor;
}
}
if (UserVF != 0) {
assert(isPowerOf2_32(UserVF) && "VF needs to be a power of two");
DEBUG(dbgs() << "LV: Using user VF "<<UserVF<<".\n");
Factor.Width = UserVF;
return Factor;
}
float Cost = expectedCost(1);
unsigned Width = 1;
DEBUG(dbgs() << "LV: Scalar loop costs: "<< (int)Cost << ".\n");
for (unsigned i=2; i <= VF; i*=2) {
float VectorCost = expectedCost(i) / (float)i;
DEBUG(dbgs() << "LV: Vector loop of width "<< i << " costs: " <<
(int)VectorCost << ".\n");
if (VectorCost < Cost) {
Cost = VectorCost;
Width = i;
}
}
DEBUG(dbgs() << "LV: Selecting VF = : "<< Width << ".\n");
Factor.Width = Width;
Factor.Cost = Width * Cost;
return Factor;
}
unsigned LoopVectorizationCostModel::getWidestType() {
unsigned MaxWidth = 8;
for (Loop::block_iterator bb = TheLoop->block_begin(),
be = TheLoop->block_end(); bb != be; ++bb) {
BasicBlock *BB = *bb;
for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
Type *T = it->getType();
if (!isa<LoadInst>(it) && !isa<StoreInst>(it) && !isa<PHINode>(it))
continue;
if (PHINode *PN = dyn_cast<PHINode>(it))
if (!Legal->getReductionVars()->count(PN))
continue;
StoreInst *ST = 0;
if ((ST = dyn_cast<StoreInst>(it)))
T = ST->getValueOperand()->getType();
if (T->isPointerTy() && isConsecutiveLoadOrStore(it))
MaxWidth = std::max(MaxWidth, DL->getPointerSizeInBits());
else
MaxWidth = std::max(MaxWidth, T->getScalarSizeInBits());
}
}
return MaxWidth;
}
unsigned
LoopVectorizationCostModel::selectUnrollFactor(bool OptForSize,
unsigned UserUF,
unsigned VF,
unsigned LoopCost) {
if (UserUF != 0)
return UserUF;
if (OptForSize)
return 1;
unsigned TC = SE->getSmallConstantTripCount(TheLoop,
TheLoop->getLoopLatch());
if (TC > 1 && TC < TinyTripCountUnrollThreshold)
return 1;
unsigned TargetVectorRegisters = TTI.getNumberOfRegisters(true);
DEBUG(dbgs() << "LV: The target has " << TargetVectorRegisters <<
" vector registers\n");
LoopVectorizationCostModel::RegisterUsage R = calculateRegisterUsage();
R.MaxLocalUsers = std::max(R.MaxLocalUsers, 1U);
R.NumInstructions = std::max(R.NumInstructions, 1U);
unsigned UF = (TargetVectorRegisters - R.LoopInvariantRegs) / R.MaxLocalUsers;
unsigned MaxUnrollSize = TTI.getMaximumUnrollFactor();
if (LoopCost == 0)
LoopCost = expectedCost(VF);
if (UF > MaxUnrollSize)
UF = MaxUnrollSize;
else if (UF < 1)
UF = 1;
if (Legal->getReductionVars()->size()) {
DEBUG(dbgs() << "LV: Unrolling because of reductions. \n");
return UF;
}
DEBUG(dbgs() << "LV: Loop cost is "<< LoopCost <<" \n");
if (LoopCost < 20) {
DEBUG(dbgs() << "LV: Unrolling to reduce branch cost. \n");
unsigned NewUF = 20/LoopCost + 1;
return std::min(NewUF, UF);
}
DEBUG(dbgs() << "LV: Not Unrolling. \n");
return 1;
}
LoopVectorizationCostModel::RegisterUsage
LoopVectorizationCostModel::calculateRegisterUsage() {
LoopBlocksDFS DFS(TheLoop);
DFS.perform(LI);
RegisterUsage R;
R.NumInstructions = 0;
typedef DenseMap<Instruction*, unsigned> IntervalMap;
DenseMap<unsigned, Instruction*> IdxToInstr;
IntervalMap EndPoint;
SmallSet<Instruction*, 8> Ends;
SmallPtrSet<Value*, 8> LoopInvariants;
unsigned Index = 0;
for (LoopBlocksDFS::RPOIterator bb = DFS.beginRPO(),
be = DFS.endRPO(); bb != be; ++bb) {
R.NumInstructions += (*bb)->size();
for (BasicBlock::iterator it = (*bb)->begin(), e = (*bb)->end(); it != e;
++it) {
Instruction *I = it;
IdxToInstr[Index++] = I;
for (unsigned i = 0; i < I->getNumOperands(); ++i) {
Value *U = I->getOperand(i);
Instruction *Instr = dyn_cast<Instruction>(U);
if (!Instr) continue;
if (!TheLoop->contains(Instr)) {
LoopInvariants.insert(Instr);
continue;
}
EndPoint[Instr] = Index;
Ends.insert(Instr);
}
}
}
typedef SmallVector<Instruction*, 2> InstrList;
DenseMap<unsigned, InstrList> TransposeEnds;
for (IntervalMap::iterator it = EndPoint.begin(), e = EndPoint.end();
it != e; ++it)
TransposeEnds[it->second].push_back(it->first);
SmallSet<Instruction*, 8> OpenIntervals;
unsigned MaxUsage = 0;
DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
for (unsigned int i = 0; i < Index; ++i) {
Instruction *I = IdxToInstr[i];
if (!Ends.count(I)) continue;
InstrList &List = TransposeEnds[i];
for (unsigned int j=0, e = List.size(); j < e; ++j)
OpenIntervals.erase(List[j]);
MaxUsage = std::max(MaxUsage, OpenIntervals.size());
DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " <<
OpenIntervals.size() <<"\n");
OpenIntervals.insert(I);
}
unsigned Invariant = LoopInvariants.size();
DEBUG(dbgs() << "LV(REG): Found max usage: " << MaxUsage << " \n");
DEBUG(dbgs() << "LV(REG): Found invariant usage: " << Invariant << " \n");
DEBUG(dbgs() << "LV(REG): LoopSize: " << R.NumInstructions << " \n");
R.LoopInvariantRegs = Invariant;
R.MaxLocalUsers = MaxUsage;
return R;
}
unsigned LoopVectorizationCostModel::expectedCost(unsigned VF) {
unsigned Cost = 0;
for (Loop::block_iterator bb = TheLoop->block_begin(),
be = TheLoop->block_end(); bb != be; ++bb) {
unsigned BlockCost = 0;
BasicBlock *BB = *bb;
for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
if (isa<DbgInfoIntrinsic>(it))
continue;
unsigned C = getInstructionCost(it, VF);
Cost += C;
DEBUG(dbgs() << "LV: Found an estimated cost of "<< C <<" for VF " <<
VF << " For instruction: "<< *it << "\n");
}
if (Legal->blockNeedsPredication(*bb) && VF == 1)
BlockCost /= 2;
Cost += BlockCost;
}
return Cost;
}
unsigned
LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
if (Legal->isUniformAfterVectorization(I))
VF = 1;
Type *RetTy = I->getType();
Type *VectorTy = ToVectorTy(RetTy, VF);
switch (I->getOpcode()) {
case Instruction::GetElementPtr:
return 0;
case Instruction::Br: {
return TTI.getCFInstrCost(I->getOpcode());
}
case Instruction::PHI:
return 0;
case Instruction::Add:
case Instruction::FAdd:
case Instruction::Sub:
case Instruction::FSub:
case Instruction::Mul:
case Instruction::FMul:
case Instruction::UDiv:
case Instruction::SDiv:
case Instruction::FDiv:
case Instruction::URem:
case Instruction::SRem:
case Instruction::FRem:
case Instruction::Shl:
case Instruction::LShr:
case Instruction::AShr:
case Instruction::And:
case Instruction::Or:
case Instruction::Xor: {
TargetTransformInfo::OperandValueKind Op1VK =
TargetTransformInfo::OK_AnyValue;
TargetTransformInfo::OperandValueKind Op2VK =
TargetTransformInfo::OK_AnyValue;
if (isa<ConstantInt>(I->getOperand(1)))
Op2VK = TargetTransformInfo::OK_UniformConstantValue;
return TTI.getArithmeticInstrCost(I->getOpcode(), VectorTy, Op1VK, Op2VK);
}
case Instruction::Select: {
SelectInst *SI = cast<SelectInst>(I);
const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
Type *CondTy = SI->getCondition()->getType();
if (!ScalarCond)
CondTy = VectorType::get(CondTy, VF);
return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy);
}
case Instruction::ICmp:
case Instruction::FCmp: {
Type *ValTy = I->getOperand(0)->getType();
VectorTy = ToVectorTy(ValTy, VF);
return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy);
}
case Instruction::Store:
case Instruction::Load: {
StoreInst *SI = dyn_cast<StoreInst>(I);
LoadInst *LI = dyn_cast<LoadInst>(I);
Type *ValTy = (SI ? SI->getValueOperand()->getType() :
LI->getType());
VectorTy = ToVectorTy(ValTy, VF);
unsigned Alignment = SI ? SI->getAlignment() : LI->getAlignment();
unsigned AS = SI ? SI->getPointerAddressSpace() :
LI->getPointerAddressSpace();
Value *Ptr = SI ? SI->getPointerOperand() : LI->getPointerOperand();
if (VF == 1)
return TTI.getAddressComputationCost(VectorTy) +
TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS);
int Stride = Legal->isConsecutivePtr(Ptr);
bool Reverse = Stride < 0;
unsigned ScalarAllocatedSize = DL->getTypeAllocSize(ValTy);
unsigned VectorElementSize = DL->getTypeStoreSize(VectorTy)/VF;
if (0 == Stride || ScalarAllocatedSize != VectorElementSize) {
unsigned Cost = 0;
Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
for (unsigned i = 0; i < VF; ++i) {
Cost += TTI.getVectorInstrCost(Instruction::ExtractElement, PtrTy, i);
Cost += TTI.getVectorInstrCost(SI ? Instruction::ExtractElement :
Instruction::InsertElement,
VectorTy, i);
}
Cost += VF * 20;
Cost += VF * TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(),
Alignment, AS);
return Cost;
}
unsigned Cost = TTI.getAddressComputationCost(VectorTy);
Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS);
if (Reverse)
Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse,
VectorTy, 0);
return Cost;
}
case Instruction::ZExt:
case Instruction::SExt:
case Instruction::FPToUI:
case Instruction::FPToSI:
case Instruction::FPExt:
case Instruction::PtrToInt:
case Instruction::IntToPtr:
case Instruction::SIToFP:
case Instruction::UIToFP:
case Instruction::Trunc:
case Instruction::FPTrunc:
case Instruction::BitCast: {
if (I->getOpcode() == Instruction::Trunc &&
Legal->isInductionVariable(I->getOperand(0)))
return TTI.getCastInstrCost(I->getOpcode(), I->getType(),
I->getOperand(0)->getType());
Type *SrcVecTy = ToVectorTy(I->getOperand(0)->getType(), VF);
return TTI.getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy);
}
case Instruction::Call: {
CallInst *CI = cast<CallInst>(I);
Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI);
assert(ID && "Not an intrinsic call!");
Type *RetTy = ToVectorTy(CI->getType(), VF);
SmallVector<Type*, 4> Tys;
for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i)
Tys.push_back(ToVectorTy(CI->getArgOperand(i)->getType(), VF));
return TTI.getIntrinsicInstrCost(ID, RetTy, Tys);
}
default: {
unsigned Cost = 0;
if (!RetTy->isVoidTy() && VF != 1) {
unsigned InsCost = TTI.getVectorInstrCost(Instruction::InsertElement,
VectorTy);
unsigned ExtCost = TTI.getVectorInstrCost(Instruction::ExtractElement,
VectorTy);
Cost += VF * (InsCost + ExtCost * I->getNumOperands());
}
Cost += VF * TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy);
return Cost;
}
}}
Type* LoopVectorizationCostModel::ToVectorTy(Type *Scalar, unsigned VF) {
if (Scalar->isVoidTy() || VF == 1)
return Scalar;
return VectorType::get(Scalar, VF);
}
char LoopVectorize::ID = 0;
static const char lv_name[] = "Loop Vectorization";
INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false)
INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
INITIALIZE_AG_DEPENDENCY(TargetTransformInfo)
INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false)
namespace llvm {
Pass *createLoopVectorizePass() {
return new LoopVectorize();
}
}
bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) {
StoreInst *ST = dyn_cast<StoreInst>(Inst);
if (ST)
return Legal->isConsecutivePtr(ST->getPointerOperand()) != 0;
LoadInst *LI = dyn_cast<LoadInst>(Inst);
if (LI)
return Legal->isConsecutivePtr(LI->getPointerOperand()) != 0;
return false;
}