[VPlan] Compute cost for predicated loads/stores to invariant address.#181572
[VPlan] Compute cost for predicated loads/stores to invariant address.#181572
Conversation
Update VPReplicateRecipe::computeCost to compute the cost for stores to invariant addresses only masked by the header mask. This matches the legacy cost model logic, but it is slightly odd that the legacy cost model only seems to do this for stores predicated by the header mask (i.e. tail-folding and not executed conditionally otherwise). This is probably something we want to re-evaluate eventually.
|
@llvm/pr-subscribers-llvm-transforms Author: Florian Hahn (fhahn) ChangesUpdate VPReplicateRecipe::computeCost to compute the cost for stores to invariant addresses only masked by the header mask. This matches the legacy cost model logic, but it is slightly odd that the legacy cost model only seems to do this for stores predicated by the header mask (i.e. tail-folding and not executed conditionally otherwise). This is probably something we want to re-evaluate eventually. Full diff: https://github.com/llvm/llvm-project/pull/181572.diff 1 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index bab1fc773fa4b..50e564a97fb6c 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -3379,6 +3379,48 @@ static bool isUsedByLoadStoreAddress(const VPUser *V) {
return false;
}
+/// Return the cost for a predicated load/store with loop-invariant address only
+/// masked by the header mask.
+static InstructionCost
+getPredicatedUniformLoadStoreCost(const VPReplicateRecipe *RepR,
+ const SCEV *PtrSCEV, ElementCount VF,
+ VPCostContext &Ctx) {
+ if (!Ctx.PSE.getSE()->isLoopInvariant(PtrSCEV, Ctx.L))
+ return InstructionCost::getInvalid();
+
+ const VPRegionBlock *ParentRegion = RepR->getParent()->getParent();
+ auto *BOM =
+ cast<VPBranchOnMaskRecipe>(&ParentRegion->getEntryBasicBlock()->front());
+ if (!vputils::isHeaderMask(BOM->getOperand(0), *ParentRegion->getPlan()))
+ return InstructionCost::getInvalid();
+
+ bool IsLoad = RepR->getOpcode() == Instruction::Load;
+ Type *ValTy = Ctx.Types.inferScalarType(IsLoad ? RepR : RepR->getOperand(0));
+ const VPValue *PtrOp = RepR->getOperand(!IsLoad);
+ Type *ScalarPtrTy = Ctx.Types.inferScalarType(PtrOp);
+ const Align Alignment = getLoadStoreAlignment(RepR->getUnderlyingInstr());
+ unsigned AS = cast<PointerType>(ScalarPtrTy)->getAddressSpace();
+
+ // Uniform mem op cost, matches getUniformMemOpCost.
+ InstructionCost UniformCost =
+ Ctx.TTI.getAddressComputationCost(ScalarPtrTy, nullptr, nullptr,
+ Ctx.CostKind) +
+ Ctx.TTI.getMemoryOpCost(RepR->getOpcode(), ValTy, Alignment, AS,
+ Ctx.CostKind);
+ auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
+ if (IsLoad) {
+ // Load: scalar load + broadcast.
+ UniformCost += Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast,
+ VectorTy, VectorTy, {}, Ctx.CostKind);
+ } else {
+ VPValue *StoredVal = RepR->getOperand(0);
+ if (!StoredVal->isDefinedOutsideLoopRegions())
+ UniformCost += Ctx.TTI.getIndexedVectorInstrCostFromEnd(
+ Instruction::ExtractElement, VectorTy, Ctx.CostKind, 0);
+ }
+ return UniformCost;
+}
+
InstructionCost VPReplicateRecipe::computeCost(ElementCount VF,
VPCostContext &Ctx) const {
Instruction *UI = cast<Instruction>(getUnderlyingValue());
@@ -3557,10 +3599,13 @@ InstructionCost VPReplicateRecipe::computeCost(ElementCount VF,
const VPRegionBlock *ParentRegion = getRegion();
if (ParentRegion && ParentRegion->isReplicator()) {
- // TODO: Handle loop-invariant pointers in predicated blocks. For now,
- // fall back to the legacy cost model.
- if (!PtrSCEV || Ctx.PSE.getSE()->isLoopInvariant(PtrSCEV, Ctx.L))
+ if (!PtrSCEV)
break;
+ InstructionCost UniformCost =
+ getPredicatedUniformLoadStoreCost(this, PtrSCEV, VF, Ctx);
+ if (UniformCost.isValid())
+ return UniformCost;
+
Cost /= Ctx.getPredBlockCostDivisor(UI->getParent());
Cost += Ctx.TTI.getCFInstrCost(Instruction::Br, Ctx.CostKind);
|
|
@llvm/pr-subscribers-vectorizers Author: Florian Hahn (fhahn) ChangesUpdate VPReplicateRecipe::computeCost to compute the cost for stores to invariant addresses only masked by the header mask. This matches the legacy cost model logic, but it is slightly odd that the legacy cost model only seems to do this for stores predicated by the header mask (i.e. tail-folding and not executed conditionally otherwise). This is probably something we want to re-evaluate eventually. Full diff: https://github.com/llvm/llvm-project/pull/181572.diff 1 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index bab1fc773fa4b..50e564a97fb6c 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -3379,6 +3379,48 @@ static bool isUsedByLoadStoreAddress(const VPUser *V) {
return false;
}
+/// Return the cost for a predicated load/store with loop-invariant address only
+/// masked by the header mask.
+static InstructionCost
+getPredicatedUniformLoadStoreCost(const VPReplicateRecipe *RepR,
+ const SCEV *PtrSCEV, ElementCount VF,
+ VPCostContext &Ctx) {
+ if (!Ctx.PSE.getSE()->isLoopInvariant(PtrSCEV, Ctx.L))
+ return InstructionCost::getInvalid();
+
+ const VPRegionBlock *ParentRegion = RepR->getParent()->getParent();
+ auto *BOM =
+ cast<VPBranchOnMaskRecipe>(&ParentRegion->getEntryBasicBlock()->front());
+ if (!vputils::isHeaderMask(BOM->getOperand(0), *ParentRegion->getPlan()))
+ return InstructionCost::getInvalid();
+
+ bool IsLoad = RepR->getOpcode() == Instruction::Load;
+ Type *ValTy = Ctx.Types.inferScalarType(IsLoad ? RepR : RepR->getOperand(0));
+ const VPValue *PtrOp = RepR->getOperand(!IsLoad);
+ Type *ScalarPtrTy = Ctx.Types.inferScalarType(PtrOp);
+ const Align Alignment = getLoadStoreAlignment(RepR->getUnderlyingInstr());
+ unsigned AS = cast<PointerType>(ScalarPtrTy)->getAddressSpace();
+
+ // Uniform mem op cost, matches getUniformMemOpCost.
+ InstructionCost UniformCost =
+ Ctx.TTI.getAddressComputationCost(ScalarPtrTy, nullptr, nullptr,
+ Ctx.CostKind) +
+ Ctx.TTI.getMemoryOpCost(RepR->getOpcode(), ValTy, Alignment, AS,
+ Ctx.CostKind);
+ auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
+ if (IsLoad) {
+ // Load: scalar load + broadcast.
+ UniformCost += Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast,
+ VectorTy, VectorTy, {}, Ctx.CostKind);
+ } else {
+ VPValue *StoredVal = RepR->getOperand(0);
+ if (!StoredVal->isDefinedOutsideLoopRegions())
+ UniformCost += Ctx.TTI.getIndexedVectorInstrCostFromEnd(
+ Instruction::ExtractElement, VectorTy, Ctx.CostKind, 0);
+ }
+ return UniformCost;
+}
+
InstructionCost VPReplicateRecipe::computeCost(ElementCount VF,
VPCostContext &Ctx) const {
Instruction *UI = cast<Instruction>(getUnderlyingValue());
@@ -3557,10 +3599,13 @@ InstructionCost VPReplicateRecipe::computeCost(ElementCount VF,
const VPRegionBlock *ParentRegion = getRegion();
if (ParentRegion && ParentRegion->isReplicator()) {
- // TODO: Handle loop-invariant pointers in predicated blocks. For now,
- // fall back to the legacy cost model.
- if (!PtrSCEV || Ctx.PSE.getSE()->isLoopInvariant(PtrSCEV, Ctx.L))
+ if (!PtrSCEV)
break;
+ InstructionCost UniformCost =
+ getPredicatedUniformLoadStoreCost(this, PtrSCEV, VF, Ctx);
+ if (UniformCost.isValid())
+ return UniformCost;
+
Cost /= Ctx.getPredBlockCostDivisor(UI->getParent());
Cost += Ctx.TTI.getCFInstrCost(Instruction::Br, Ctx.CostKind);
|
Update VPReplicateRecipe::computeCost to compute the cost for stores to invariant addresses only masked by the header mask.
This matches the legacy cost model logic, but it is slightly odd that the legacy cost model only seems to do this for stores predicated by the header mask (i.e. tail-folding and not executed conditionally otherwise). This is probably something we want to re-evaluate eventually.