llvm
diff --git a/‎llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
Lines changed: 15 additions & 19 deletions b/‎llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
Lines changed: 15 additions & 19 deletions
@@ -23589,17 +23589,16 @@ static SDValue combineV3I8LoadExt(LoadSDNode *LD, SelectionDAG &DAG) {
   return DAG.getMergeValues({Extract, TokenFactor}, DL);
 }
 
-// Replace scalable loads with fixed loads when vscale_range(1, 1).
+// Replace packed scalable loads with fixed loads when vscale_range(1, 1).
 // This enables further optimisations such as LDP folds.
 static SDValue combineVScale1Load(LoadSDNode *LD, SelectionDAG &DAG,
+                                  TargetLowering::DAGCombinerInfo &DCI,
                                   const AArch64Subtarget *Subtarget) {
   EVT MemVT = LD->getMemoryVT();
-  if (!Subtarget->isNeonAvailable() || !MemVT.isScalableVector() ||
-      Subtarget->getMaxSVEVectorSizeInBits() != AArch64::SVEBitsPerBlock)
-    return SDValue();
-
-  // Skip unpacked types given their different layouts between Neon and SVE.
-  if (MemVT.getSizeInBits().getKnownMinValue() != AArch64::SVEBitsPerBlock)
+  if (!DCI.isBeforeLegalize() || !Subtarget->hasNEON() ||
+      !MemVT.isScalableVector() || LD->getExtensionType() != ISD::NON_EXTLOAD ||
+      MemVT.getSizeInBits().getKnownMinValue() != 128 ||
+      Subtarget->getMaxSVEVectorSizeInBits() != 128)
     return SDValue();
 
   SDLoc DL(LD);
@@ -23609,9 +23608,7 @@ static SDValue combineVScale1Load(LoadSDNode *LD, SelectionDAG &DAG,
       NewVT, DL, LD->getChain(), LD->getBasePtr(), LD->getPointerInfo(),
       LD->getOriginalAlign(), LD->getMemOperand()->getFlags(), LD->getAAInfo());
   SDValue Insert = convertToScalableVector(DAG, MemVT, NewLoad);
-  SDValue TokenFactor = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
-                                    {SDValue(cast<SDNode>(NewLoad), 1)});
-  return DAG.getMergeValues({Insert, TokenFactor}, DL);
+  return DAG.getMergeValues({Insert, SDValue(cast<SDNode>(NewLoad), 1)}, DL);
 }
 
 // Perform TBI simplification if supported by the target and try to break up
@@ -23651,7 +23648,7 @@ static SDValue performLOADCombine(SDNode *N,
   if (SDValue Res = combineV3I8LoadExt(LD, DAG))
     return Res;
 
-  if (SDValue Res = combineVScale1Load(LD, DAG, Subtarget))
+  if (SDValue Res = combineVScale1Load(LD, DAG, DCI, Subtarget))
     return Res;
 
   if (!LD->isNonTemporal())
@@ -23912,18 +23909,17 @@ static SDValue combineI8TruncStore(StoreSDNode *ST, SelectionDAG &DAG,
   return Chain;
 }
 
-// Replace scalable stores with fixed stores when vscale_range(1, 1).
+// Replace packed scalable stores with fixed stores when vscale_range(1, 1).
 static SDValue combineVScale1Store(StoreSDNode *ST, SelectionDAG &DAG,
+                                   TargetLowering::DAGCombinerInfo &DCI,
                                    const AArch64Subtarget *Subtarget) {
   SDValue Value = ST->getValue();
   EVT ValueVT = Value.getValueType();
   if (ST->isVolatile() || !Subtarget->isLittleEndian() ||
-      !Subtarget->isNeonAvailable() || !ValueVT.isScalableVector() ||
-      Subtarget->getMaxSVEVectorSizeInBits() != AArch64::SVEBitsPerBlock)
-    return SDValue();
-
-  // Skip unpacked types given their different layouts between Neon and SVE.
-  if (ValueVT.getSizeInBits().getKnownMinValue() != AArch64::SVEBitsPerBlock)
+      !DCI.isBeforeLegalize() || !Subtarget->hasNEON() ||
+      !ValueVT.isScalableVector() || ST->isTruncatingStore() ||
+      ValueVT.getSizeInBits().getKnownMinValue() != 128 ||
+      Subtarget->getMaxSVEVectorSizeInBits() != 128)
     return SDValue();
 
   SDLoc DL(ST);
@@ -23970,7 +23966,7 @@ static SDValue performSTORECombine(SDNode *N,
   if (SDValue Res = combineI8TruncStore(ST, DAG, Subtarget))
     return Res;
 
-  if (SDValue Res = combineVScale1Store(ST, DAG, Subtarget))
+  if (SDValue Res = combineVScale1Store(ST, DAG, DCI, Subtarget))
     return Res;
 
   // If this is an FP_ROUND followed by a store, fold this into a truncating