From ad00fd02442cf9c0f00c4562635738042b521efa Mon Sep 17 00:00:00 2001
From: LDj3SNuD <35856442+LDj3SNuD@users.noreply.github.com>
Date: Sun, 2 Dec 2018 01:34:43 +0100
Subject: [PATCH] Fix Sshl_V; Add S/Uqrshl_V, S/Uqshl_V, S/Urshl_V; Add Tests.
 (#516)

* Update OpCodeTable.cs

* Update InstEmitSimdShift.cs

* Update SoftFallback.cs

* Update CpuTestSimdReg.cs

* Nit.

* Update SoftFallback.cs

* Update Optimizations.cs

* Update InstEmitSimdLogical.cs

* Update InstEmitSimdArithmetic.cs
---
 ChocolArm64/Decoders/OpCodeSimdFcond64.cs     |   4 +-
 .../Instructions/InstEmitSimdArithmetic.cs    |  29 +-
 .../Instructions/InstEmitSimdLogical.cs       | 255 ++++++++++++----
 ChocolArm64/Instructions/InstEmitSimdShift.cs | 273 +++++++++++++----
 ChocolArm64/Instructions/SoftFallback.cs      | 279 +++++++++++++++++-
 ChocolArm64/OpCodeTable.cs                    |   6 +
 ChocolArm64/Optimizations.cs                  |   2 +
 Ryujinx.Tests/Cpu/CpuTestSimdReg.cs           |  74 +++++
 8 files changed, 790 insertions(+), 132 deletions(-)

diff --git a/ChocolArm64/Decoders/OpCodeSimdFcond64.cs b/ChocolArm64/Decoders/OpCodeSimdFcond64.cs
index b0f1c0eb..f805b3c1 100644
--- a/ChocolArm64/Decoders/OpCodeSimdFcond64.cs
+++ b/ChocolArm64/Decoders/OpCodeSimdFcond64.cs
@@ -10,8 +10,8 @@ namespace ChocolArm64.Decoders
 
         public OpCodeSimdFcond64(Inst inst, long position, int opCode) : base(inst, position, opCode)
         {
-            Nzcv =         (opCode >>  0) & 0xf;
+            Nzcv =        (opCode >>  0) & 0xf;
             Cond = (Cond)((opCode >> 12) & 0xf);
         }
     }
-}
\ No newline at end of file
+}
diff --git a/ChocolArm64/Instructions/InstEmitSimdArithmetic.cs b/ChocolArm64/Instructions/InstEmitSimdArithmetic.cs
index c05e9f94..df84596b 100644
--- a/ChocolArm64/Instructions/InstEmitSimdArithmetic.cs
+++ b/ChocolArm64/Instructions/InstEmitSimdArithmetic.cs
@@ -1638,7 +1638,34 @@ namespace ChocolArm64.Instructions
 
         public static void Neg_V(ILEmitterCtx context)
         {
-            EmitVectorUnaryOpSx(context, () => context.Emit(OpCodes.Neg));
+            if (Optimizations.UseSse2)
+            {
+                OpCodeSimd64 op = (OpCodeSimd64)context.CurrOp;
+
+                Type[] typesSub = new Type[] { VectorIntTypesPerSizeLog2[op.Size], VectorIntTypesPerSizeLog2[op.Size] };
+
+                string[] namesSzv = new string[] { nameof(VectorHelper.VectorSByteZero),
+                                                   nameof(VectorHelper.VectorInt16Zero),
+                                                   nameof(VectorHelper.VectorInt32Zero),
+                                                   nameof(VectorHelper.VectorInt64Zero) };
+
+                VectorHelper.EmitCall(context, namesSzv[op.Size]);
+
+                EmitLdvecWithSignedCast(context, op.Rn, op.Size);
+
+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.Subtract), typesSub));
+
+                EmitStvecWithSignedCast(context, op.Rd, op.Size);
+
+                if (op.RegisterSize == RegisterSize.Simd64)
+                {
+                    EmitVectorZeroUpper(context, op.Rd);
+                }
+            }
+            else
+            {
+                EmitVectorUnaryOpSx(context, () => context.Emit(OpCodes.Neg));
+            }
         }
 
         public static void Raddhn_V(ILEmitterCtx context)
diff --git a/ChocolArm64/Instructions/InstEmitSimdLogical.cs b/ChocolArm64/Instructions/InstEmitSimdLogical.cs
index f51568eb..3473fc5d 100644
--- a/ChocolArm64/Instructions/InstEmitSimdLogical.cs
+++ b/ChocolArm64/Instructions/InstEmitSimdLogical.cs
@@ -3,6 +3,7 @@ using ChocolArm64.State;
 using ChocolArm64.Translation;
 using System;
 using System.Reflection.Emit;
+using System.Runtime.Intrinsics;
 using System.Runtime.Intrinsics.X86;
 
 using static ChocolArm64.Instructions.InstEmitSimdHelper;
@@ -29,18 +30,14 @@ namespace ChocolArm64.Instructions
             {
                 OpCodeSimdReg64 op = (OpCodeSimdReg64)context.CurrOp;
 
-                EmitLdvecWithUnsignedCast(context, op.Rm, op.Size);
-                EmitLdvecWithUnsignedCast(context, op.Rn, op.Size);
+                Type[] typesAndNot = new Type[] { typeof(Vector128<byte>), typeof(Vector128<byte>) };
 
-                Type[] types = new Type[]
-                {
-                    VectorUIntTypesPerSizeLog2[op.Size],
-                    VectorUIntTypesPerSizeLog2[op.Size]
-                };
+                EmitLdvecWithUnsignedCast(context, op.Rm, 0);
+                EmitLdvecWithUnsignedCast(context, op.Rn, 0);
 
-                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.AndNot), types));
+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.AndNot), typesAndNot));
 
-                EmitStvecWithUnsignedCast(context, op.Rd, op.Size);
+                EmitStvecWithUnsignedCast(context, op.Rd, 0);
 
                 if (op.RegisterSize == RegisterSize.Simd64)
                 {
@@ -68,41 +65,34 @@ namespace ChocolArm64.Instructions
 
         public static void Bif_V(ILEmitterCtx context)
         {
-            EmitBitBif(context, true);
+            EmitBifBit(context, notRm: true);
         }
 
         public static void Bit_V(ILEmitterCtx context)
         {
-            EmitBitBif(context, false);
+            EmitBifBit(context, notRm: false);
         }
 
-        private static void EmitBitBif(ILEmitterCtx context, bool notRm)
+        private static void EmitBifBit(ILEmitterCtx context, bool notRm)
         {
             OpCodeSimdReg64 op = (OpCodeSimdReg64)context.CurrOp;
 
             if (Optimizations.UseSse2)
             {
-                Type[] types = new Type[]
-                {
-                    VectorUIntTypesPerSizeLog2[op.Size],
-                    VectorUIntTypesPerSizeLog2[op.Size]
-                };
+                Type[] typesXorAndNot = new Type[] { typeof(Vector128<byte>), typeof(Vector128<byte>) };
 
-                EmitLdvecWithUnsignedCast(context, op.Rm, op.Size);
-                EmitLdvecWithUnsignedCast(context, op.Rd, op.Size);
-                EmitLdvecWithUnsignedCast(context, op.Rn, op.Size);
+                string nameAndNot = notRm ? nameof(Sse2.AndNot) : nameof(Sse2.And);
 
-                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.Xor), types));
+                EmitLdvecWithUnsignedCast(context, op.Rd, 0);
+                EmitLdvecWithUnsignedCast(context, op.Rm, 0);
+                EmitLdvecWithUnsignedCast(context, op.Rn, 0);
+                EmitLdvecWithUnsignedCast(context, op.Rd, 0);
 
-                string name = notRm ? nameof(Sse2.AndNot) : nameof(Sse2.And);
+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.Xor), typesXorAndNot));
+                context.EmitCall(typeof(Sse2).GetMethod(nameAndNot,       typesXorAndNot));
+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.Xor), typesXorAndNot));
 
-                context.EmitCall(typeof(Sse2).GetMethod(name, types));
-
-                EmitLdvecWithUnsignedCast(context, op.Rd, op.Size);
-
-                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.Xor), types));
-
-                EmitStvecWithUnsignedCast(context, op.Rd, op.Size);
+                EmitStvecWithUnsignedCast(context, op.Rd, 0);
 
                 if (op.RegisterSize == RegisterSize.Simd64)
                 {
@@ -111,17 +101,18 @@ namespace ChocolArm64.Instructions
             }
             else
             {
-                int bytes = op.GetBitsCount() >> 3;
-                int elems = bytes >> op.Size;
+                int elems = op.RegisterSize == RegisterSize.Simd128 ? 2 : 1;
 
                 for (int index = 0; index < elems; index++)
                 {
-                    EmitVectorExtractZx(context, op.Rd, index, op.Size);
-                    EmitVectorExtractZx(context, op.Rn, index, op.Size);
+                    EmitVectorExtractZx(context, op.Rd, index, 3);
+                    context.Emit(OpCodes.Dup);
+
+                    EmitVectorExtractZx(context, op.Rn, index, 3);
 
                     context.Emit(OpCodes.Xor);
 
-                    EmitVectorExtractZx(context, op.Rm, index, op.Size);
+                    EmitVectorExtractZx(context, op.Rm, index, 3);
 
                     if (notRm)
                     {
@@ -130,11 +121,9 @@ namespace ChocolArm64.Instructions
 
                     context.Emit(OpCodes.And);
 
-                    EmitVectorExtractZx(context, op.Rd, index, op.Size);
-
                     context.Emit(OpCodes.Xor);
 
-                    EmitVectorInsert(context, op.Rd, index, op.Size);
+                    EmitVectorInsert(context, op.Rd, index, 3);
                 }
 
                 if (op.RegisterSize == RegisterSize.Simd64)
@@ -150,26 +139,22 @@ namespace ChocolArm64.Instructions
             {
                 OpCodeSimdReg64 op = (OpCodeSimdReg64)context.CurrOp;
 
-                Type[] types = new Type[]
-                {
-                    VectorUIntTypesPerSizeLog2[op.Size],
-                    VectorUIntTypesPerSizeLog2[op.Size]
-                };
+                Type[] typesXorAnd = new Type[] { typeof(Vector128<byte>), typeof(Vector128<byte>) };
 
-                EmitLdvecWithUnsignedCast(context, op.Rn, op.Size);
-                EmitLdvecWithUnsignedCast(context, op.Rm, op.Size);
+                EmitLdvecWithUnsignedCast(context, op.Rm, 0);
+                context.Emit(OpCodes.Dup);
 
-                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.Xor), types));
+                EmitLdvecWithUnsignedCast(context, op.Rn, 0);
 
-                EmitLdvecWithUnsignedCast(context, op.Rd, op.Size);
+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.Xor), typesXorAnd));
 
-                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.And), types));
+                EmitLdvecWithUnsignedCast(context, op.Rd, 0);
 
-                EmitLdvecWithUnsignedCast(context, op.Rm, op.Size);
+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.And), typesXorAnd));
 
-                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.Xor), types));
+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.Xor), typesXorAnd));
 
-                EmitStvecWithUnsignedCast(context, op.Rd, op.Size);
+                EmitStvecWithUnsignedCast(context, op.Rd, 0);
 
                 if (op.RegisterSize == RegisterSize.Simd64)
                 {
@@ -207,16 +192,66 @@ namespace ChocolArm64.Instructions
 
         public static void Not_V(ILEmitterCtx context)
         {
-            EmitVectorUnaryOpZx(context, () => context.Emit(OpCodes.Not));
+            if (Optimizations.UseSse2)
+            {
+                OpCodeSimd64 op = (OpCodeSimd64)context.CurrOp;
+
+                Type[] typesSav    = new Type[] { typeof(byte) };
+                Type[] typesAndNot = new Type[] { typeof(Vector128<byte>), typeof(Vector128<byte>) };
+
+                EmitLdvecWithUnsignedCast(context, op.Rn, 0);
+
+                context.EmitLdc_I4(byte.MaxValue);
+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.SetAllVector128), typesSav));
+
+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.AndNot), typesAndNot));
+
+                EmitStvecWithUnsignedCast(context, op.Rd, 0);
+
+                if (op.RegisterSize == RegisterSize.Simd64)
+                {
+                    EmitVectorZeroUpper(context, op.Rd);
+                }
+            }
+            else
+            {
+                EmitVectorUnaryOpZx(context, () => context.Emit(OpCodes.Not));
+            }
         }
 
         public static void Orn_V(ILEmitterCtx context)
         {
-            EmitVectorBinaryOpZx(context, () =>
+            if (Optimizations.UseSse2)
             {
-                context.Emit(OpCodes.Not);
-                context.Emit(OpCodes.Or);
-            });
+                OpCodeSimdReg64 op = (OpCodeSimdReg64)context.CurrOp;
+
+                Type[] typesSav      = new Type[] { typeof(byte) };
+                Type[] typesAndNotOr = new Type[] { typeof(Vector128<byte>), typeof(Vector128<byte>) };
+
+                EmitLdvecWithUnsignedCast(context, op.Rn, 0);
+                EmitLdvecWithUnsignedCast(context, op.Rm, 0);
+
+                context.EmitLdc_I4(byte.MaxValue);
+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.SetAllVector128), typesSav));
+
+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.AndNot), typesAndNotOr));
+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.Or),     typesAndNotOr));
+
+                EmitStvecWithUnsignedCast(context, op.Rd, 0);
+
+                if (op.RegisterSize == RegisterSize.Simd64)
+                {
+                    EmitVectorZeroUpper(context, op.Rd);
+                }
+            }
+            else
+            {
+                EmitVectorBinaryOpZx(context, () =>
+                {
+                    context.Emit(OpCodes.Not);
+                    context.Emit(OpCodes.Or);
+                });
+            }
         }
 
         public static void Orr_V(ILEmitterCtx context)
@@ -263,28 +298,122 @@ namespace ChocolArm64.Instructions
 
         public static void Rev16_V(ILEmitterCtx context)
         {
-            EmitRev_V(context, containerSize: 1);
+            if (Optimizations.UseSsse3)
+            {
+                OpCodeSimd64 op = (OpCodeSimd64)context.CurrOp;
+
+                Type[] typesSve = new Type[] { typeof(long), typeof(long) };
+                Type[] typesSfl = new Type[] { typeof(Vector128<sbyte>), typeof(Vector128<sbyte>) };
+
+                EmitLdvecWithSignedCast(context, op.Rn, 0); // value
+
+                context.EmitLdc_I8(14L << 56 | 15L << 48 | 12L << 40 | 13L << 32 | 10L << 24 | 11L << 16 | 08L << 8 | 09L << 0); // maskE1
+                context.EmitLdc_I8(06L << 56 | 07L << 48 | 04L << 40 | 05L << 32 | 02L << 24 | 03L << 16 | 00L << 8 | 01L << 0); // maskE0
+
+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.SetVector128), typesSve));
+
+                context.EmitCall(typeof(Ssse3).GetMethod(nameof(Ssse3.Shuffle), typesSfl));
+
+                EmitStvecWithSignedCast(context, op.Rd, 0);
+
+                if (op.RegisterSize == RegisterSize.Simd64)
+                {
+                    EmitVectorZeroUpper(context, op.Rd);
+                }
+            }
+            else
+            {
+                EmitRev_V(context, containerSize: 1);
+            }
         }
 
         public static void Rev32_V(ILEmitterCtx context)
         {
-            EmitRev_V(context, containerSize: 2);
+            if (Optimizations.UseSsse3)
+            {
+                OpCodeSimd64 op = (OpCodeSimd64)context.CurrOp;
+
+                Type[] typesSve = new Type[] { typeof(long), typeof(long) };
+                Type[] typesSfl = new Type[] { typeof(Vector128<sbyte>), typeof(Vector128<sbyte>) };
+
+                EmitLdvecWithSignedCast(context, op.Rn, op.Size); // value
+
+                if (op.Size == 0)
+                {
+                    context.EmitLdc_I8(12L << 56 | 13L << 48 | 14L << 40 | 15L << 32 | 08L << 24 | 09L << 16 | 10L << 8 | 11L << 0); // maskE1
+                    context.EmitLdc_I8(04L << 56 | 05L << 48 | 06L << 40 | 07L << 32 | 00L << 24 | 01L << 16 | 02L << 8 | 03L << 0); // maskE0
+                }
+                else /* if (op.Size == 1) */
+                {
+                    context.EmitLdc_I8(13L << 56 | 12L << 48 | 15L << 40 | 14L << 32 | 09L << 24 | 08L << 16 | 11L << 8 | 10L << 0); // maskE1
+                    context.EmitLdc_I8(05L << 56 | 04L << 48 | 07L << 40 | 06L << 32 | 01L << 24 | 00L << 16 | 03L << 8 | 02L << 0); // maskE0
+                }
+
+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.SetVector128), typesSve));
+
+                context.EmitCall(typeof(Ssse3).GetMethod(nameof(Ssse3.Shuffle), typesSfl));
+
+                EmitStvecWithSignedCast(context, op.Rd, op.Size);
+
+                if (op.RegisterSize == RegisterSize.Simd64)
+                {
+                    EmitVectorZeroUpper(context, op.Rd);
+                }
+            }
+            else
+            {
+                EmitRev_V(context, containerSize: 2);
+            }
         }
 
         public static void Rev64_V(ILEmitterCtx context)
         {
-            EmitRev_V(context, containerSize: 3);
+            if (Optimizations.UseSsse3)
+            {
+                OpCodeSimd64 op = (OpCodeSimd64)context.CurrOp;
+
+                Type[] typesSve = new Type[] { typeof(long), typeof(long) };
+                Type[] typesSfl = new Type[] { typeof(Vector128<sbyte>), typeof(Vector128<sbyte>) };
+
+                EmitLdvecWithSignedCast(context, op.Rn, op.Size); // value
+
+                if (op.Size == 0)
+                {
+                    context.EmitLdc_I8(08L << 56 | 09L << 48 | 10L << 40 | 11L << 32 | 12L << 24 | 13L << 16 | 14L << 8 | 15L << 0); // maskE1
+                    context.EmitLdc_I8(00L << 56 | 01L << 48 | 02L << 40 | 03L << 32 | 04L << 24 | 05L << 16 | 06L << 8 | 07L << 0); // maskE0
+                }
+                else if (op.Size == 1)
+                {
+                    context.EmitLdc_I8(09L << 56 | 08L << 48 | 11L << 40 | 10L << 32 | 13L << 24 | 12L << 16 | 15L << 8 | 14L << 0); // maskE1
+                    context.EmitLdc_I8(01L << 56 | 00L << 48 | 03L << 40 | 02L << 32 | 05L << 24 | 04L << 16 | 07L << 8 | 06L << 0); // maskE0
+                }
+                else /* if (op.Size == 2) */
+                {
+                    context.EmitLdc_I8(11L << 56 | 10L << 48 | 09L << 40 | 08L << 32 | 15L << 24 | 14L << 16 | 13L << 8 | 12L << 0); // maskE1
+                    context.EmitLdc_I8(03L << 56 | 02L << 48 | 01L << 40 | 00L << 32 | 07L << 24 | 06L << 16 | 05L << 8 | 04L << 0); // maskE0
+                }
+
+                context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.SetVector128), typesSve));
+
+                context.EmitCall(typeof(Ssse3).GetMethod(nameof(Ssse3.Shuffle), typesSfl));
+
+                EmitStvecWithSignedCast(context, op.Rd, op.Size);
+
+                if (op.RegisterSize == RegisterSize.Simd64)
+                {
+                    EmitVectorZeroUpper(context, op.Rd);
+                }
+            }
+            else
+            {
+                EmitRev_V(context, containerSize: 3);
+            }
         }
 
         private static void EmitRev_V(ILEmitterCtx context, int containerSize)
         {
             OpCodeSimd64 op = (OpCodeSimd64)context.CurrOp;
 
-            if (op.Size >= containerSize)
-            {
-                throw new InvalidOperationException();
-            }
-
             int bytes = op.GetBitsCount() >> 3;
             int elems = bytes >> op.Size;
 
diff --git a/ChocolArm64/Instructions/InstEmitSimdShift.cs b/ChocolArm64/Instructions/InstEmitSimdShift.cs
index b183e8aa..5b606167 100644
--- a/ChocolArm64/Instructions/InstEmitSimdShift.cs
+++ b/ChocolArm64/Instructions/InstEmitSimdShift.cs
@@ -110,6 +110,34 @@ namespace ChocolArm64.Instructions
             }
         }
 
+        public static void Sqrshl_V(ILEmitterCtx context)
+        {
+            OpCodeSimdReg64 op = (OpCodeSimdReg64)context.CurrOp;
+
+            int bytes = op.GetBitsCount() >> 3;
+            int elems = bytes >> op.Size;
+
+            for (int index = 0; index < elems; index++)
+            {
+                EmitVectorExtractSx(context, op.Rn, index, op.Size);
+                EmitVectorExtractSx(context, op.Rm, index, op.Size);
+
+                context.Emit(OpCodes.Ldc_I4_1);
+                context.EmitLdc_I4(op.Size);
+
+                context.EmitLdarg(TranslatedSub.StateArgIdx);
+
+                SoftFallback.EmitCall(context, nameof(SoftFallback.SignedShlRegSatQ));
+
+                EmitVectorInsert(context, op.Rd, index, op.Size);
+            }
+
+            if (op.RegisterSize == RegisterSize.Simd64)
+            {
+                EmitVectorZeroUpper(context, op.Rd);
+            }
+        }
+
         public static void Sqrshrn_S(ILEmitterCtx context)
         {
             EmitRoundShrImmSaturatingNarrowOp(context, ShrImmSaturatingNarrowFlags.ScalarSxSx);
@@ -130,6 +158,34 @@ namespace ChocolArm64.Instructions
             EmitRoundShrImmSaturatingNarrowOp(context, ShrImmSaturatingNarrowFlags.VectorSxZx);
         }
 
+        public static void Sqshl_V(ILEmitterCtx context)
+        {
+            OpCodeSimdReg64 op = (OpCodeSimdReg64)context.CurrOp;
+
+            int bytes = op.GetBitsCount() >> 3;
+            int elems = bytes >> op.Size;
+
+            for (int index = 0; index < elems; index++)
+            {
+                EmitVectorExtractSx(context, op.Rn, index, op.Size);
+                EmitVectorExtractSx(context, op.Rm, index, op.Size);
+
+                context.Emit(OpCodes.Ldc_I4_0);
+                context.EmitLdc_I4(op.Size);
+
+                context.EmitLdarg(TranslatedSub.StateArgIdx);
+
+                SoftFallback.EmitCall(context, nameof(SoftFallback.SignedShlRegSatQ));
+
+                EmitVectorInsert(context, op.Rd, index, op.Size);
+            }
+
+            if (op.RegisterSize == RegisterSize.Simd64)
+            {
+                EmitVectorZeroUpper(context, op.Rd);
+            }
+        }
+
         public static void Sqshrn_S(ILEmitterCtx context)
         {
             EmitShrImmSaturatingNarrowOp(context, ShrImmSaturatingNarrowFlags.ScalarSxSx);
@@ -150,6 +206,32 @@ namespace ChocolArm64.Instructions
             EmitShrImmSaturatingNarrowOp(context, ShrImmSaturatingNarrowFlags.VectorSxZx);
         }
 
+        public static void Srshl_V(ILEmitterCtx context)
+        {
+            OpCodeSimdReg64 op = (OpCodeSimdReg64)context.CurrOp;
+
+            int bytes = op.GetBitsCount() >> 3;
+            int elems = bytes >> op.Size;
+
+            for (int index = 0; index < elems; index++)
+            {
+                EmitVectorExtractSx(context, op.Rn, index, op.Size);
+                EmitVectorExtractSx(context, op.Rm, index, op.Size);
+
+                context.Emit(OpCodes.Ldc_I4_1);
+                context.EmitLdc_I4(op.Size);
+
+                SoftFallback.EmitCall(context, nameof(SoftFallback.SignedShlReg));
+
+                EmitVectorInsert(context, op.Rd, index, op.Size);
+            }
+
+            if (op.RegisterSize == RegisterSize.Simd64)
+            {
+                EmitVectorZeroUpper(context, op.Rd);
+            }
+        }
+
         public static void Srshr_S(ILEmitterCtx context)
         {
             EmitScalarShrImmOpSx(context, ShrImmFlags.Round);
@@ -252,7 +334,28 @@ namespace ChocolArm64.Instructions
 
         public static void Sshl_V(ILEmitterCtx context)
         {
-            EmitVectorShl(context, signed: true);
+            OpCodeSimdReg64 op = (OpCodeSimdReg64)context.CurrOp;
+
+            int bytes = op.GetBitsCount() >> 3;
+            int elems = bytes >> op.Size;
+
+            for (int index = 0; index < elems; index++)
+            {
+                EmitVectorExtractSx(context, op.Rn, index, op.Size);
+                EmitVectorExtractSx(context, op.Rm, index, op.Size);
+
+                context.Emit(OpCodes.Ldc_I4_0);
+                context.EmitLdc_I4(op.Size);
+
+                SoftFallback.EmitCall(context, nameof(SoftFallback.SignedShlReg));
+
+                EmitVectorInsert(context, op.Rd, index, op.Size);
+            }
+
+            if (op.RegisterSize == RegisterSize.Simd64)
+            {
+                EmitVectorZeroUpper(context, op.Rd);
+            }
         }
 
         public static void Sshll_V(ILEmitterCtx context)
@@ -330,6 +433,34 @@ namespace ChocolArm64.Instructions
             }
         }
 
+        public static void Uqrshl_V(ILEmitterCtx context)
+        {
+            OpCodeSimdReg64 op = (OpCodeSimdReg64)context.CurrOp;
+
+            int bytes = op.GetBitsCount() >> 3;
+            int elems = bytes >> op.Size;
+
+            for (int index = 0; index < elems; index++)
+            {
+                EmitVectorExtractZx(context, op.Rn, index, op.Size);
+                EmitVectorExtractZx(context, op.Rm, index, op.Size);
+
+                context.Emit(OpCodes.Ldc_I4_1);
+                context.EmitLdc_I4(op.Size);
+
+                context.EmitLdarg(TranslatedSub.StateArgIdx);
+
+                SoftFallback.EmitCall(context, nameof(SoftFallback.UnsignedShlRegSatQ));
+
+                EmitVectorInsert(context, op.Rd, index, op.Size);
+            }
+
+            if (op.RegisterSize == RegisterSize.Simd64)
+            {
+                EmitVectorZeroUpper(context, op.Rd);
+            }
+        }
+
         public static void Uqrshrn_S(ILEmitterCtx context)
         {
             EmitRoundShrImmSaturatingNarrowOp(context, ShrImmSaturatingNarrowFlags.ScalarZxZx);
@@ -340,6 +471,34 @@ namespace ChocolArm64.Instructions
             EmitRoundShrImmSaturatingNarrowOp(context, ShrImmSaturatingNarrowFlags.VectorZxZx);
         }
 
+        public static void Uqshl_V(ILEmitterCtx context)
+        {
+            OpCodeSimdReg64 op = (OpCodeSimdReg64)context.CurrOp;
+
+            int bytes = op.GetBitsCount() >> 3;
+            int elems = bytes >> op.Size;
+
+            for (int index = 0; index < elems; index++)
+            {
+                EmitVectorExtractZx(context, op.Rn, index, op.Size);
+                EmitVectorExtractZx(context, op.Rm, index, op.Size);
+
+                context.Emit(OpCodes.Ldc_I4_0);
+                context.EmitLdc_I4(op.Size);
+
+                context.EmitLdarg(TranslatedSub.StateArgIdx);
+
+                SoftFallback.EmitCall(context, nameof(SoftFallback.UnsignedShlRegSatQ));
+
+                EmitVectorInsert(context, op.Rd, index, op.Size);
+            }
+
+            if (op.RegisterSize == RegisterSize.Simd64)
+            {
+                EmitVectorZeroUpper(context, op.Rd);
+            }
+        }
+
         public static void Uqshrn_S(ILEmitterCtx context)
         {
             EmitShrImmSaturatingNarrowOp(context, ShrImmSaturatingNarrowFlags.ScalarZxZx);
@@ -350,6 +509,32 @@ namespace ChocolArm64.Instructions
             EmitShrImmSaturatingNarrowOp(context, ShrImmSaturatingNarrowFlags.VectorZxZx);
         }
 
+        public static void Urshl_V(ILEmitterCtx context)
+        {
+            OpCodeSimdReg64 op = (OpCodeSimdReg64)context.CurrOp;
+
+            int bytes = op.GetBitsCount() >> 3;
+            int elems = bytes >> op.Size;
+
+            for (int index = 0; index < elems; index++)
+            {
+                EmitVectorExtractZx(context, op.Rn, index, op.Size);
+                EmitVectorExtractZx(context, op.Rm, index, op.Size);
+
+                context.Emit(OpCodes.Ldc_I4_1);
+                context.EmitLdc_I4(op.Size);
+
+                SoftFallback.EmitCall(context, nameof(SoftFallback.UnsignedShlReg));
+
+                EmitVectorInsert(context, op.Rd, index, op.Size);
+            }
+
+            if (op.RegisterSize == RegisterSize.Simd64)
+            {
+                EmitVectorZeroUpper(context, op.Rd);
+            }
+        }
+
         public static void Urshr_S(ILEmitterCtx context)
         {
             EmitScalarShrImmOpZx(context, ShrImmFlags.Round);
@@ -450,7 +635,28 @@ namespace ChocolArm64.Instructions
 
         public static void Ushl_V(ILEmitterCtx context)
         {
-            EmitVectorShl(context, signed: false);
+            OpCodeSimdReg64 op = (OpCodeSimdReg64)context.CurrOp;
+
+            int bytes = op.GetBitsCount() >> 3;
+            int elems = bytes >> op.Size;
+
+            for (int index = 0; index < elems; index++)
+            {
+                EmitVectorExtractZx(context, op.Rn, index, op.Size);
+                EmitVectorExtractZx(context, op.Rm, index, op.Size);
+
+                context.Emit(OpCodes.Ldc_I4_0);
+                context.EmitLdc_I4(op.Size);
+
+                SoftFallback.EmitCall(context, nameof(SoftFallback.UnsignedShlReg));
+
+                EmitVectorInsert(context, op.Rd, index, op.Size);
+            }
+
+            if (op.RegisterSize == RegisterSize.Simd64)
+            {
+                EmitVectorZeroUpper(context, op.Rd);
+            }
         }
 
         public static void Ushll_V(ILEmitterCtx context)
@@ -526,69 +732,6 @@ namespace ChocolArm64.Instructions
             }
         }
 
-        private static void EmitVectorShl(ILEmitterCtx context, bool signed)
-        {
-            //This instruction shifts the value on vector A by the number of bits
-            //specified on the signed, lower 8 bits of vector B. If the shift value
-            //is greater or equal to the data size of each lane, then the result is zero.
-            //Additionally, negative shifts produces right shifts by the negated shift value.
-            OpCodeSimd64 op = (OpCodeSimd64)context.CurrOp;
-
-            int maxShift = 8 << op.Size;
-
-            Action emit = () =>
-            {
-                ILLabel lblShl  = new ILLabel();
-                ILLabel lblZero = new ILLabel();
-                ILLabel lblEnd  = new ILLabel();
-
-                void EmitShift(OpCode ilOp)
-                {
-                    context.Emit(OpCodes.Dup);
-
-                    context.EmitLdc_I4(maxShift);
-
-                    context.Emit(OpCodes.Bge_S, lblZero);
-                    context.Emit(ilOp);
-                    context.Emit(OpCodes.Br_S, lblEnd);
-                }
-
-                context.Emit(OpCodes.Conv_I1);
-                context.Emit(OpCodes.Dup);
-
-                context.EmitLdc_I4(0);
-
-                context.Emit(OpCodes.Bge_S, lblShl);
-                context.Emit(OpCodes.Neg);
-
-                EmitShift(signed
-                    ? OpCodes.Shr
-                    : OpCodes.Shr_Un);
-
-                context.MarkLabel(lblShl);
-
-                EmitShift(OpCodes.Shl);
-
-                context.MarkLabel(lblZero);
-
-                context.Emit(OpCodes.Pop);
-                context.Emit(OpCodes.Pop);
-
-                context.EmitLdc_I8(0);
-
-                context.MarkLabel(lblEnd);
-            };
-
-            if (signed)
-            {
-                EmitVectorBinaryOpSx(context, emit);
-            }
-            else
-            {
-                EmitVectorBinaryOpZx(context, emit);
-            }
-        }
-
         [Flags]
         private enum ShrImmFlags
         {
diff --git a/ChocolArm64/Instructions/SoftFallback.cs b/ChocolArm64/Instructions/SoftFallback.cs
index 8315395a..def95343 100644
--- a/ChocolArm64/Instructions/SoftFallback.cs
+++ b/ChocolArm64/Instructions/SoftFallback.cs
@@ -16,6 +16,283 @@ namespace ChocolArm64.Instructions
             context.EmitCall(typeof(SoftFallback), mthdName);
         }
 
+#region "ShlReg"
+        public static long SignedShlReg(long value, long shift, bool round, int size)
+        {
+            int eSize = 8 << size;
+
+            int shiftLsB = (sbyte)shift;
+
+            if (shiftLsB < 0)
+            {
+                return SignedShrReg(value, -shiftLsB, round, eSize);
+            }
+            else if (shiftLsB > 0)
+            {
+                if (shiftLsB >= eSize)
+                {
+                    return 0L;
+                }
+
+                return value << shiftLsB;
+            }
+            else /* if (shiftLsB == 0) */
+            {
+                return value;
+            }
+        }
+
+        public static ulong UnsignedShlReg(ulong value, ulong shift, bool round, int size)
+        {
+            int eSize = 8 << size;
+
+            int shiftLsB = (sbyte)shift;
+
+            if (shiftLsB < 0)
+            {
+                return UnsignedShrReg(value, -shiftLsB, round, eSize);
+            }
+            else if (shiftLsB > 0)
+            {
+                if (shiftLsB >= eSize)
+                {
+                    return 0UL;
+                }
+
+                return value << shiftLsB;
+            }
+            else /* if (shiftLsB == 0) */
+            {
+                return value;
+            }
+        }
+
+        public static long SignedShlRegSatQ(long value, long shift, bool round, int size, CpuThreadState state)
+        {
+            int eSize = 8 << size;
+
+            int shiftLsB = (sbyte)shift;
+
+            if (shiftLsB < 0)
+            {
+                return SignedShrReg(value, -shiftLsB, round, eSize);
+            }
+            else if (shiftLsB > 0)
+            {
+                if (shiftLsB >= eSize)
+                {
+                    return SignedSignSatQ(value, eSize, state);
+                }
+
+                if (eSize == 64)
+                {
+                    long shl = value << shiftLsB;
+                    long shr = shl   >> shiftLsB;
+
+                    if (shr != value)
+                    {
+                        return SignedSignSatQ(value, eSize, state);
+                    }
+                    else /* if (shr == value) */
+                    {
+                        return shl;
+                    }
+                }
+                else /* if (eSize != 64) */
+                {
+                    return SignedSrcSignedDstSatQ(value << shiftLsB, size, state);
+                }
+            }
+            else /* if (shiftLsB == 0) */
+            {
+                return value;
+            }
+        }
+
+        public static ulong UnsignedShlRegSatQ(ulong value, ulong shift, bool round, int size, CpuThreadState state)
+        {
+            int eSize = 8 << size;
+
+            int shiftLsB = (sbyte)shift;
+
+            if (shiftLsB < 0)
+            {
+                return UnsignedShrReg(value, -shiftLsB, round, eSize);
+            }
+            else if (shiftLsB > 0)
+            {
+                if (shiftLsB >= eSize)
+                {
+                    return UnsignedSignSatQ(value, eSize, state);
+                }
+
+                if (eSize == 64)
+                {
+                    ulong shl = value << shiftLsB;
+                    ulong shr = shl   >> shiftLsB;
+
+                    if (shr != value)
+                    {
+                        return UnsignedSignSatQ(value, eSize, state);
+                    }
+                    else /* if (shr == value) */
+                    {
+                        return shl;
+                    }
+                }
+                else /* if (eSize != 64) */
+                {
+                    return UnsignedSrcUnsignedDstSatQ(value << shiftLsB, size, state);
+                }
+            }
+            else /* if (shiftLsB == 0) */
+            {
+                return value;
+            }
+        }
+
+        private static long SignedShrReg(long value, int shift, bool round, int eSize) // shift := [1, 128]; eSize := {8, 16, 32, 64}.
+        {
+            if (round)
+            {
+                if (shift >= eSize)
+                {
+                    return 0L;
+                }
+
+                long roundConst = 1L << (shift - 1);
+
+                long add = value + roundConst;
+
+                if (eSize == 64)
+                {
+                    if ((~value & (value ^ add)) < 0L)
+                    {
+                        return (long)((ulong)add >> shift);
+                    }
+                    else
+                    {
+                        return add >> shift;
+                    }
+                }
+                else /* if (eSize != 64) */
+                {
+                    return add >> shift;
+                }
+            }
+            else /* if (!round) */
+            {
+                if (shift >= eSize)
+                {
+                    if (value < 0L)
+                    {
+                        return -1L;
+                    }
+                    else /* if (value >= 0L) */
+                    {
+                        return 0L;
+                    }
+                }
+
+                return value >> shift;
+            }
+        }
+
+        private static ulong UnsignedShrReg(ulong value, int shift, bool round, int eSize) // shift := [1, 128]; eSize := {8, 16, 32, 64}.
+        {
+            if (round)
+            {
+                if (shift > 64)
+                {
+                    return 0UL;
+                }
+
+                ulong roundConst = 1UL << (shift - 1);
+
+                ulong add = value + roundConst;
+
+                if (eSize == 64)
+                {
+                    if ((add < value) && (add < roundConst))
+                    {
+                        if (shift == 64)
+                        {
+                            return 1UL;
+                        }
+
+                        return (add >> shift) | (0x8000000000000000UL >> (shift - 1));
+                    }
+                    else
+                    {
+                        if (shift == 64)
+                        {
+                            return 0UL;
+                        }
+
+                        return add >> shift;
+                    }
+                }
+                else /* if (eSize != 64) */
+                {
+                    if (shift == 64)
+                    {
+                        return 0UL;
+                    }
+
+                    return add >> shift;
+                }
+            }
+            else /* if (!round) */
+            {
+                if (shift >= eSize)
+                {
+                    return 0UL;
+                }
+
+                return value >> shift;
+            }
+        }
+
+        private static long SignedSignSatQ(long op, int eSize, CpuThreadState state) // eSize := {8, 16, 32, 64}.
+        {
+            long tMaxValue =  (1L << (eSize - 1)) - 1L;
+            long tMinValue = -(1L << (eSize - 1));
+
+            if (op > 0L)
+            {
+                state.SetFpsrFlag(Fpsr.Qc);
+
+                return tMaxValue;
+            }
+            else if (op < 0L)
+            {
+                state.SetFpsrFlag(Fpsr.Qc);
+
+                return tMinValue;
+            }
+            else
+            {
+                return 0L;
+            }
+        }
+
+        private static ulong UnsignedSignSatQ(ulong op, int eSize, CpuThreadState state) // eSize := {8, 16, 32, 64}.
+        {
+            ulong tMaxValue = ulong.MaxValue >> (64 - eSize);
+
+            if (op > 0UL)
+            {
+                state.SetFpsrFlag(Fpsr.Qc);
+
+                return tMaxValue;
+            }
+            else
+            {
+                return 0UL;
+            }
+        }
+#endregion
+
 #region "ShrImm64"
         public static long SignedShrImm64(long value, long roundConst, int shift)
         {
@@ -31,7 +308,7 @@ namespace ChocolArm64.Instructions
                     {
                         return -1L;
                     }
-                    else
+                    else /* if (value >= 0L) */
                     {
                         return 0L;
                     }
diff --git a/ChocolArm64/OpCodeTable.cs b/ChocolArm64/OpCodeTable.cs
index 8151718f..9b9b993a 100644
--- a/ChocolArm64/OpCodeTable.cs
+++ b/ChocolArm64/OpCodeTable.cs
@@ -427,10 +427,12 @@ namespace ChocolArm64
             SetA64("01111110101xxxxx101101xxxxxxxxxx", InstEmit.Sqrdmulh_S,    typeof(OpCodeSimdReg64));
             SetA64("0x101110011xxxxx101101xxxxxxxxxx", InstEmit.Sqrdmulh_V,    typeof(OpCodeSimdReg64));
             SetA64("0x101110101xxxxx101101xxxxxxxxxx", InstEmit.Sqrdmulh_V,    typeof(OpCodeSimdReg64));
+            SetA64("0>001110<<1xxxxx010111xxxxxxxxxx", InstEmit.Sqrshl_V,      typeof(OpCodeSimdReg64));
             SetA64("0101111100>>>xxx100111xxxxxxxxxx", InstEmit.Sqrshrn_S,     typeof(OpCodeSimdShImm64));
             SetA64("0x00111100>>>xxx100111xxxxxxxxxx", InstEmit.Sqrshrn_V,     typeof(OpCodeSimdShImm64));
             SetA64("0111111100>>>xxx100011xxxxxxxxxx", InstEmit.Sqrshrun_S,    typeof(OpCodeSimdShImm64));
             SetA64("0x10111100>>>xxx100011xxxxxxxxxx", InstEmit.Sqrshrun_V,    typeof(OpCodeSimdShImm64));
+            SetA64("0>001110<<1xxxxx010011xxxxxxxxxx", InstEmit.Sqshl_V,       typeof(OpCodeSimdReg64));
             SetA64("0101111100>>>xxx100101xxxxxxxxxx", InstEmit.Sqshrn_S,      typeof(OpCodeSimdShImm64));
             SetA64("0x00111100>>>xxx100101xxxxxxxxxx", InstEmit.Sqshrn_V,      typeof(OpCodeSimdShImm64));
             SetA64("0111111100>>>xxx100001xxxxxxxxxx", InstEmit.Sqshrun_S,     typeof(OpCodeSimdShImm64));
@@ -442,6 +444,7 @@ namespace ChocolArm64
             SetA64("01111110<<100001001010xxxxxxxxxx", InstEmit.Sqxtun_S,      typeof(OpCodeSimd64));
             SetA64("0x101110<<100001001010xxxxxxxxxx", InstEmit.Sqxtun_V,      typeof(OpCodeSimd64));
             SetA64("0x001110<<1xxxxx000101xxxxxxxxxx", InstEmit.Srhadd_V,      typeof(OpCodeSimdReg64));
+            SetA64("0>001110<<1xxxxx010101xxxxxxxxxx", InstEmit.Srshl_V,       typeof(OpCodeSimdReg64));
             SetA64("0101111101xxxxxx001001xxxxxxxxxx", InstEmit.Srshr_S,       typeof(OpCodeSimdShImm64));
             SetA64("0x00111100>>>xxx001001xxxxxxxxxx", InstEmit.Srshr_V,       typeof(OpCodeSimdShImm64));
             SetA64("0100111101xxxxxx001001xxxxxxxxxx", InstEmit.Srshr_V,       typeof(OpCodeSimdShImm64));
@@ -501,8 +504,10 @@ namespace ChocolArm64
             SetA64("0x101110<<1xxxxx110000xxxxxxxxxx", InstEmit.Umull_V,       typeof(OpCodeSimdReg64));
             SetA64("01111110xx1xxxxx000011xxxxxxxxxx", InstEmit.Uqadd_S,       typeof(OpCodeSimdReg64));
             SetA64("0>101110<<1xxxxx000011xxxxxxxxxx", InstEmit.Uqadd_V,       typeof(OpCodeSimdReg64));
+            SetA64("0>101110<<1xxxxx010111xxxxxxxxxx", InstEmit.Uqrshl_V,      typeof(OpCodeSimdReg64));
             SetA64("0111111100>>>xxx100111xxxxxxxxxx", InstEmit.Uqrshrn_S,     typeof(OpCodeSimdShImm64));
             SetA64("0x10111100>>>xxx100111xxxxxxxxxx", InstEmit.Uqrshrn_V,     typeof(OpCodeSimdShImm64));
+            SetA64("0>101110<<1xxxxx010011xxxxxxxxxx", InstEmit.Uqshl_V,       typeof(OpCodeSimdReg64));
             SetA64("0111111100>>>xxx100101xxxxxxxxxx", InstEmit.Uqshrn_S,      typeof(OpCodeSimdShImm64));
             SetA64("0x10111100>>>xxx100101xxxxxxxxxx", InstEmit.Uqshrn_V,      typeof(OpCodeSimdShImm64));
             SetA64("01111110xx1xxxxx001011xxxxxxxxxx", InstEmit.Uqsub_S,       typeof(OpCodeSimdReg64));
@@ -510,6 +515,7 @@ namespace ChocolArm64
             SetA64("01111110<<100001010010xxxxxxxxxx", InstEmit.Uqxtn_S,       typeof(OpCodeSimd64));
             SetA64("0x101110<<100001010010xxxxxxxxxx", InstEmit.Uqxtn_V,       typeof(OpCodeSimd64));
             SetA64("0x101110<<1xxxxx000101xxxxxxxxxx", InstEmit.Urhadd_V,      typeof(OpCodeSimdReg64));
+            SetA64("0>101110<<1xxxxx010101xxxxxxxxxx", InstEmit.Urshl_V,       typeof(OpCodeSimdReg64));
             SetA64("0111111101xxxxxx001001xxxxxxxxxx", InstEmit.Urshr_S,       typeof(OpCodeSimdShImm64));
             SetA64("0x10111100>>>xxx001001xxxxxxxxxx", InstEmit.Urshr_V,       typeof(OpCodeSimdShImm64));
             SetA64("0110111101xxxxxx001001xxxxxxxxxx", InstEmit.Urshr_V,       typeof(OpCodeSimdShImm64));
diff --git a/ChocolArm64/Optimizations.cs b/ChocolArm64/Optimizations.cs
index f2b0ffba..aab5eca7 100644
--- a/ChocolArm64/Optimizations.cs
+++ b/ChocolArm64/Optimizations.cs
@@ -8,11 +8,13 @@ public static class Optimizations
 
     private static bool _useSseIfAvailable   = true;
     private static bool _useSse2IfAvailable  = true;
+    private static bool _useSsse3IfAvailable = true;
     private static bool _useSse41IfAvailable = true;
     private static bool _useSse42IfAvailable = true;
 
     internal static bool UseSse   = (_useAllSseIfAvailable && _useSseIfAvailable)   && Sse.IsSupported;
     internal static bool UseSse2  = (_useAllSseIfAvailable && _useSse2IfAvailable)  && Sse2.IsSupported;
+    internal static bool UseSsse3 = (_useAllSseIfAvailable && _useSsse3IfAvailable) && Ssse3.IsSupported;
     internal static bool UseSse41 = (_useAllSseIfAvailable && _useSse41IfAvailable) && Sse41.IsSupported;
     internal static bool UseSse42 = (_useAllSseIfAvailable && _useSse42IfAvailable) && Sse42.IsSupported;
 }
diff --git a/Ryujinx.Tests/Cpu/CpuTestSimdReg.cs b/Ryujinx.Tests/Cpu/CpuTestSimdReg.cs
index d43447a7..cceb8b10 100644
--- a/Ryujinx.Tests/Cpu/CpuTestSimdReg.cs
+++ b/Ryujinx.Tests/Cpu/CpuTestSimdReg.cs
@@ -420,6 +420,36 @@ namespace Ryujinx.Tests.Cpu
             };
         }
 
+        private static uint[] _ShlReg_V_8B_4H_2S_()
+        {
+            return new uint[]
+            {
+                0x0E205C00u, // SQRSHL V0.8B, V0.8B, V0.8B
+                0x0E204C00u, // SQSHL  V0.8B, V0.8B, V0.8B
+                0x0E205400u, // SRSHL  V0.8B, V0.8B, V0.8B
+                0x0E204400u, // SSHL   V0.8B, V0.8B, V0.8B
+                0x2E205C00u, // UQRSHL V0.8B, V0.8B, V0.8B
+                0x2E204C00u, // UQSHL  V0.8B, V0.8B, V0.8B
+                0x2E205400u, // URSHL  V0.8B, V0.8B, V0.8B
+                0x2E204400u  // USHL   V0.8B, V0.8B, V0.8B
+            };
+        }
+
+        private static uint[] _ShlReg_V_16B_8H_4S_2D_()
+        {
+            return new uint[]
+            {
+                0x4E205C00u, // SQRSHL V0.16B, V0.16B, V0.16B
+                0x4E204C00u, // SQSHL  V0.16B, V0.16B, V0.16B
+                0x4E205400u, // SRSHL  V0.16B, V0.16B, V0.16B
+                0x4E204400u, // SSHL   V0.16B, V0.16B, V0.16B
+                0x6E205C00u, // UQRSHL V0.16B, V0.16B, V0.16B
+                0x6E204C00u, // UQSHL  V0.16B, V0.16B, V0.16B
+                0x6E205400u, // URSHL  V0.16B, V0.16B, V0.16B
+                0x6E204400u  // USHL   V0.16B, V0.16B, V0.16B
+            };
+        }
+
         private static uint[] _U_Max_Min_P_V_()
         {
             return new uint[]
@@ -2602,6 +2632,50 @@ namespace Ryujinx.Tests.Cpu
             CompareAgainstUnicorn();
         }
 
+        [Test, Pairwise]
+        public void ShlReg_V_8B_4H_2S([ValueSource("_ShlReg_V_8B_4H_2S_")] uint opcodes,
+                                      [Values(0u)]     uint rd,
+                                      [Values(1u, 0u)] uint rn,
+                                      [Values(2u, 0u)] uint rm,
+                                      [ValueSource("_8B4H2S_")] [Random(RndCnt)] ulong z,
+                                      [ValueSource("_8B4H2S_")] [Random(RndCnt)] ulong a,
+                                      [ValueSource("_8B4H2S_")] [Random(0ul, 255ul, RndCnt)] ulong b,
+                                      [Values(0b00u, 0b01u, 0b10u)] uint size) // <8B, 4H, 2S>
+        {
+            opcodes |= ((rm & 31) << 16) | ((rn & 31) << 5) | ((rd & 31) << 0);
+            opcodes |= ((size & 3) << 22);
+
+            Vector128<float> v0 = MakeVectorE0E1(z, z);
+            Vector128<float> v1 = MakeVectorE0(a);
+            Vector128<float> v2 = MakeVectorE0(b);
+
+            SingleOpcode(opcodes, v0: v0, v1: v1, v2: v2);
+
+            CompareAgainstUnicorn(fpsrMask: Fpsr.Qc);
+        }
+
+        [Test, Pairwise]
+        public void ShlReg_V_16B_8H_4S_2D([ValueSource("_ShlReg_V_16B_8H_4S_2D_")] uint opcodes,
+                                          [Values(0u)]     uint rd,
+                                          [Values(1u, 0u)] uint rn,
+                                          [Values(2u, 0u)] uint rm,
+                                          [ValueSource("_8B4H2S1D_")] [Random(RndCnt)] ulong z,
+                                          [ValueSource("_8B4H2S1D_")] [Random(RndCnt)] ulong a,
+                                          [ValueSource("_8B4H2S1D_")] [Random(0ul, 255ul, RndCnt)] ulong b,
+                                          [Values(0b00u, 0b01u, 0b10u, 0b11u)] uint size) // <16B, 8H, 4S, 2D>
+        {
+            opcodes |= ((rm & 31) << 16) | ((rn & 31) << 5) | ((rd & 31) << 0);
+            opcodes |= ((size & 3) << 22);
+
+            Vector128<float> v0 = MakeVectorE0E1(z, z);
+            Vector128<float> v1 = MakeVectorE0E1(a, a);
+            Vector128<float> v2 = MakeVectorE0E1(b, b);
+
+            SingleOpcode(opcodes, v0: v0, v1: v1, v2: v2);
+
+            CompareAgainstUnicorn(fpsrMask: Fpsr.Qc);
+        }
+
         [Test, Pairwise, Description("SSUBL{2} <Vd>.<Ta>, <Vn>.<Tb>, <Vm>.<Tb>")]
         public void Ssubl_V_8B8H_4H4S_2S2D([Values(0u)]     uint rd,
                                            [Values(1u, 0u)] uint rn,