From acc22c769d24c4daef349e8ae654677a2ab7fc0b Mon Sep 17 00:00:00 2001
From: Gabriel A <gab.dark.100@gmail.com>
Date: Tue, 26 Dec 2023 23:12:44 -0300
Subject: [PATCH] New JIT cache for platforms that enforce W^X, currently
 unused

---
 src/ARMeilleure/Memory/IJitMemoryBlock.cs     |   1 +
 src/Ryujinx.Cpu/Jit/JitCpuContext.cs          |   2 +-
 src/Ryujinx.Cpu/Jit/JitMemoryAllocator.cs     |   9 +-
 src/Ryujinx.Cpu/Jit/JitMemoryBlock.cs         |   1 +
 .../Cache/CacheMemoryAllocator.cs             |  43 ++-
 .../LightningJit/Cache/JitSupportDarwin.cs    |   3 +
 .../LightningJit/Cache/NoWxCache.cs           | 340 ++++++++++++++++++
 .../Cache/PageAlignedRangeList.cs             | 218 +++++++++++
 .../LightningJit/CodeGen/Arm64/StackWalker.cs |  30 ++
 src/Ryujinx.Cpu/LightningJit/IStackWalker.cs  |  10 +
 .../LightningJit/LightningJitCpuContext.cs    |   4 +-
 .../LightningJit/NativeInterface.cs           |   6 +-
 src/Ryujinx.Cpu/LightningJit/Translator.cs    |  63 +++-
 .../LightningJit/TranslatorStubs.cs           |  50 ++-
 14 files changed, 749 insertions(+), 31 deletions(-)
 create mode 100644 src/Ryujinx.Cpu/LightningJit/Cache/NoWxCache.cs
 create mode 100644 src/Ryujinx.Cpu/LightningJit/Cache/PageAlignedRangeList.cs
 create mode 100644 src/Ryujinx.Cpu/LightningJit/CodeGen/Arm64/StackWalker.cs
 create mode 100644 src/Ryujinx.Cpu/LightningJit/IStackWalker.cs

diff --git a/src/ARMeilleure/Memory/IJitMemoryBlock.cs b/src/ARMeilleure/Memory/IJitMemoryBlock.cs
index cd49f314a..c103fe8d1 100644
--- a/src/ARMeilleure/Memory/IJitMemoryBlock.cs
+++ b/src/ARMeilleure/Memory/IJitMemoryBlock.cs
@@ -8,6 +8,7 @@ namespace ARMeilleure.Memory
 
         void Commit(ulong offset, ulong size);
 
+        void MapAsRw(ulong offset, ulong size);
         void MapAsRx(ulong offset, ulong size);
         void MapAsRwx(ulong offset, ulong size);
     }
diff --git a/src/Ryujinx.Cpu/Jit/JitCpuContext.cs b/src/Ryujinx.Cpu/Jit/JitCpuContext.cs
index 6563d699a..a5944097d 100644
--- a/src/Ryujinx.Cpu/Jit/JitCpuContext.cs
+++ b/src/Ryujinx.Cpu/Jit/JitCpuContext.cs
@@ -11,7 +11,7 @@ namespace Ryujinx.Cpu.Jit
         public JitCpuContext(ITickSource tickSource, IMemoryManager memory, bool for64Bit)
         {
             _tickSource = tickSource;
-            _translator = new Translator(new JitMemoryAllocator(), memory, for64Bit);
+            _translator = new Translator(new JitMemoryAllocator(forJit: true), memory, for64Bit);
             memory.UnmapEvent += UnmapHandler;
         }
 
diff --git a/src/Ryujinx.Cpu/Jit/JitMemoryAllocator.cs b/src/Ryujinx.Cpu/Jit/JitMemoryAllocator.cs
index 529a1a808..926dd8a0c 100644
--- a/src/Ryujinx.Cpu/Jit/JitMemoryAllocator.cs
+++ b/src/Ryujinx.Cpu/Jit/JitMemoryAllocator.cs
@@ -5,8 +5,15 @@ namespace Ryujinx.Cpu.Jit
 {
     public class JitMemoryAllocator : IJitMemoryAllocator
     {
+        private readonly MemoryAllocationFlags _jitFlag;
+
+        public JitMemoryAllocator(bool forJit = false)
+        {
+            _jitFlag = forJit ? MemoryAllocationFlags.Jit : MemoryAllocationFlags.None;
+        }
+
         public IJitMemoryBlock Allocate(ulong size) => new JitMemoryBlock(size, MemoryAllocationFlags.None);
-        public IJitMemoryBlock Reserve(ulong size) => new JitMemoryBlock(size, MemoryAllocationFlags.Reserve | MemoryAllocationFlags.Jit);
+        public IJitMemoryBlock Reserve(ulong size) => new JitMemoryBlock(size, MemoryAllocationFlags.Reserve | _jitFlag);
 
         public ulong GetPageSize() => MemoryBlock.GetPageSize();
     }
diff --git a/src/Ryujinx.Cpu/Jit/JitMemoryBlock.cs b/src/Ryujinx.Cpu/Jit/JitMemoryBlock.cs
index bcacd116a..bd07d349c 100644
--- a/src/Ryujinx.Cpu/Jit/JitMemoryBlock.cs
+++ b/src/Ryujinx.Cpu/Jit/JitMemoryBlock.cs
@@ -16,6 +16,7 @@ namespace Ryujinx.Cpu.Jit
         }
 
         public void Commit(ulong offset, ulong size) => _impl.Commit(offset, size);
+        public void MapAsRw(ulong offset, ulong size) => _impl.Reprotect(offset, size, MemoryPermission.ReadAndWrite);
         public void MapAsRx(ulong offset, ulong size) => _impl.Reprotect(offset, size, MemoryPermission.ReadAndExecute);
         public void MapAsRwx(ulong offset, ulong size) => _impl.Reprotect(offset, size, MemoryPermission.ReadWriteExecute);
 
diff --git a/src/Ryujinx.Cpu/LightningJit/Cache/CacheMemoryAllocator.cs b/src/Ryujinx.Cpu/LightningJit/Cache/CacheMemoryAllocator.cs
index 64583cc3e..3837824f3 100644
--- a/src/Ryujinx.Cpu/LightningJit/Cache/CacheMemoryAllocator.cs
+++ b/src/Ryujinx.Cpu/LightningJit/Cache/CacheMemoryAllocator.cs
@@ -1,5 +1,7 @@
+using Ryujinx.Cpu.LightningJit.Arm32;
 using System;
 using System.Collections.Generic;
+using System.Diagnostics;
 using System.Diagnostics.CodeAnalysis;
 
 namespace Ryujinx.Cpu.LightningJit.Cache
@@ -38,7 +40,7 @@ namespace Ryujinx.Cpu.LightningJit.Cache
 
                 if (block.Size > size)
                 {
-                    _blocks[i] = new MemoryBlock(block.Offset + size, block.Size - size);
+                    _blocks[i] = new(block.Offset + size, block.Size - size);
                     return block.Offset;
                 }
                 else if (block.Size == size)
@@ -52,6 +54,40 @@ namespace Ryujinx.Cpu.LightningJit.Cache
             return -1;
         }
 
+        public void ForceAllocation(int offset, int size)
+        {
+            int index = _blocks.BinarySearch(new(offset, size));
+
+            if (index < 0)
+            {
+                index = ~index;
+            }
+
+            int endOffset = offset + size;
+
+            MemoryBlock block = _blocks[index];
+
+            Debug.Assert(block.Offset <= offset && block.Offset + block.Size >= endOffset);
+
+            if (offset > block.Offset && endOffset < block.Offset + block.Size)
+            {
+                _blocks[index] = new(block.Offset, offset - block.Offset);
+                _blocks.Insert(index + 1, new(endOffset, (block.Offset + block.Size) - endOffset));
+            }
+            else if (offset > block.Offset)
+            {
+                _blocks[index] = new(block.Offset, offset - block.Offset);
+            }
+            else if (endOffset < block.Offset + block.Size)
+            {
+                _blocks[index] = new(endOffset, (block.Offset + block.Size) - endOffset);
+            }
+            else
+            {
+                _blocks.RemoveAt(index);
+            }
+        }
+
         public void Free(int offset, int size)
         {
             Insert(new MemoryBlock(offset, size));
@@ -92,5 +128,10 @@ namespace Ryujinx.Cpu.LightningJit.Cache
 
             _blocks.Insert(index, block);
         }
+
+        public void Clear()
+        {
+            _blocks.Clear();
+        }
     }
 }
diff --git a/src/Ryujinx.Cpu/LightningJit/Cache/JitSupportDarwin.cs b/src/Ryujinx.Cpu/LightningJit/Cache/JitSupportDarwin.cs
index 52297b435..06c81045d 100644
--- a/src/Ryujinx.Cpu/LightningJit/Cache/JitSupportDarwin.cs
+++ b/src/Ryujinx.Cpu/LightningJit/Cache/JitSupportDarwin.cs
@@ -9,5 +9,8 @@ namespace Ryujinx.Cpu.LightningJit.Cache
     {
         [LibraryImport("libarmeilleure-jitsupport", EntryPoint = "armeilleure_jit_memcpy")]
         public static partial void Copy(IntPtr dst, IntPtr src, ulong n);
+
+        [LibraryImport("libc", EntryPoint = "sys_icache_invalidate", SetLastError = true)]
+        public static partial void SysIcacheInvalidate(IntPtr start, IntPtr len);
     }
 }
diff --git a/src/Ryujinx.Cpu/LightningJit/Cache/NoWxCache.cs b/src/Ryujinx.Cpu/LightningJit/Cache/NoWxCache.cs
new file mode 100644
index 000000000..0a78d7a5f
--- /dev/null
+++ b/src/Ryujinx.Cpu/LightningJit/Cache/NoWxCache.cs
@@ -0,0 +1,340 @@
+using ARMeilleure.Memory;
+using Ryujinx.Common;
+using Ryujinx.Memory;
+using System;
+using System.Collections.Generic;
+using System.Diagnostics;
+
+namespace Ryujinx.Cpu.LightningJit.Cache
+{
+    class NoWxCache : IDisposable
+    {
+        private const int CodeAlignment = 4; // Bytes.
+        private const int SharedCacheSize = 2047 * 1024 * 1024;
+        private const int LocalCacheSize = 256 * 1024 * 1024;
+
+        // How many calls to the same function we allow until we pad the shared cache to force the function to become available there
+        // and allow the guest to take the fast path.
+        private const int MinCallsForPad = 8;
+
+        private class MemoryCache : IDisposable
+        {
+            private readonly ReservedRegion _region;
+            private readonly CacheMemoryAllocator _cacheAllocator;
+
+            public CacheMemoryAllocator Allocator => _cacheAllocator;
+            public IntPtr Pointer => _region.Block.Pointer;
+
+            public MemoryCache(IJitMemoryAllocator allocator, ulong size)
+            {
+                _region = new(allocator, size);
+                _cacheAllocator = new((int)size);
+            }
+
+            public int Allocate(int codeSize)
+            {
+                codeSize = AlignCodeSize(codeSize);
+
+                int allocOffset = _cacheAllocator.Allocate(codeSize);
+
+                if (allocOffset < 0)
+                {
+                    throw new OutOfMemoryException("JIT Cache exhausted.");
+                }
+
+                _region.ExpandIfNeeded((ulong)allocOffset + (ulong)codeSize);
+
+                return allocOffset;
+            }
+
+            public void Free(int offset, int size)
+            {
+                _cacheAllocator.Free(offset, size);
+            }
+
+            public void ReprotectAsRw(int offset, int size)
+            {
+                Debug.Assert(offset >= 0 && (offset & (int)(MemoryBlock.GetPageSize() - 1)) == 0);
+                Debug.Assert(size > 0 && (size & (int)(MemoryBlock.GetPageSize() - 1)) == 0);
+
+                _region.Block.MapAsRw((ulong)offset, (ulong)size);
+            }
+
+            public void ReprotectAsRx(int offset, int size)
+            {
+                Debug.Assert(offset >= 0 && (offset & (int)(MemoryBlock.GetPageSize() - 1)) == 0);
+                Debug.Assert(size > 0 && (size & (int)(MemoryBlock.GetPageSize() - 1)) == 0);
+
+                _region.Block.MapAsRx((ulong)offset, (ulong)size);
+
+                if (OperatingSystem.IsMacOS() || OperatingSystem.IsIOS())
+                {
+                    JitSupportDarwin.SysIcacheInvalidate(_region.Block.Pointer + offset, size);
+                }
+                else
+                {
+                    throw new PlatformNotSupportedException();
+                }
+            }
+
+            private static int AlignCodeSize(int codeSize)
+            {
+                return checked(codeSize + (CodeAlignment - 1)) & ~(CodeAlignment - 1);
+            }
+
+            protected virtual void Dispose(bool disposing)
+            {
+                if (disposing)
+                {
+                    _region.Dispose();
+                    _cacheAllocator.Clear();
+                }
+            }
+
+            public void Dispose()
+            {
+                // Do not change this code. Put cleanup code in 'Dispose(bool disposing)' method
+                Dispose(disposing: true);
+                GC.SuppressFinalize(this);
+            }
+        }
+
+        private readonly IStackWalker _stackWalker;
+        private readonly Translator _translator;
+        private readonly MemoryCache _sharedCache;
+        private readonly MemoryCache _localCache;
+        private readonly PageAlignedRangeList _pendingMap;
+        private readonly object _lock;
+
+        class ThreadLocalCacheEntry
+        {
+            public readonly int Offset;
+            public readonly int Size;
+            public readonly IntPtr FuncPtr;
+            private int _useCount;
+
+            public ThreadLocalCacheEntry(int offset, int size, IntPtr funcPtr)
+            {
+                Offset = offset;
+                Size = size;
+                FuncPtr = funcPtr;
+                _useCount = 0;
+            }
+
+            public int IncrementUseCount()
+            {
+                return ++_useCount;
+            }
+        }
+
+        [ThreadStatic]
+        private static Dictionary<ulong, ThreadLocalCacheEntry> _threadLocalCache;
+
+        public NoWxCache(IJitMemoryAllocator allocator, IStackWalker stackWalker, Translator translator)
+        {
+            _stackWalker = stackWalker;
+            _translator = translator;
+            _sharedCache = new(allocator, SharedCacheSize);
+            _localCache = new(allocator, LocalCacheSize);
+            _pendingMap = new(_sharedCache.ReprotectAsRx, RegisterFunction);
+            _lock = new();
+        }
+
+        public unsafe IntPtr Map(IntPtr framePointer, ReadOnlySpan<byte> code, ulong guestAddress, ulong guestSize)
+        {
+            if (TryGetThreadLocalFunction(guestAddress, out IntPtr funcPtr))
+            {
+                return funcPtr;
+            }
+
+            lock (_lock)
+            {
+                if (!_pendingMap.Has(guestAddress))
+                {
+                    int funcOffset = _sharedCache.Allocate(code.Length);
+
+                    funcPtr = _sharedCache.Pointer + funcOffset;
+                    code.CopyTo(new Span<byte>((void*)funcPtr, code.Length));
+
+                    TranslatedFunction function = new(funcPtr, guestSize);
+
+                    _pendingMap.Add(funcOffset, code.Length, guestAddress, function);
+                }
+
+                ClearThreadLocalCache(framePointer);
+
+                return AddThreadLocalFunction(code, guestAddress);
+            }
+        }
+
+        public unsafe IntPtr MapPageAligned(ReadOnlySpan<byte> code)
+        {
+            lock (_lock)
+            {
+                // Ensure we will get an aligned offset from the allocator.
+                _pendingMap.Pad(_sharedCache.Allocator);
+
+                int sizeAligned = BitUtils.AlignUp(code.Length, (int)MemoryBlock.GetPageSize());
+                int funcOffset = _sharedCache.Allocate(sizeAligned);
+
+                Debug.Assert((funcOffset & ((int)MemoryBlock.GetPageSize() - 1)) == 0);
+
+                IntPtr funcPtr = _sharedCache.Pointer + funcOffset;
+                code.CopyTo(new Span<byte>((void*)funcPtr, code.Length));
+
+                _sharedCache.ReprotectAsRx(funcOffset, sizeAligned);
+
+                return funcPtr;
+            }
+        }
+
+        private bool TryGetThreadLocalFunction(ulong guestAddress, out IntPtr funcPtr)
+        {
+            if ((_threadLocalCache ??= new()).TryGetValue(guestAddress, out var entry))
+            {
+                if (entry.IncrementUseCount() >= MinCallsForPad)
+                {
+                    // Function is being called often, let's make it available in the shared cache so that the guest code
+                    // can take the fast path and stop calling the emulator to get the function from the thread local cache.
+                    // To do that we pad all "pending" function until they complete a page of memory, allowing us to reprotect them as RX.
+
+                    lock (_lock)
+                    {
+                        _pendingMap.Pad(_sharedCache.Allocator);
+                    }
+                }
+
+                funcPtr = entry.FuncPtr;
+
+                return true;
+            }
+
+            funcPtr = IntPtr.Zero;
+
+            return false;
+        }
+
+        private void ClearThreadLocalCache(IntPtr framePointer)
+        {
+            // Try to delete functions that are already on the shared cache
+            // and no longer being executed.
+
+            if (_threadLocalCache == null)
+            {
+                return;
+            }
+
+            IEnumerable<ulong> callStack = _stackWalker.GetCallStack(
+                framePointer,
+                _localCache.Pointer,
+                LocalCacheSize,
+                _sharedCache.Pointer,
+                SharedCacheSize);
+
+            List<(ulong, ThreadLocalCacheEntry)> toDelete = new();
+
+            foreach ((ulong address, ThreadLocalCacheEntry entry) in _threadLocalCache)
+            {
+                // We only want to delete if the function is already on the shared cache,
+                // otherwise we will keep translating the same function over and over again.
+                bool canDelete = !_pendingMap.Has(address);
+                if (!canDelete)
+                {
+                    continue;
+                }
+
+                // We can only delete if the function is not part of the current thread call stack,
+                // otherwise we will crash the program when the thread returns to it.
+                foreach (ulong funcAddress in callStack)
+                {
+                    if (funcAddress >= (ulong)entry.FuncPtr && funcAddress < (ulong)entry.FuncPtr + (ulong)entry.Size)
+                    {
+                        canDelete = false;
+                        break;
+                    }
+                }
+
+                if (canDelete)
+                {
+                    toDelete.Add((address, entry));
+                }
+            }
+
+            int pageSize = (int)MemoryBlock.GetPageSize();
+
+            foreach ((ulong address, ThreadLocalCacheEntry entry) in toDelete)
+            {
+                _threadLocalCache.Remove(address);
+
+                int sizeAligned = BitUtils.AlignUp(entry.Size, pageSize);
+
+                _localCache.Free(entry.Offset, sizeAligned);
+                _localCache.ReprotectAsRw(entry.Offset, sizeAligned);
+            }
+        }
+
+        public void ClearEntireThreadLocalCache()
+        {
+            // Thread is exiting, delete everything.
+
+            if (_threadLocalCache == null)
+            {
+                return;
+            }
+
+            int pageSize = (int)MemoryBlock.GetPageSize();
+
+            foreach ((_, ThreadLocalCacheEntry entry) in _threadLocalCache)
+            {
+                int sizeAligned = BitUtils.AlignUp(entry.Size, pageSize);
+
+                _localCache.Free(entry.Offset, sizeAligned);
+                _localCache.ReprotectAsRw(entry.Offset, sizeAligned);
+            }
+
+            _threadLocalCache.Clear();
+            _threadLocalCache = null;
+        }
+
+        private unsafe IntPtr AddThreadLocalFunction(ReadOnlySpan<byte> code, ulong guestAddress)
+        {
+            int alignedSize = BitUtils.AlignUp(code.Length, (int)MemoryBlock.GetPageSize());
+            int funcOffset = _localCache.Allocate(alignedSize);
+
+            Debug.Assert((funcOffset & (int)(MemoryBlock.GetPageSize() - 1)) == 0);
+
+            IntPtr funcPtr = _localCache.Pointer + funcOffset;
+            code.CopyTo(new Span<byte>((void*)funcPtr, code.Length));
+
+            (_threadLocalCache ??= new()).Add(guestAddress, new(funcOffset, code.Length, funcPtr));
+
+            _localCache.ReprotectAsRx(funcOffset, alignedSize);
+
+            return funcPtr;
+        }
+
+        private void RegisterFunction(ulong address, TranslatedFunction func)
+        {
+            TranslatedFunction oldFunc = _translator.Functions.GetOrAdd(address, func.GuestSize, func);
+
+            Debug.Assert(oldFunc == func);
+
+            _translator.RegisterFunction(address, func);
+        }
+
+        protected virtual void Dispose(bool disposing)
+        {
+            if (disposing)
+            {
+                _localCache.Dispose();
+                _sharedCache.Dispose();
+            }
+        }
+
+        public void Dispose()
+        {
+            Dispose(disposing: true);
+            GC.SuppressFinalize(this);
+        }
+    }
+}
diff --git a/src/Ryujinx.Cpu/LightningJit/Cache/PageAlignedRangeList.cs b/src/Ryujinx.Cpu/LightningJit/Cache/PageAlignedRangeList.cs
new file mode 100644
index 000000000..b6b386714
--- /dev/null
+++ b/src/Ryujinx.Cpu/LightningJit/Cache/PageAlignedRangeList.cs
@@ -0,0 +1,218 @@
+using Ryujinx.Common;
+using Ryujinx.Memory;
+using System;
+using System.Collections.Generic;
+using System.Diagnostics;
+using System.Diagnostics.CodeAnalysis;
+
+namespace Ryujinx.Cpu.LightningJit.Cache
+{
+    class PageAlignedRangeList
+    {
+        private readonly struct Range : IComparable<Range>
+        {
+            public int Offset { get; }
+            public int Size { get; }
+
+            public Range(int offset, int size)
+            {
+                Offset = offset;
+                Size = size;
+            }
+
+            public int CompareTo([AllowNull] Range other)
+            {
+                return Offset.CompareTo(other.Offset);
+            }
+        }
+
+        private readonly Action<int, int> _alignedRangeAction;
+        private readonly Action<ulong, TranslatedFunction> _alignedFunctionAction;
+        private readonly List<(Range, ulong, TranslatedFunction)> _pendingFunctions;
+        private readonly List<Range> _ranges;
+
+        public PageAlignedRangeList(Action<int, int> alignedRangeAction, Action<ulong, TranslatedFunction> alignedFunctionAction)
+        {
+            _alignedRangeAction = alignedRangeAction;
+            _alignedFunctionAction = alignedFunctionAction;
+            _pendingFunctions = new();
+            _ranges = new();
+        }
+
+        public bool Has(ulong address)
+        {
+            foreach ((_, ulong guestAddress, _) in _pendingFunctions)
+            {
+                if (guestAddress == address)
+                {
+                    return true;
+                }
+            }
+
+            return false;
+        }
+
+        public void Add(int offset, int size, ulong address, TranslatedFunction function)
+        {
+            Range range = new(offset, size);
+
+            Insert(range);
+            _pendingFunctions.Add((range, address, function));
+            ProcessAlignedRanges();
+        }
+
+        public void Pad(CacheMemoryAllocator allocator)
+        {
+            int pageSize = (int)MemoryBlock.GetPageSize();
+
+            for (int index = 0; index < _ranges.Count; index++)
+            {
+                Range range = _ranges[index];
+
+                int endOffset = range.Offset + range.Size;
+
+                int alignedStart = BitUtils.AlignDown(range.Offset, pageSize);
+                int alignedEnd = BitUtils.AlignUp(endOffset, pageSize);
+                int alignedSize = alignedEnd - alignedStart;
+
+                if (alignedStart < range.Offset)
+                {
+                    allocator.ForceAllocation(alignedStart, range.Offset - alignedStart);
+                }
+
+                if (alignedEnd > endOffset)
+                {
+                    allocator.ForceAllocation(endOffset, alignedEnd - endOffset);
+                }
+
+                _alignedRangeAction(alignedStart, alignedSize);
+                _ranges.RemoveAt(index--);
+                ProcessPendingFunctions(index, alignedEnd);
+            }
+        }
+
+        private void ProcessAlignedRanges()
+        {
+            int pageSize = (int)MemoryBlock.GetPageSize();
+
+            for (int index = 0; index < _ranges.Count; index++)
+            {
+                Range range = _ranges[index];
+
+                int alignedStart = BitUtils.AlignUp(range.Offset, pageSize);
+                int alignedEnd = BitUtils.AlignDown(range.Offset + range.Size, pageSize);
+                int alignedSize = alignedEnd - alignedStart;
+
+                if (alignedSize <= 0)
+                {
+                    continue;
+                }
+
+                _alignedRangeAction(alignedStart, alignedSize);
+                SplitAt(ref index, alignedStart, alignedEnd);
+                ProcessPendingFunctions(index, alignedEnd);
+            }
+        }
+
+        private void ProcessPendingFunctions(int rangeIndex, int alignedEnd)
+        {
+            if ((rangeIndex > 0 && rangeIndex == _ranges.Count) ||
+                (rangeIndex >= 0 && rangeIndex < _ranges.Count && _ranges[rangeIndex].Offset >= alignedEnd))
+            {
+                rangeIndex--;
+            }
+
+            int alignedStart;
+
+            if (rangeIndex >= 0)
+            {
+                alignedStart = _ranges[rangeIndex].Offset + _ranges[rangeIndex].Size;
+            }
+            else
+            {
+                alignedStart = 0;
+            }
+
+            if (rangeIndex < _ranges.Count - 1)
+            {
+                alignedEnd = _ranges[rangeIndex + 1].Offset;
+            }
+            else
+            {
+                alignedEnd = int.MaxValue;
+            }
+
+            for (int index = 0; index < _pendingFunctions.Count; index++)
+            {
+                (Range range, ulong address, TranslatedFunction function) = _pendingFunctions[index];
+
+                if (range.Offset >= alignedStart && range.Offset + range.Size <= alignedEnd)
+                {
+                    _alignedFunctionAction(address, function);
+                    _pendingFunctions.RemoveAt(index--);
+                }
+            }
+        }
+
+        private void Insert(Range range)
+        {
+            int index = _ranges.BinarySearch(range);
+
+            if (index < 0)
+            {
+                index = ~index;
+            }
+
+            if (index < _ranges.Count)
+            {
+                Range next = _ranges[index];
+
+                int endOffs = range.Offset + range.Size;
+
+                if (next.Offset == endOffs)
+                {
+                    range = new Range(range.Offset, range.Size + next.Size);
+                    _ranges.RemoveAt(index);
+                }
+            }
+
+            if (index > 0)
+            {
+                Range prev = _ranges[index - 1];
+
+                if (prev.Offset + prev.Size == range.Offset)
+                {
+                    range = new Range(range.Offset - prev.Size, range.Size + prev.Size);
+                    _ranges.RemoveAt(--index);
+                }
+            }
+
+            _ranges.Insert(index, range);
+        }
+
+        private void SplitAt(ref int index, int alignedStart, int alignedEnd)
+        {
+            Range range = _ranges[index];
+
+            if (range.Offset < alignedStart)
+            {
+                _ranges[index++] = new(range.Offset, alignedStart - range.Offset);
+
+                if (range.Offset + range.Size > alignedEnd)
+                {
+                    _ranges.Insert(index, new(alignedEnd, (range.Offset + range.Size) - alignedEnd));
+                }
+            }
+            else if (range.Offset + range.Size > alignedEnd)
+            {
+                _ranges[index] = new(alignedEnd, (range.Offset + range.Size) - alignedEnd);
+            }
+            else if (range.Offset == alignedStart && range.Offset + range.Size == alignedEnd)
+            {
+                Debug.Assert(range.Offset == alignedStart && range.Offset + range.Size == alignedEnd);
+
+                _ranges.RemoveAt(index--);
+            }
+        }
+    }
+}
diff --git a/src/Ryujinx.Cpu/LightningJit/CodeGen/Arm64/StackWalker.cs b/src/Ryujinx.Cpu/LightningJit/CodeGen/Arm64/StackWalker.cs
new file mode 100644
index 000000000..ffed56a65
--- /dev/null
+++ b/src/Ryujinx.Cpu/LightningJit/CodeGen/Arm64/StackWalker.cs
@@ -0,0 +1,30 @@
+using System;
+using System.Collections.Generic;
+using System.Runtime.InteropServices;
+
+namespace Ryujinx.Cpu.LightningJit.CodeGen.Arm64
+{
+    class StackWalker : IStackWalker
+    {
+        public IEnumerable<ulong> GetCallStack(IntPtr framePointer, IntPtr codeRegionStart, int codeRegionSize, IntPtr codeRegion2Start, int codeRegion2Size)
+        {
+            List<ulong> functionPointers = new();
+
+            while (true)
+            {
+                IntPtr functionPointer = Marshal.ReadIntPtr(framePointer, IntPtr.Size);
+
+                if ((functionPointer < codeRegionStart || functionPointer >= codeRegionStart + codeRegionSize) &&
+                    (functionPointer < codeRegion2Start || functionPointer >= codeRegion2Start + codeRegion2Size))
+                {
+                    break;
+                }
+
+                functionPointers.Add((ulong)functionPointer - 4);
+                framePointer = Marshal.ReadIntPtr(framePointer);
+            }
+
+            return functionPointers;
+        }
+    }
+}
\ No newline at end of file
diff --git a/src/Ryujinx.Cpu/LightningJit/IStackWalker.cs b/src/Ryujinx.Cpu/LightningJit/IStackWalker.cs
new file mode 100644
index 000000000..d330ef788
--- /dev/null
+++ b/src/Ryujinx.Cpu/LightningJit/IStackWalker.cs
@@ -0,0 +1,10 @@
+using System;
+using System.Collections.Generic;
+
+namespace Ryujinx.Cpu.LightningJit
+{
+    interface IStackWalker
+    {
+        IEnumerable<ulong> GetCallStack(IntPtr framePointer, IntPtr codeRegionStart, int codeRegionSize, IntPtr codeRegion2Start, int codeRegion2Size);
+    }
+}
\ No newline at end of file
diff --git a/src/Ryujinx.Cpu/LightningJit/LightningJitCpuContext.cs b/src/Ryujinx.Cpu/LightningJit/LightningJitCpuContext.cs
index efe60c018..0f07abc25 100644
--- a/src/Ryujinx.Cpu/LightningJit/LightningJitCpuContext.cs
+++ b/src/Ryujinx.Cpu/LightningJit/LightningJitCpuContext.cs
@@ -1,4 +1,4 @@
-using ARMeilleure.Memory;
+using ARMeilleure.Memory;
 using Ryujinx.Cpu.Jit;
 using Ryujinx.Cpu.LightningJit.State;
 
@@ -12,7 +12,7 @@ namespace Ryujinx.Cpu.LightningJit
         public LightningJitCpuContext(ITickSource tickSource, IMemoryManager memory, bool for64Bit)
         {
             _tickSource = tickSource;
-            _translator = new Translator(new JitMemoryAllocator(), memory, for64Bit);
+            _translator = new Translator(new JitMemoryAllocator(forJit: true), memory, for64Bit);
             memory.UnmapEvent += UnmapHandler;
         }
 
diff --git a/src/Ryujinx.Cpu/LightningJit/NativeInterface.cs b/src/Ryujinx.Cpu/LightningJit/NativeInterface.cs
index d67a45d4c..da3ad9832 100644
--- a/src/Ryujinx.Cpu/LightningJit/NativeInterface.cs
+++ b/src/Ryujinx.Cpu/LightningJit/NativeInterface.cs
@@ -61,11 +61,9 @@ namespace Ryujinx.Cpu.LightningJit
             return GetContext().CntpctEl0;
         }
 
-        public static ulong GetFunctionAddress(ulong address)
+        public static ulong GetFunctionAddress(IntPtr framePointer, ulong address)
         {
-            TranslatedFunction function = Context.Translator.GetOrTranslate(address, GetContext().ExecutionMode);
-
-            return (ulong)function.FuncPointer.ToInt64();
+            return (ulong)Context.Translator.GetOrTranslatePointer(framePointer, address, GetContext().ExecutionMode);
         }
 
         public static void InvalidateCacheLine(ulong address)
diff --git a/src/Ryujinx.Cpu/LightningJit/Translator.cs b/src/Ryujinx.Cpu/LightningJit/Translator.cs
index fc5df0a8e..a1ff448de 100644
--- a/src/Ryujinx.Cpu/LightningJit/Translator.cs
+++ b/src/Ryujinx.Cpu/LightningJit/Translator.cs
@@ -1,7 +1,9 @@
 using ARMeilleure.Common;
 using ARMeilleure.Memory;
 using ARMeilleure.Signal;
+using Ryujinx.Cpu.Jit;
 using Ryujinx.Cpu.LightningJit.Cache;
+using Ryujinx.Cpu.LightningJit.CodeGen.Arm64;
 using Ryujinx.Cpu.LightningJit.State;
 using System;
 using System.Collections.Concurrent;
@@ -13,6 +15,9 @@ namespace Ryujinx.Cpu.LightningJit
 {
     class Translator : IDisposable
     {
+        // Should be enabled on platforms that enforce W^X.
+        private static bool IsNoWxPlatform => false;
+
         private static readonly AddressTable<ulong>.Level[] _levels64Bit =
             new AddressTable<ulong>.Level[]
             {
@@ -33,6 +38,7 @@ namespace Ryujinx.Cpu.LightningJit
             };
 
         private readonly ConcurrentQueue<KeyValuePair<ulong, TranslatedFunction>> _oldFuncs;
+        private readonly NoWxCache _noWxCache;
         private bool _disposed;
 
         internal TranslatorCache<TranslatedFunction> Functions { get; }
@@ -46,12 +52,20 @@ namespace Ryujinx.Cpu.LightningJit
 
             _oldFuncs = new ConcurrentQueue<KeyValuePair<ulong, TranslatedFunction>>();
 
-            JitCache.Initialize(allocator);
+            if (IsNoWxPlatform)
+            {
+                _noWxCache = new(new JitMemoryAllocator(), CreateStackWalker(), this);
+            }
+            else
+            {
+                JitCache.Initialize(allocator);
+            }
+
             NativeSignalHandler.Initialize(allocator);
 
             Functions = new TranslatorCache<TranslatedFunction>();
             FunctionTable = new AddressTable<ulong>(for64Bits ? _levels64Bit : _levels32Bit);
-            Stubs = new TranslatorStubs(FunctionTable);
+            Stubs = new TranslatorStubs(FunctionTable, _noWxCache);
 
             FunctionTable.Fill = (ulong)Stubs.SlowDispatchStub;
 
@@ -61,6 +75,18 @@ namespace Ryujinx.Cpu.LightningJit
             }
         }
 
+        private static IStackWalker CreateStackWalker()
+        {
+            if (RuntimeInformation.ProcessArchitecture == Architecture.Arm64)
+            {
+                return new StackWalker();
+            }
+            else
+            {
+                throw new PlatformNotSupportedException();
+            }
+        }
+
         public void Execute(State.ExecutionContext context, ulong address)
         {
             ObjectDisposedException.ThrowIf(_disposed, this);
@@ -70,9 +96,22 @@ namespace Ryujinx.Cpu.LightningJit
             Stubs.DispatchLoop(context.NativeContextPtr, address);
 
             NativeInterface.UnregisterThread();
+            _noWxCache?.ClearEntireThreadLocalCache();
         }
 
-        internal TranslatedFunction GetOrTranslate(ulong address, ExecutionMode mode)
+        internal IntPtr GetOrTranslatePointer(IntPtr framePointer, ulong address, ExecutionMode mode)
+        {
+            if (_noWxCache != null)
+            {
+                CompiledFunction func = Compile(address, mode);
+
+                return _noWxCache.Map(framePointer, func.Code, address, (ulong)func.GuestCodeLength);
+            }
+
+            return GetOrTranslate(address, mode).FuncPointer;
+        }
+
+        private TranslatedFunction GetOrTranslate(ulong address, ExecutionMode mode)
         {
             if (!Functions.TryGetValue(address, out TranslatedFunction func))
             {
@@ -86,7 +125,6 @@ namespace Ryujinx.Cpu.LightningJit
                     func = oldFunc;
                 }
 
-
                 RegisterFunction(address, func);
             }
 
@@ -103,13 +141,17 @@ namespace Ryujinx.Cpu.LightningJit
 
         internal TranslatedFunction Translate(ulong address, ExecutionMode mode)
         {
-            CompiledFunction func = AarchCompiler.Compile(CpuPresets.CortexA57, Memory, address, FunctionTable, Stubs.DispatchStub, mode, RuntimeInformation.ProcessArchitecture);
-
+            CompiledFunction func = Compile(address, mode);
             IntPtr funcPointer = JitCache.Map(func.Code);
 
             return new TranslatedFunction(funcPointer, (ulong)func.GuestCodeLength);
         }
 
+        internal CompiledFunction Compile(ulong address, ExecutionMode mode)
+        {
+            return AarchCompiler.Compile(CpuPresets.CortexA57, Memory, address, FunctionTable, Stubs.DispatchStub, mode, RuntimeInformation.ProcessArchitecture);
+        }
+
         public void InvalidateJitCacheRegion(ulong address, ulong size)
         {
             ulong[] overlapAddresses = Array.Empty<ulong>();
@@ -160,7 +202,14 @@ namespace Ryujinx.Cpu.LightningJit
             {
                 if (disposing)
                 {
-                    ClearJitCache();
+                    if (_noWxCache != null)
+                    {
+                        _noWxCache.Dispose();
+                    }
+                    else
+                    {
+                        ClearJitCache();
+                    }
 
                     Stubs.Dispose();
                     FunctionTable.Dispose();
diff --git a/src/Ryujinx.Cpu/LightningJit/TranslatorStubs.cs b/src/Ryujinx.Cpu/LightningJit/TranslatorStubs.cs
index 3a0b78982..914712bb1 100644
--- a/src/Ryujinx.Cpu/LightningJit/TranslatorStubs.cs
+++ b/src/Ryujinx.Cpu/LightningJit/TranslatorStubs.cs
@@ -17,13 +17,14 @@ namespace Ryujinx.Cpu.LightningJit
     /// </summary>
     class TranslatorStubs : IDisposable
     {
-        private delegate ulong GetFunctionAddressDelegate(ulong address);
+        private delegate ulong GetFunctionAddressDelegate(IntPtr framePointer, ulong address);
 
         private readonly Lazy<IntPtr> _slowDispatchStub;
 
         private bool _disposed;
 
         private readonly AddressTable<ulong> _functionTable;
+        private readonly NoWxCache _noWxCache;
         private readonly GetFunctionAddressDelegate _getFunctionAddressRef;
         private readonly IntPtr _getFunctionAddress;
         private readonly Lazy<IntPtr> _dispatchStub;
@@ -76,12 +77,14 @@ namespace Ryujinx.Cpu.LightningJit
         /// <see cref="Translator"/> instance.
         /// </summary>
         /// <param name="functionTable">Function table used to store pointers to the functions that the guest code will call</param>
+        /// <param name="noWxCache">Cache used on platforms that enforce W^X, otherwise should be null</param>
         /// <exception cref="ArgumentNullException"><paramref name="translator"/> is null</exception>
-        public TranslatorStubs(AddressTable<ulong> functionTable)
+        public TranslatorStubs(AddressTable<ulong> functionTable, NoWxCache noWxCache)
         {
             ArgumentNullException.ThrowIfNull(functionTable);
 
             _functionTable = functionTable;
+            _noWxCache = noWxCache;
             _getFunctionAddressRef = NativeInterface.GetFunctionAddress;
             _getFunctionAddress = Marshal.GetFunctionPointerForDelegate(_getFunctionAddressRef);
             _slowDispatchStub = new(GenerateSlowDispatchStub, isThreadSafe: true);
@@ -106,14 +109,17 @@ namespace Ryujinx.Cpu.LightningJit
         {
             if (!_disposed)
             {
-                if (_dispatchStub.IsValueCreated)
+                if (_noWxCache == null)
                 {
-                    JitCache.Unmap(_dispatchStub.Value);
-                }
+                    if (_dispatchStub.IsValueCreated)
+                    {
+                        JitCache.Unmap(_dispatchStub.Value);
+                    }
 
-                if (_dispatchLoop.IsValueCreated)
-                {
-                    JitCache.Unmap(Marshal.GetFunctionPointerForDelegate(_dispatchLoop.Value));
+                    if (_dispatchLoop.IsValueCreated)
+                    {
+                        JitCache.Unmap(Marshal.GetFunctionPointerForDelegate(_dispatchLoop.Value));
+                    }
                 }
 
                 _disposed = true;
@@ -197,7 +203,8 @@ namespace Ryujinx.Cpu.LightningJit
                 }
 
                 // Fallback.
-                asm.Mov(Register(0), guestAddress);
+                asm.Mov(Register(0), Register(29));
+                asm.Mov(Register(1), guestAddress);
                 asm.Mov(Register(16), (ulong)_getFunctionAddress);
                 asm.Blr(Register(16));
                 asm.Mov(Register(16), Register(0));
@@ -212,7 +219,7 @@ namespace Ryujinx.Cpu.LightningJit
                 throw new PlatformNotSupportedException();
             }
 
-            return JitCache.Map(writer.AsByteSpan());
+            return Map(writer.AsByteSpan());
         }
 
         /// <summary>
@@ -234,7 +241,8 @@ namespace Ryujinx.Cpu.LightningJit
                 asm.Mov(context, Register(0));
 
                 // Load the target guest address from the native context.
-                asm.LdrRiUn(Register(0), context, NativeContext.GetDispatchAddressOffset());
+                asm.Mov(Register(0), Register(29));
+                asm.LdrRiUn(Register(1), context, NativeContext.GetDispatchAddressOffset());
                 asm.Mov(Register(16), (ulong)_getFunctionAddress);
                 asm.Blr(Register(16));
                 asm.Mov(Register(16), Register(0));
@@ -249,7 +257,7 @@ namespace Ryujinx.Cpu.LightningJit
                 throw new PlatformNotSupportedException();
             }
 
-            return JitCache.Map(writer.AsByteSpan());
+            return Map(writer.AsByteSpan());
         }
 
         /// <summary>
@@ -312,7 +320,7 @@ namespace Ryujinx.Cpu.LightningJit
                 Operand context = Register(19);
                 asm.Mov(context, Register(0));
 
-                EmitSyncFpContext(ref asm, context, Register(16), Register(17), true);
+                EmitSyncFpContext(ref asm, context, Register(16, OperandType.I32), Register(17, OperandType.I32), true);
 
                 // Load the target guest address from the native context.
                 Operand guestAddress = Register(16);
@@ -331,7 +339,7 @@ namespace Ryujinx.Cpu.LightningJit
                 asm.Cbz(Register(17), 8);
                 asm.B((loopStartIndex - writer.InstructionPointer) * 4);
 
-                EmitSyncFpContext(ref asm, context, Register(16), Register(17), false);
+                EmitSyncFpContext(ref asm, context, Register(16, OperandType.I32), Register(17, OperandType.I32), false);
 
                 rsr.WriteEpilogue(ref asm);
 
@@ -342,11 +350,23 @@ namespace Ryujinx.Cpu.LightningJit
                 throw new PlatformNotSupportedException();
             }
 
-            IntPtr pointer = JitCache.Map(writer.AsByteSpan());
+            IntPtr pointer = Map(writer.AsByteSpan());
 
             return Marshal.GetDelegateForFunctionPointer<DispatcherFunction>(pointer);
         }
 
+        private IntPtr Map(ReadOnlySpan<byte> code)
+        {
+            if (_noWxCache != null)
+            {
+                return _noWxCache.MapPageAligned(code);
+            }
+            else
+            {
+                return JitCache.Map(code);
+            }
+        }
+
         private static Operand Register(int register, OperandType type = OperandType.I64)
         {
             return new Operand(register, RegisterType.Integer, type);