From 5a7df48975bcb04b1805031a26f5007211fe4c62 Mon Sep 17 00:00:00 2001
From: gdkchan <gab.dark.100@gmail.com>
Date: Thu, 23 Jul 2020 23:53:25 -0300
Subject: [PATCH] New GPFifo and fast guest constant buffer updates (#1400)

* Add new structures from official docs, start migrating GPFifo

* Finish migration to new GPFifo processor

* Implement fast constant buffer data upload

* Migrate to new GPFifo class

* XML docs
---
 Ryujinx.Graphics.Device/DeviceState.cs        |   6 +-
 Ryujinx.Graphics.Gpu/DmaPusher.cs             | 316 ------------------
 .../Engine/GPFifo/CompressedMethod.cs         |  39 +++
 Ryujinx.Graphics.Gpu/Engine/GPFifo/GPEntry.cs |  51 +++
 .../Engine/GPFifo/GPFifoClass.cs              | 214 ++++++++++++
 .../Engine/GPFifo/GPFifoClassState.cs         | 186 +++++++++++
 .../Engine/GPFifo/GPFifoDevice.cs             | 188 +++++++++++
 .../Engine/GPFifo/GPFifoProcessor.cs          | 179 ++++++++++
 Ryujinx.Graphics.Gpu/Engine/MME/Macro.cs      |  69 ++++
 Ryujinx.Graphics.Gpu/Engine/MethodFifo.cs     | 103 ------
 .../Engine/MethodUniformBufferUpdate.cs       |  18 +
 Ryujinx.Graphics.Gpu/Engine/Methods.cs        |  14 -
 Ryujinx.Graphics.Gpu/GpuContext.cs            |  15 +-
 Ryujinx.Graphics.Gpu/NvGpuFifo.cs             | 220 ------------
 .../Ryujinx.Graphics.Gpu.csproj               |   1 +
 .../State/FenceActionOperation.cs             |  11 -
 .../State/FifoSemaphoreOperation.cs           |   9 -
 Ryujinx.Graphics.Gpu/State/MethodOffset.cs    |   9 -
 .../NvHostChannel/NvHostChannelDeviceFile.cs  |   8 +-
 Ryujinx.HLE/Switch.cs                         |   4 +-
 20 files changed, 958 insertions(+), 702 deletions(-)
 delete mode 100644 Ryujinx.Graphics.Gpu/DmaPusher.cs
 create mode 100644 Ryujinx.Graphics.Gpu/Engine/GPFifo/CompressedMethod.cs
 create mode 100644 Ryujinx.Graphics.Gpu/Engine/GPFifo/GPEntry.cs
 create mode 100644 Ryujinx.Graphics.Gpu/Engine/GPFifo/GPFifoClass.cs
 create mode 100644 Ryujinx.Graphics.Gpu/Engine/GPFifo/GPFifoClassState.cs
 create mode 100644 Ryujinx.Graphics.Gpu/Engine/GPFifo/GPFifoDevice.cs
 create mode 100644 Ryujinx.Graphics.Gpu/Engine/GPFifo/GPFifoProcessor.cs
 create mode 100644 Ryujinx.Graphics.Gpu/Engine/MME/Macro.cs
 delete mode 100644 Ryujinx.Graphics.Gpu/Engine/MethodFifo.cs
 delete mode 100644 Ryujinx.Graphics.Gpu/NvGpuFifo.cs
 delete mode 100644 Ryujinx.Graphics.Gpu/State/FenceActionOperation.cs
 delete mode 100644 Ryujinx.Graphics.Gpu/State/FifoSemaphoreOperation.cs

diff --git a/Ryujinx.Graphics.Device/DeviceState.cs b/Ryujinx.Graphics.Device/DeviceState.cs
index ea6942ec..740d8589 100644
--- a/Ryujinx.Graphics.Device/DeviceState.cs
+++ b/Ryujinx.Graphics.Device/DeviceState.cs
@@ -90,14 +90,12 @@ namespace Ryujinx.Graphics.Device
             {
                 int alignedOffset = Align(offset);
 
+                GetRef<int>(alignedOffset) = data;
+
                 if (_writeCallbacks.TryGetValue(alignedOffset, out Action<int> write))
                 {
                     write(data);
                 }
-                else
-                {
-                    GetRef<int>(alignedOffset) = data;
-                }
             }
         }
 
diff --git a/Ryujinx.Graphics.Gpu/DmaPusher.cs b/Ryujinx.Graphics.Gpu/DmaPusher.cs
deleted file mode 100644
index 3b5ac830..00000000
--- a/Ryujinx.Graphics.Gpu/DmaPusher.cs
+++ /dev/null
@@ -1,316 +0,0 @@
-using System;
-using System.Collections.Concurrent;
-using System.Runtime.InteropServices;
-using System.Threading;
-
-namespace Ryujinx.Graphics.Gpu
-{
-    /// <summary>
-    /// GPU DMA pusher, used to push commands to the GPU.
-    /// </summary>
-    public class DmaPusher
-    {
-        private ConcurrentQueue<CommandBuffer> _commandBufferQueue;
-
-        private enum CommandBufferType
-        {
-            Prefetch,
-            NoPrefetch,
-        }
-
-        private struct CommandBuffer
-        {
-            /// <summary>
-            /// The type of the command buffer.
-            /// </summary>
-            public CommandBufferType Type;
-
-            /// <summary>
-            /// Fetched data.
-            /// </summary>
-            public int[] Words;
-
-            /// <summary>
-            /// The GPFIFO entry address. (used in NoPrefetch mode)
-            /// </summary>
-            public ulong EntryAddress;
-
-            /// <summary>
-            /// The count of entries inside this GPFIFO entry.
-            /// </summary>
-            public uint EntryCount;
-
-            /// <summary>
-            /// Fetch the command buffer.
-            /// </summary>
-            public void Fetch(GpuContext context)
-            {
-                if (Words == null)
-                {
-                    Words = MemoryMarshal.Cast<byte, int>(context.MemoryAccessor.GetSpan(EntryAddress, (int)EntryCount * 4)).ToArray();
-                }
-            }
-
-            /// <summary>
-            /// Read inside the command buffer.
-            /// </summary>
-            /// <param name="context">The GPU context</param>
-            /// <param name="index">The index inside the command buffer</param>
-            /// <returns>The value read</returns>
-            public int ReadAt(GpuContext context, int index)
-            {
-                return Words[index];
-            }
-        }
-
-        private CommandBuffer _currentCommandBuffer;
-        private int           _wordsPosition;
-
-        /// <summary>
-        /// Internal GPFIFO state.
-        /// </summary>
-        private struct DmaState
-        {
-            public int  Method;
-            public int  SubChannel;
-            public int  MethodCount;
-            public bool NonIncrementing;
-            public bool IncrementOnce;
-            public int  LengthPending;
-        }
-
-        private DmaState _state;
-
-        private bool _ibEnable;
-
-        private GpuContext _context;
-
-        private AutoResetEvent _event;
-
-        /// <summary>
-        /// Creates a new instance of the GPU DMA pusher.
-        /// </summary>
-        /// <param name="context">GPU context that the pusher belongs to</param>
-        internal DmaPusher(GpuContext context)
-        {
-            _context = context;
-
-            _ibEnable = true;
-
-            _commandBufferQueue = new ConcurrentQueue<CommandBuffer>();
-
-            _event = new AutoResetEvent(false);
-        }
-
-        /// <summary>
-        /// Signal the pusher that there are new entries to process.
-        /// </summary>
-        public void SignalNewEntries()
-        {
-            _event.Set();
-        }
-
-        /// <summary>
-        /// Push a GPFIFO entry in the form of a prefetched command buffer.
-        /// It is intended to be used by nvservices to handle special cases.
-        /// </summary>
-        /// <param name="commandBuffer">The command buffer containing the prefetched commands</param>
-        public void PushHostCommandBuffer(int[] commandBuffer)
-        {
-            _commandBufferQueue.Enqueue(new CommandBuffer
-            {
-                Type         = CommandBufferType.Prefetch,
-                Words        = commandBuffer,
-                EntryAddress = ulong.MaxValue,
-                EntryCount   = (uint)commandBuffer.Length
-            });
-        }
-
-        /// <summary>
-        /// Create a CommandBuffer from a GPFIFO entry.
-        /// </summary>
-        /// <param name="entry">The GPFIFO entry</param>
-        /// <returns>A new CommandBuffer based on the GPFIFO entry</returns>
-        private CommandBuffer CreateCommandBuffer(ulong entry)
-        {
-            ulong length       = (entry >> 42) & 0x1fffff;
-            ulong startAddress = entry & 0xfffffffffc;
-
-            bool noPrefetch = (entry & (1UL << 63)) != 0;
-
-            CommandBufferType type = CommandBufferType.Prefetch;
-
-            if (noPrefetch)
-            {
-                type = CommandBufferType.NoPrefetch;
-            }
-
-            return new CommandBuffer
-            {
-                Type         = type,
-                Words        = null,
-                EntryAddress = startAddress,
-                EntryCount   = (uint)length
-            };
-        }
-
-        /// <summary>
-        /// Pushes GPFIFO entries.
-        /// </summary>
-        /// <param name="entries">GPFIFO entries</param>
-        public void PushEntries(ReadOnlySpan<ulong> entries)
-        {
-            bool beforeBarrier = true;
-
-            foreach (ulong entry in entries)
-            {
-                CommandBuffer commandBuffer = CreateCommandBuffer(entry);
-
-                if (beforeBarrier && commandBuffer.Type == CommandBufferType.Prefetch)
-                {
-                    commandBuffer.Fetch(_context);
-                }
-
-                if (commandBuffer.Type == CommandBufferType.NoPrefetch)
-                {
-                    beforeBarrier = false;
-                }
-
-                _commandBufferQueue.Enqueue(commandBuffer);
-            }
-        }
-
-        /// <summary>
-        /// Waits until commands are pushed to the FIFO.
-        /// </summary>
-        /// <returns>True if commands were received, false if wait timed out</returns>
-        public bool WaitForCommands()
-        {
-            return _event.WaitOne(8);
-        }
-
-        /// <summary>
-        /// Processes commands pushed to the FIFO.
-        /// </summary>
-        public void DispatchCalls()
-        {
-            while (Step());
-        }
-
-        /// <summary>
-        /// Processes a single command on the FIFO.
-        /// </summary>
-        /// <returns>True if the FIFO still has commands to be processed, false otherwise</returns>
-        private bool Step()
-        {
-            if (_wordsPosition != _currentCommandBuffer.EntryCount)
-            {
-                int word = _currentCommandBuffer.ReadAt(_context, _wordsPosition++);
-
-                if (_state.LengthPending != 0)
-                {
-                    _state.LengthPending = 0;
-                    _state.MethodCount   = word & 0xffffff;
-                }
-                else if (_state.MethodCount != 0)
-                {
-                    CallMethod(word);
-
-                    if (!_state.NonIncrementing)
-                    {
-                        _state.Method++;
-                    }
-
-                    if (_state.IncrementOnce)
-                    {
-                        _state.NonIncrementing = true;
-                    }
-
-                    _state.MethodCount--;
-                }
-                else
-                {
-                    int submissionMode = (word >> 29) & 7;
-
-                    switch (submissionMode)
-                    {
-                        case 1:
-                            // Incrementing.
-                            SetNonImmediateState(word);
-
-                            _state.NonIncrementing = false;
-                            _state.IncrementOnce   = false;
-
-                            break;
-
-                        case 3:
-                            // Non-incrementing.
-                            SetNonImmediateState(word);
-
-                            _state.NonIncrementing = true;
-                            _state.IncrementOnce   = false;
-
-                            break;
-
-                        case 4:
-                            // Immediate.
-                            _state.Method          = (word >> 0)  & 0x1fff;
-                            _state.SubChannel      = (word >> 13) & 7;
-                            _state.NonIncrementing = true;
-                            _state.IncrementOnce   = false;
-
-                            CallMethod((word >> 16) & 0x1fff);
-
-                            break;
-
-                        case 5:
-                            // Increment-once.
-                            SetNonImmediateState(word);
-
-                            _state.NonIncrementing = false;
-                            _state.IncrementOnce   = true;
-
-                            break;
-                    }
-                }
-            }
-            else if (_ibEnable && _commandBufferQueue.TryDequeue(out CommandBuffer entry))
-            {
-                _currentCommandBuffer = entry;
-                _wordsPosition        = 0;
-
-                _currentCommandBuffer.Fetch(_context);
-            }
-            else
-            {
-                return false;
-            }
-
-            return true;
-        }
-
-        /// <summary>
-        /// Sets current non-immediate method call state.
-        /// </summary>
-        /// <param name="word">Compressed method word</param>
-        private void SetNonImmediateState(int word)
-        {
-            _state.Method      = (word >> 0)  & 0x1fff;
-            _state.SubChannel  = (word >> 13) & 7;
-            _state.MethodCount = (word >> 16) & 0x1fff;
-        }
-
-        /// <summary>
-        /// Forwards the method call to GPU engines.
-        /// </summary>
-        /// <param name="argument">Call argument</param>
-        private void CallMethod(int argument)
-        {
-            _context.Fifo.CallMethod(new MethodParams(
-                _state.Method,
-                argument,
-                _state.SubChannel,
-                _state.MethodCount));
-        }
-    }
-}
\ No newline at end of file
diff --git a/Ryujinx.Graphics.Gpu/Engine/GPFifo/CompressedMethod.cs b/Ryujinx.Graphics.Gpu/Engine/GPFifo/CompressedMethod.cs
new file mode 100644
index 00000000..9a213489
--- /dev/null
+++ b/Ryujinx.Graphics.Gpu/Engine/GPFifo/CompressedMethod.cs
@@ -0,0 +1,39 @@
+// This file was auto-generated from NVIDIA official Maxwell definitions.
+
+namespace Ryujinx.Graphics.Gpu.Engine.GPFifo
+{
+    enum TertOp
+    {
+        Grp0IncMethod = 0,
+        Grp0SetSubDevMask = 1,
+        Grp0StoreSubDevMask = 2,
+        Grp0UseSubDevMask = 3,
+        Grp2NonIncMethod = 0
+    }
+
+    enum SecOp
+    {
+        Grp0UseTert = 0,
+        IncMethod = 1,
+        Grp2UseTert = 2,
+        NonIncMethod = 3,
+        ImmdDataMethod = 4,
+        OneInc = 5,
+        Reserved6 = 6,
+        EndPbSegment = 7
+    }
+
+    struct CompressedMethod
+    {
+        public uint Method;
+        public int MethodAddressOld => (int)((Method >> 2) & 0x7FF);
+        public int MethodAddress => (int)((Method >> 0) & 0xFFF);
+        public int SubdeviceMask => (int)((Method >> 4) & 0xFFF);
+        public int MethodSubchannel => (int)((Method >> 13) & 0x7);
+        public TertOp TertOp => (TertOp)((Method >> 16) & 0x3);
+        public int MethodCountOld => (int)((Method >> 18) & 0x7FF);
+        public int MethodCount => (int)((Method >> 16) & 0x1FFF);
+        public int ImmdData => (int)((Method >> 16) & 0x1FFF);
+        public SecOp SecOp => (SecOp)((Method >> 29) & 0x7);
+    }
+}
diff --git a/Ryujinx.Graphics.Gpu/Engine/GPFifo/GPEntry.cs b/Ryujinx.Graphics.Gpu/Engine/GPFifo/GPEntry.cs
new file mode 100644
index 00000000..9866cd2e
--- /dev/null
+++ b/Ryujinx.Graphics.Gpu/Engine/GPFifo/GPEntry.cs
@@ -0,0 +1,51 @@
+// This file was auto-generated from NVIDIA official Maxwell definitions.
+
+namespace Ryujinx.Graphics.Gpu.Engine.GPFifo
+{
+    enum Entry0Fetch
+    {
+        Unconditional = 0,
+        Conditional = 1,
+    }
+
+    enum Entry1Priv
+    {
+        User = 0,
+        Kernel = 1,
+    }
+
+    enum Entry1Level
+    {
+        Main = 0,
+        Subroutine = 1,
+    }
+
+    enum Entry1Sync
+    {
+        Proceed = 0,
+        Wait = 1,
+    }
+
+    enum Entry1Opcode
+    {
+        Nop = 0,
+        Illegal = 1,
+        Crc = 2,
+        PbCrc = 3,
+    }
+
+    struct GPEntry
+    {
+        public uint Entry0;
+        public Entry0Fetch Entry0Fetch => (Entry0Fetch)((Entry0 >> 0) & 0x1);
+        public int Entry0Get => (int)((Entry0 >> 2) & 0x3FFFFFFF);
+        public int Entry0Operand => (int)(Entry0);
+        public uint Entry1;
+        public int Entry1GetHi => (int)((Entry1 >> 0) & 0xFF);
+        public Entry1Priv Entry1Priv => (Entry1Priv)((Entry1 >> 8) & 0x1);
+        public Entry1Level Entry1Level => (Entry1Level)((Entry1 >> 9) & 0x1);
+        public int Entry1Length => (int)((Entry1 >> 10) & 0x1FFFFF);
+        public Entry1Sync Entry1Sync => (Entry1Sync)((Entry1 >> 31) & 0x1);
+        public Entry1Opcode Entry1Opcode => (Entry1Opcode)((Entry1 >> 0) & 0xFF);
+    }
+}
diff --git a/Ryujinx.Graphics.Gpu/Engine/GPFifo/GPFifoClass.cs b/Ryujinx.Graphics.Gpu/Engine/GPFifo/GPFifoClass.cs
new file mode 100644
index 00000000..ec2e4bdc
--- /dev/null
+++ b/Ryujinx.Graphics.Gpu/Engine/GPFifo/GPFifoClass.cs
@@ -0,0 +1,214 @@
+using Ryujinx.Graphics.Device;
+using Ryujinx.Graphics.Gpu.Engine.MME;
+using Ryujinx.Graphics.Gpu.State;
+using System;
+using System.Collections.Generic;
+using System.Threading;
+
+namespace Ryujinx.Graphics.Gpu.Engine.GPFifo
+{
+    /// <summary>
+    /// Represents a GPU General Purpose FIFO class.
+    /// </summary>
+    class GPFifoClass : IDeviceState
+    {
+        private readonly GpuContext _context;
+        private readonly DeviceState<GPFifoClassState> _state;
+
+        private const int MacrosCount = 0x80;
+
+        // Note: The size of the macro memory is unknown, we just make
+        // a guess here and use 256kb as the size. Increase if needed.
+        private const int MacroCodeSize = 256 * 256;
+
+        private readonly Macro[] _macros;
+        private readonly int[] _macroCode;
+
+        /// <summary>
+        /// MME Shadow RAM Control.
+        /// </summary>
+        public ShadowRamControl ShadowCtrl { get; private set; }
+
+        /// <summary>
+        /// Creates a new instance of the GPU General Purpose FIFO class.
+        /// </summary>
+        /// <param name="context">GPU context</param>
+        public GPFifoClass(GpuContext context)
+        {
+            _context = context;
+            _state = new DeviceState<GPFifoClassState>(new Dictionary<string, RwCallback>
+            {
+                { nameof(GPFifoClassState.Semaphored), new RwCallback(Semaphored, null) },
+                { nameof(GPFifoClassState.Syncpointb), new RwCallback(Syncpointb, null) },
+                { nameof(GPFifoClassState.WaitForIdle), new RwCallback(WaitForIdle, null) },
+                { nameof(GPFifoClassState.LoadMmeInstructionRam), new RwCallback(LoadMmeInstructionRam, null) },
+                { nameof(GPFifoClassState.LoadMmeStartAddressRam), new RwCallback(LoadMmeStartAddressRam, null) },
+                { nameof(GPFifoClassState.SetMmeShadowRamControl), new RwCallback(SetMmeShadowRamControl, null) }
+            });
+
+            _macros = new Macro[MacrosCount];
+            _macroCode = new int[MacroCodeSize];
+        }
+
+        /// <summary>
+        /// Reads data from the class registers.
+        /// </summary>
+        /// <param name="offset">Register byte offset</param>
+        /// <returns>Data at the specified offset</returns>
+        public int Read(int offset) => _state.Read(offset);
+
+        /// <summary>
+        /// Writes data to the class registers.
+        /// </summary>
+        /// <param name="offset">Register byte offset</param>
+        /// <param name="data">Data to be written</param>
+        public void Write(int offset, int data) => _state.Write(offset, data);
+
+        /// <summary>
+        /// Writes a GPU counter to guest memory.
+        /// </summary>
+        /// <param name="argument">Method call argument</param>
+        public void Semaphored(int argument)
+        {
+            ulong address = ((ulong)_state.State.SemaphorebOffsetLower << 2) |
+                            ((ulong)_state.State.SemaphoreaOffsetUpper << 32);
+
+            int value = _state.State.SemaphorecPayload;
+
+            SemaphoredOperation operation = _state.State.SemaphoredOperation;
+
+            // TODO: Acquire operations (Wait), interrupts for invalid combinations.
+            if (operation == SemaphoredOperation.Release)
+            {
+                _context.MemoryAccessor.Write(address, value);
+            }
+            else if (operation == SemaphoredOperation.Reduction)
+            {
+                bool signed = _state.State.SemaphoredFormat == SemaphoredFormat.Signed;
+
+                int mem = _context.MemoryAccessor.Read<int>(address);
+
+                switch (_state.State.SemaphoredReduction)
+                {
+                    case SemaphoredReduction.Min:
+                        value = signed ? Math.Min(mem, value) : (int)Math.Min((uint)mem, (uint)value);
+                        break;
+                    case SemaphoredReduction.Max:
+                        value = signed ? Math.Max(mem, value) : (int)Math.Max((uint)mem, (uint)value);
+                        break;
+                    case SemaphoredReduction.Xor:
+                        value ^= mem;
+                        break;
+                    case SemaphoredReduction.And:
+                        value &= mem;
+                        break;
+                    case SemaphoredReduction.Or:
+                        value |= mem;
+                        break;
+                    case SemaphoredReduction.Add:
+                        value += mem;
+                        break;
+                    case SemaphoredReduction.Inc:
+                        value = (uint)mem < (uint)value ? mem + 1 : 0;
+                        break;
+                    case SemaphoredReduction.Dec:
+                        value = (uint)mem > 0 && (uint)mem <= (uint)value ? mem - 1 : value;
+                        break;
+                }
+
+                _context.MemoryAccessor.Write(address, value);
+            }
+        }
+
+        /// <summary>
+        /// Apply a fence operation on a syncpoint.
+        /// </summary>
+        /// <param name="argument">Method call argument</param>
+        public void Syncpointb(int argument)
+        {
+            SyncpointbOperation operation = _state.State.SyncpointbOperation;
+
+            uint syncpointId = (uint)_state.State.SyncpointbSyncptIndex;
+
+            if (operation == SyncpointbOperation.Wait)
+            {
+                uint threshold = (uint)_state.State.SyncpointaPayload;
+
+                _context.Synchronization.WaitOnSyncpoint(syncpointId, threshold, Timeout.InfiniteTimeSpan);
+            }
+            else if (operation == SyncpointbOperation.Incr)
+            {
+                _context.Synchronization.IncrementSyncpoint(syncpointId);
+            }
+
+            _context.AdvanceSequence();
+        }
+
+        /// <summary>
+        /// Waits for the GPU to be idle.
+        /// </summary>
+        /// <param name="argument">Method call argument</param>
+        public void WaitForIdle(int argument)
+        {
+            _context.Methods.PerformDeferredDraws();
+            _context.Renderer.Pipeline.Barrier();
+        }
+
+        /// <summary>
+        /// Send macro code/data to the MME
+        /// </summary>
+        /// <param name="argument">Method call argument</param>
+        public void LoadMmeInstructionRam(int argument)
+        {
+            _macroCode[_state.State.LoadMmeInstructionRamPointer++] = argument;
+        }
+
+        /// <summary>
+        /// Bind a macro index to a position for the MME
+        /// </summary>
+        /// <param name="argument">Method call argument</param>
+        public void LoadMmeStartAddressRam(int argument)
+        {
+            _macros[_state.State.LoadMmeStartAddressRamPointer++] = new Macro(argument);
+        }
+
+        /// <summary>
+        /// Change the shadow RAM setting
+        /// </summary>
+        /// <param name="argument">Method call argument</param>
+        public void SetMmeShadowRamControl(int argument)
+        {
+            ShadowCtrl = (ShadowRamControl)argument;
+        }
+
+        /// <summary>
+        /// Pushes an argument to a macro.
+        /// </summary>
+        /// <param name="index">Index of the macro</param>
+        /// <param name="argument">Argument to be pushed to the macro</param>
+        public void MmePushArgument(int index, int argument)
+        {
+            _macros[index].PushArgument(argument);
+        }
+
+        /// <summary>
+        /// Prepares a macro for execution.
+        /// </summary>
+        /// <param name="index">Index of the macro</param>
+        /// <param name="argument">Initial argument passed to the macro</param>
+        public void MmeStart(int index, int argument)
+        {
+            _macros[index].StartExecution(argument);
+        }
+
+        /// <summary>
+        /// Executes a macro.
+        /// </summary>
+        /// <param name="index">Index of the macro</param>
+        /// <param name="state">Current GPU state</param>
+        public void CallMme(int index, GpuState state)
+        {
+            _macros[index].Execute(_macroCode, ShadowCtrl, state);
+        }
+    }
+}
diff --git a/Ryujinx.Graphics.Gpu/Engine/GPFifo/GPFifoClassState.cs b/Ryujinx.Graphics.Gpu/Engine/GPFifo/GPFifoClassState.cs
new file mode 100644
index 00000000..3b282668
--- /dev/null
+++ b/Ryujinx.Graphics.Gpu/Engine/GPFifo/GPFifoClassState.cs
@@ -0,0 +1,186 @@
+// This file was auto-generated from NVIDIA official Maxwell definitions.
+
+using Ryujinx.Common.Memory;
+
+namespace Ryujinx.Graphics.Gpu.Engine.GPFifo
+{
+    enum SemaphoredOperation
+    {
+        Acquire = 1,
+        Release = 2,
+        AcqGeq = 4,
+        AcqAnd = 8,
+        Reduction = 16
+    }
+
+    enum SemaphoredAcquireSwitch
+    {
+        Disabled = 0,
+        Enabled = 1
+    }
+
+    enum SemaphoredReleaseWfi
+    {
+        En = 0,
+        Dis = 1
+    }
+
+    enum SemaphoredReleaseSize
+    {
+        SixteenBytes = 0,
+        FourBytes = 1
+    }
+
+    enum SemaphoredReduction
+    {
+        Min = 0,
+        Max = 1,
+        Xor = 2,
+        And = 3,
+        Or = 4,
+        Add = 5,
+        Inc = 6,
+        Dec = 7
+    }
+
+    enum SemaphoredFormat
+    {
+        Signed = 0,
+        Unsigned = 1
+    }
+
+    enum MemOpCTlbInvalidatePdb
+    {
+        One = 0,
+        All = 1
+    }
+
+    enum MemOpCTlbInvalidateGpc
+    {
+        Enable = 0,
+        Disable = 1
+    }
+
+    enum MemOpCTlbInvalidateTarget
+    {
+        VidMem = 0,
+        SysMemCoherent = 2,
+        SysMemNoncoherent = 3
+    }
+
+    enum MemOpDOperation
+    {
+        Membar = 5,
+        MmuTlbInvalidate = 9,
+        L2PeermemInvalidate = 13,
+        L2SysmemInvalidate = 14,
+        L2CleanComptags = 15,
+        L2FlushDirty = 16
+    }
+
+    enum SyncpointbOperation
+    {
+        Wait = 0,
+        Incr = 1
+    }
+
+    enum SyncpointbWaitSwitch
+    {
+        Dis = 0,
+        En = 1
+    }
+
+    enum WfiScope
+    {
+        CurrentScgType = 0,
+        All = 1
+    }
+
+    enum YieldOp
+    {
+        Nop = 0,
+        PbdmaTimeslice = 1,
+        RunlistTimeslice = 2,
+        Tsg = 3
+    }
+
+    struct GPFifoClassState
+    {
+        public uint SetObject;
+        public int SetObjectNvclass => (int)((SetObject >> 0) & 0xFFFF);
+        public int SetObjectEngine => (int)((SetObject >> 16) & 0x1F);
+        public uint Illegal;
+        public int IllegalHandle => (int)(Illegal);
+        public uint Nop;
+        public int NopHandle => (int)(Nop);
+        public uint Reserved0C;
+        public uint Semaphorea;
+        public int SemaphoreaOffsetUpper => (int)((Semaphorea >> 0) & 0xFF);
+        public uint Semaphoreb;
+        public int SemaphorebOffsetLower => (int)((Semaphoreb >> 2) & 0x3FFFFFFF);
+        public uint Semaphorec;
+        public int SemaphorecPayload => (int)(Semaphorec);
+        public uint Semaphored;
+        public SemaphoredOperation SemaphoredOperation => (SemaphoredOperation)((Semaphored >> 0) & 0x1F);
+        public SemaphoredAcquireSwitch SemaphoredAcquireSwitch => (SemaphoredAcquireSwitch)((Semaphored >> 12) & 0x1);
+        public SemaphoredReleaseWfi SemaphoredReleaseWfi => (SemaphoredReleaseWfi)((Semaphored >> 20) & 0x1);
+        public SemaphoredReleaseSize SemaphoredReleaseSize => (SemaphoredReleaseSize)((Semaphored >> 24) & 0x1);
+        public SemaphoredReduction SemaphoredReduction => (SemaphoredReduction)((Semaphored >> 27) & 0xF);
+        public SemaphoredFormat SemaphoredFormat => (SemaphoredFormat)((Semaphored >> 31) & 0x1);
+        public uint NonStallInterrupt;
+        public int NonStallInterruptHandle => (int)(NonStallInterrupt);
+        public uint FbFlush;
+        public int FbFlushHandle => (int)(FbFlush);
+        public uint Reserved28;
+        public uint Reserved2C;
+        public uint MemOpC;
+        public int MemOpCOperandLow => (int)((MemOpC >> 2) & 0x3FFFFFFF);
+        public MemOpCTlbInvalidatePdb MemOpCTlbInvalidatePdb => (MemOpCTlbInvalidatePdb)((MemOpC >> 0) & 0x1);
+        public MemOpCTlbInvalidateGpc MemOpCTlbInvalidateGpc => (MemOpCTlbInvalidateGpc)((MemOpC >> 1) & 0x1);
+        public MemOpCTlbInvalidateTarget MemOpCTlbInvalidateTarget => (MemOpCTlbInvalidateTarget)((MemOpC >> 10) & 0x3);
+        public int MemOpCTlbInvalidateAddrLo => (int)((MemOpC >> 12) & 0xFFFFF);
+        public uint MemOpD;
+        public int MemOpDOperandHigh => (int)((MemOpD >> 0) & 0xFF);
+        public MemOpDOperation MemOpDOperation => (MemOpDOperation)((MemOpD >> 27) & 0x1F);
+        public int MemOpDTlbInvalidateAddrHi => (int)((MemOpD >> 0) & 0xFF);
+        public uint Reserved38;
+        public uint Reserved3C;
+        public uint Reserved40;
+        public uint Reserved44;
+        public uint Reserved48;
+        public uint Reserved4C;
+        public uint SetReference;
+        public int SetReferenceCount => (int)(SetReference);
+        public uint Reserved54;
+        public uint Reserved58;
+        public uint Reserved5C;
+        public uint Reserved60;
+        public uint Reserved64;
+        public uint Reserved68;
+        public uint Reserved6C;
+        public uint Syncpointa;
+        public int SyncpointaPayload => (int)(Syncpointa);
+        public uint Syncpointb;
+        public SyncpointbOperation SyncpointbOperation => (SyncpointbOperation)((Syncpointb >> 0) & 0x1);
+        public SyncpointbWaitSwitch SyncpointbWaitSwitch => (SyncpointbWaitSwitch)((Syncpointb >> 4) & 0x1);
+        public int SyncpointbSyncptIndex => (int)((Syncpointb >> 8) & 0xFFF);
+        public uint Wfi;
+        public WfiScope WfiScope => (WfiScope)((Wfi >> 0) & 0x1);
+        public uint CrcCheck;
+        public int CrcCheckValue => (int)(CrcCheck);
+        public uint Yield;
+        public YieldOp YieldOp => (YieldOp)((Yield >> 0) & 0x3);
+        // TODO: Eventually move this to per-engine state.
+        public Array31<uint> Reserved84;
+        public uint NoOperation;
+        public uint SetNotifyA;
+        public uint SetNotifyB;
+        public uint Notify;
+        public uint WaitForIdle;
+        public uint LoadMmeInstructionRamPointer;
+        public uint LoadMmeInstructionRam;
+        public uint LoadMmeStartAddressRamPointer;
+        public uint LoadMmeStartAddressRam;
+        public uint SetMmeShadowRamControl;
+    }
+}
diff --git a/Ryujinx.Graphics.Gpu/Engine/GPFifo/GPFifoDevice.cs b/Ryujinx.Graphics.Gpu/Engine/GPFifo/GPFifoDevice.cs
new file mode 100644
index 00000000..466bff8f
--- /dev/null
+++ b/Ryujinx.Graphics.Gpu/Engine/GPFifo/GPFifoDevice.cs
@@ -0,0 +1,188 @@
+using System;
+using System.Collections.Concurrent;
+using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
+using System.Threading;
+
+namespace Ryujinx.Graphics.Gpu.Engine.GPFifo
+{
+    /// <summary>
+    /// Represents a GPU General Purpose FIFO device.
+    /// </summary>
+    public sealed class GPFifoDevice : IDisposable
+    {
+        /// <summary>
+        /// Indicates if the command buffer has pre-fetch enabled.
+        /// </summary>
+        private enum CommandBufferType
+        {
+            Prefetch,
+            NoPrefetch
+        }
+
+        /// <summary>
+        /// Command buffer data.
+        /// </summary>
+        private struct CommandBuffer
+        {
+            /// <summary>
+            /// The type of the command buffer.
+            /// </summary>
+            public CommandBufferType Type;
+
+            /// <summary>
+            /// Fetched data.
+            /// </summary>
+            public int[] Words;
+
+            /// <summary>
+            /// The GPFIFO entry address (used in <see cref="CommandBufferType.NoPrefetch"/> mode).
+            /// </summary>
+            public ulong EntryAddress;
+
+            /// <summary>
+            /// The count of entries inside this GPFIFO entry.
+            /// </summary>
+            public uint EntryCount;
+
+            /// <summary>
+            /// Fetch the command buffer.
+            /// </summary>
+            public void Fetch(GpuContext context)
+            {
+                if (Words == null)
+                {
+                    Words = MemoryMarshal.Cast<byte, int>(context.MemoryAccessor.GetSpan(EntryAddress, (int)EntryCount * 4)).ToArray();
+                }
+            }
+        }
+
+        private readonly ConcurrentQueue<CommandBuffer> _commandBufferQueue;
+
+        private CommandBuffer _currentCommandBuffer;
+
+        private readonly bool _ibEnable;
+        private readonly GpuContext _context;
+        private readonly AutoResetEvent _event;
+        private readonly GPFifoProcessor _processor;
+
+        /// <summary>
+        /// Creates a new instance of the GPU General Purpose FIFO device.
+        /// </summary>
+        /// <param name="context">GPU context that the GPFIFO belongs to</param>
+        internal GPFifoDevice(GpuContext context)
+        {
+            _commandBufferQueue = new ConcurrentQueue<CommandBuffer>();
+            _ibEnable = true;
+            _context = context;
+            _event = new AutoResetEvent(false);
+
+            _processor = new GPFifoProcessor(context);
+        }
+
+        /// <summary>
+        /// Signal the FIFO that there are new entries to process.
+        /// </summary>
+        public void SignalNewEntries()
+        {
+            _event.Set();
+        }
+
+        /// <summary>
+        /// Push a GPFIFO entry in the form of a prefetched command buffer.
+        /// It is intended to be used by nvservices to handle special cases.
+        /// </summary>
+        /// <param name="commandBuffer">The command buffer containing the prefetched commands</param>
+        public void PushHostCommandBuffer(int[] commandBuffer)
+        {
+            _commandBufferQueue.Enqueue(new CommandBuffer
+            {
+                Type = CommandBufferType.Prefetch,
+                Words = commandBuffer,
+                EntryAddress = ulong.MaxValue,
+                EntryCount = (uint)commandBuffer.Length
+            });
+        }
+
+        /// <summary>
+        /// Create a CommandBuffer from a GPFIFO entry.
+        /// </summary>
+        /// <param name="entry">The GPFIFO entry</param>
+        /// <returns>A new CommandBuffer based on the GPFIFO entry</returns>
+        private CommandBuffer CreateCommandBuffer(GPEntry entry)
+        {
+            CommandBufferType type = CommandBufferType.Prefetch;
+
+            if (entry.Entry1Sync == Entry1Sync.Wait)
+            {
+                type = CommandBufferType.NoPrefetch;
+            }
+
+            ulong startAddress = ((ulong)entry.Entry0Get << 2) | ((ulong)entry.Entry1GetHi << 32);
+
+            return new CommandBuffer
+            {
+                Type = type,
+                Words = null,
+                EntryAddress = startAddress,
+                EntryCount = (uint)entry.Entry1Length
+            };
+        }
+
+        /// <summary>
+        /// Pushes GPFIFO entries.
+        /// </summary>
+        /// <param name="entries">GPFIFO entries</param>
+        public void PushEntries(ReadOnlySpan<ulong> entries)
+        {
+            bool beforeBarrier = true;
+
+            for (int index = 0; index < entries.Length; index++)
+            {
+                ulong entry = entries[index];
+
+                CommandBuffer commandBuffer = CreateCommandBuffer(Unsafe.As<ulong, GPEntry>(ref entry));
+
+                if (beforeBarrier && commandBuffer.Type == CommandBufferType.Prefetch)
+                {
+                    commandBuffer.Fetch(_context);
+                }
+
+                if (commandBuffer.Type == CommandBufferType.NoPrefetch)
+                {
+                    beforeBarrier = false;
+                }
+
+                _commandBufferQueue.Enqueue(commandBuffer);
+            }
+        }
+
+        /// <summary>
+        /// Waits until commands are pushed to the FIFO.
+        /// </summary>
+        /// <returns>True if commands were received, false if wait timed out</returns>
+        public bool WaitForCommands()
+        {
+            return _event.WaitOne(8);
+        }
+
+        /// <summary>
+        /// Processes commands pushed to the FIFO.
+        /// </summary>
+        public void DispatchCalls()
+        {
+            while (_ibEnable && _commandBufferQueue.TryDequeue(out CommandBuffer entry))
+            {
+                _currentCommandBuffer = entry;
+                _currentCommandBuffer.Fetch(_context);
+
+                _processor.Process(_currentCommandBuffer.Words);
+            }
+        }
+
+        /// <summary>
+        /// Disposes of resources used for GPFifo command processing.
+        /// </summary>
+        public void Dispose() => _event.Dispose();
+    }
+}
diff --git a/Ryujinx.Graphics.Gpu/Engine/GPFifo/GPFifoProcessor.cs b/Ryujinx.Graphics.Gpu/Engine/GPFifo/GPFifoProcessor.cs
new file mode 100644
index 00000000..115361f3
--- /dev/null
+++ b/Ryujinx.Graphics.Gpu/Engine/GPFifo/GPFifoProcessor.cs
@@ -0,0 +1,179 @@
+using Ryujinx.Graphics.Gpu.State;
+using System;
+using System.Runtime.CompilerServices;
+
+namespace Ryujinx.Graphics.Gpu.Engine.GPFifo
+{
+    /// <summary>
+    /// Represents a GPU General Purpose FIFO command processor.
+    /// </summary>
+    class GPFifoProcessor
+    {
+        private const int MacrosCount = 0x80;
+        private const int MacroIndexMask = MacrosCount - 1;
+
+        private readonly GpuContext _context;
+
+        /// <summary>
+        /// Internal GPFIFO state.
+        /// </summary>
+        private struct DmaState
+        {
+            public int Method;
+            public int SubChannel;
+            public int MethodCount;
+            public bool NonIncrementing;
+            public bool IncrementOnce;
+        }
+
+        private DmaState _state;
+
+        private readonly GpuState[] _subChannels;
+        private readonly GPFifoClass _fifoClass;
+
+        /// <summary>
+        /// Creates a new instance of the GPU General Purpose FIFO command processor.
+        /// </summary>
+        /// <param name="context">GPU context</param>
+        public GPFifoProcessor(GpuContext context)
+        {
+            _context = context;
+
+            _fifoClass = new GPFifoClass(context);
+
+            _subChannels = new GpuState[8];
+
+            for (int index = 0; index < _subChannels.Length; index++)
+            {
+                _subChannels[index] = new GpuState();
+
+                _context.Methods.RegisterCallbacks(_subChannels[index]);
+            }
+        }
+
+        /// <summary>
+        /// Processes a command buffer.
+        /// </summary>
+        /// <param name="commandBuffer">Command buffer</param>
+        public void Process(ReadOnlySpan<int> commandBuffer)
+        {
+            for (int index = 0; index < commandBuffer.Length; index++)
+            {
+                int command = commandBuffer[index];
+
+                if (_state.MethodCount != 0)
+                {
+                    Send(new MethodParams(_state.Method, command, _state.SubChannel, _state.MethodCount));
+
+                    if (!_state.NonIncrementing)
+                    {
+                        _state.Method++;
+                    }
+
+                    if (_state.IncrementOnce)
+                    {
+                        _state.NonIncrementing = true;
+                    }
+
+                    _state.MethodCount--;
+                }
+                else
+                {
+                    CompressedMethod meth = Unsafe.As<int, CompressedMethod>(ref command);
+
+                    if (TryFastUniformBufferUpdate(meth, commandBuffer, index))
+                    {
+                        index += meth.MethodCount;
+                        continue;
+                    }
+
+                    switch (meth.SecOp)
+                    {
+                        case SecOp.IncMethod:
+                        case SecOp.NonIncMethod:
+                        case SecOp.OneInc:
+                            _state.Method = meth.MethodAddress;
+                            _state.SubChannel = meth.MethodSubchannel;
+                            _state.MethodCount = meth.MethodCount;
+                            _state.IncrementOnce = meth.SecOp == SecOp.OneInc;
+                            _state.NonIncrementing = meth.SecOp == SecOp.NonIncMethod;
+                            break;
+                        case SecOp.ImmdDataMethod:
+                            Send(new MethodParams(meth.MethodAddress, meth.ImmdData, meth.MethodSubchannel, 1));
+                            break;
+                    }
+                }
+            }
+        }
+
+        /// <summary>
+        /// Tries to perform a fast constant buffer data update.
+        /// If successful, all data will be copied at once, and <see cref="CompressedMethod.MethodCount"/> + 1
+        /// command buffer entries will be consumed.
+        /// </summary>
+        /// <param name="meth">Compressed method to be checked</param>
+        /// <param name="commandBuffer">Command buffer where <paramref name="meth"/> is contained</param>
+        /// <param name="offset">Offset at <paramref name="commandBuffer"/> where <paramref name="meth"/> is located</param>
+        /// <returns>True if the fast copy was successful, false otherwise</returns>
+        private bool TryFastUniformBufferUpdate(CompressedMethod meth, ReadOnlySpan<int> commandBuffer, int offset)
+        {
+            int availableCount = commandBuffer.Length - offset;
+
+            if (meth.MethodCount < availableCount &&
+                meth.SecOp == SecOp.NonIncMethod &&
+                meth.MethodAddress == (int)MethodOffset.UniformBufferUpdateData)
+            {
+                GpuState state = _subChannels[meth.MethodSubchannel];
+
+                _context.Methods.UniformBufferUpdate(state, commandBuffer.Slice(offset + 1, meth.MethodCount));
+
+                return true;
+            }
+
+            return false;
+        }
+
+        /// <summary>
+        /// Sends a uncompressed method for processing by the graphics pipeline.
+        /// </summary>
+        /// <param name="meth">Method to be processed</param>
+        private void Send(MethodParams meth)
+        {
+            if ((MethodOffset)meth.Method == MethodOffset.BindChannel)
+            {
+                _subChannels[meth.SubChannel] = new GpuState();
+
+                _context.Methods.RegisterCallbacks(_subChannels[meth.SubChannel]);
+            }
+            else if (meth.Method < 0x60)
+            {
+                // TODO: check if macros are shared between subchannels or not. For now let's assume they are.
+                _fifoClass.Write(meth.Method * 4, meth.Argument);
+            }
+            else if (meth.Method < 0xe00)
+            {
+                _subChannels[meth.SubChannel].CallMethod(meth, _fifoClass.ShadowCtrl);
+            }
+            else
+            {
+                int macroIndex = (meth.Method >> 1) & MacroIndexMask;
+
+                if ((meth.Method & 1) != 0)
+                {
+                    _fifoClass.MmePushArgument(macroIndex, meth.Argument);
+                }
+                else
+                {
+                    _fifoClass.MmeStart(macroIndex, meth.Argument);
+                }
+
+                if (meth.IsLastCall)
+                {
+                    _fifoClass.CallMme(macroIndex, _subChannels[meth.SubChannel]);
+
+                    _context.Methods.PerformDeferredDraws();
+                }
+            }
+        }
+    }
+}
diff --git a/Ryujinx.Graphics.Gpu/Engine/MME/Macro.cs b/Ryujinx.Graphics.Gpu/Engine/MME/Macro.cs
new file mode 100644
index 00000000..10127d11
--- /dev/null
+++ b/Ryujinx.Graphics.Gpu/Engine/MME/Macro.cs
@@ -0,0 +1,69 @@
+using Ryujinx.Graphics.Gpu.State;
+
+namespace Ryujinx.Graphics.Gpu.Engine.MME
+{
+    /// <summary>
+    /// GPU macro program.
+    /// </summary>
+    struct Macro
+    {
+        /// <summary>
+        /// Word offset of the code on the code memory.
+        /// </summary>
+        public int Position { get; }
+
+        private bool _executionPending;
+        private int _argument;
+
+        private readonly MacroInterpreter _interpreter;
+
+        /// <summary>
+        /// Creates a new instance of the GPU cached macro program.
+        /// </summary>
+        /// <param name="position">Macro code start position</param>
+        public Macro(int position)
+        {
+            Position = position;
+
+            _executionPending = false;
+            _argument = 0;
+
+            _interpreter = new MacroInterpreter();
+        }
+
+        /// <summary>
+        /// Sets the first argument for the macro call.
+        /// </summary>
+        /// <param name="argument">First argument</param>
+        public void StartExecution(int argument)
+        {
+            _argument = argument;
+
+            _executionPending = true;
+        }
+
+        /// <summary>
+        /// Starts executing the macro program code.
+        /// </summary>
+        /// <param name="mme">Program code</param>
+        /// <param name="state">Current GPU state</param>
+        public void Execute(int[] mme, ShadowRamControl shadowCtrl, GpuState state)
+        {
+            if (_executionPending)
+            {
+                _executionPending = false;
+
+                _interpreter?.Execute(mme, Position, _argument, shadowCtrl, state);
+            }
+        }
+
+        /// <summary>
+        /// Pushes an argument to the macro call argument FIFO.
+        /// </summary>
+        /// <param name="argument">Argument to be pushed</param>
+        public void PushArgument(int argument)
+        {
+            _interpreter?.Fifo.Enqueue(argument);
+        }
+    }
+}
diff --git a/Ryujinx.Graphics.Gpu/Engine/MethodFifo.cs b/Ryujinx.Graphics.Gpu/Engine/MethodFifo.cs
deleted file mode 100644
index c1f45941..00000000
--- a/Ryujinx.Graphics.Gpu/Engine/MethodFifo.cs
+++ /dev/null
@@ -1,103 +0,0 @@
-using Ryujinx.Graphics.Gpu.State;
-using System.Threading;
-
-namespace Ryujinx.Graphics.Gpu.Engine
-{
-    partial class Methods
-    {
-        /// <summary>
-        /// Writes a GPU counter to guest memory.
-        /// </summary>
-        /// <param name="state">Current GPU state</param>
-        /// <param name="argument">Method call argument</param>
-        public void Semaphore(GpuState state, int argument)
-        {
-            FifoSemaphoreOperation op = (FifoSemaphoreOperation)(argument & 3);
-
-            var semaphore = state.Get<SemaphoreState>(MethodOffset.Semaphore);
-
-            int value = semaphore.Payload;
-
-            if (op == FifoSemaphoreOperation.Counter)
-            {
-                // TODO: There's much more that should be done here.
-                // NVN only supports the "Accumulate" mode, so we
-                // can't currently guess which bits specify the
-                // reduction operation.
-                value += _context.MemoryAccessor.Read<int>(semaphore.Address.Pack());
-            }
-
-            _context.MemoryAccessor.Write(semaphore.Address.Pack(), value);
-
-            _context.AdvanceSequence();
-        }
-
-        /// <summary>
-        /// Waits for the GPU to be idle.
-        /// </summary>
-        /// <param name="state">Current GPU state</param>
-        /// <param name="argument">Method call argument</param>
-        public void WaitForIdle(GpuState state, int argument)
-        {
-            PerformDeferredDraws();
-
-            _context.Renderer.Pipeline.Barrier();
-        }
-
-        /// <summary>
-        /// Send macro code/data to the MME.
-        /// </summary>
-        /// <param name="state">Current GPU state</param>
-        /// <param name="argument">Method call argument</param>
-        public void SendMacroCodeData(GpuState state, int argument)
-        {
-            int macroUploadAddress = state.Get<int>(MethodOffset.MacroUploadAddress);
-
-            _context.Fifo.SendMacroCodeData(macroUploadAddress++, argument);
-
-            state.Write((int)MethodOffset.MacroUploadAddress, macroUploadAddress);
-        }
-
-        /// <summary>
-        /// Bind a macro index to a position for the MME.
-        /// </summary>
-        /// <param name="state">Current GPU state</param>
-        /// <param name="argument">Method call argument</param>
-        public void BindMacro(GpuState state, int argument)
-        {
-            int macroBindingIndex = state.Get<int>(MethodOffset.MacroBindingIndex);
-
-            _context.Fifo.BindMacro(macroBindingIndex++, argument);
-
-            state.Write((int)MethodOffset.MacroBindingIndex, macroBindingIndex);
-        }
-
-        public void SetMmeShadowRamControl(GpuState state, int argument)
-        {
-            _context.Fifo.SetMmeShadowRamControl((ShadowRamControl)argument);
-        }
-
-        /// <summary>
-        /// Apply a fence operation on a syncpoint.
-        /// </summary>
-        /// <param name="state">Current GPU state</param>
-        /// <param name="argument">Method call argument</param>
-        public void FenceAction(GpuState state, int argument)
-        {
-            uint threshold = state.Get<uint>(MethodOffset.FenceValue);
-
-            FenceActionOperation operation = (FenceActionOperation)(argument & 1);
-
-            uint syncpointId = (uint)(argument >> 8) & 0xFF;
-
-            if (operation == FenceActionOperation.Acquire)
-            {
-                _context.Synchronization.WaitOnSyncpoint(syncpointId, threshold, Timeout.InfiniteTimeSpan);
-            }
-            else if (operation == FenceActionOperation.Increment)
-            {
-                _context.Synchronization.IncrementSyncpoint(syncpointId);
-            }
-        }
-    }
-}
diff --git a/Ryujinx.Graphics.Gpu/Engine/MethodUniformBufferUpdate.cs b/Ryujinx.Graphics.Gpu/Engine/MethodUniformBufferUpdate.cs
index 524f5e03..032a5868 100644
--- a/Ryujinx.Graphics.Gpu/Engine/MethodUniformBufferUpdate.cs
+++ b/Ryujinx.Graphics.Gpu/Engine/MethodUniformBufferUpdate.cs
@@ -1,4 +1,6 @@
 using Ryujinx.Graphics.Gpu.State;
+using System;
+using System.Runtime.InteropServices;
 
 namespace Ryujinx.Graphics.Gpu.Engine
 {
@@ -19,5 +21,21 @@ namespace Ryujinx.Graphics.Gpu.Engine
 
             _context.AdvanceSequence();
         }
+
+        /// <summary>
+        /// Updates the uniform buffer data with inline data.
+        /// </summary>
+        /// <param name="state">Current GPU state</param>
+        /// <param name="data">Data to be written to the uniform buffer</param>
+        public void UniformBufferUpdate(GpuState state, ReadOnlySpan<int> data)
+        {
+            var uniformBuffer = state.Get<UniformBufferState>(MethodOffset.UniformBufferState);
+
+            _context.MemoryAccessor.Write(uniformBuffer.Address.Pack() + (uint)uniformBuffer.Offset, MemoryMarshal.Cast<int, byte>(data));
+
+            state.SetUniformBufferOffset(uniformBuffer.Offset + data.Length * 4);
+
+            _context.AdvanceSequence();
+        }
     }
 }
\ No newline at end of file
diff --git a/Ryujinx.Graphics.Gpu/Engine/Methods.cs b/Ryujinx.Graphics.Gpu/Engine/Methods.cs
index df0e713d..e84687ef 100644
--- a/Ryujinx.Graphics.Gpu/Engine/Methods.cs
+++ b/Ryujinx.Graphics.Gpu/Engine/Methods.cs
@@ -106,20 +106,6 @@ namespace Ryujinx.Graphics.Gpu.Engine
             state.RegisterCallback(MethodOffset.UniformBufferBindFragment,       UniformBufferBindFragment);
         }
 
-        /// <summary>
-        /// Register callback for Fifo method calls that triggers an action on the GPFIFO.
-        /// </summary>
-        /// <param name="state">GPU state where the triggers will be registered</param>
-        public void RegisterCallbacksForFifo(GpuState state)
-        {
-            state.RegisterCallback(MethodOffset.Semaphore,              Semaphore);
-            state.RegisterCallback(MethodOffset.FenceAction,            FenceAction);
-            state.RegisterCallback(MethodOffset.WaitForIdle,            WaitForIdle);
-            state.RegisterCallback(MethodOffset.SendMacroCodeData,      SendMacroCodeData);
-            state.RegisterCallback(MethodOffset.BindMacro,              BindMacro);
-            state.RegisterCallback(MethodOffset.SetMmeShadowRamControl, SetMmeShadowRamControl);
-        }
-
         /// <summary>
         /// Updates host state based on the current guest GPU state.
         /// </summary>
diff --git a/Ryujinx.Graphics.Gpu/GpuContext.cs b/Ryujinx.Graphics.Gpu/GpuContext.cs
index b07694b9..8e9f2732 100644
--- a/Ryujinx.Graphics.Gpu/GpuContext.cs
+++ b/Ryujinx.Graphics.Gpu/GpuContext.cs
@@ -1,5 +1,6 @@
 using Ryujinx.Graphics.GAL;
 using Ryujinx.Graphics.Gpu.Engine;
+using Ryujinx.Graphics.Gpu.Engine.GPFifo;
 using Ryujinx.Graphics.Gpu.Memory;
 using Ryujinx.Graphics.Gpu.Synchronization;
 using System;
@@ -37,14 +38,9 @@ namespace Ryujinx.Graphics.Gpu
         internal Methods Methods { get; }
 
         /// <summary>
-        /// GPU commands FIFO.
+        /// GPU General Purpose FIFO queue.
         /// </summary>
-        internal NvGpuFifo Fifo { get; }
-
-        /// <summary>
-        /// DMA pusher.
-        /// </summary>
-        public DmaPusher DmaPusher { get; }
+        public GPFifoDevice GPFifo { get; }
 
         /// <summary>
         /// GPU synchronization manager.
@@ -83,9 +79,7 @@ namespace Ryujinx.Graphics.Gpu
 
             Methods = new Methods(this);
 
-            Fifo = new NvGpuFifo(this);
-
-            DmaPusher = new DmaPusher(this);
+            GPFifo = new GPFifoDevice(this);
 
             Synchronization = new SynchronizationManager();
 
@@ -125,6 +119,7 @@ namespace Ryujinx.Graphics.Gpu
             Methods.BufferManager.Dispose();
             Methods.TextureManager.Dispose();
             Renderer.Dispose();
+            GPFifo.Dispose();
         }
     }
 }
\ No newline at end of file
diff --git a/Ryujinx.Graphics.Gpu/NvGpuFifo.cs b/Ryujinx.Graphics.Gpu/NvGpuFifo.cs
deleted file mode 100644
index 36a275e2..00000000
--- a/Ryujinx.Graphics.Gpu/NvGpuFifo.cs
+++ /dev/null
@@ -1,220 +0,0 @@
-using Ryujinx.Graphics.Gpu.State;
-using System.IO;
-
-namespace Ryujinx.Graphics.Gpu
-{
-    /// <summary>
-    /// GPU commands FIFO.
-    /// </summary>
-    class NvGpuFifo
-    {
-        private const int MacrosCount    = 0x80;
-        private const int MacroIndexMask = MacrosCount - 1;
-
-        // Note: The size of the macro memory is unknown, we just make
-        // a guess here and use 256kb as the size. Increase if needed.
-        private const int MmeWords = 256 * 256;
-
-        private GpuContext _context;
-
-        /// <summary>
-        /// Cached GPU macro program.
-        /// </summary>
-        private struct CachedMacro
-        {
-            /// <summary>
-            /// Word offset of the code on the code memory.
-            /// </summary>
-            public int Position { get; }
-
-            private bool _executionPending;
-            private int  _argument;
-
-            private MacroInterpreter _interpreter;
-
-            /// <summary>
-            /// Creates a new instance of the GPU cached macro program.
-            /// </summary>
-            /// <param name="position">Macro code start position</param>
-            public CachedMacro(int position)
-            {
-                Position = position;
-
-                _executionPending = false;
-                _argument         = 0;
-
-                _interpreter = new MacroInterpreter();
-            }
-
-            /// <summary>
-            /// Sets the first argument for the macro call.
-            /// </summary>
-            /// <param name="argument">First argument</param>
-            public void StartExecution(int argument)
-            {
-                _argument = argument;
-
-                _executionPending = true;
-            }
-
-            /// <summary>
-            /// Starts executing the macro program code.
-            /// </summary>
-            /// <param name="mme">Program code</param>
-            /// <param name="state">Current GPU state</param>
-            public void Execute(int[] mme, ShadowRamControl shadowCtrl, GpuState state)
-            {
-                if (_executionPending)
-                {
-                    _executionPending = false;
-
-                    _interpreter?.Execute(mme, Position, _argument, shadowCtrl, state);
-                }
-            }
-
-            /// <summary>
-            /// Pushes an argument to the macro call argument FIFO.
-            /// </summary>
-            /// <param name="argument">Argument to be pushed</param>
-            public void PushArgument(int argument)
-            {
-                _interpreter?.Fifo.Enqueue(argument);
-            }
-        }
-
-        private ShadowRamControl _shadowCtrl;
-
-        private CachedMacro[] _macros;
-
-        private int[] _mme;
-
-        /// <summary>
-        /// GPU sub-channel information.
-        /// </summary>
-        private class SubChannel
-        {
-            /// <summary>
-            /// Sub-channel GPU state.
-            /// </summary>
-            public GpuState State { get; }
-
-            /// <summary>
-            /// Engine bound to the sub-channel.
-            /// </summary>
-            public ClassId  Class { get; set; }
-
-            /// <summary>
-            /// Creates a new instance of the GPU sub-channel.
-            /// </summary>
-            public SubChannel()
-            {
-                State = new GpuState();
-            }
-        }
-
-        private SubChannel[] _subChannels;
-
-        private SubChannel _fifoChannel;
-
-        /// <summary>
-        /// Creates a new instance of the GPU commands FIFO.
-        /// </summary>
-        /// <param name="context">GPU emulation context</param>
-        public NvGpuFifo(GpuContext context)
-        {
-            _context = context;
-
-            _macros = new CachedMacro[MacrosCount];
-
-            _mme = new int[MmeWords];
-
-            _fifoChannel = new SubChannel();
-
-            _context.Methods.RegisterCallbacksForFifo(_fifoChannel.State);
-
-            _subChannels = new SubChannel[8];
-
-            for (int index = 0; index < _subChannels.Length; index++)
-            {
-                _subChannels[index] = new SubChannel();
-
-                _context.Methods.RegisterCallbacks(_subChannels[index].State);
-            }
-        }
-
-        /// <summary>
-        /// Send macro code/data to the MME
-        /// </summary>
-        /// <param name="index">The index in the MME</param>
-        /// <param name="data">The data to use</param>
-        public void SendMacroCodeData(int index, int data)
-        {
-            _mme[index] = data;
-        }
-
-        /// <summary>
-        /// Bind a macro index to a position for the MME
-        /// </summary>
-        /// <param name="index">The macro index</param>
-        /// <param name="position">The position of the macro</param>
-        public void BindMacro(int index, int position)
-        {
-            _macros[index] = new CachedMacro(position);
-        }
-
-        /// <summary>
-        /// Change the shadow RAM setting
-        /// </summary>
-        /// <param name="shadowCtrl">The new Shadow RAM setting</param>
-        public void SetMmeShadowRamControl(ShadowRamControl shadowCtrl)
-        {
-            _shadowCtrl = shadowCtrl;
-        }
-
-        /// <summary>
-        /// Calls a GPU method.
-        /// </summary>
-        /// <param name="meth">GPU method call parameters</param>
-        public void CallMethod(MethodParams meth)
-        {
-            if ((MethodOffset)meth.Method == MethodOffset.BindChannel)
-            {
-                _subChannels[meth.SubChannel] = new SubChannel
-                {
-                    Class = (ClassId)meth.Argument
-                };
-
-                _context.Methods.RegisterCallbacks(_subChannels[meth.SubChannel].State);
-            }
-            else if (meth.Method < 0x60)
-            {
-                // TODO: check if macros are shared between subchannels or not. For now let's assume they are.
-                _fifoChannel.State.CallMethod(meth, _shadowCtrl);
-            }
-            else if (meth.Method < 0xe00)
-            {
-                _subChannels[meth.SubChannel].State.CallMethod(meth, _shadowCtrl);
-            }
-            else
-            {
-                int macroIndex = (meth.Method >> 1) & MacroIndexMask;
-
-                if ((meth.Method & 1) != 0)
-                {
-                    _macros[macroIndex].PushArgument(meth.Argument);
-                }
-                else
-                {
-                    _macros[macroIndex].StartExecution(meth.Argument);
-                }
-
-                if (meth.IsLastCall)
-                {
-                    _macros[macroIndex].Execute(_mme, _shadowCtrl, _subChannels[meth.SubChannel].State);
-
-                    _context.Methods.PerformDeferredDraws();
-                }
-            }
-        }
-    }
-}
\ No newline at end of file
diff --git a/Ryujinx.Graphics.Gpu/Ryujinx.Graphics.Gpu.csproj b/Ryujinx.Graphics.Gpu/Ryujinx.Graphics.Gpu.csproj
index 9348d04b..a9e81be3 100644
--- a/Ryujinx.Graphics.Gpu/Ryujinx.Graphics.Gpu.csproj
+++ b/Ryujinx.Graphics.Gpu/Ryujinx.Graphics.Gpu.csproj
@@ -2,6 +2,7 @@
 
   <ItemGroup>
     <ProjectReference Include="..\Ryujinx.Cpu\Ryujinx.Cpu.csproj" />
+    <ProjectReference Include="..\Ryujinx.Graphics.Device\Ryujinx.Graphics.Device.csproj" />
     <ProjectReference Include="..\Ryujinx.Graphics.GAL\Ryujinx.Graphics.GAL.csproj" />
     <ProjectReference Include="..\Ryujinx.Common\Ryujinx.Common.csproj" />
     <ProjectReference Include="..\Ryujinx.Graphics.Texture\Ryujinx.Graphics.Texture.csproj" />
diff --git a/Ryujinx.Graphics.Gpu/State/FenceActionOperation.cs b/Ryujinx.Graphics.Gpu/State/FenceActionOperation.cs
deleted file mode 100644
index c03443a8..00000000
--- a/Ryujinx.Graphics.Gpu/State/FenceActionOperation.cs
+++ /dev/null
@@ -1,11 +0,0 @@
-namespace Ryujinx.Graphics.Gpu.State
-{
-    /// <summary>
-    /// Fence action operations.
-    /// </summary>
-    enum FenceActionOperation
-    {
-        Acquire   = 0,
-        Increment = 1
-    }
-}
diff --git a/Ryujinx.Graphics.Gpu/State/FifoSemaphoreOperation.cs b/Ryujinx.Graphics.Gpu/State/FifoSemaphoreOperation.cs
deleted file mode 100644
index a6ccdcfe..00000000
--- a/Ryujinx.Graphics.Gpu/State/FifoSemaphoreOperation.cs
+++ /dev/null
@@ -1,9 +0,0 @@
-namespace Ryujinx.Graphics.Gpu.State
-{
-    enum FifoSemaphoreOperation
-    {
-        Counter = 0,
-        Acquire = 1,
-        Release = 2
-    }
-}
diff --git a/Ryujinx.Graphics.Gpu/State/MethodOffset.cs b/Ryujinx.Graphics.Gpu/State/MethodOffset.cs
index d9e2ce93..505e3d89 100644
--- a/Ryujinx.Graphics.Gpu/State/MethodOffset.cs
+++ b/Ryujinx.Graphics.Gpu/State/MethodOffset.cs
@@ -9,15 +9,6 @@ namespace Ryujinx.Graphics.Gpu.State
     enum MethodOffset
     {
         BindChannel                     = 0x0,
-        Semaphore                       = 0x4,
-        FenceValue                      = 0x1c,
-        FenceAction                     = 0x1d,
-        WaitForIdle                     = 0x44,
-        MacroUploadAddress              = 0x45,
-        SendMacroCodeData               = 0x46,
-        MacroBindingIndex               = 0x47,
-        BindMacro                       = 0x48,
-        SetMmeShadowRamControl          = 0x49,
         I2mParams                       = 0x60,
         LaunchDma                       = 0x6c,
         LoadInlineData                  = 0x6d,
diff --git a/Ryujinx.HLE/HOS/Services/Nv/NvDrvServices/NvHostChannel/NvHostChannelDeviceFile.cs b/Ryujinx.HLE/HOS/Services/Nv/NvDrvServices/NvHostChannel/NvHostChannelDeviceFile.cs
index 70c9a47b..b45d8401 100644
--- a/Ryujinx.HLE/HOS/Services/Nv/NvDrvServices/NvHostChannel/NvHostChannelDeviceFile.cs
+++ b/Ryujinx.HLE/HOS/Services/Nv/NvDrvServices/NvHostChannel/NvHostChannelDeviceFile.cs
@@ -414,10 +414,10 @@ namespace Ryujinx.HLE.HOS.Services.Nv.NvDrvServices.NvHostChannel
 
             if (header.Flags.HasFlag(SubmitGpfifoFlags.FenceWait) && !_device.System.HostSyncpoint.IsSyncpointExpired(header.Fence.Id, header.Fence.Value))
             {
-                _device.Gpu.DmaPusher.PushHostCommandBuffer(CreateWaitCommandBuffer(header.Fence));
+                _device.Gpu.GPFifo.PushHostCommandBuffer(CreateWaitCommandBuffer(header.Fence));
             }
 
-            _device.Gpu.DmaPusher.PushEntries(entries);
+            _device.Gpu.GPFifo.PushEntries(entries);
 
             header.Fence.Id = _channelSyncpoint.Id;
 
@@ -439,12 +439,12 @@ namespace Ryujinx.HLE.HOS.Services.Nv.NvDrvServices.NvHostChannel
 
             if (header.Flags.HasFlag(SubmitGpfifoFlags.FenceIncrement))
             {
-                _device.Gpu.DmaPusher.PushHostCommandBuffer(CreateIncrementCommandBuffer(ref header.Fence, header.Flags));
+                _device.Gpu.GPFifo.PushHostCommandBuffer(CreateIncrementCommandBuffer(ref header.Fence, header.Flags));
             }
 
             header.Flags = SubmitGpfifoFlags.None;
 
-            _device.Gpu.DmaPusher.SignalNewEntries();
+            _device.Gpu.GPFifo.SignalNewEntries();
 
             return NvInternalResult.Success;
         }
diff --git a/Ryujinx.HLE/Switch.cs b/Ryujinx.HLE/Switch.cs
index 9defe25d..2e1a4b66 100644
--- a/Ryujinx.HLE/Switch.cs
+++ b/Ryujinx.HLE/Switch.cs
@@ -148,12 +148,12 @@ namespace Ryujinx.HLE
 
         public bool WaitFifo()
         {
-            return Gpu.DmaPusher.WaitForCommands();
+            return Gpu.GPFifo.WaitForCommands();
         }
 
         public void ProcessFrame()
         {
-            Gpu.DmaPusher.DispatchCalls();
+            Gpu.GPFifo.DispatchCalls();
         }
 
         public void PresentFrame(Action swapBuffersCallback)