diff --git a/ChocolArm64/CpuThread.cs b/ChocolArm64/CpuThread.cs
index 87b21395..6cd34f81 100644
--- a/ChocolArm64/CpuThread.cs
+++ b/ChocolArm64/CpuThread.cs
@@ -1,5 +1,6 @@
 using ChocolArm64.Memory;
 using ChocolArm64.State;
+using ChocolArm64.Translation;
 using System;
 using System.Threading;
 
diff --git a/ChocolArm64/Decoders/Decoder.cs b/ChocolArm64/Decoders/Decoder.cs
index 2b195412..6b5d79f0 100644
--- a/ChocolArm64/Decoders/Decoder.cs
+++ b/ChocolArm64/Decoders/Decoder.cs
@@ -25,14 +25,53 @@ namespace ChocolArm64.Decoders
 
             FillBlock(memory, mode, block);
 
+            OpCode64 lastOp = block.GetLastOp();
+
+            if (IsBranch(lastOp) && !IsCall(lastOp) && lastOp is IOpCodeBImm op)
+            {
+                //It's possible that the branch on this block lands on the middle of the block.
+                //This is more common on tight loops. In this case, we can improve the codegen
+                //a bit by changing the CFG and either making the branch point to the same block
+                //(which indicates that the block is a loop that jumps back to the start), and the
+                //other possible case is a jump somewhere on the middle of the block, which is
+                //also a loop, but in this case we need to split the block in half.
+                if (op.Imm == start)
+                {
+                    block.Branch = block;
+                }
+                else if ((ulong)op.Imm > (ulong)start &&
+                         (ulong)op.Imm < (ulong)block.EndPosition)
+                {
+                    Block botBlock = new Block(op.Imm);
+
+                    int botBlockIndex = 0;
+
+                    long currPosition = start;
+
+                    while ((ulong)currPosition < (ulong)op.Imm)
+                    {
+                        currPosition += block.OpCodes[botBlockIndex++].OpCodeSizeInBytes;
+                    }
+
+                    botBlock.OpCodes.AddRange(block.OpCodes);
+
+                    botBlock.OpCodes.RemoveRange(0, botBlockIndex);
+
+                    block.OpCodes.RemoveRange(botBlockIndex, block.OpCodes.Count - botBlockIndex);
+
+                    botBlock.EndPosition = block.EndPosition;
+
+                    block.EndPosition = op.Imm;
+
+                    botBlock.Branch = botBlock;
+                    block.Next      = botBlock;
+                }
+            }
+
             return block;
         }
 
-        public static Block DecodeSubroutine(
-            TranslatorCache cache,
-            MemoryManager   memory,
-            long            start,
-            ExecutionMode   mode)
+        public static Block DecodeSubroutine(MemoryManager memory, long start, ExecutionMode mode)
         {
             Dictionary<long, Block> visited    = new Dictionary<long, Block>();
             Dictionary<long, Block> visitedEnd = new Dictionary<long, Block>();
@@ -67,23 +106,16 @@ namespace ChocolArm64.Decoders
                 //(except BL/BLR that are sub calls) or end of executable, Next is null.
                 if (current.OpCodes.Count > 0)
                 {
-                    bool hasCachedSub = false;
-
                     OpCode64 lastOp = current.GetLastOp();
 
-                    if (lastOp is IOpCodeBImm op)
+                    bool isCall = IsCall(lastOp);
+
+                    if (lastOp is IOpCodeBImm op && !isCall)
                     {
-                        if (op.Emitter == InstEmit.Bl)
-                        {
-                            hasCachedSub = cache.HasSubroutine(op.Imm);
-                        }
-                        else
-                        {
-                            current.Branch = Enqueue(op.Imm);
-                        }
+                        current.Branch = Enqueue(op.Imm);
                     }
 
-                    if (!IsUnconditionalBranch(lastOp) || hasCachedSub)
+                    if (!IsUnconditionalBranch(lastOp) || isCall)
                     {
                         current.Next = Enqueue(current.EndPosition);
                     }
@@ -223,6 +255,13 @@ namespace ChocolArm64.Decoders
                    opCode is IOpCode32BReg;
         }
 
+        private static bool IsCall(OpCode64 opCode)
+        {
+            //TODO (CQ): ARM32 support.
+            return opCode.Emitter == InstEmit.Bl ||
+                   opCode.Emitter == InstEmit.Blr;
+        }
+
         private static bool IsException(OpCode64 opCode)
         {
             return opCode.Emitter == InstEmit.Brk ||
diff --git a/ChocolArm64/Instructions/InstEmitFlow.cs b/ChocolArm64/Instructions/InstEmitFlow.cs
index 181c6a04..a842dca9 100644
--- a/ChocolArm64/Instructions/InstEmitFlow.cs
+++ b/ChocolArm64/Instructions/InstEmitFlow.cs
@@ -3,6 +3,8 @@ using ChocolArm64.State;
 using ChocolArm64.Translation;
 using System.Reflection.Emit;
 
+using static ChocolArm64.Instructions.InstEmitFlowHelper;
+
 namespace ChocolArm64.Instructions
 {
     static partial class InstEmit
@@ -39,7 +41,7 @@ namespace ChocolArm64.Instructions
             context.EmitStint(RegisterAlias.Lr);
             context.EmitStoreState();
 
-            InstEmitFlowHelper.EmitCall(context, op.Imm);
+            EmitCall(context, op.Imm);
         }
 
         public static void Blr(ILEmitterCtx context)
@@ -51,7 +53,7 @@ namespace ChocolArm64.Instructions
             context.EmitStint(RegisterAlias.Lr);
             context.EmitStoreState();
 
-            context.Emit(OpCodes.Ret);
+            EmitVirtualCall(context);
         }
 
         public static void Br(ILEmitterCtx context)
@@ -61,7 +63,7 @@ namespace ChocolArm64.Instructions
             context.EmitStoreState();
             context.EmitLdintzr(op.Rn);
 
-            context.Emit(OpCodes.Ret);
+            EmitVirtualJump(context);
         }
 
         public static void Cbnz(ILEmitterCtx context) => EmitCb(context, OpCodes.Bne_Un);
@@ -106,10 +108,17 @@ namespace ChocolArm64.Instructions
         {
             OpCodeBImm64 op = (OpCodeBImm64)context.CurrOp;
 
-            if (context.CurrBlock.Next   != null &&
-                context.CurrBlock.Branch != null)
+            if (context.CurrBlock.Branch != null)
             {
                 context.EmitCondBranch(context.GetLabel(op.Imm), cond);
+
+                if (context.CurrBlock.Next == null)
+                {
+                    context.EmitStoreState();
+                    context.EmitLdc_I8(op.Position + 4);
+
+                    context.Emit(OpCodes.Ret);
+                }
             }
             else
             {
@@ -135,10 +144,17 @@ namespace ChocolArm64.Instructions
         {
             OpCodeBImm64 op = (OpCodeBImm64)context.CurrOp;
 
-            if (context.CurrBlock.Next   != null &&
-                context.CurrBlock.Branch != null)
+            if (context.CurrBlock.Branch != null)
             {
                 context.Emit(ilOp, context.GetLabel(op.Imm));
+
+                if (context.CurrBlock.Next == null)
+                {
+                    context.EmitStoreState();
+                    context.EmitLdc_I8(op.Position + 4);
+
+                    context.Emit(OpCodes.Ret);
+                }
             }
             else
             {
diff --git a/ChocolArm64/Instructions/InstEmitFlowHelper.cs b/ChocolArm64/Instructions/InstEmitFlowHelper.cs
index cf093bb3..e93ef426 100644
--- a/ChocolArm64/Instructions/InstEmitFlowHelper.cs
+++ b/ChocolArm64/Instructions/InstEmitFlowHelper.cs
@@ -1,4 +1,6 @@
+using ChocolArm64.State;
 using ChocolArm64.Translation;
+using System.Reflection;
 using System.Reflection.Emit;
 
 namespace ChocolArm64.Instructions
@@ -7,12 +9,120 @@ namespace ChocolArm64.Instructions
     {
         public static void EmitCall(ILEmitterCtx context, long imm)
         {
-            if (context.TryOptEmitSubroutineCall())
+            if (context.Tier == TranslationTier.Tier0)
+            {
+                context.TranslateAhead(imm);
+
+                context.EmitLdc_I8(imm);
+
+                context.Emit(OpCodes.Ret);
+
+                return;
+            }
+
+            if (!context.TryOptEmitSubroutineCall())
+            {
+                context.TranslateAhead(imm);
+
+                context.EmitLdarg(TranslatedSub.StateArgIdx);
+
+                context.EmitFieldLoad(typeof(CpuThreadState).GetField(nameof(CpuThreadState.CurrentTranslator),
+                    BindingFlags.Instance |
+                    BindingFlags.NonPublic));
+
+                context.EmitLdarg(TranslatedSub.StateArgIdx);
+                context.EmitLdc_I8(imm);
+
+                context.EmitPrivateCall(typeof(Translator), nameof(Translator.GetOrTranslateSubroutine));
+
+                context.EmitLdarg(TranslatedSub.StateArgIdx);
+                context.EmitLdarg(TranslatedSub.MemoryArgIdx);
+
+                context.EmitCall(typeof(TranslatedSub), nameof(TranslatedSub.Execute));
+            }
+
+            EmitContinueOrReturnCheck(context);
+        }
+
+        public static void EmitVirtualCall(ILEmitterCtx context)
+        {
+            EmitVirtualCallOrJump(context, isJump: false);
+        }
+
+        public static void EmitVirtualJump(ILEmitterCtx context)
+        {
+            EmitVirtualCallOrJump(context, isJump: true);
+        }
+
+        private static void EmitVirtualCallOrJump(ILEmitterCtx context, bool isJump)
+        {
+            if (context.Tier == TranslationTier.Tier0)
+            {
+                context.Emit(OpCodes.Dup);
+
+                context.EmitSttmp();
+                context.EmitLdarg(TranslatedSub.StateArgIdx);
+
+                context.EmitFieldLoad(typeof(CpuThreadState).GetField(nameof(CpuThreadState.CurrentTranslator),
+                    BindingFlags.Instance |
+                    BindingFlags.NonPublic));
+
+                context.EmitLdarg(TranslatedSub.StateArgIdx);
+                context.EmitLdtmp();
+
+                context.EmitPrivateCall(typeof(Translator), nameof(Translator.TranslateVirtualSubroutine));
+
+                context.Emit(OpCodes.Ret);
+            }
+            else
+            {
+                context.EmitSttmp();
+                context.EmitLdarg(TranslatedSub.StateArgIdx);
+
+                context.EmitFieldLoad(typeof(CpuThreadState).GetField(nameof(CpuThreadState.CurrentTranslator),
+                    BindingFlags.Instance |
+                    BindingFlags.NonPublic));
+
+                context.EmitLdarg(TranslatedSub.StateArgIdx);
+                context.EmitLdtmp();
+
+                context.EmitPrivateCall(typeof(Translator), nameof(Translator.GetOrTranslateVirtualSubroutine));
+
+                context.EmitLdarg(TranslatedSub.StateArgIdx);
+                context.EmitLdarg(TranslatedSub.MemoryArgIdx);
+
+                if (isJump)
+                {
+                    //The tail prefix allows the JIT to jump to the next function,
+                    //while releasing the stack space used by the current one.
+                    //This is ideal for BR ARM instructions, which are
+                    //basically indirect tail calls.
+                    context.Emit(OpCodes.Tailcall);
+                }
+
+                MethodInfo mthdInfo = typeof(ArmSubroutine).GetMethod("Invoke");
+
+                context.EmitCall(mthdInfo, isVirtual: true);
+
+                if (!isJump)
+                {
+                    EmitContinueOrReturnCheck(context);
+                }
+                else
+                {
+                    context.Emit(OpCodes.Ret);
+                }
+            }
+        }
+
+        private static void EmitContinueOrReturnCheck(ILEmitterCtx context)
+        {
+            //Note: The return value of the called method will be placed
+            //at the Stack, the return value is always a Int64 with the
+            //return address of the function. We check if the address is
+            //correct, if it isn't we keep returning until we reach the dispatcher.
+            if (context.CurrBlock.Next != null)
             {
-                //Note: the return value of the called method will be placed
-                //at the Stack, the return value is always a Int64 with the
-                //return address of the function. We check if the address is
-                //correct, if it isn't we keep returning until we reach the dispatcher.
                 context.Emit(OpCodes.Dup);
 
                 context.EmitLdc_I8(context.CurrOp.Position + 4);
@@ -30,8 +140,6 @@ namespace ChocolArm64.Instructions
             }
             else
             {
-                context.EmitLdc_I8(imm);
-
                 context.Emit(OpCodes.Ret);
             }
         }
diff --git a/ChocolArm64/State/CpuThreadState.cs b/ChocolArm64/State/CpuThreadState.cs
index 12edc429..abec60bb 100644
--- a/ChocolArm64/State/CpuThreadState.cs
+++ b/ChocolArm64/State/CpuThreadState.cs
@@ -1,4 +1,5 @@
 using ChocolArm64.Events;
+using ChocolArm64.Translation;
 using System;
 using System.Diagnostics;
 using System.Runtime.CompilerServices;
@@ -82,6 +83,8 @@ namespace ChocolArm64.State
 
         private static double _hostTickFreq;
 
+        internal Translator CurrentTranslator;
+
         static CpuThreadState()
         {
             _hostTickFreq = 1.0 / Stopwatch.Frequency;
diff --git a/ChocolArm64/TranslatedSub.cs b/ChocolArm64/TranslatedSub.cs
deleted file mode 100644
index 653abcca..00000000
--- a/ChocolArm64/TranslatedSub.cs
+++ /dev/null
@@ -1,140 +0,0 @@
-using ChocolArm64.Memory;
-using ChocolArm64.State;
-using System;
-using System.Collections.Generic;
-using System.Collections.ObjectModel;
-using System.Linq;
-using System.Reflection;
-using System.Reflection.Emit;
-
-namespace ChocolArm64
-{
-    class TranslatedSub
-    {
-        private delegate long Aa64Subroutine(CpuThreadState register, MemoryManager memory);
-
-        private const int MinCallCountForReJit = 250;
-
-        private Aa64Subroutine _execDelegate;
-
-        public static int StateArgIdx  { get; private set; }
-        public static int MemoryArgIdx { get; private set; }
-
-        public static Type[] FixedArgTypes { get; private set; }
-
-        public DynamicMethod Method { get; private set; }
-
-        public ReadOnlyCollection<Register> SubArgs { get; private set; }
-
-        private HashSet<long> _callers;
-
-        private TranslatedSubType _type;
-
-        private int _callCount;
-
-        private bool _needsReJit;
-
-        public TranslatedSub(DynamicMethod method, List<Register> subArgs)
-        {
-            Method  = method                ?? throw new ArgumentNullException(nameof(method));;
-            SubArgs = subArgs?.AsReadOnly() ?? throw new ArgumentNullException(nameof(subArgs));
-
-            _callers = new HashSet<long>();
-
-            PrepareDelegate();
-        }
-
-        static TranslatedSub()
-        {
-            MethodInfo mthdInfo = typeof(Aa64Subroutine).GetMethod("Invoke");
-
-            ParameterInfo[] Params = mthdInfo.GetParameters();
-
-            FixedArgTypes = new Type[Params.Length];
-
-            for (int index = 0; index < Params.Length; index++)
-            {
-                Type paramType = Params[index].ParameterType;
-
-                FixedArgTypes[index] = paramType;
-
-                if (paramType == typeof(CpuThreadState))
-                {
-                    StateArgIdx = index;
-                }
-                else if (paramType == typeof(MemoryManager))
-                {
-                    MemoryArgIdx = index;
-                }
-            }
-        }
-
-        private void PrepareDelegate()
-        {
-            string name = $"{Method.Name}_Dispatch";
-
-            DynamicMethod mthd = new DynamicMethod(name, typeof(long), FixedArgTypes);
-
-            ILGenerator generator = mthd.GetILGenerator();
-
-            generator.EmitLdargSeq(FixedArgTypes.Length);
-
-            foreach (Register reg in SubArgs)
-            {
-                generator.EmitLdarg(StateArgIdx);
-
-                generator.Emit(OpCodes.Ldfld, reg.GetField());
-            }
-
-            generator.Emit(OpCodes.Call, Method);
-            generator.Emit(OpCodes.Ret);
-
-            _execDelegate = (Aa64Subroutine)mthd.CreateDelegate(typeof(Aa64Subroutine));
-        }
-
-        public bool ShouldReJit()
-        {
-            if (_needsReJit && _callCount < MinCallCountForReJit)
-            {
-                _callCount++;
-
-                return false;
-            }
-
-            return _needsReJit;
-        }
-
-        public long Execute(CpuThreadState threadState, MemoryManager memory)
-        {
-            return _execDelegate(threadState, memory);
-        }
-
-        public void AddCaller(long position)
-        {
-            lock (_callers)
-            {
-                _callers.Add(position);
-            }
-        }
-
-        public long[] GetCallerPositions()
-        {
-            lock (_callers)
-            {
-                return _callers.ToArray();
-            }
-        }
-
-        public void SetType(TranslatedSubType type)
-        {
-            _type = type;
-
-            if (type == TranslatedSubType.SubTier0)
-            {
-                _needsReJit = true;
-            }
-        }
-
-        public void MarkForReJit() => _needsReJit = true;
-    }
-}
\ No newline at end of file
diff --git a/ChocolArm64/TranslatedSubType.cs b/ChocolArm64/TranslatedSubType.cs
deleted file mode 100644
index f57aea94..00000000
--- a/ChocolArm64/TranslatedSubType.cs
+++ /dev/null
@@ -1,8 +0,0 @@
-namespace ChocolArm64
-{
-    enum TranslatedSubType
-    {
-        SubTier0,
-        SubTier1
-    }
-}
\ No newline at end of file
diff --git a/ChocolArm64/Translation/ILEmitterCtx.cs b/ChocolArm64/Translation/ILEmitterCtx.cs
index b5ebff75..ef63e60c 100644
--- a/ChocolArm64/Translation/ILEmitterCtx.cs
+++ b/ChocolArm64/Translation/ILEmitterCtx.cs
@@ -11,6 +11,7 @@ namespace ChocolArm64.Translation
     class ILEmitterCtx
     {
         private TranslatorCache _cache;
+        private TranslatorQueue _queue;
 
         private Dictionary<long, ILLabel> _labels;
 
@@ -23,6 +24,8 @@ namespace ChocolArm64.Translation
         public Block    CurrBlock => _currBlock;
         public OpCode64 CurrOp    => _currBlock?.OpCodes[_opcIndex];
 
+        public TranslationTier Tier { get; }
+
         public Aarch32Mode Mode { get; } = Aarch32Mode.User; //TODO
 
         private Dictionary<Block, ILBlock> _visitedBlocks;
@@ -47,11 +50,14 @@ namespace ChocolArm64.Translation
         private const int VecTmp1Index    = -5;
         private const int VecTmp2Index    = -6;
 
-        public ILEmitterCtx(TranslatorCache cache, Block graph)
+        public ILEmitterCtx(TranslatorCache cache, TranslatorQueue queue, TranslationTier tier, Block graph)
         {
             _cache     = cache ?? throw new ArgumentNullException(nameof(cache));
+            _queue     = queue ?? throw new ArgumentNullException(nameof(queue));
             _currBlock = graph ?? throw new ArgumentNullException(nameof(graph));
 
+            Tier = tier;
+
             _labels = new Dictionary<long, ILLabel>();
 
             _visitedBlocks = new Dictionary<Block, ILBlock>();
@@ -243,6 +249,16 @@ namespace ChocolArm64.Translation
             return new ILBlock();
         }
 
+        public void TranslateAhead(long position, ExecutionMode mode = ExecutionMode.Aarch64)
+        {
+            if (_cache.TryGetSubroutine(position, out TranslatedSub sub) && sub.Tier != TranslationTier.Tier0)
+            {
+                return;
+            }
+
+            _queue.Enqueue(new TranslatorQueueItem(position, mode, TranslationTier.Tier1));
+        }
+
         public bool TryOptEmitSubroutineCall()
         {
             if (_currBlock.Next == null)
@@ -265,20 +281,8 @@ namespace ChocolArm64.Translation
                 EmitLdarg(index);
             }
 
-            foreach (Register reg in subroutine.SubArgs)
-            {
-                switch (reg.Type)
-                {
-                    case RegisterType.Flag:   Ldloc(reg.Index, IoType.Flag);   break;
-                    case RegisterType.Int:    Ldloc(reg.Index, IoType.Int);    break;
-                    case RegisterType.Vector: Ldloc(reg.Index, IoType.Vector); break;
-                }
-            }
-
             EmitCall(subroutine.Method);
 
-            subroutine.AddCaller(_subPosition);
-
             return true;
         }
 
@@ -463,7 +467,12 @@ namespace ChocolArm64.Translation
             _ilBlock.Add(new ILOpCodeBranch(ilOp, label));
         }
 
-        public void Emit(string text)
+        public void EmitFieldLoad(FieldInfo info)
+        {
+            _ilBlock.Add(new ILOpCodeLoadField(info));
+        }
+
+        public void EmitPrint(string text)
         {
             _ilBlock.Add(new ILOpCodeLog(text));
         }
@@ -618,14 +627,9 @@ namespace ChocolArm64.Translation
             EmitCall(objType.GetMethod(mthdName, BindingFlags.Instance | BindingFlags.NonPublic));
         }
 
-        public void EmitCall(MethodInfo mthdInfo)
+        public void EmitCall(MethodInfo mthdInfo, bool isVirtual = false)
         {
-            if (mthdInfo == null)
-            {
-                throw new ArgumentNullException(nameof(mthdInfo));
-            }
-
-            _ilBlock.Add(new ILOpCodeCall(mthdInfo));
+            _ilBlock.Add(new ILOpCodeCall(mthdInfo ?? throw new ArgumentNullException(nameof(mthdInfo)), isVirtual));
         }
 
         public void EmitLdc_I(long value)
diff --git a/ChocolArm64/Translation/ILMethodBuilder.cs b/ChocolArm64/Translation/ILMethodBuilder.cs
index 70d9a2db..892f831b 100644
--- a/ChocolArm64/Translation/ILMethodBuilder.cs
+++ b/ChocolArm64/Translation/ILMethodBuilder.cs
@@ -26,74 +26,32 @@ namespace ChocolArm64.Translation
             _subName  = subName;
         }
 
-        public TranslatedSub GetSubroutine()
+        public TranslatedSub GetSubroutine(TranslationTier tier)
         {
             LocalAlloc = new LocalAlloc(_ilBlocks, _ilBlocks[0]);
 
-            List<Register> subArgs = new List<Register>();
-
-            void SetArgs(long inputs, RegisterType baseType)
-            {
-                for (int bit = 0; bit < 64; bit++)
-                {
-                    long mask = 1L << bit;
-
-                    if ((inputs & mask) != 0)
-                    {
-                        subArgs.Add(GetRegFromBit(bit, baseType));
-                    }
-                }
-            }
-
-            SetArgs(LocalAlloc.GetIntInputs(_ilBlocks[0]), RegisterType.Int);
-            SetArgs(LocalAlloc.GetVecInputs(_ilBlocks[0]), RegisterType.Vector);
-
-            DynamicMethod method = new DynamicMethod(_subName, typeof(long), GetArgumentTypes(subArgs));
+            DynamicMethod method = new DynamicMethod(_subName, typeof(long), TranslatedSub.FixedArgTypes);
 
             Generator = method.GetILGenerator();
 
-            TranslatedSub subroutine = new TranslatedSub(method, subArgs);
-
-            int argsStart = TranslatedSub.FixedArgTypes.Length;
+            TranslatedSub subroutine = new TranslatedSub(method, tier);
 
             _locals = new Dictionary<Register, int>();
 
             _localsCount = 0;
 
-            for (int index = 0; index < subroutine.SubArgs.Count; index++)
-            {
-                Register reg = subroutine.SubArgs[index];
-
-                Generator.EmitLdarg(index + argsStart);
-                Generator.EmitStloc(GetLocalIndex(reg));
-            }
+            new ILOpCodeLoadState(_ilBlocks[0]).Emit(this);
 
             foreach (ILBlock ilBlock in _ilBlocks)
             {
                 ilBlock.Emit(this);
             }
 
+            subroutine.PrepareMethod();
+
             return subroutine;
         }
 
-        private Type[] GetArgumentTypes(IList<Register> Params)
-        {
-            Type[] fixedArgs = TranslatedSub.FixedArgTypes;
-
-            Type[] output = new Type[Params.Count + fixedArgs.Length];
-
-            fixedArgs.CopyTo(output, 0);
-
-            int typeIdx = fixedArgs.Length;
-
-            for (int index = 0; index < Params.Count; index++)
-            {
-                output[typeIdx++] = GetFieldType(Params[index].Type);
-            }
-
-            return output;
-        }
-
         public int GetLocalIndex(Register reg)
         {
             if (!_locals.TryGetValue(reg, out int index))
diff --git a/ChocolArm64/Translation/ILOpCodeCall.cs b/ChocolArm64/Translation/ILOpCodeCall.cs
index 8486a791..c046aeeb 100644
--- a/ChocolArm64/Translation/ILOpCodeCall.cs
+++ b/ChocolArm64/Translation/ILOpCodeCall.cs
@@ -5,16 +5,19 @@ namespace ChocolArm64.Translation
 {
     struct ILOpCodeCall : IILEmit
     {
-        private MethodInfo _mthdInfo;
+        public MethodInfo Info { get; private set; }
 
-        public ILOpCodeCall(MethodInfo mthdInfo)
+        public bool IsVirtual { get; private set; }
+
+        public ILOpCodeCall(MethodInfo info, bool isVirtual)
         {
-            _mthdInfo = mthdInfo;
+            Info      = info;
+            IsVirtual = isVirtual;
         }
 
         public void Emit(ILMethodBuilder context)
         {
-            context.Generator.Emit(OpCodes.Call, _mthdInfo);
+            context.Generator.Emit(IsVirtual ? OpCodes.Callvirt : OpCodes.Call, Info);
         }
     }
 }
\ No newline at end of file
diff --git a/ChocolArm64/Translation/ILOpCodeLoadField.cs b/ChocolArm64/Translation/ILOpCodeLoadField.cs
new file mode 100644
index 00000000..abcd37c3
--- /dev/null
+++ b/ChocolArm64/Translation/ILOpCodeLoadField.cs
@@ -0,0 +1,20 @@
+using System.Reflection;
+using System.Reflection.Emit;
+
+namespace ChocolArm64.Translation
+{
+    struct ILOpCodeLoadField : IILEmit
+    {
+        public FieldInfo Info { get; private set; }
+
+        public ILOpCodeLoadField(FieldInfo info)
+        {
+            Info = info;
+        }
+
+        public void Emit(ILMethodBuilder context)
+        {
+            context.Generator.Emit(OpCodes.Ldfld, Info);
+        }
+    }
+}
\ No newline at end of file
diff --git a/ChocolArm64/Translation/TranslatedSub.cs b/ChocolArm64/Translation/TranslatedSub.cs
new file mode 100644
index 00000000..65d70351
--- /dev/null
+++ b/ChocolArm64/Translation/TranslatedSub.cs
@@ -0,0 +1,65 @@
+using ChocolArm64.Memory;
+using ChocolArm64.State;
+using System;
+using System.Reflection;
+using System.Reflection.Emit;
+
+namespace ChocolArm64.Translation
+{
+    delegate long ArmSubroutine(CpuThreadState state, MemoryManager memory);
+
+    class TranslatedSub
+    {
+        public ArmSubroutine Delegate { get; private set; }
+
+        public static int StateArgIdx  { get; private set; }
+        public static int MemoryArgIdx { get; private set; }
+
+        public static Type[] FixedArgTypes { get; private set; }
+
+        public DynamicMethod Method { get; private set; }
+
+        public TranslationTier Tier { get; private set; }
+
+        public TranslatedSub(DynamicMethod method, TranslationTier tier)
+        {
+            Method = method ?? throw new ArgumentNullException(nameof(method));;
+            Tier   = tier;
+        }
+
+        static TranslatedSub()
+        {
+            MethodInfo mthdInfo = typeof(ArmSubroutine).GetMethod("Invoke");
+
+            ParameterInfo[] Params = mthdInfo.GetParameters();
+
+            FixedArgTypes = new Type[Params.Length];
+
+            for (int index = 0; index < Params.Length; index++)
+            {
+                Type argType = Params[index].ParameterType;
+
+                FixedArgTypes[index] = argType;
+
+                if (argType == typeof(CpuThreadState))
+                {
+                    StateArgIdx = index;
+                }
+                else if (argType == typeof(MemoryManager))
+                {
+                    MemoryArgIdx = index;
+                }
+            }
+        }
+
+        public void PrepareMethod()
+        {
+            Delegate = (ArmSubroutine)Method.CreateDelegate(typeof(ArmSubroutine));
+        }
+
+        public long Execute(CpuThreadState threadState, MemoryManager memory)
+        {
+            return Delegate(threadState, memory);
+        }
+    }
+}
\ No newline at end of file
diff --git a/ChocolArm64/Translation/TranslationTier.cs b/ChocolArm64/Translation/TranslationTier.cs
new file mode 100644
index 00000000..13afd9c5
--- /dev/null
+++ b/ChocolArm64/Translation/TranslationTier.cs
@@ -0,0 +1,11 @@
+namespace ChocolArm64.Translation
+{
+    enum TranslationTier
+    {
+        Tier0,
+        Tier1,
+        Tier2,
+
+        Count
+    }
+}
\ No newline at end of file
diff --git a/ChocolArm64/Translation/Translator.cs b/ChocolArm64/Translation/Translator.cs
new file mode 100644
index 00000000..7f7df6e5
--- /dev/null
+++ b/ChocolArm64/Translation/Translator.cs
@@ -0,0 +1,188 @@
+using ChocolArm64.Decoders;
+using ChocolArm64.Events;
+using ChocolArm64.Memory;
+using ChocolArm64.State;
+using System;
+using System.Threading;
+
+namespace ChocolArm64.Translation
+{
+    public class Translator
+    {
+        private MemoryManager _memory;
+
+        private CpuThreadState _dummyThreadState;
+
+        private TranslatorCache _cache;
+        private TranslatorQueue _queue;
+
+        private Thread _backgroundTranslator;
+
+        public event EventHandler<CpuTraceEventArgs> CpuTrace;
+
+        public bool EnableCpuTrace { get; set; }
+
+        private volatile int _threadCount;
+
+        public Translator(MemoryManager memory)
+        {
+            _memory = memory;
+
+            _dummyThreadState = new CpuThreadState();
+
+            _dummyThreadState.Running = false;
+
+            _cache = new TranslatorCache();
+            _queue = new TranslatorQueue();
+        }
+
+        internal void ExecuteSubroutine(CpuThread thread, long position)
+        {
+            if (Interlocked.Increment(ref _threadCount) == 1)
+            {
+                _backgroundTranslator = new Thread(TranslateQueuedSubs);
+                _backgroundTranslator.Start();
+            }
+
+            ExecuteSubroutine(thread.ThreadState, position);
+
+            if (Interlocked.Decrement(ref _threadCount) == 0)
+            {
+                _queue.ForceSignal();
+            }
+        }
+
+        private void ExecuteSubroutine(CpuThreadState state, long position)
+        {
+            state.CurrentTranslator = this;
+
+            do
+            {
+                if (EnableCpuTrace)
+                {
+                    CpuTrace?.Invoke(this, new CpuTraceEventArgs(position));
+                }
+
+                TranslatedSub subroutine = GetOrTranslateSubroutine(state, position);
+
+                position = subroutine.Execute(state, _memory);
+            }
+            while (position != 0 && state.Running);
+
+            state.CurrentTranslator = null;
+        }
+
+        internal void TranslateVirtualSubroutine(CpuThreadState state, long position)
+        {
+            if (!_cache.TryGetSubroutine(position, out TranslatedSub sub) || sub.Tier == TranslationTier.Tier0)
+            {
+                _queue.Enqueue(new TranslatorQueueItem(position, state.GetExecutionMode(), TranslationTier.Tier1));
+            }
+        }
+
+        internal ArmSubroutine GetOrTranslateVirtualSubroutine(CpuThreadState state, long position)
+        {
+            if (!_cache.TryGetSubroutine(position, out TranslatedSub sub))
+            {
+                sub = TranslateLowCq(position, state.GetExecutionMode());
+            }
+
+            if (sub.Tier == TranslationTier.Tier0)
+            {
+                _queue.Enqueue(new TranslatorQueueItem(position, state.GetExecutionMode(), TranslationTier.Tier1));
+            }
+
+            return sub.Delegate;
+        }
+
+        internal TranslatedSub GetOrTranslateSubroutine(CpuThreadState state, long position)
+        {
+            if (!_cache.TryGetSubroutine(position, out TranslatedSub subroutine))
+            {
+                subroutine = TranslateLowCq(position, state.GetExecutionMode());
+            }
+
+            return subroutine;
+        }
+
+        private void TranslateQueuedSubs()
+        {
+            while (_threadCount != 0)
+            {
+                if (_queue.TryDequeue(out TranslatorQueueItem item))
+                {
+                    bool isCached = _cache.TryGetSubroutine(item.Position, out TranslatedSub sub);
+
+                    if (isCached && item.Tier <= sub.Tier)
+                    {
+                        continue;
+                    }
+
+                    if (item.Tier == TranslationTier.Tier0)
+                    {
+                        TranslateLowCq(item.Position, item.Mode);
+                    }
+                    else
+                    {
+                        TranslateHighCq(item.Position, item.Mode);
+                    }
+                }
+                else
+                {
+                    _queue.WaitForItems();
+                }
+            }
+        }
+
+        private TranslatedSub TranslateLowCq(long position, ExecutionMode mode)
+        {
+            Block block = Decoder.DecodeBasicBlock(_memory, position, mode);
+
+            ILEmitterCtx context = new ILEmitterCtx(_cache, _queue, TranslationTier.Tier0, block);
+
+            string subName = GetSubroutineName(position);
+
+            ILMethodBuilder ilMthdBuilder = new ILMethodBuilder(context.GetILBlocks(), subName);
+
+            TranslatedSub subroutine = ilMthdBuilder.GetSubroutine(TranslationTier.Tier0);
+
+            return _cache.GetOrAdd(position, subroutine, block.OpCodes.Count);
+        }
+
+        private void TranslateHighCq(long position, ExecutionMode mode)
+        {
+            Block graph = Decoder.DecodeSubroutine(_memory, position, mode);
+
+            ILEmitterCtx context = new ILEmitterCtx(_cache, _queue, TranslationTier.Tier1, graph);
+
+            ILBlock[] ilBlocks = context.GetILBlocks();
+
+            string subName = GetSubroutineName(position);
+
+            ILMethodBuilder ilMthdBuilder = new ILMethodBuilder(ilBlocks, subName);
+
+            TranslatedSub subroutine = ilMthdBuilder.GetSubroutine(TranslationTier.Tier1);
+
+            int ilOpCount = 0;
+
+            foreach (ILBlock ilBlock in ilBlocks)
+            {
+                ilOpCount += ilBlock.Count;
+            }
+
+            _cache.AddOrUpdate(position, subroutine, ilOpCount);
+
+            ForceAheadOfTimeCompilation(subroutine);
+        }
+
+        private string GetSubroutineName(long position)
+        {
+            return $"Sub{position:x16}";
+        }
+
+        private void ForceAheadOfTimeCompilation(TranslatedSub subroutine)
+        {
+            subroutine.Execute(_dummyThreadState, null);
+        }
+    }
+}
\ No newline at end of file
diff --git a/ChocolArm64/TranslatorCache.cs b/ChocolArm64/Translation/TranslatorCache.cs
similarity index 87%
rename from ChocolArm64/TranslatorCache.cs
rename to ChocolArm64/Translation/TranslatorCache.cs
index 9903ccaa..d10d6757 100644
--- a/ChocolArm64/TranslatorCache.cs
+++ b/ChocolArm64/Translation/TranslatorCache.cs
@@ -4,7 +4,7 @@ using System.Diagnostics;
 using System.Runtime.CompilerServices;
 using System.Threading;
 
-namespace ChocolArm64
+namespace ChocolArm64.Translation
 {
     class TranslatorCache
     {
@@ -58,6 +58,31 @@ namespace ChocolArm64
             _sortedCache = new LinkedList<long>();
         }
 
+        public TranslatedSub GetOrAdd(long position, TranslatedSub subroutine, int size)
+        {
+            ClearCacheIfNeeded();
+
+            lock (_sortedCache)
+            {
+                LinkedListNode<long> node = _sortedCache.AddLast(position);
+
+                CacheBucket bucket = new CacheBucket(subroutine, node, size);
+
+                bucket = _cache.GetOrAdd(position, bucket);
+
+                if (bucket.Node == node)
+                {
+                    _totalSize += size;
+                }
+                else
+                {
+                    _sortedCache.Remove(node);
+                }
+
+                return bucket.Subroutine;
+            }
+        }
+
         public void AddOrUpdate(long position, TranslatedSub subroutine, int size)
         {
             ClearCacheIfNeeded();
diff --git a/ChocolArm64/Translation/TranslatorQueue.cs b/ChocolArm64/Translation/TranslatorQueue.cs
new file mode 100644
index 00000000..89d665bf
--- /dev/null
+++ b/ChocolArm64/Translation/TranslatorQueue.cs
@@ -0,0 +1,83 @@
+using System.Collections.Concurrent;
+using System.Threading;
+
+namespace ChocolArm64.Translation
+{
+    class TranslatorQueue
+    {
+        //This is the maximum number of functions to be translated that the queue can hold.
+        //The value may need some tuning to find the sweet spot.
+        private const int MaxQueueSize = 1024;
+
+        private ConcurrentStack<TranslatorQueueItem>[] _translationQueue;
+
+        private ManualResetEvent _queueDataReceivedEvent;
+
+        private bool _signaled;
+
+        public TranslatorQueue()
+        {
+            _translationQueue = new ConcurrentStack<TranslatorQueueItem>[(int)TranslationTier.Count];
+
+            for (int prio = 0; prio < _translationQueue.Length; prio++)
+            {
+                _translationQueue[prio] = new ConcurrentStack<TranslatorQueueItem>();
+            }
+
+            _queueDataReceivedEvent = new ManualResetEvent(false);
+        }
+
+        public void Enqueue(TranslatorQueueItem item)
+        {
+            ConcurrentStack<TranslatorQueueItem> queue = _translationQueue[(int)item.Tier];
+
+            if (queue.Count >= MaxQueueSize)
+            {
+                queue.TryPop(out _);
+            }
+
+            queue.Push(item);
+
+            _queueDataReceivedEvent.Set();
+        }
+
+        public bool TryDequeue(out TranslatorQueueItem item)
+        {
+            for (int prio = 0; prio < _translationQueue.Length; prio++)
+            {
+                if (_translationQueue[prio].TryPop(out item))
+                {
+                    return true;
+                }
+            }
+
+            item = default(TranslatorQueueItem);
+
+            return false;
+        }
+
+        public void WaitForItems()
+        {
+            _queueDataReceivedEvent.WaitOne();
+
+            lock (_queueDataReceivedEvent)
+            {
+                if (!_signaled)
+                {
+                    _queueDataReceivedEvent.Reset();
+                }
+            }
+        }
+
+        public void ForceSignal()
+        {
+            lock (_queueDataReceivedEvent)
+            {
+                _signaled = true;
+
+                _queueDataReceivedEvent.Set();
+                _queueDataReceivedEvent.Close();
+            }
+        }
+    }
+}
\ No newline at end of file
diff --git a/ChocolArm64/Translation/TranslatorQueueItem.cs b/ChocolArm64/Translation/TranslatorQueueItem.cs
new file mode 100644
index 00000000..0988414a
--- /dev/null
+++ b/ChocolArm64/Translation/TranslatorQueueItem.cs
@@ -0,0 +1,20 @@
+using ChocolArm64.State;
+
+namespace ChocolArm64.Translation
+{
+    struct TranslatorQueueItem
+    {
+        public long Position { get; }
+
+        public ExecutionMode Mode { get; }
+
+        public TranslationTier Tier { get; }
+
+        public TranslatorQueueItem(long position, ExecutionMode mode, TranslationTier tier)
+        {
+            Position = position;
+            Mode     = mode;
+            Tier     = tier;
+        }
+    }
+}
\ No newline at end of file
diff --git a/ChocolArm64/Translator.cs b/ChocolArm64/Translator.cs
deleted file mode 100644
index af2586f4..00000000
--- a/ChocolArm64/Translator.cs
+++ /dev/null
@@ -1,120 +0,0 @@
-using ChocolArm64.Decoders;
-using ChocolArm64.Events;
-using ChocolArm64.Memory;
-using ChocolArm64.State;
-using ChocolArm64.Translation;
-using System;
-
-namespace ChocolArm64
-{
-    public class Translator
-    {
-        private TranslatorCache _cache;
-
-        public event EventHandler<CpuTraceEventArgs> CpuTrace;
-
-        public bool EnableCpuTrace { get; set; }
-
-        public Translator()
-        {
-            _cache = new TranslatorCache();
-        }
-
-        internal void ExecuteSubroutine(CpuThread thread, long position)
-        {
-            ExecuteSubroutine(thread.ThreadState, thread.Memory, position);
-        }
-
-        private void ExecuteSubroutine(CpuThreadState state, MemoryManager memory, long position)
-        {
-            do
-            {
-                if (EnableCpuTrace)
-                {
-                    CpuTrace?.Invoke(this, new CpuTraceEventArgs(position));
-                }
-
-                if (!_cache.TryGetSubroutine(position, out TranslatedSub sub))
-                {
-                    sub = TranslateTier0(memory, position, state.GetExecutionMode());
-                }
-
-                if (sub.ShouldReJit())
-                {
-                    TranslateTier1(memory, position, state.GetExecutionMode());
-                }
-
-                position = sub.Execute(state, memory);
-            }
-            while (position != 0 && state.Running);
-        }
-
-        internal bool HasCachedSub(long position)
-        {
-            return _cache.HasSubroutine(position);
-        }
-
-        private TranslatedSub TranslateTier0(MemoryManager memory, long position, ExecutionMode mode)
-        {
-            Block block = Decoder.DecodeBasicBlock(memory, position, mode);
-
-            ILEmitterCtx context = new ILEmitterCtx(_cache, block);
-
-            string subName = GetSubroutineName(position);
-
-            ILMethodBuilder ilMthdBuilder = new ILMethodBuilder(context.GetILBlocks(), subName);
-
-            TranslatedSub subroutine = ilMthdBuilder.GetSubroutine();
-
-            subroutine.SetType(TranslatedSubType.SubTier0);
-
-            _cache.AddOrUpdate(position, subroutine, block.OpCodes.Count);
-
-            return subroutine;
-        }
-
-        private void TranslateTier1(MemoryManager memory, long position, ExecutionMode mode)
-        {
-            Block graph = Decoder.DecodeSubroutine(_cache, memory, position, mode);
-
-            ILEmitterCtx context = new ILEmitterCtx(_cache, graph);
-
-            ILBlock[] ilBlocks = context.GetILBlocks();
-
-            string subName = GetSubroutineName(position);
-
-            ILMethodBuilder ilMthdBuilder = new ILMethodBuilder(ilBlocks, subName);
-
-            TranslatedSub subroutine = ilMthdBuilder.GetSubroutine();
-
-            subroutine.SetType(TranslatedSubType.SubTier1);
-
-            int ilOpCount = 0;
-
-            foreach (ILBlock ilBlock in ilBlocks)
-            {
-                ilOpCount += ilBlock.Count;
-            }
-
-            _cache.AddOrUpdate(position, subroutine, ilOpCount);
-
-            //Mark all methods that calls this method for ReJiting,
-            //since we can now call it directly which is faster.
-            if (_cache.TryGetSubroutine(position, out TranslatedSub oldSub))
-            {
-                foreach (long callerPos in oldSub.GetCallerPositions())
-                {
-                    if (_cache.TryGetSubroutine(position, out TranslatedSub callerSub))
-                    {
-                        callerSub.MarkForReJit();
-                    }
-                }
-            }
-        }
-
-        private string GetSubroutineName(long position)
-        {
-            return $"Sub{position:x16}";
-        }
-    }
-}
\ No newline at end of file
diff --git a/Ryujinx.Graphics/Graphics3d/NvGpuEngine3d.cs b/Ryujinx.Graphics/Graphics3d/NvGpuEngine3d.cs
index 749f5fdc..9ff3b36a 100644
--- a/Ryujinx.Graphics/Graphics3d/NvGpuEngine3d.cs
+++ b/Ryujinx.Graphics/Graphics3d/NvGpuEngine3d.cs
@@ -789,7 +789,7 @@ namespace Ryujinx.Graphics.Graphics3d
                 GalVertexAttribType Type = (GalVertexAttribType)((Packed >> 27) & 0x7);
 
                 bool IsRgba = ((Packed >> 31) & 1) != 0;
-                
+
                 // Check vertex array is enabled to avoid out of bounds exception when reading bytes
                 bool Enable = (ReadRegister(NvGpuEngine3dReg.VertexArrayNControl + ArrayIndex * 4) & 0x1000) != 0;
 
diff --git a/Ryujinx.HLE/HOS/Kernel/Process/KProcess.cs b/Ryujinx.HLE/HOS/Kernel/Process/KProcess.cs
index fd473014..338e5543 100644
--- a/Ryujinx.HLE/HOS/Kernel/Process/KProcess.cs
+++ b/Ryujinx.HLE/HOS/Kernel/Process/KProcess.cs
@@ -1,6 +1,7 @@
 using ChocolArm64;
 using ChocolArm64.Events;
 using ChocolArm64.Memory;
+using ChocolArm64.Translation;
 using Ryujinx.Common;
 using Ryujinx.Common.Logging;
 using Ryujinx.HLE.Exceptions;
@@ -109,7 +110,7 @@ namespace Ryujinx.HLE.HOS.Kernel.Process
 
             _threads = new LinkedList<KThread>();
 
-            Translator = new Translator();
+            Translator = new Translator(CpuMemory);
 
             Translator.CpuTrace += CpuTraceHandler;
 
diff --git a/Ryujinx.Tests/Cpu/CpuTest.cs b/Ryujinx.Tests/Cpu/CpuTest.cs
index b970e055..47feb573 100644
--- a/Ryujinx.Tests/Cpu/CpuTest.cs
+++ b/Ryujinx.Tests/Cpu/CpuTest.cs
@@ -1,6 +1,7 @@
 using ChocolArm64;
 using ChocolArm64.Memory;
 using ChocolArm64.State;
+using ChocolArm64.Translation;
 
 using NUnit.Framework;
 
@@ -48,10 +49,12 @@ namespace Ryujinx.Tests.Cpu
 
             _entryPoint = Position;
 
-            Translator translator = new Translator();
             _ramPointer = Marshal.AllocHGlobal(new IntPtr(_size));
             _memory = new MemoryManager(_ramPointer);
             _memory.Map(Position, 0, _size);
+
+            Translator translator = new Translator(_memory);
+
             _thread = new CpuThread(translator, _memory, _entryPoint);
 
             if (_unicornAvailable)