From d141a5c48d7ffda9e992d10ba24d9f6ad242eda9 Mon Sep 17 00:00:00 2001
From: vegorov-rbx <75688451+vegorov-rbx@users.noreply.github.com>
Date: Fri, 14 Apr 2023 21:06:22 +0300
Subject: [PATCH] Sync to upstream/release/572 (#899)

* Fixed exported types not being suggested in autocomplete
* `T...` is now convertible to `...any` (Fixes
https://github.com/Roblox/luau/issues/767)
* Fixed issue with `T?` not being convertible to `T | T` or `T?`
(sometimes when internal pointer identity is different)
* Fixed potential crash in missing table key error suggestion to use a
similar existing key
* `lua_topointer` now returns a pointer for strings

C++ API Changes:
* `prepareModuleScope` callback has moved from TypeChecker to Frontend
* For LSPs, AstQuery functions (and `isWithinComment`) can be used
without full Frontend data

A lot of changes in our two experimental components as well.

In our work on the new type-solver, the following issues were fixed:
* Fixed table union and intersection indexing
* Correct custom type environments are now used
* Fixed issue with values of `free & number` type not accepted in
numeric operations

And these are the changes in native code generation (JIT):
* arm64 lowering is almost complete with support for 99% of IR commands
and all fastcalls
* Fixed x64 assembly encoding for extended byte registers
* More external x64 calls are aware of register allocator
* `math.min`/`math.max` with more than 2 arguments are now lowered to IR
as well
* Fixed correctness issues with `math` library calls with multiple
results in variadic context and with x64 register conflicts
* x64 register allocator learnt to restore values from VM memory instead
of always using stack spills
* x64 exception unwind information now supports multiple functions and
fixes function start offset in Dwarf2 info
---
 Analysis/include/Luau/AstQuery.h           |   3 +
 Analysis/include/Luau/Frontend.h           |  20 +-
 Analysis/include/Luau/Module.h             |   1 +
 Analysis/include/Luau/Type.h               |  14 +
 Analysis/include/Luau/Unifier.h            |   4 +-
 Analysis/src/AstQuery.cpp                  |  31 +-
 Analysis/src/ConstraintSolver.cpp          |  18 +-
 Analysis/src/Frontend.cpp                  | 154 +++--
 Analysis/src/Module.cpp                    |  23 +-
 Analysis/src/Type.cpp                      |   9 +
 Analysis/src/TypeChecker2.cpp              |  66 +-
 Analysis/src/TypeInfer.cpp                 |  64 +-
 Analysis/src/Unifier.cpp                   |  86 ++-
 Ast/src/StringUtils.cpp                    |   8 +-
 CodeGen/include/Luau/AddressA64.h          |   4 +-
 CodeGen/include/Luau/AssemblyBuilderA64.h  |  17 +-
 CodeGen/include/Luau/IrCallWrapperX64.h    |   7 +-
 CodeGen/include/Luau/IrData.h              |  38 +-
 CodeGen/include/Luau/IrRegAllocX64.h       |  23 +-
 CodeGen/include/Luau/IrUtils.h             |   2 +-
 CodeGen/include/Luau/RegisterA64.h         |  12 +
 CodeGen/include/Luau/RegisterX64.h         |  12 +
 CodeGen/include/Luau/UnwindBuilder.h       |  12 +-
 CodeGen/include/Luau/UnwindBuilderDwarf2.h |  22 +-
 CodeGen/include/Luau/UnwindBuilderWin.h    |  38 +-
 CodeGen/src/AssemblyBuilderA64.cpp         |  87 ++-
 CodeGen/src/AssemblyBuilderX64.cpp         |  11 +-
 CodeGen/src/BitUtils.h                     |  36 +
 CodeGen/src/CodeBlockUnwind.cpp            |  52 +-
 CodeGen/src/CodeGen.cpp                    |  26 +-
 CodeGen/src/CodeGenA64.cpp                 | 132 +++-
 CodeGen/src/CodeGenA64.h                   |   2 +-
 CodeGen/src/CodeGenUtils.cpp               |  50 +-
 CodeGen/src/CodeGenUtils.h                 |   1 +
 CodeGen/src/CodeGenX64.cpp                 |  50 +-
 CodeGen/src/CodeGenX64.h                   |   2 +-
 CodeGen/src/EmitBuiltinsX64.cpp            |  78 +--
 CodeGen/src/EmitCommon.h                   |   4 +-
 CodeGen/src/EmitCommonA64.cpp              | 130 ----
 CodeGen/src/EmitCommonA64.h                |  19 +-
 CodeGen/src/EmitCommonX64.cpp              |  75 +--
 CodeGen/src/EmitCommonX64.h                |  41 +-
 CodeGen/src/EmitInstructionA64.cpp         |  74 ---
 CodeGen/src/EmitInstructionA64.h           |  24 -
 CodeGen/src/EmitInstructionX64.cpp         |  74 +--
 CodeGen/src/EmitInstructionX64.h           |   6 +-
 CodeGen/src/Fallbacks.cpp                  |  38 ++
 CodeGen/src/Fallbacks.h                    |   1 +
 CodeGen/src/IrAnalysis.cpp                 |   2 +
 CodeGen/src/IrBuilder.cpp                  |   3 +-
 CodeGen/src/IrCallWrapperX64.cpp           |  83 +--
 CodeGen/src/IrLoweringA64.cpp              | 733 ++++++++++++++-------
 CodeGen/src/IrLoweringA64.h                |   4 +-
 CodeGen/src/IrLoweringX64.cpp              | 239 ++++---
 CodeGen/src/IrRegAllocA64.cpp              |  21 +-
 CodeGen/src/IrRegAllocX64.cpp              | 305 +++++----
 CodeGen/src/IrTranslateBuiltins.cpp        |  70 +-
 CodeGen/src/IrUtils.cpp                    |   4 +-
 CodeGen/src/NativeState.cpp                |  26 +-
 CodeGen/src/NativeState.h                  |  13 +-
 CodeGen/src/OptimizeConstProp.cpp          |   2 +
 CodeGen/src/UnwindBuilderDwarf2.cpp        |  47 +-
 CodeGen/src/UnwindBuilderWin.cpp           | 112 +++-
 Sources.cmake                              |   4 +-
 VM/src/lapi.cpp                            |   2 +
 VM/src/ltable.cpp                          |  36 +-
 fuzz/linter.cpp                            |   2 +-
 fuzz/proto.cpp                             |   8 +-
 fuzz/typeck.cpp                            |   2 +-
 tests/AssemblyBuilderA64.test.cpp          |  26 +-
 tests/AssemblyBuilderX64.test.cpp          |  10 +
 tests/Autocomplete.test.cpp                |  30 +
 tests/CodeAllocator.test.cpp               | 190 +++++-
 tests/Conformance.test.cpp                 |  43 +-
 tests/Fixture.cpp                          |  10 +-
 tests/Module.test.cpp                      |  39 +-
 tests/StringUtils.test.cpp                 |  18 +
 tests/TypeInfer.annotations.test.cpp       |  13 +-
 tests/TypeInfer.functions.test.cpp         |  33 +
 tests/TypeInfer.operators.test.cpp         |  78 ++-
 tests/TypeInfer.provisional.test.cpp       |  36 +-
 tests/TypeInfer.test.cpp                   |  15 +
 tests/TypeInfer.unionTypes.test.cpp        |  16 +
 tests/TypeInfer.unknownnever.test.cpp      |   5 -
 tests/TypeVar.test.cpp                     |  12 +-
 tests/conformance/math.lua                 |  10 +
 tests/conformance/tables.lua               |   7 +
 tools/lvmexecute_split.py                  |   2 +-
 88 files changed, 2579 insertions(+), 1433 deletions(-)
 create mode 100644 CodeGen/src/BitUtils.h
 delete mode 100644 CodeGen/src/EmitCommonA64.cpp
 delete mode 100644 CodeGen/src/EmitInstructionA64.cpp
 delete mode 100644 CodeGen/src/EmitInstructionA64.h

diff --git a/Analysis/include/Luau/AstQuery.h b/Analysis/include/Luau/AstQuery.h
index aa7ef8d..e7a018c 100644
--- a/Analysis/include/Luau/AstQuery.h
+++ b/Analysis/include/Luau/AstQuery.h
@@ -64,8 +64,11 @@ private:
 };
 
 std::vector<AstNode*> findAncestryAtPositionForAutocomplete(const SourceModule& source, Position pos);
+std::vector<AstNode*> findAncestryAtPositionForAutocomplete(AstStatBlock* root, Position pos);
 std::vector<AstNode*> findAstAncestryOfPosition(const SourceModule& source, Position pos, bool includeTypes = false);
+std::vector<AstNode*> findAstAncestryOfPosition(AstStatBlock* root, Position pos, bool includeTypes = false);
 AstNode* findNodeAtPosition(const SourceModule& source, Position pos);
+AstNode* findNodeAtPosition(AstStatBlock* root, Position pos);
 AstExpr* findExprAtPosition(const SourceModule& source, Position pos);
 ScopePtr findScopeAtPosition(const Module& module, Position pos);
 std::optional<Binding> findBindingAtPosition(const Module& module, const SourceModule& source, Position pos);
diff --git a/Analysis/include/Luau/Frontend.h b/Analysis/include/Luau/Frontend.h
index 8225137..3f41c14 100644
--- a/Analysis/include/Luau/Frontend.h
+++ b/Analysis/include/Luau/Frontend.h
@@ -165,7 +165,15 @@ struct Frontend
         bool captureComments, bool typeCheckForAutocomplete = false);
 
 private:
-    ModulePtr check(const SourceModule& sourceModule, Mode mode, std::vector<RequireCycle> requireCycles, bool forAutocomplete = false, bool recordJsonLog = false);
+    struct TypeCheckLimits
+    {
+        std::optional<double> finishTime;
+        std::optional<int> instantiationChildLimit;
+        std::optional<int> unifierIterationLimit;
+    };
+
+    ModulePtr check(const SourceModule& sourceModule, Mode mode, std::vector<RequireCycle> requireCycles, std::optional<ScopePtr> environmentScope,
+        bool forAutocomplete, bool recordJsonLog, TypeCheckLimits typeCheckLimits);
 
     std::pair<SourceNode*, SourceModule*> getSourceNode(const ModuleName& name);
     SourceModule parse(const ModuleName& name, std::string_view src, const ParseOptions& parseOptions);
@@ -185,15 +193,21 @@ public:
     const NotNull<BuiltinTypes> builtinTypes;
 
     FileResolver* fileResolver;
+
     FrontendModuleResolver moduleResolver;
     FrontendModuleResolver moduleResolverForAutocomplete;
+
     GlobalTypes globals;
     GlobalTypes globalsForAutocomplete;
-    TypeChecker typeChecker;
-    TypeChecker typeCheckerForAutocomplete;
+
+    // TODO: remove with FFlagLuauOnDemandTypecheckers
+    TypeChecker typeChecker_DEPRECATED;
+    TypeChecker typeCheckerForAutocomplete_DEPRECATED;
+
     ConfigResolver* configResolver;
     FrontendOptions options;
     InternalErrorReporter iceHandler;
+    std::function<void(const ModuleName& name, const ScopePtr& scope, bool forAutocomplete)> prepareModuleScope;
 
     std::unordered_map<ModuleName, SourceNode> sourceNodes;
     std::unordered_map<ModuleName, SourceModule> sourceModules;
diff --git a/Analysis/include/Luau/Module.h b/Analysis/include/Luau/Module.h
index 72f8760..1bca763 100644
--- a/Analysis/include/Luau/Module.h
+++ b/Analysis/include/Luau/Module.h
@@ -51,6 +51,7 @@ struct SourceModule
 };
 
 bool isWithinComment(const SourceModule& sourceModule, Position pos);
+bool isWithinComment(const ParseResult& result, Position pos);
 
 struct RequireCycle
 {
diff --git a/Analysis/include/Luau/Type.h b/Analysis/include/Luau/Type.h
index cff86df..b9544a1 100644
--- a/Analysis/include/Luau/Type.h
+++ b/Analysis/include/Luau/Type.h
@@ -738,6 +738,7 @@ const T* get(TypeId tv)
     return get_if<T>(&tv->ty);
 }
 
+
 template<typename T>
 T* getMutable(TypeId tv)
 {
@@ -897,6 +898,19 @@ bool hasTag(TypeId ty, const std::string& tagName);
 bool hasTag(const Property& prop, const std::string& tagName);
 bool hasTag(const Tags& tags, const std::string& tagName); // Do not use in new work.
 
+template<typename T>
+bool hasTypeInIntersection(TypeId ty)
+{
+    TypeId tf = follow(ty);
+    if (get<T>(tf))
+        return true;
+    for (auto t : flattenIntersection(tf))
+        if (get<T>(follow(t)))
+            return true;
+    return false;
+}
+
+bool hasPrimitiveTypeInIntersection(TypeId ty, PrimitiveType::Type primTy);
 /*
  * Use this to change the kind of a particular type.
  *
diff --git a/Analysis/include/Luau/Unifier.h b/Analysis/include/Luau/Unifier.h
index e7817e5..e3b0a87 100644
--- a/Analysis/include/Luau/Unifier.h
+++ b/Analysis/include/Luau/Unifier.h
@@ -137,9 +137,9 @@ private:
 
 public:
     // Returns true if the type "needle" already occurs within "haystack" and reports an "infinite type error"
-    bool occursCheck(TypeId needle, TypeId haystack);
+    bool occursCheck(TypeId needle, TypeId haystack, bool reversed);
     bool occursCheck(DenseHashSet<TypeId>& seen, TypeId needle, TypeId haystack);
-    bool occursCheck(TypePackId needle, TypePackId haystack);
+    bool occursCheck(TypePackId needle, TypePackId haystack, bool reversed);
     bool occursCheck(DenseHashSet<TypePackId>& seen, TypePackId needle, TypePackId haystack);
 
     Unifier makeChildUnifier();
diff --git a/Analysis/src/AstQuery.cpp b/Analysis/src/AstQuery.cpp
index dc07a35..cb3efe6 100644
--- a/Analysis/src/AstQuery.cpp
+++ b/Analysis/src/AstQuery.cpp
@@ -211,33 +211,48 @@ struct FindFullAncestry final : public AstVisitor
 
 std::vector<AstNode*> findAncestryAtPositionForAutocomplete(const SourceModule& source, Position pos)
 {
-    AutocompleteNodeFinder finder{pos, source.root};
-    source.root->visit(&finder);
+    return findAncestryAtPositionForAutocomplete(source.root, pos);
+}
+
+std::vector<AstNode*> findAncestryAtPositionForAutocomplete(AstStatBlock* root, Position pos)
+{
+    AutocompleteNodeFinder finder{pos, root};
+    root->visit(&finder);
     return finder.ancestry;
 }
 
 std::vector<AstNode*> findAstAncestryOfPosition(const SourceModule& source, Position pos, bool includeTypes)
 {
-    const Position end = source.root->location.end;
+    return findAstAncestryOfPosition(source.root, pos, includeTypes);
+}
+
+std::vector<AstNode*> findAstAncestryOfPosition(AstStatBlock* root, Position pos, bool includeTypes)
+{
+    const Position end = root->location.end;
     if (pos > end)
         pos = end;
 
     FindFullAncestry finder(pos, end, includeTypes);
-    source.root->visit(&finder);
+    root->visit(&finder);
     return finder.nodes;
 }
 
 AstNode* findNodeAtPosition(const SourceModule& source, Position pos)
 {
-    const Position end = source.root->location.end;
-    if (pos < source.root->location.begin)
-        return source.root;
+    return findNodeAtPosition(source.root, pos);
+}
+
+AstNode* findNodeAtPosition(AstStatBlock* root, Position pos)
+{
+    const Position end = root->location.end;
+    if (pos < root->location.begin)
+        return root;
 
     if (pos > end)
         pos = end;
 
     FindNode findNode{pos, end};
-    findNode.visit(source.root);
+    findNode.visit(root);
     return findNode.best;
 }
 
diff --git a/Analysis/src/ConstraintSolver.cpp b/Analysis/src/ConstraintSolver.cpp
index d2bed2d..0fc32c3 100644
--- a/Analysis/src/ConstraintSolver.cpp
+++ b/Analysis/src/ConstraintSolver.cpp
@@ -595,6 +595,11 @@ bool ConstraintSolver::tryDispatch(const BinaryConstraint& c, NotNull<const Cons
      * make any sense to stop and wait for someone else to do it.
      */
 
+    // If any is present, the expression must evaluate to any as well.
+    bool leftAny = get<AnyType>(leftType) || get<ErrorType>(leftType);
+    bool rightAny = get<AnyType>(rightType) || get<ErrorType>(rightType);
+    bool anyPresent = leftAny || rightAny;
+
     if (isBlocked(leftType) && leftType != resultType)
         return block(c.leftType, constraint);
 
@@ -604,12 +609,12 @@ bool ConstraintSolver::tryDispatch(const BinaryConstraint& c, NotNull<const Cons
     if (!force)
     {
         // Logical expressions may proceed if the LHS is free.
-        if (get<FreeType>(leftType) && !isLogical)
+        if (hasTypeInIntersection<FreeType>(leftType) && !isLogical)
             return block(leftType, constraint);
     }
 
     // Logical expressions may proceed if the LHS is free.
-    if (isBlocked(leftType) || (get<FreeType>(leftType) && !isLogical))
+    if (isBlocked(leftType) || (hasTypeInIntersection<FreeType>(leftType) && !isLogical))
     {
         asMutable(resultType)->ty.emplace<BoundType>(errorRecoveryType());
         unblock(resultType);
@@ -696,11 +701,6 @@ bool ConstraintSolver::tryDispatch(const BinaryConstraint& c, NotNull<const Cons
         // If there's no metamethod available, fall back to primitive behavior.
     }
 
-    // If any is present, the expression must evaluate to any as well.
-    bool leftAny = get<AnyType>(leftType) || get<ErrorType>(leftType);
-    bool rightAny = get<AnyType>(rightType) || get<ErrorType>(rightType);
-    bool anyPresent = leftAny || rightAny;
-
     switch (c.op)
     {
     // For arithmetic operators, if the LHS is a number, the RHS must be a
@@ -711,6 +711,8 @@ bool ConstraintSolver::tryDispatch(const BinaryConstraint& c, NotNull<const Cons
     case AstExprBinary::Op::Div:
     case AstExprBinary::Op::Pow:
     case AstExprBinary::Op::Mod:
+        if (hasTypeInIntersection<FreeType>(leftType) && force)
+            asMutable(leftType)->ty.emplace<BoundType>(anyPresent ? builtinTypes->anyType : builtinTypes->numberType);
         if (isNumber(leftType))
         {
             unify(leftType, rightType, constraint->scope);
@@ -723,6 +725,8 @@ bool ConstraintSolver::tryDispatch(const BinaryConstraint& c, NotNull<const Cons
     // For concatenation, if the LHS is a string, the RHS must be a string as
     // well. The result will also be a string.
     case AstExprBinary::Op::Concat:
+        if (hasTypeInIntersection<FreeType>(leftType) && force)
+            asMutable(leftType)->ty.emplace<BoundType>(anyPresent ? builtinTypes->anyType : builtinTypes->stringType);
         if (isString(leftType))
         {
             unify(leftType, rightType, constraint->scope);
diff --git a/Analysis/src/Frontend.cpp b/Analysis/src/Frontend.cpp
index 98022d8..5beb6c4 100644
--- a/Analysis/src/Frontend.cpp
+++ b/Analysis/src/Frontend.cpp
@@ -31,7 +31,8 @@ LUAU_FASTFLAG(LuauInferInNoCheckMode)
 LUAU_FASTFLAGVARIABLE(LuauKnowsTheDataModel3, false)
 LUAU_FASTINTVARIABLE(LuauAutocompleteCheckTimeoutMs, 100)
 LUAU_FASTFLAGVARIABLE(DebugLuauDeferredConstraintResolution, false)
-LUAU_FASTFLAGVARIABLE(DebugLuauLogSolverToJson, false);
+LUAU_FASTFLAGVARIABLE(DebugLuauLogSolverToJson, false)
+LUAU_FASTFLAGVARIABLE(LuauOnDemandTypecheckers, false)
 
 namespace Luau
 {
@@ -131,8 +132,8 @@ static void persistCheckedTypes(ModulePtr checkedModule, GlobalTypes& globals, S
 LoadDefinitionFileResult Frontend::loadDefinitionFile(GlobalTypes& globals, ScopePtr targetScope, std::string_view source,
     const std::string& packageName, bool captureComments, bool typeCheckForAutocomplete)
 {
-    if (!FFlag::DebugLuauDeferredConstraintResolution)
-        return Luau::loadDefinitionFileNoDCR(typeCheckForAutocomplete ? typeCheckerForAutocomplete : typeChecker,
+    if (!FFlag::DebugLuauDeferredConstraintResolution && !FFlag::LuauOnDemandTypecheckers)
+        return Luau::loadDefinitionFileNoDCR(typeCheckForAutocomplete ? typeCheckerForAutocomplete_DEPRECATED : typeChecker_DEPRECATED,
             typeCheckForAutocomplete ? globalsForAutocomplete : globals, targetScope, source, packageName, captureComments);
 
     LUAU_TIMETRACE_SCOPE("loadDefinitionFile", "Frontend");
@@ -142,7 +143,7 @@ LoadDefinitionFileResult Frontend::loadDefinitionFile(GlobalTypes& globals, Scop
     if (parseResult.errors.size() > 0)
         return LoadDefinitionFileResult{false, parseResult, sourceModule, nullptr};
 
-    ModulePtr checkedModule = check(sourceModule, Mode::Definition, {});
+    ModulePtr checkedModule = check(sourceModule, Mode::Definition, {}, std::nullopt, /*forAutocomplete*/ false, /*recordJsonLog*/ false, {});
 
     if (checkedModule->errors.size() > 0)
         return LoadDefinitionFileResult{false, parseResult, sourceModule, checkedModule};
@@ -155,6 +156,7 @@ LoadDefinitionFileResult Frontend::loadDefinitionFile(GlobalTypes& globals, Scop
 LoadDefinitionFileResult loadDefinitionFileNoDCR(TypeChecker& typeChecker, GlobalTypes& globals, ScopePtr targetScope, std::string_view source,
     const std::string& packageName, bool captureComments)
 {
+    LUAU_ASSERT(!FFlag::LuauOnDemandTypecheckers);
     LUAU_TIMETRACE_SCOPE("loadDefinitionFile", "Frontend");
 
     Luau::SourceModule sourceModule;
@@ -406,8 +408,8 @@ Frontend::Frontend(FileResolver* fileResolver, ConfigResolver* configResolver, c
     , moduleResolverForAutocomplete(this)
     , globals(builtinTypes)
     , globalsForAutocomplete(builtinTypes)
-    , typeChecker(globals.globalScope, &moduleResolver, builtinTypes, &iceHandler)
-    , typeCheckerForAutocomplete(globalsForAutocomplete.globalScope, &moduleResolverForAutocomplete, builtinTypes, &iceHandler)
+    , typeChecker_DEPRECATED(globals.globalScope, &moduleResolver, builtinTypes, &iceHandler)
+    , typeCheckerForAutocomplete_DEPRECATED(globalsForAutocomplete.globalScope, &moduleResolverForAutocomplete, builtinTypes, &iceHandler)
     , configResolver(configResolver)
     , options(options)
 {
@@ -491,35 +493,68 @@ CheckResult Frontend::check(const ModuleName& name, std::optional<FrontendOption
 
         if (frontendOptions.forAutocomplete)
         {
-            // The autocomplete typecheck is always in strict mode with DM awareness
-            // to provide better type information for IDE features
-            typeCheckerForAutocomplete.requireCycles = requireCycles;
+            ModulePtr moduleForAutocomplete;
 
             double autocompleteTimeLimit = FInt::LuauAutocompleteCheckTimeoutMs / 1000.0;
 
-            if (autocompleteTimeLimit != 0.0)
-                typeCheckerForAutocomplete.finishTime = TimeTrace::getClock() + autocompleteTimeLimit;
-            else
-                typeCheckerForAutocomplete.finishTime = std::nullopt;
+            if (!FFlag::LuauOnDemandTypecheckers)
+            {
+                // The autocomplete typecheck is always in strict mode with DM awareness
+                // to provide better type information for IDE features
+                typeCheckerForAutocomplete_DEPRECATED.requireCycles = requireCycles;
 
-            // TODO: This is a dirty ad hoc solution for autocomplete timeouts
-            // We are trying to dynamically adjust our existing limits to lower total typechecking time under the limit
-            // so that we'll have type information for the whole file at lower quality instead of a full abort in the middle
-            if (FInt::LuauTarjanChildLimit > 0)
-                typeCheckerForAutocomplete.instantiationChildLimit = std::max(1, int(FInt::LuauTarjanChildLimit * sourceNode.autocompleteLimitsMult));
-            else
-                typeCheckerForAutocomplete.instantiationChildLimit = std::nullopt;
+                if (autocompleteTimeLimit != 0.0)
+                    typeCheckerForAutocomplete_DEPRECATED.finishTime = TimeTrace::getClock() + autocompleteTimeLimit;
+                else
+                    typeCheckerForAutocomplete_DEPRECATED.finishTime = std::nullopt;
 
-            if (FInt::LuauTypeInferIterationLimit > 0)
-                typeCheckerForAutocomplete.unifierIterationLimit =
-                    std::max(1, int(FInt::LuauTypeInferIterationLimit * sourceNode.autocompleteLimitsMult));
-            else
-                typeCheckerForAutocomplete.unifierIterationLimit = std::nullopt;
+                // TODO: This is a dirty ad hoc solution for autocomplete timeouts
+                // We are trying to dynamically adjust our existing limits to lower total typechecking time under the limit
+                // so that we'll have type information for the whole file at lower quality instead of a full abort in the middle
+                if (FInt::LuauTarjanChildLimit > 0)
+                    typeCheckerForAutocomplete_DEPRECATED.instantiationChildLimit =
+                        std::max(1, int(FInt::LuauTarjanChildLimit * sourceNode.autocompleteLimitsMult));
+                else
+                    typeCheckerForAutocomplete_DEPRECATED.instantiationChildLimit = std::nullopt;
 
-            ModulePtr moduleForAutocomplete =
-                FFlag::DebugLuauDeferredConstraintResolution
-                    ? check(sourceModule, Mode::Strict, requireCycles, /*forAutocomplete*/ true, /*recordJsonLog*/ false)
-                    : typeCheckerForAutocomplete.check(sourceModule, Mode::Strict, environmentScope);
+                if (FInt::LuauTypeInferIterationLimit > 0)
+                    typeCheckerForAutocomplete_DEPRECATED.unifierIterationLimit =
+                        std::max(1, int(FInt::LuauTypeInferIterationLimit * sourceNode.autocompleteLimitsMult));
+                else
+                    typeCheckerForAutocomplete_DEPRECATED.unifierIterationLimit = std::nullopt;
+
+                moduleForAutocomplete =
+                    FFlag::DebugLuauDeferredConstraintResolution
+                        ? check(sourceModule, Mode::Strict, requireCycles, environmentScope, /*forAutocomplete*/ true, /*recordJsonLog*/ false, {})
+                        : typeCheckerForAutocomplete_DEPRECATED.check(sourceModule, Mode::Strict, environmentScope);
+            }
+            else
+            {
+                // The autocomplete typecheck is always in strict mode with DM awareness
+                // to provide better type information for IDE features
+                TypeCheckLimits typeCheckLimits;
+
+                if (autocompleteTimeLimit != 0.0)
+                    typeCheckLimits.finishTime = TimeTrace::getClock() + autocompleteTimeLimit;
+                else
+                    typeCheckLimits.finishTime = std::nullopt;
+
+                // TODO: This is a dirty ad hoc solution for autocomplete timeouts
+                // We are trying to dynamically adjust our existing limits to lower total typechecking time under the limit
+                // so that we'll have type information for the whole file at lower quality instead of a full abort in the middle
+                if (FInt::LuauTarjanChildLimit > 0)
+                    typeCheckLimits.instantiationChildLimit = std::max(1, int(FInt::LuauTarjanChildLimit * sourceNode.autocompleteLimitsMult));
+                else
+                    typeCheckLimits.instantiationChildLimit = std::nullopt;
+
+                if (FInt::LuauTypeInferIterationLimit > 0)
+                    typeCheckLimits.unifierIterationLimit = std::max(1, int(FInt::LuauTypeInferIterationLimit * sourceNode.autocompleteLimitsMult));
+                else
+                    typeCheckLimits.unifierIterationLimit = std::nullopt;
+
+                moduleForAutocomplete = check(sourceModule, Mode::Strict, requireCycles, environmentScope, /*forAutocomplete*/ true,
+                    /*recordJsonLog*/ false, typeCheckLimits);
+            }
 
             moduleResolverForAutocomplete.modules[moduleName] = moduleForAutocomplete;
 
@@ -543,13 +578,22 @@ CheckResult Frontend::check(const ModuleName& name, std::optional<FrontendOption
             continue;
         }
 
-        typeChecker.requireCycles = requireCycles;
-
         const bool recordJsonLog = FFlag::DebugLuauLogSolverToJson && moduleName == name;
 
-        ModulePtr module = (FFlag::DebugLuauDeferredConstraintResolution && mode == Mode::Strict)
-                               ? check(sourceModule, mode, requireCycles, /*forAutocomplete*/ false, recordJsonLog)
-                               : typeChecker.check(sourceModule, mode, environmentScope);
+        ModulePtr module;
+
+        if (!FFlag::LuauOnDemandTypecheckers)
+        {
+            typeChecker_DEPRECATED.requireCycles = requireCycles;
+
+            module = (FFlag::DebugLuauDeferredConstraintResolution && mode == Mode::Strict)
+                         ? check(sourceModule, mode, requireCycles, environmentScope, /*forAutocomplete*/ false, recordJsonLog, {})
+                         : typeChecker_DEPRECATED.check(sourceModule, mode, environmentScope);
+        }
+        else
+        {
+            module = check(sourceModule, mode, requireCycles, environmentScope, /*forAutocomplete*/ false, recordJsonLog, {});
+        }
 
         stats.timeCheck += getTimestamp() - timestamp;
         stats.filesStrict += mode == Mode::Strict;
@@ -752,7 +796,7 @@ ScopePtr Frontend::getModuleEnvironment(const SourceModule& module, const Config
             AstName name = module.names->get(global.c_str());
 
             if (name.value)
-                result->bindings[name].typeId = typeChecker.anyType;
+                result->bindings[name].typeId = FFlag::LuauOnDemandTypecheckers ? builtinTypes->anyType : typeChecker_DEPRECATED.anyType;
         }
     }
 
@@ -829,15 +873,15 @@ const SourceModule* Frontend::getSourceModule(const ModuleName& moduleName) cons
 
 ModulePtr check(const SourceModule& sourceModule, const std::vector<RequireCycle>& requireCycles, NotNull<BuiltinTypes> builtinTypes,
     NotNull<InternalErrorReporter> iceHandler, NotNull<ModuleResolver> moduleResolver, NotNull<FileResolver> fileResolver,
-    const ScopePtr& globalScope, FrontendOptions options)
+    const ScopePtr& parentScope, FrontendOptions options)
 {
     const bool recordJsonLog = FFlag::DebugLuauLogSolverToJson;
-    return check(sourceModule, requireCycles, builtinTypes, iceHandler, moduleResolver, fileResolver, globalScope, options, recordJsonLog);
+    return check(sourceModule, requireCycles, builtinTypes, iceHandler, moduleResolver, fileResolver, parentScope, options, recordJsonLog);
 }
 
 ModulePtr check(const SourceModule& sourceModule, const std::vector<RequireCycle>& requireCycles, NotNull<BuiltinTypes> builtinTypes,
     NotNull<InternalErrorReporter> iceHandler, NotNull<ModuleResolver> moduleResolver, NotNull<FileResolver> fileResolver,
-    const ScopePtr& globalScope, FrontendOptions options, bool recordJsonLog)
+    const ScopePtr& parentScope, FrontendOptions options, bool recordJsonLog)
 {
     ModulePtr result = std::make_shared<Module>();
     result->reduction = std::make_unique<TypeReduction>(NotNull{&result->internalTypes}, builtinTypes, iceHandler);
@@ -868,7 +912,7 @@ ModulePtr check(const SourceModule& sourceModule, const std::vector<RequireCycle
         moduleResolver,
         builtinTypes,
         iceHandler,
-        globalScope,
+        parentScope,
         logger.get(),
         NotNull{&dfg},
     };
@@ -911,11 +955,35 @@ ModulePtr check(const SourceModule& sourceModule, const std::vector<RequireCycle
     return result;
 }
 
-ModulePtr Frontend::check(const SourceModule& sourceModule, Mode mode, std::vector<RequireCycle> requireCycles, bool forAutocomplete, bool recordJsonLog)
+ModulePtr Frontend::check(const SourceModule& sourceModule, Mode mode, std::vector<RequireCycle> requireCycles,
+    std::optional<ScopePtr> environmentScope, bool forAutocomplete, bool recordJsonLog, TypeCheckLimits typeCheckLimits)
 {
-    return Luau::check(sourceModule, requireCycles, builtinTypes, NotNull{&iceHandler},
-        NotNull{forAutocomplete ? &moduleResolverForAutocomplete : &moduleResolver}, NotNull{fileResolver},
-        forAutocomplete ? globalsForAutocomplete.globalScope : globals.globalScope, options, recordJsonLog);
+    if (FFlag::DebugLuauDeferredConstraintResolution && mode == Mode::Strict)
+    {
+        return Luau::check(sourceModule, requireCycles, builtinTypes, NotNull{&iceHandler},
+            NotNull{forAutocomplete ? &moduleResolverForAutocomplete : &moduleResolver}, NotNull{fileResolver},
+            environmentScope ? *environmentScope : globals.globalScope, options, recordJsonLog);
+    }
+    else
+    {
+        LUAU_ASSERT(FFlag::LuauOnDemandTypecheckers);
+
+        TypeChecker typeChecker(globals.globalScope, forAutocomplete ? &moduleResolverForAutocomplete : &moduleResolver, builtinTypes, &iceHandler);
+
+        if (prepareModuleScope)
+        {
+            typeChecker.prepareModuleScope = [this, forAutocomplete](const ModuleName& name, const ScopePtr& scope) {
+                prepareModuleScope(name, scope, forAutocomplete);
+            };
+        }
+
+        typeChecker.requireCycles = requireCycles;
+        typeChecker.finishTime = typeCheckLimits.finishTime;
+        typeChecker.instantiationChildLimit = typeCheckLimits.instantiationChildLimit;
+        typeChecker.unifierIterationLimit = typeCheckLimits.unifierIterationLimit;
+
+        return typeChecker.check(sourceModule, mode, environmentScope);
+    }
 }
 
 // Read AST into sourceModules if necessary.  Trace require()s.  Report parse errors.
diff --git a/Analysis/src/Module.cpp b/Analysis/src/Module.cpp
index fd94840..830aaf7 100644
--- a/Analysis/src/Module.cpp
+++ b/Analysis/src/Module.cpp
@@ -20,6 +20,7 @@ LUAU_FASTFLAGVARIABLE(LuauClonePublicInterfaceLess2, false);
 LUAU_FASTFLAG(LuauSubstitutionReentrant);
 LUAU_FASTFLAG(LuauClassTypeVarsInSubstitution);
 LUAU_FASTFLAG(LuauSubstitutionFixMissingFields);
+LUAU_FASTFLAGVARIABLE(LuauCopyExportedTypes, false);
 
 namespace Luau
 {
@@ -37,14 +38,14 @@ static bool contains(Position pos, Comment comment)
         return false;
 }
 
-bool isWithinComment(const SourceModule& sourceModule, Position pos)
+static bool isWithinComment(const std::vector<Comment>& commentLocations, Position pos)
 {
-    auto iter = std::lower_bound(sourceModule.commentLocations.begin(), sourceModule.commentLocations.end(),
-        Comment{Lexeme::Comment, Location{pos, pos}}, [](const Comment& a, const Comment& b) {
+    auto iter = std::lower_bound(
+        commentLocations.begin(), commentLocations.end(), Comment{Lexeme::Comment, Location{pos, pos}}, [](const Comment& a, const Comment& b) {
             return a.location.end < b.location.end;
         });
 
-    if (iter == sourceModule.commentLocations.end())
+    if (iter == commentLocations.end())
         return false;
 
     if (contains(pos, *iter))
@@ -53,12 +54,22 @@ bool isWithinComment(const SourceModule& sourceModule, Position pos)
     // Due to the nature of std::lower_bound, it is possible that iter points at a comment that ends
     // at pos.  We'll try the next comment, if it exists.
     ++iter;
-    if (iter == sourceModule.commentLocations.end())
+    if (iter == commentLocations.end())
         return false;
 
     return contains(pos, *iter);
 }
 
+bool isWithinComment(const SourceModule& sourceModule, Position pos)
+{
+    return isWithinComment(sourceModule.commentLocations, pos);
+}
+
+bool isWithinComment(const ParseResult& result, Position pos)
+{
+    return isWithinComment(result.commentLocations, pos);
+}
+
 struct ClonePublicInterface : Substitution
 {
     NotNull<BuiltinTypes> builtinTypes;
@@ -227,7 +238,7 @@ void Module::clonePublicInterface(NotNull<BuiltinTypes> builtinTypes, InternalEr
 
     // Copy external stuff over to Module itself
     this->returnType = moduleScope->returnType;
-    if (FFlag::DebugLuauDeferredConstraintResolution)
+    if (FFlag::DebugLuauDeferredConstraintResolution || FFlag::LuauCopyExportedTypes)
         this->exportedTypeBindings = moduleScope->exportedTypeBindings;
     else
         this->exportedTypeBindings = std::move(moduleScope->exportedTypeBindings);
diff --git a/Analysis/src/Type.cpp b/Analysis/src/Type.cpp
index d70f17f..5285410 100644
--- a/Analysis/src/Type.cpp
+++ b/Analysis/src/Type.cpp
@@ -337,7 +337,16 @@ bool isSubset(const UnionType& super, const UnionType& sub)
 
     return true;
 }
+bool hasPrimitiveTypeInIntersection(TypeId ty, PrimitiveType::Type primTy)
+{
+    TypeId tf = follow(ty);
+    if (isPrim(tf, primTy))
+        return true;
 
+    for (auto t : flattenIntersection(tf))
+        return isPrim(follow(t), primTy);
+    return false;
+}
 // When typechecking an assignment `x = e`, we typecheck `x:T` and `e:U`,
 // then instantiate U if `isGeneric(U)` is true, and `maybeGeneric(T)` is false.
 bool isGeneric(TypeId ty)
diff --git a/Analysis/src/TypeChecker2.cpp b/Analysis/src/TypeChecker2.cpp
index c7d30f4..6e76af0 100644
--- a/Analysis/src/TypeChecker2.cpp
+++ b/Analysis/src/TypeChecker2.cpp
@@ -1160,11 +1160,7 @@ struct TypeChecker2
         visit(expr, RValue);
 
         TypeId leftType = stripFromNilAndReport(lookupType(expr), location);
-        const NormalizedType* norm = normalizer.normalize(leftType);
-        if (!norm)
-            reportError(NormalizationTooComplex{}, location);
-
-        checkIndexTypeFromType(leftType, *norm, propName, location, context);
+        checkIndexTypeFromType(leftType, propName, location, context);
     }
 
     void visit(AstExprIndexName* indexName, ValueContext context)
@@ -2033,8 +2029,16 @@ struct TypeChecker2
             reportError(std::move(e));
     }
 
-    void checkIndexTypeFromType(TypeId tableTy, const NormalizedType& norm, const std::string& prop, const Location& location, ValueContext context)
+    // If the provided type does not have the named property, report an error.
+    void checkIndexTypeFromType(TypeId tableTy, const std::string& prop, const Location& location, ValueContext context)
     {
+        const NormalizedType* norm = normalizer.normalize(tableTy);
+        if (!norm)
+        {
+            reportError(NormalizationTooComplex{}, location);
+            return;
+        }
+
         bool foundOneProp = false;
         std::vector<TypeId> typesMissingTheProp;
 
@@ -2042,49 +2046,50 @@ struct TypeChecker2
             if (!normalizer.isInhabited(ty))
                 return;
 
-            bool found = hasIndexTypeFromType(ty, prop, location);
+            std::unordered_set<TypeId> seen;
+            bool found = hasIndexTypeFromType(ty, prop, location, seen);
             foundOneProp |= found;
             if (!found)
                 typesMissingTheProp.push_back(ty);
         };
 
-        fetch(norm.tops);
-        fetch(norm.booleans);
+        fetch(norm->tops);
+        fetch(norm->booleans);
 
         if (FFlag::LuauNegatedClassTypes)
         {
-            for (const auto& [ty, _negations] : norm.classes.classes)
+            for (const auto& [ty, _negations] : norm->classes.classes)
             {
                 fetch(ty);
             }
         }
         else
         {
-            for (TypeId ty : norm.DEPRECATED_classes)
+            for (TypeId ty : norm->DEPRECATED_classes)
                 fetch(ty);
         }
-        fetch(norm.errors);
-        fetch(norm.nils);
-        fetch(norm.numbers);
-        if (!norm.strings.isNever())
+        fetch(norm->errors);
+        fetch(norm->nils);
+        fetch(norm->numbers);
+        if (!norm->strings.isNever())
             fetch(builtinTypes->stringType);
-        fetch(norm.threads);
-        for (TypeId ty : norm.tables)
+        fetch(norm->threads);
+        for (TypeId ty : norm->tables)
             fetch(ty);
-        if (norm.functions.isTop)
+        if (norm->functions.isTop)
             fetch(builtinTypes->functionType);
-        else if (!norm.functions.isNever())
+        else if (!norm->functions.isNever())
         {
-            if (norm.functions.parts.size() == 1)
-                fetch(norm.functions.parts.front());
+            if (norm->functions.parts.size() == 1)
+                fetch(norm->functions.parts.front());
             else
             {
                 std::vector<TypeId> parts;
-                parts.insert(parts.end(), norm.functions.parts.begin(), norm.functions.parts.end());
+                parts.insert(parts.end(), norm->functions.parts.begin(), norm->functions.parts.end());
                 fetch(testArena.addType(IntersectionType{std::move(parts)}));
             }
         }
-        for (const auto& [tyvar, intersect] : norm.tyvars)
+        for (const auto& [tyvar, intersect] : norm->tyvars)
         {
             if (get<NeverType>(intersect->tops))
             {
@@ -2110,8 +2115,15 @@ struct TypeChecker2
         }
     }
 
-    bool hasIndexTypeFromType(TypeId ty, const std::string& prop, const Location& location)
+    bool hasIndexTypeFromType(TypeId ty, const std::string& prop, const Location& location, std::unordered_set<TypeId>& seen)
     {
+        // If we have already encountered this type, we must assume that some
+        // other codepath will do the right thing and signal false if the
+        // property is not present.
+        const bool isUnseen = seen.insert(ty).second;
+        if (!isUnseen)
+            return true;
+
         if (get<ErrorType>(ty) || get<AnyType>(ty) || get<NeverType>(ty))
             return true;
 
@@ -2136,10 +2148,12 @@ struct TypeChecker2
         else if (const ClassType* cls = get<ClassType>(ty))
             return bool(lookupClassProp(cls, prop));
         else if (const UnionType* utv = get<UnionType>(ty))
-            ice.ice("getIndexTypeFromTypeHelper cannot take a UnionType");
+            return std::all_of(begin(utv), end(utv), [&](TypeId part) {
+                return hasIndexTypeFromType(part, prop, location, seen);
+            });
         else if (const IntersectionType* itv = get<IntersectionType>(ty))
             return std::any_of(begin(itv), end(itv), [&](TypeId part) {
-                return hasIndexTypeFromType(part, prop, location);
+                return hasIndexTypeFromType(part, prop, location, seen);
             });
         else
             return false;
diff --git a/Analysis/src/TypeInfer.cpp b/Analysis/src/TypeInfer.cpp
index acf70fe..7f366a2 100644
--- a/Analysis/src/TypeInfer.cpp
+++ b/Analysis/src/TypeInfer.cpp
@@ -35,14 +35,13 @@ LUAU_FASTFLAG(LuauKnowsTheDataModel3)
 LUAU_FASTFLAGVARIABLE(DebugLuauFreezeDuringUnification, false)
 LUAU_FASTFLAGVARIABLE(LuauReturnAnyInsteadOfICE, false) // Eventually removed as false.
 LUAU_FASTFLAGVARIABLE(DebugLuauSharedSelf, false)
-LUAU_FASTFLAGVARIABLE(LuauTryhardAnd, false)
 LUAU_FASTFLAG(LuauInstantiateInSubtyping)
 LUAU_FASTFLAG(LuauNegatedClassTypes)
 LUAU_FASTFLAGVARIABLE(LuauAllowIndexClassParameters, false)
 LUAU_FASTFLAG(LuauUninhabitedSubAnything2)
+LUAU_FASTFLAG(LuauOccursIsntAlwaysFailure)
 LUAU_FASTFLAGVARIABLE(LuauTypecheckTypeguards, false)
 LUAU_FASTFLAGVARIABLE(LuauTinyControlFlowAnalysis, false)
-LUAU_FASTFLAGVARIABLE(LuauReducingAndOr, false)
 
 namespace Luau
 {
@@ -1623,9 +1622,28 @@ ControlFlow TypeChecker::check(const ScopePtr& scope, const AstStatTypeAlias& ty
 
     TypeId& bindingType = bindingsMap[name].type;
 
-    if (unify(ty, bindingType, aliasScope, typealias.location))
-        bindingType = ty;
+    if (!FFlag::LuauOccursIsntAlwaysFailure)
+    {
+        if (unify(ty, bindingType, aliasScope, typealias.location))
+            bindingType = ty;
+        return ControlFlow::None;
+    }
 
+    unify(ty, bindingType, aliasScope, typealias.location);
+
+    // It is possible for this unification to succeed but for
+    // `bindingType` still to be free For example, in
+    // `type T = T|T`, we generate a fresh free type `X`, and then
+    // unify `X` with `X|X`, which succeeds without binding `X` to
+    // anything, since `X <: X|X`
+    if (bindingType->ty.get_if<FreeType>())
+    {
+        ty = errorRecoveryType(aliasScope);
+        unify(ty, bindingType, aliasScope, typealias.location);
+        reportError(TypeError{typealias.location, OccursCheckFailed{}});
+    }
+
+    bindingType = ty;
     return ControlFlow::None;
 }
 
@@ -2848,7 +2866,7 @@ TypeId TypeChecker::checkRelationalOperation(
         {
             return lhsType;
         }
-        else if (FFlag::LuauTryhardAnd)
+        else
         {
             // If lhs is free, we can't tell which 'falsy' components it has, if any
             if (get<FreeType>(lhsType))
@@ -2860,14 +2878,11 @@ TypeId TypeChecker::checkRelationalOperation(
             {
                 LUAU_ASSERT(oty);
 
-                if (FFlag::LuauReducingAndOr)
-                {
-                    // Perform a limited form of type reduction for booleans
-                    if (isPrim(*oty, PrimitiveType::Boolean) && get<BooleanSingleton>(get<SingletonType>(follow(rhsType))))
-                        return booleanType;
-                    if (isPrim(rhsType, PrimitiveType::Boolean) && get<BooleanSingleton>(get<SingletonType>(follow(*oty))))
-                        return booleanType;
-                }
+                // Perform a limited form of type reduction for booleans
+                if (isPrim(*oty, PrimitiveType::Boolean) && get<BooleanSingleton>(get<SingletonType>(follow(rhsType))))
+                    return booleanType;
+                if (isPrim(rhsType, PrimitiveType::Boolean) && get<BooleanSingleton>(get<SingletonType>(follow(*oty))))
+                    return booleanType;
 
                 return unionOfTypes(*oty, rhsType, scope, expr.location, false);
             }
@@ -2876,16 +2891,12 @@ TypeId TypeChecker::checkRelationalOperation(
                 return rhsType;
             }
         }
-        else
-        {
-            return unionOfTypes(rhsType, booleanType, scope, expr.location, false);
-        }
     case AstExprBinary::Or:
         if (lhsIsAny)
         {
             return lhsType;
         }
-        else if (FFlag::LuauTryhardAnd)
+        else
         {
             auto [oty, notNever] = pickTypesFromSense(lhsType, true, neverType); // Filter out truthy types
 
@@ -2893,14 +2904,11 @@ TypeId TypeChecker::checkRelationalOperation(
             {
                 LUAU_ASSERT(oty);
 
-                if (FFlag::LuauReducingAndOr)
-                {
-                    // Perform a limited form of type reduction for booleans
-                    if (isPrim(*oty, PrimitiveType::Boolean) && get<BooleanSingleton>(get<SingletonType>(follow(rhsType))))
-                        return booleanType;
-                    if (isPrim(rhsType, PrimitiveType::Boolean) && get<BooleanSingleton>(get<SingletonType>(follow(*oty))))
-                        return booleanType;
-                }
+                // Perform a limited form of type reduction for booleans
+                if (isPrim(*oty, PrimitiveType::Boolean) && get<BooleanSingleton>(get<SingletonType>(follow(rhsType))))
+                    return booleanType;
+                if (isPrim(rhsType, PrimitiveType::Boolean) && get<BooleanSingleton>(get<SingletonType>(follow(*oty))))
+                    return booleanType;
 
                 return unionOfTypes(*oty, rhsType, scope, expr.location);
             }
@@ -2909,10 +2917,6 @@ TypeId TypeChecker::checkRelationalOperation(
                 return rhsType;
             }
         }
-        else
-        {
-            return unionOfTypes(lhsType, rhsType, scope, expr.location);
-        }
     default:
         LUAU_ASSERT(0);
         ice(format("checkRelationalOperation called with incorrect binary expression '%s'", toString(expr.op).c_str()), expr.location);
diff --git a/Analysis/src/Unifier.cpp b/Analysis/src/Unifier.cpp
index 642aa39..3f4e34f 100644
--- a/Analysis/src/Unifier.cpp
+++ b/Analysis/src/Unifier.cpp
@@ -19,8 +19,10 @@ LUAU_FASTINT(LuauTypeInferTypePackLoopLimit)
 LUAU_FASTFLAG(LuauErrorRecoveryType)
 LUAU_FASTFLAGVARIABLE(LuauInstantiateInSubtyping, false)
 LUAU_FASTFLAGVARIABLE(LuauUninhabitedSubAnything2, false)
+LUAU_FASTFLAGVARIABLE(LuauVariadicAnyCanBeGeneric, false)
 LUAU_FASTFLAGVARIABLE(LuauMaintainScopesInUnifier, false)
 LUAU_FASTFLAGVARIABLE(LuauTransitiveSubtyping, false)
+LUAU_FASTFLAGVARIABLE(LuauOccursIsntAlwaysFailure, false)
 LUAU_FASTFLAG(LuauClassTypeVarsInSubstitution)
 LUAU_FASTFLAG(DebugLuauDeferredConstraintResolution)
 LUAU_FASTFLAG(LuauNormalizeBlockedTypes)
@@ -431,14 +433,14 @@ void Unifier::tryUnify_(TypeId subTy, TypeId superTy, bool isFunctionCall, bool
 
     if (superFree && subFree && subsumes(useScopes, superFree, subFree))
     {
-        if (!occursCheck(subTy, superTy))
+        if (!occursCheck(subTy, superTy, /* reversed = */ false))
             log.replace(subTy, BoundType(superTy));
 
         return;
     }
     else if (superFree && subFree)
     {
-        if (!occursCheck(superTy, subTy))
+        if (!occursCheck(superTy, subTy, /* reversed = */ true))
         {
             if (subsumes(useScopes, superFree, subFree))
             {
@@ -461,7 +463,7 @@ void Unifier::tryUnify_(TypeId subTy, TypeId superTy, bool isFunctionCall, bool
             return;
         }
 
-        if (!occursCheck(superTy, subTy))
+        if (!occursCheck(superTy, subTy, /* reversed = */ true))
         {
             promoteTypeLevels(log, types, superFree->level, superFree->scope, useScopes, subTy);
 
@@ -487,7 +489,7 @@ void Unifier::tryUnify_(TypeId subTy, TypeId superTy, bool isFunctionCall, bool
             return;
         }
 
-        if (!occursCheck(subTy, superTy))
+        if (!occursCheck(subTy, superTy, /* reversed = */ false))
         {
             promoteTypeLevels(log, types, subFree->level, subFree->scope, useScopes, superTy);
             log.replace(subTy, BoundType(superTy));
@@ -1593,7 +1595,7 @@ void Unifier::tryUnify_(TypePackId subTp, TypePackId superTp, bool isFunctionCal
 
     if (log.getMutable<FreeTypePack>(superTp))
     {
-        if (!occursCheck(superTp, subTp))
+        if (!occursCheck(superTp, subTp, /* reversed = */ true))
         {
             Widen widen{types, builtinTypes};
             log.replace(superTp, Unifiable::Bound<TypePackId>(widen(subTp)));
@@ -1601,7 +1603,7 @@ void Unifier::tryUnify_(TypePackId subTp, TypePackId superTp, bool isFunctionCal
     }
     else if (log.getMutable<FreeTypePack>(subTp))
     {
-        if (!occursCheck(subTp, superTp))
+        if (!occursCheck(subTp, superTp, /* reversed = */ false))
         {
             log.replace(subTp, Unifiable::Bound<TypePackId>(superTp));
         }
@@ -2585,13 +2587,14 @@ static void queueTypePack(std::vector<TypeId>& queue, DenseHashSet<TypePackId>&
 void Unifier::tryUnifyVariadics(TypePackId subTp, TypePackId superTp, bool reversed, int subOffset)
 {
     const VariadicTypePack* superVariadic = log.getMutable<VariadicTypePack>(superTp);
+    const TypeId variadicTy = follow(superVariadic->ty);
 
     if (!superVariadic)
         ice("passed non-variadic pack to tryUnifyVariadics");
 
     if (const VariadicTypePack* subVariadic = log.get<VariadicTypePack>(subTp))
     {
-        tryUnify_(reversed ? superVariadic->ty : subVariadic->ty, reversed ? subVariadic->ty : superVariadic->ty);
+        tryUnify_(reversed ? variadicTy : subVariadic->ty, reversed ? subVariadic->ty : variadicTy);
     }
     else if (log.get<TypePack>(subTp))
     {
@@ -2602,7 +2605,7 @@ void Unifier::tryUnifyVariadics(TypePackId subTp, TypePackId superTp, bool rever
 
         while (subIter != subEnd)
         {
-            tryUnify_(reversed ? superVariadic->ty : *subIter, reversed ? *subIter : superVariadic->ty);
+            tryUnify_(reversed ? variadicTy : *subIter, reversed ? *subIter : variadicTy);
             ++subIter;
         }
 
@@ -2615,7 +2618,7 @@ void Unifier::tryUnifyVariadics(TypePackId subTp, TypePackId superTp, bool rever
             }
             else if (const VariadicTypePack* vtp = get<VariadicTypePack>(tail))
             {
-                tryUnify_(vtp->ty, superVariadic->ty);
+                tryUnify_(vtp->ty, variadicTy);
             }
             else if (get<GenericTypePack>(tail))
             {
@@ -2631,6 +2634,10 @@ void Unifier::tryUnifyVariadics(TypePackId subTp, TypePackId superTp, bool rever
             }
         }
     }
+    else if (FFlag::LuauVariadicAnyCanBeGeneric && get<AnyType>(variadicTy) && log.get<GenericTypePack>(subTp))
+    {
+        // Nothing to do.  This is ok.
+    }
     else
     {
         reportError(location, GenericError{"Failed to unify variadic packs"});
@@ -2751,11 +2758,42 @@ TxnLog Unifier::combineLogsIntoUnion(std::vector<TxnLog> logs)
     return result;
 }
 
-bool Unifier::occursCheck(TypeId needle, TypeId haystack)
+bool Unifier::occursCheck(TypeId needle, TypeId haystack, bool reversed)
 {
     sharedState.tempSeenTy.clear();
 
-    return occursCheck(sharedState.tempSeenTy, needle, haystack);
+    bool occurs = occursCheck(sharedState.tempSeenTy, needle, haystack);
+
+    if (occurs && FFlag::LuauOccursIsntAlwaysFailure)
+    {
+        Unifier innerState = makeChildUnifier();
+        if (const UnionType* ut = get<UnionType>(haystack))
+        {
+            if (reversed)
+                innerState.tryUnifyUnionWithType(haystack, ut, needle);
+            else
+                innerState.tryUnifyTypeWithUnion(needle, haystack, ut, /* cacheEnabled = */ false, /* isFunction = */ false);
+        }
+        else if (const IntersectionType* it = get<IntersectionType>(haystack))
+        {
+            if (reversed)
+                innerState.tryUnifyIntersectionWithType(haystack, it, needle, /* cacheEnabled = */ false, /* isFunction = */ false);
+            else
+                innerState.tryUnifyTypeWithIntersection(needle, haystack, it);
+        }
+        else
+        {
+            innerState.failure = true;
+        }
+
+        if (innerState.failure)
+        {
+            reportError(location, OccursCheckFailed{});
+            log.replace(needle, *builtinTypes->errorRecoveryType());
+        }
+    }
+
+    return occurs;
 }
 
 bool Unifier::occursCheck(DenseHashSet<TypeId>& seen, TypeId needle, TypeId haystack)
@@ -2785,8 +2823,11 @@ bool Unifier::occursCheck(DenseHashSet<TypeId>& seen, TypeId needle, TypeId hays
 
     if (needle == haystack)
     {
-        reportError(location, OccursCheckFailed{});
-        log.replace(needle, *builtinTypes->errorRecoveryType());
+        if (!FFlag::LuauOccursIsntAlwaysFailure)
+        {
+            reportError(location, OccursCheckFailed{});
+            log.replace(needle, *builtinTypes->errorRecoveryType());
+        }
 
         return true;
     }
@@ -2807,11 +2848,19 @@ bool Unifier::occursCheck(DenseHashSet<TypeId>& seen, TypeId needle, TypeId hays
     return occurrence;
 }
 
-bool Unifier::occursCheck(TypePackId needle, TypePackId haystack)
+bool Unifier::occursCheck(TypePackId needle, TypePackId haystack, bool reversed)
 {
     sharedState.tempSeenTp.clear();
 
-    return occursCheck(sharedState.tempSeenTp, needle, haystack);
+    bool occurs = occursCheck(sharedState.tempSeenTp, needle, haystack);
+
+    if (occurs && FFlag::LuauOccursIsntAlwaysFailure)
+    {
+        reportError(location, OccursCheckFailed{});
+        log.replace(needle, *builtinTypes->errorRecoveryTypePack());
+    }
+
+    return occurs;
 }
 
 bool Unifier::occursCheck(DenseHashSet<TypePackId>& seen, TypePackId needle, TypePackId haystack)
@@ -2836,8 +2885,11 @@ bool Unifier::occursCheck(DenseHashSet<TypePackId>& seen, TypePackId needle, Typ
     {
         if (needle == haystack)
         {
-            reportError(location, OccursCheckFailed{});
-            log.replace(needle, *builtinTypes->errorRecoveryTypePack());
+            if (!FFlag::LuauOccursIsntAlwaysFailure)
+            {
+                reportError(location, OccursCheckFailed{});
+                log.replace(needle, *builtinTypes->errorRecoveryTypePack());
+            }
 
             return true;
         }
diff --git a/Ast/src/StringUtils.cpp b/Ast/src/StringUtils.cpp
index 11e0076..343c553 100644
--- a/Ast/src/StringUtils.cpp
+++ b/Ast/src/StringUtils.cpp
@@ -167,7 +167,9 @@ size_t editDistance(std::string_view a, std::string_view b)
 
         for (size_t y = 1; y <= b.size(); ++y)
         {
-            size_t x1 = seenCharToRow[b[y - 1]];
+            // The value of b[N] can be negative with unicode characters
+            unsigned char bSeenCharIndex = static_cast<unsigned char>(b[y - 1]);
+            size_t x1 = seenCharToRow[bSeenCharIndex];
             size_t y1 = lastMatchedY;
 
             size_t cost = 1;
@@ -187,7 +189,9 @@ size_t editDistance(std::string_view a, std::string_view b)
             distances[getPos(x + 1, y + 1)] = std::min(std::min(insertion, deletion), std::min(substitution, transposition));
         }
 
-        seenCharToRow[a[x - 1]] = x;
+        // The value of a[N] can be negative with unicode characters
+        unsigned char aSeenCharIndex = static_cast<unsigned char>(a[x - 1]);
+        seenCharToRow[aSeenCharIndex] = x;
     }
 
     return distances[getPos(a.size() + 1, b.size() + 1)];
diff --git a/CodeGen/include/Luau/AddressA64.h b/CodeGen/include/Luau/AddressA64.h
index 2796ef7..acb64e3 100644
--- a/CodeGen/include/Luau/AddressA64.h
+++ b/CodeGen/include/Luau/AddressA64.h
@@ -29,7 +29,7 @@ struct AddressA64
     // For example, ldr x0, [reg+imm] is limited to 8 KB offsets assuming imm is divisible by 8, but loading into w0 reduces the range to 4 KB
     static constexpr size_t kMaxOffset = 1023;
 
-    AddressA64(RegisterA64 base, int off = 0)
+    constexpr AddressA64(RegisterA64 base, int off = 0)
         : kind(AddressKindA64::imm)
         , base(base)
         , offset(xzr)
@@ -38,7 +38,7 @@ struct AddressA64
         LUAU_ASSERT(base.kind == KindA64::x || base == sp);
     }
 
-    AddressA64(RegisterA64 base, RegisterA64 offset)
+    constexpr AddressA64(RegisterA64 base, RegisterA64 offset)
         : kind(AddressKindA64::reg)
         , base(base)
         , offset(offset)
diff --git a/CodeGen/include/Luau/AssemblyBuilderA64.h b/CodeGen/include/Luau/AssemblyBuilderA64.h
index def4d0c..42f5f8a 100644
--- a/CodeGen/include/Luau/AssemblyBuilderA64.h
+++ b/CodeGen/include/Luau/AssemblyBuilderA64.h
@@ -49,17 +49,25 @@ public:
     void cmp(RegisterA64 src1, RegisterA64 src2);
     void cmp(RegisterA64 src1, uint16_t src2);
     void csel(RegisterA64 dst, RegisterA64 src1, RegisterA64 src2, ConditionA64 cond);
+    void cset(RegisterA64 dst, ConditionA64 cond);
 
     // Bitwise
-    // TODO: support immediate arguments (they have odd encoding and forbid many values)
-    // TODO: support bic (andnot)
     // TODO: support shifts
     // TODO: support bitfield ops
     void and_(RegisterA64 dst, RegisterA64 src1, RegisterA64 src2);
     void orr(RegisterA64 dst, RegisterA64 src1, RegisterA64 src2);
     void eor(RegisterA64 dst, RegisterA64 src1, RegisterA64 src2);
+    void bic(RegisterA64 dst, RegisterA64 src1, RegisterA64 src2);
+    void tst(RegisterA64 src1, RegisterA64 src2);
     void mvn(RegisterA64 dst, RegisterA64 src);
 
+    // Bitwise with immediate
+    // Note: immediate must have a single contiguous sequence of 1 bits set of length 1..31
+    void and_(RegisterA64 dst, RegisterA64 src1, uint32_t src2);
+    void orr(RegisterA64 dst, RegisterA64 src1, uint32_t src2);
+    void eor(RegisterA64 dst, RegisterA64 src1, uint32_t src2);
+    void tst(RegisterA64 src1, uint32_t src2);
+
     // Shifts
     void lsl(RegisterA64 dst, RegisterA64 src1, RegisterA64 src2);
     void lsr(RegisterA64 dst, RegisterA64 src1, RegisterA64 src2);
@@ -168,7 +176,7 @@ public:
 private:
     // Instruction archetypes
     void place0(const char* name, uint32_t word);
-    void placeSR3(const char* name, RegisterA64 dst, RegisterA64 src1, RegisterA64 src2, uint8_t op, int shift = 0);
+    void placeSR3(const char* name, RegisterA64 dst, RegisterA64 src1, RegisterA64 src2, uint8_t op, int shift = 0, int N = 0);
     void placeSR2(const char* name, RegisterA64 dst, RegisterA64 src, uint8_t op, uint8_t op2 = 0);
     void placeR3(const char* name, RegisterA64 dst, RegisterA64 src1, RegisterA64 src2, uint8_t op, uint8_t op2);
     void placeR1(const char* name, RegisterA64 dst, RegisterA64 src, uint32_t op);
@@ -181,8 +189,9 @@ private:
     void placeADR(const char* name, RegisterA64 src, uint8_t op);
     void placeADR(const char* name, RegisterA64 src, uint8_t op, Label& label);
     void placeP(const char* name, RegisterA64 dst1, RegisterA64 dst2, AddressA64 src, uint8_t op, uint8_t opc, int sizelog);
-    void placeCS(const char* name, RegisterA64 dst, RegisterA64 src1, RegisterA64 src2, ConditionA64 cond, uint8_t op, uint8_t opc);
+    void placeCS(const char* name, RegisterA64 dst, RegisterA64 src1, RegisterA64 src2, ConditionA64 cond, uint8_t op, uint8_t opc, int invert = 0);
     void placeFCMP(const char* name, RegisterA64 src1, RegisterA64 src2, uint8_t op, uint8_t opc);
+    void placeBM(const char* name, RegisterA64 dst, RegisterA64 src1, uint32_t src2, uint8_t op);
 
     void place(uint32_t word);
 
diff --git a/CodeGen/include/Luau/IrCallWrapperX64.h b/CodeGen/include/Luau/IrCallWrapperX64.h
index 724d462..c403d18 100644
--- a/CodeGen/include/Luau/IrCallWrapperX64.h
+++ b/CodeGen/include/Luau/IrCallWrapperX64.h
@@ -41,12 +41,14 @@ public:
 
     void call(const OperandX64& func);
 
+    RegisterX64 suggestNextArgumentRegister(SizeX64 size) const;
+
     IrRegAllocX64& regs;
     AssemblyBuilderX64& build;
     uint32_t instIdx = ~0u;
 
 private:
-    void assignTargetRegisters();
+    OperandX64 getNextArgumentTarget(SizeX64 size) const;
     void countRegisterUses();
     CallArgument* findNonInterferingArgument();
     bool interferesWithOperand(const OperandX64& op, RegisterX64 reg) const;
@@ -67,6 +69,9 @@ private:
     std::array<CallArgument, kMaxCallArguments> args;
     int argCount = 0;
 
+    int gprPos = 0;
+    int xmmPos = 0;
+
     OperandX64 funcOp;
 
     // Internal counters for remaining register use counts
diff --git a/CodeGen/include/Luau/IrData.h b/CodeGen/include/Luau/IrData.h
index fcf29ad..486a013 100644
--- a/CodeGen/include/Luau/IrData.h
+++ b/CodeGen/include/Luau/IrData.h
@@ -155,7 +155,7 @@ enum class IrCmd : uint8_t
 
     // Compute Luau 'not' operation on destructured TValue
     // A: tag
-    // B: double
+    // B: int (value)
     NOT_ANY, // TODO: boolean specialization will be useful
 
     // Unconditional jump
@@ -233,7 +233,7 @@ enum class IrCmd : uint8_t
 
     // Try to get pointer to tag method TValue inside the table's metatable or jump if there is no such value or metatable
     // A: table
-    // B: int
+    // B: int (TMS enum)
     // C: block
     TRY_CALL_FASTGETTM,
 
@@ -256,8 +256,8 @@ enum class IrCmd : uint8_t
     // B: Rn (result start)
     // C: Rn (argument start)
     // D: Rn or Kn or a boolean that's false (optional second argument)
-    // E: int (argument count or -1 to use all arguments up to stack top)
-    // F: int (result count or -1 to preserve all results and adjust stack top)
+    // E: int (argument count)
+    // F: int (result count)
     FASTCALL,
 
     // Call the fastcall builtin function
@@ -517,8 +517,10 @@ enum class IrCmd : uint8_t
     FALLBACK_FORGPREP,
 
     // Instruction that passes value through, it is produced by constant folding and users substitute it with the value
+    // When operand location is set, updates the tracked location of the value in memory
     SUBSTITUTE,
     // A: operand of any type
+    // B: Rn/Kn/none (location of operand in memory; optional)
 };
 
 enum class IrConstKind : uint8_t
@@ -694,6 +696,9 @@ struct IrFunction
 
     std::vector<BytecodeMapping> bcMapping;
 
+    // For each instruction, an operand that can be used to recompute the calue
+    std::vector<IrOp> valueRestoreOps;
+
     Proto* proto = nullptr;
 
     CfgInfo cfg;
@@ -829,19 +834,40 @@ struct IrFunction
         return value.valueDouble;
     }
 
-    uint32_t getBlockIndex(const IrBlock& block)
+    uint32_t getBlockIndex(const IrBlock& block) const
     {
         // Can only be called with blocks from our vector
         LUAU_ASSERT(&block >= blocks.data() && &block <= blocks.data() + blocks.size());
         return uint32_t(&block - blocks.data());
     }
 
-    uint32_t getInstIndex(const IrInst& inst)
+    uint32_t getInstIndex(const IrInst& inst) const
     {
         // Can only be called with instructions from our vector
         LUAU_ASSERT(&inst >= instructions.data() && &inst <= instructions.data() + instructions.size());
         return uint32_t(&inst - instructions.data());
     }
+
+    void recordRestoreOp(uint32_t instIdx, IrOp location)
+    {
+        if (instIdx >= valueRestoreOps.size())
+            valueRestoreOps.resize(instIdx + 1);
+
+        valueRestoreOps[instIdx] = location;
+    }
+
+    IrOp findRestoreOp(uint32_t instIdx) const
+    {
+        if (instIdx >= valueRestoreOps.size())
+            return {};
+
+        return valueRestoreOps[instIdx];
+    }
+
+    IrOp findRestoreOp(const IrInst& inst) const
+    {
+        return findRestoreOp(getInstIndex(inst));
+    }
 };
 
 inline IrCondition conditionOp(IrOp op)
diff --git a/CodeGen/include/Luau/IrRegAllocX64.h b/CodeGen/include/Luau/IrRegAllocX64.h
index dc7b48c..f83cc22 100644
--- a/CodeGen/include/Luau/IrRegAllocX64.h
+++ b/CodeGen/include/Luau/IrRegAllocX64.h
@@ -20,7 +20,9 @@ constexpr uint8_t kNoStackSlot = 0xff;
 struct IrSpillX64
 {
     uint32_t instIdx = 0;
-    bool useDoubleSlot = 0;
+    IrValueKind valueKind = IrValueKind::Unknown;
+
+    unsigned spillId = 0;
 
     // Spill location can be a stack location or be empty
     // When it's empty, it means that instruction value can be rematerialized
@@ -33,12 +35,8 @@ struct IrRegAllocX64
 {
     IrRegAllocX64(AssemblyBuilderX64& build, IrFunction& function);
 
-    RegisterX64 allocGprReg(SizeX64 preferredSize, uint32_t instIdx);
-    RegisterX64 allocXmmReg(uint32_t instIdx);
-
-    RegisterX64 allocGprRegOrReuse(SizeX64 preferredSize, uint32_t instIdx, std::initializer_list<IrOp> oprefs);
-    RegisterX64 allocXmmRegOrReuse(uint32_t instIdx, std::initializer_list<IrOp> oprefs);
-
+    RegisterX64 allocReg(SizeX64 size, uint32_t instIdx);
+    RegisterX64 allocRegOrReuse(SizeX64 size, uint32_t instIdx, std::initializer_list<IrOp> oprefs);
     RegisterX64 takeReg(RegisterX64 reg, uint32_t instIdx);
 
     void freeReg(RegisterX64 reg);
@@ -49,6 +47,12 @@ struct IrRegAllocX64
 
     bool shouldFreeGpr(RegisterX64 reg) const;
 
+    unsigned findSpillStackSlot(IrValueKind valueKind);
+
+    IrOp getRestoreOp(const IrInst& inst) const;
+    bool hasRestoreOp(const IrInst& inst) const;
+    OperandX64 getRestoreAddress(const IrInst& inst, IrOp restoreOp);
+
     // Register used by instruction is about to be freed, have to find a way to restore value later
     void preserve(IrInst& inst);
 
@@ -74,6 +78,7 @@ struct IrRegAllocX64
 
     std::bitset<256> usedSpillSlots;
     unsigned maxUsedSlot = 0;
+    unsigned nextSpillId = 1;
     std::vector<IrSpillX64> spills;
 };
 
@@ -107,10 +112,8 @@ struct ScopedSpills
     ScopedSpills(const ScopedSpills&) = delete;
     ScopedSpills& operator=(const ScopedSpills&) = delete;
 
-    bool wasSpilledBefore(const IrSpillX64& spill) const;
-
     IrRegAllocX64& owner;
-    std::vector<IrSpillX64> snapshot;
+    unsigned startSpillId = 0;
 };
 
 } // namespace X64
diff --git a/CodeGen/include/Luau/IrUtils.h b/CodeGen/include/Luau/IrUtils.h
index 09c55c7..136ce3b 100644
--- a/CodeGen/include/Luau/IrUtils.h
+++ b/CodeGen/include/Luau/IrUtils.h
@@ -200,7 +200,7 @@ void replace(IrFunction& function, IrOp& original, IrOp replacement);
 void replace(IrFunction& function, IrBlock& block, uint32_t instIdx, IrInst replacement);
 
 // Replace instruction with a different value (using IrCmd::SUBSTITUTE)
-void substitute(IrFunction& function, IrInst& inst, IrOp replacement);
+void substitute(IrFunction& function, IrInst& inst, IrOp replacement, IrOp location = {});
 
 // Replace instruction arguments that point to substitutions with target values
 void applySubstitutions(IrFunction& function, IrOp& op);
diff --git a/CodeGen/include/Luau/RegisterA64.h b/CodeGen/include/Luau/RegisterA64.h
index 99e6295..c3a9ae0 100644
--- a/CodeGen/include/Luau/RegisterA64.h
+++ b/CodeGen/include/Luau/RegisterA64.h
@@ -46,6 +46,18 @@ constexpr RegisterA64 castReg(KindA64 kind, RegisterA64 reg)
     return RegisterA64{kind, reg.index};
 }
 
+// This is equivalent to castReg(KindA64::x), but is separate because it implies different semantics
+// Specifically, there are cases when it's useful to treat a wN register as an xN register *after* it has been assigned a value
+// Since all A64 instructions that write to wN implicitly zero the top half, this works when we need zero extension semantics
+// Crucially, this is *not* safe on an ABI boundary - an int parameter in wN register may have anything in its top half in certain cases
+// However, as long as our codegen doesn't use 32-bit truncation by using castReg x=>w, we can safely rely on this.
+constexpr RegisterA64 zextReg(RegisterA64 reg)
+{
+    LUAU_ASSERT(reg.kind == KindA64::w);
+
+    return RegisterA64{KindA64::x, reg.index};
+}
+
 constexpr RegisterA64 noreg{KindA64::none, 0};
 
 constexpr RegisterA64 w0{KindA64::w, 0};
diff --git a/CodeGen/include/Luau/RegisterX64.h b/CodeGen/include/Luau/RegisterX64.h
index 9d76b11..7fa9760 100644
--- a/CodeGen/include/Luau/RegisterX64.h
+++ b/CodeGen/include/Luau/RegisterX64.h
@@ -46,6 +46,18 @@ constexpr RegisterX64 al{SizeX64::byte, 0};
 constexpr RegisterX64 cl{SizeX64::byte, 1};
 constexpr RegisterX64 dl{SizeX64::byte, 2};
 constexpr RegisterX64 bl{SizeX64::byte, 3};
+constexpr RegisterX64 spl{SizeX64::byte, 4};
+constexpr RegisterX64 bpl{SizeX64::byte, 5};
+constexpr RegisterX64 sil{SizeX64::byte, 6};
+constexpr RegisterX64 dil{SizeX64::byte, 7};
+constexpr RegisterX64 r8b{SizeX64::byte, 8};
+constexpr RegisterX64 r9b{SizeX64::byte, 9};
+constexpr RegisterX64 r10b{SizeX64::byte, 10};
+constexpr RegisterX64 r11b{SizeX64::byte, 11};
+constexpr RegisterX64 r12b{SizeX64::byte, 12};
+constexpr RegisterX64 r13b{SizeX64::byte, 13};
+constexpr RegisterX64 r14b{SizeX64::byte, 14};
+constexpr RegisterX64 r15b{SizeX64::byte, 15};
 
 constexpr RegisterX64 eax{SizeX64::dword, 0};
 constexpr RegisterX64 ecx{SizeX64::dword, 1};
diff --git a/CodeGen/include/Luau/UnwindBuilder.h b/CodeGen/include/Luau/UnwindBuilder.h
index 98e6049..8fe55ba 100644
--- a/CodeGen/include/Luau/UnwindBuilder.h
+++ b/CodeGen/include/Luau/UnwindBuilder.h
@@ -11,6 +11,9 @@ namespace Luau
 namespace CodeGen
 {
 
+// This value is used in 'finishFunction' to mark the function that spans to the end of the whole code block
+static uint32_t kFullBlockFuncton = ~0u;
+
 class UnwindBuilder
 {
 public:
@@ -19,19 +22,22 @@ public:
     virtual void setBeginOffset(size_t beginOffset) = 0;
     virtual size_t getBeginOffset() const = 0;
 
-    virtual void start() = 0;
+    virtual void startInfo() = 0;
 
+    virtual void startFunction() = 0;
     virtual void spill(int espOffset, X64::RegisterX64 reg) = 0;
     virtual void save(X64::RegisterX64 reg) = 0;
     virtual void allocStack(int size) = 0;
     virtual void setupFrameReg(X64::RegisterX64 reg, int espOffset) = 0;
+    virtual void finishFunction(uint32_t beginOffset, uint32_t endOffset) = 0;
 
-    virtual void finish() = 0;
+    virtual void finishInfo() = 0;
 
     virtual size_t getSize() const = 0;
+    virtual size_t getFunctionCount() const = 0;
 
     // This will place the unwinding data at the target address and might update values of some fields
-    virtual void finalize(char* target, void* funcAddress, size_t funcSize) const = 0;
+    virtual void finalize(char* target, size_t offset, void* funcAddress, size_t funcSize) const = 0;
 };
 
 } // namespace CodeGen
diff --git a/CodeGen/include/Luau/UnwindBuilderDwarf2.h b/CodeGen/include/Luau/UnwindBuilderDwarf2.h
index 972f742..9f862d2 100644
--- a/CodeGen/include/Luau/UnwindBuilderDwarf2.h
+++ b/CodeGen/include/Luau/UnwindBuilderDwarf2.h
@@ -4,34 +4,48 @@
 #include "Luau/RegisterX64.h"
 #include "UnwindBuilder.h"
 
+#include <vector>
+
 namespace Luau
 {
 namespace CodeGen
 {
 
+struct UnwindFunctionDwarf2
+{
+    uint32_t beginOffset;
+    uint32_t endOffset;
+    uint32_t fdeEntryStartPos;
+};
+
 class UnwindBuilderDwarf2 : public UnwindBuilder
 {
 public:
     void setBeginOffset(size_t beginOffset) override;
     size_t getBeginOffset() const override;
 
-    void start() override;
+    void startInfo() override;
 
+    void startFunction() override;
     void spill(int espOffset, X64::RegisterX64 reg) override;
     void save(X64::RegisterX64 reg) override;
     void allocStack(int size) override;
     void setupFrameReg(X64::RegisterX64 reg, int espOffset) override;
+    void finishFunction(uint32_t beginOffset, uint32_t endOffset) override;
 
-    void finish() override;
+    void finishInfo() override;
 
     size_t getSize() const override;
+    size_t getFunctionCount() const override;
 
-    void finalize(char* target, void* funcAddress, size_t funcSize) const override;
+    void finalize(char* target, size_t offset, void* funcAddress, size_t funcSize) const override;
 
 private:
     size_t beginOffset = 0;
 
-    static const unsigned kRawDataLimit = 128;
+    std::vector<UnwindFunctionDwarf2> unwindFunctions;
+
+    static const unsigned kRawDataLimit = 1024;
     uint8_t rawData[kRawDataLimit];
     uint8_t* pos = rawData;
 
diff --git a/CodeGen/include/Luau/UnwindBuilderWin.h b/CodeGen/include/Luau/UnwindBuilderWin.h
index 1cd750a..ccd7125 100644
--- a/CodeGen/include/Luau/UnwindBuilderWin.h
+++ b/CodeGen/include/Luau/UnwindBuilderWin.h
@@ -11,6 +11,25 @@ namespace Luau
 namespace CodeGen
 {
 
+// This struct matches the layout of x64 RUNTIME_FUNCTION from winnt.h
+struct UnwindFunctionWin
+{
+    uint32_t beginOffset;
+    uint32_t endOffset;
+    uint32_t unwindInfoOffset;
+};
+
+// This struct matches the layout of x64 UNWIND_INFO from ehdata.h
+struct UnwindInfoWin
+{
+    uint8_t version : 3;
+    uint8_t flags : 5;
+    uint8_t prologsize;
+    uint8_t unwindcodecount;
+    uint8_t framereg : 4;
+    uint8_t frameregoff : 4;
+};
+
 // This struct matches the layout of UNWIND_CODE from ehdata.h
 struct UnwindCodeWin
 {
@@ -25,31 +44,38 @@ public:
     void setBeginOffset(size_t beginOffset) override;
     size_t getBeginOffset() const override;
 
-    void start() override;
+    void startInfo() override;
 
+    void startFunction() override;
     void spill(int espOffset, X64::RegisterX64 reg) override;
     void save(X64::RegisterX64 reg) override;
     void allocStack(int size) override;
     void setupFrameReg(X64::RegisterX64 reg, int espOffset) override;
+    void finishFunction(uint32_t beginOffset, uint32_t endOffset) override;
 
-    void finish() override;
+    void finishInfo() override;
 
     size_t getSize() const override;
+    size_t getFunctionCount() const override;
 
-    void finalize(char* target, void* funcAddress, size_t funcSize) const override;
+    void finalize(char* target, size_t offset, void* funcAddress, size_t funcSize) const override;
 
 private:
     size_t beginOffset = 0;
 
+    static const unsigned kRawDataLimit = 1024;
+    uint8_t rawData[kRawDataLimit];
+    uint8_t* rawDataPos = rawData;
+
+    std::vector<UnwindFunctionWin> unwindFunctions;
+
     // Windows unwind codes are written in reverse, so we have to collect them all first
     std::vector<UnwindCodeWin> unwindCodes;
 
     uint8_t prologSize = 0;
-    X64::RegisterX64 frameReg = X64::rax; // rax means that frame register is not used
+    X64::RegisterX64 frameReg = X64::noreg;
     uint8_t frameRegOffset = 0;
     uint32_t stackOffset = 0;
-
-    size_t infoSize = 0;
 };
 
 } // namespace CodeGen
diff --git a/CodeGen/src/AssemblyBuilderA64.cpp b/CodeGen/src/AssemblyBuilderA64.cpp
index a80003e..bb7c943 100644
--- a/CodeGen/src/AssemblyBuilderA64.cpp
+++ b/CodeGen/src/AssemblyBuilderA64.cpp
@@ -1,6 +1,7 @@
 // This file is part of the Luau programming language and is licensed under MIT License; see LICENSE.txt for details
 #include "Luau/AssemblyBuilderA64.h"
 
+#include "BitUtils.h"
 #include "ByteUtils.h"
 
 #include <stdarg.h>
@@ -126,6 +127,15 @@ void AssemblyBuilderA64::csel(RegisterA64 dst, RegisterA64 src1, RegisterA64 src
     placeCS("csel", dst, src1, src2, cond, 0b11010'10'0, 0b00);
 }
 
+void AssemblyBuilderA64::cset(RegisterA64 dst, ConditionA64 cond)
+{
+    LUAU_ASSERT(dst.kind == KindA64::x || dst.kind == KindA64::w);
+
+    RegisterA64 src = dst.kind == KindA64::x ? xzr : wzr;
+
+    placeCS("cset", dst, src, src, cond, 0b11010'10'0, 0b01, /* invert= */ 1);
+}
+
 void AssemblyBuilderA64::and_(RegisterA64 dst, RegisterA64 src1, RegisterA64 src2)
 {
     placeSR3("and", dst, src1, src2, 0b00'01010);
@@ -141,11 +151,45 @@ void AssemblyBuilderA64::eor(RegisterA64 dst, RegisterA64 src1, RegisterA64 src2
     placeSR3("eor", dst, src1, src2, 0b10'01010);
 }
 
+void AssemblyBuilderA64::bic(RegisterA64 dst, RegisterA64 src1, RegisterA64 src2)
+{
+    placeSR3("bic", dst, src1, src2, 0b00'01010, /* shift= */ 0, /* N= */ 1);
+}
+
+void AssemblyBuilderA64::tst(RegisterA64 src1, RegisterA64 src2)
+{
+    RegisterA64 dst = src1.kind == KindA64::x ? xzr : wzr;
+
+    placeSR3("tst", dst, src1, src2, 0b11'01010);
+}
+
 void AssemblyBuilderA64::mvn(RegisterA64 dst, RegisterA64 src)
 {
     placeSR2("mvn", dst, src, 0b01'01010, 0b1);
 }
 
+void AssemblyBuilderA64::and_(RegisterA64 dst, RegisterA64 src1, uint32_t src2)
+{
+    placeBM("and", dst, src1, src2, 0b00'100100);
+}
+
+void AssemblyBuilderA64::orr(RegisterA64 dst, RegisterA64 src1, uint32_t src2)
+{
+    placeBM("orr", dst, src1, src2, 0b01'100100);
+}
+
+void AssemblyBuilderA64::eor(RegisterA64 dst, RegisterA64 src1, uint32_t src2)
+{
+    placeBM("eor", dst, src1, src2, 0b10'100100);
+}
+
+void AssemblyBuilderA64::tst(RegisterA64 src1, uint32_t src2)
+{
+    RegisterA64 dst = src1.kind == KindA64::x ? xzr : wzr;
+
+    placeBM("tst", dst, src1, src2, 0b11'100100);
+}
+
 void AssemblyBuilderA64::lsl(RegisterA64 dst, RegisterA64 src1, RegisterA64 src2)
 {
     placeR3("lsl", dst, src1, src2, 0b11010110, 0b0010'00);
@@ -583,7 +627,7 @@ void AssemblyBuilderA64::place0(const char* name, uint32_t op)
     commit();
 }
 
-void AssemblyBuilderA64::placeSR3(const char* name, RegisterA64 dst, RegisterA64 src1, RegisterA64 src2, uint8_t op, int shift)
+void AssemblyBuilderA64::placeSR3(const char* name, RegisterA64 dst, RegisterA64 src1, RegisterA64 src2, uint8_t op, int shift, int N)
 {
     if (logText)
         log(name, dst, src1, src2, shift);
@@ -594,7 +638,7 @@ void AssemblyBuilderA64::placeSR3(const char* name, RegisterA64 dst, RegisterA64
 
     uint32_t sf = (dst.kind == KindA64::x) ? 0x80000000 : 0;
 
-    place(dst.index | (src1.index << 5) | (shift << 10) | (src2.index << 16) | (op << 24) | sf);
+    place(dst.index | (src1.index << 5) | (shift << 10) | (src2.index << 16) | (N << 21) | (op << 24) | sf);
     commit();
 }
 
@@ -764,7 +808,8 @@ void AssemblyBuilderA64::placeP(const char* name, RegisterA64 src1, RegisterA64
     commit();
 }
 
-void AssemblyBuilderA64::placeCS(const char* name, RegisterA64 dst, RegisterA64 src1, RegisterA64 src2, ConditionA64 cond, uint8_t op, uint8_t opc)
+void AssemblyBuilderA64::placeCS(
+    const char* name, RegisterA64 dst, RegisterA64 src1, RegisterA64 src2, ConditionA64 cond, uint8_t op, uint8_t opc, int invert)
 {
     if (logText)
         log(name, dst, src1, src2, cond);
@@ -773,7 +818,7 @@ void AssemblyBuilderA64::placeCS(const char* name, RegisterA64 dst, RegisterA64
 
     uint32_t sf = (dst.kind == KindA64::x) ? 0x80000000 : 0;
 
-    place(dst.index | (src1.index << 5) | (opc << 10) | (codeForCondition[int(cond)] << 12) | (src2.index << 16) | (op << 21) | sf);
+    place(dst.index | (src1.index << 5) | (opc << 10) | ((codeForCondition[int(cond)] ^ invert) << 12) | (src2.index << 16) | (op << 21) | sf);
     commit();
 }
 
@@ -793,6 +838,29 @@ void AssemblyBuilderA64::placeFCMP(const char* name, RegisterA64 src1, RegisterA
     commit();
 }
 
+void AssemblyBuilderA64::placeBM(const char* name, RegisterA64 dst, RegisterA64 src1, uint32_t src2, uint8_t op)
+{
+    if (logText)
+        log(name, dst, src1, src2);
+
+    LUAU_ASSERT(dst.kind == KindA64::w || dst.kind == KindA64::x);
+    LUAU_ASSERT(dst.kind == src1.kind);
+
+    uint32_t sf = (dst.kind == KindA64::x) ? 0x80000000 : 0;
+
+    int lz = countlz(src2);
+    int rz = countrz(src2);
+
+    LUAU_ASSERT(lz + rz > 0 && lz + rz < 32);                 // must have at least one 0 and at least one 1
+    LUAU_ASSERT((src2 >> rz) == (1u << (32 - lz - rz)) - 1u); // sequence of 1s must be contiguous
+
+    int imms = 31 - lz - rz;   // count of 1s minus 1
+    int immr = (32 - rz) & 31; // right rotate amount
+
+    place(dst.index | (src1.index << 5) | (imms << 10) | (immr << 16) | (op << 23) | sf);
+    commit();
+}
+
 void AssemblyBuilderA64::place(uint32_t word)
 {
     LUAU_ASSERT(codePos < codeEnd);
@@ -965,10 +1033,13 @@ void AssemblyBuilderA64::log(const char* opcode, RegisterA64 dst, RegisterA64 sr
 {
     logAppend(" %-12s", opcode);
     log(dst);
-    text.append(",");
-    log(src1);
-    text.append(",");
-    log(src2);
+    if ((src1 != wzr && src1 != xzr) || (src2 != wzr && src2 != xzr))
+    {
+        text.append(",");
+        log(src1);
+        text.append(",");
+        log(src2);
+    }
     text.append(",");
     text.append(textForCondition[int(cond)] + 2); // skip b.
     text.append("\n");
diff --git a/CodeGen/src/AssemblyBuilderX64.cpp b/CodeGen/src/AssemblyBuilderX64.cpp
index d86a37c..ed95004 100644
--- a/CodeGen/src/AssemblyBuilderX64.cpp
+++ b/CodeGen/src/AssemblyBuilderX64.cpp
@@ -31,7 +31,8 @@ static_assert(sizeof(setccTextForCondition) / sizeof(setccTextForCondition[0]) =
 #define OP_PLUS_REG(op, reg) ((op) + (reg & 0x7))
 #define OP_PLUS_CC(op, cc) ((op) + uint8_t(cc))
 
-#define REX_W(value) (value ? 0x8 : 0x0)
+#define REX_W_BIT(value) (value ? 0x8 : 0x0)
+#define REX_W(reg) REX_W_BIT((reg).size == SizeX64::qword || ((reg).size == SizeX64::byte && (reg).index >= 4))
 #define REX_R(reg) (((reg).index & 0x8) >> 1)
 #define REX_X(reg) (((reg).index & 0x8) >> 2)
 #define REX_B(reg) (((reg).index & 0x8) >> 3)
@@ -1116,7 +1117,7 @@ void AssemblyBuilderX64::placeAvx(
 
 void AssemblyBuilderX64::placeRex(RegisterX64 op)
 {
-    uint8_t code = REX_W(op.size == SizeX64::qword) | REX_B(op);
+    uint8_t code = REX_W(op) | REX_B(op);
 
     if (code != 0)
         place(code | 0x40);
@@ -1127,9 +1128,9 @@ void AssemblyBuilderX64::placeRex(OperandX64 op)
     uint8_t code = 0;
 
     if (op.cat == CategoryX64::reg)
-        code = REX_W(op.base.size == SizeX64::qword) | REX_B(op.base);
+        code = REX_W(op.base) | REX_B(op.base);
     else if (op.cat == CategoryX64::mem)
-        code = REX_W(op.memSize == SizeX64::qword) | REX_X(op.index) | REX_B(op.base);
+        code = REX_W_BIT(op.memSize == SizeX64::qword) | REX_X(op.index) | REX_B(op.base);
     else
         LUAU_ASSERT(!"No encoding for left operand of this category");
 
@@ -1154,7 +1155,7 @@ void AssemblyBuilderX64::placeRexNoW(OperandX64 op)
 
 void AssemblyBuilderX64::placeRex(RegisterX64 lhs, OperandX64 rhs)
 {
-    uint8_t code = REX_W(lhs.size == SizeX64::qword);
+    uint8_t code = REX_W(lhs);
 
     if (rhs.cat == CategoryX64::imm)
         code |= REX_B(lhs);
diff --git a/CodeGen/src/BitUtils.h b/CodeGen/src/BitUtils.h
new file mode 100644
index 0000000..93f7cc8
--- /dev/null
+++ b/CodeGen/src/BitUtils.h
@@ -0,0 +1,36 @@
+// This file is part of the Luau programming language and is licensed under MIT License; see LICENSE.txt for details
+#pragma once
+
+#include <stdint.h>
+
+#ifdef _MSC_VER
+#include <intrin.h>
+#endif
+
+namespace Luau
+{
+namespace CodeGen
+{
+
+inline int countlz(uint32_t n)
+{
+#ifdef _MSC_VER
+    unsigned long rl;
+    return _BitScanReverse(&rl, n) ? 31 - int(rl) : 32;
+#else
+    return n == 0 ? 32 : __builtin_clz(n);
+#endif
+}
+
+inline int countrz(uint32_t n)
+{
+#ifdef _MSC_VER
+    unsigned long rl;
+    return _BitScanForward(&rl, n) ? int(rl) : 32;
+#else
+    return n == 0 ? 32 : __builtin_ctz(n);
+#endif
+}
+
+} // namespace CodeGen
+} // namespace Luau
diff --git a/CodeGen/src/CodeBlockUnwind.cpp b/CodeGen/src/CodeBlockUnwind.cpp
index 72842be..ccd15fa 100644
--- a/CodeGen/src/CodeBlockUnwind.cpp
+++ b/CodeGen/src/CodeBlockUnwind.cpp
@@ -54,31 +54,6 @@ namespace CodeGen
 
 void* createBlockUnwindInfo(void* context, uint8_t* block, size_t blockSize, size_t& beginOffset)
 {
-#if defined(_WIN32) && defined(_M_X64)
-    UnwindBuilder* unwind = (UnwindBuilder*)context;
-
-    // All unwinding related data is placed together at the start of the block
-    size_t unwindSize = sizeof(RUNTIME_FUNCTION) + unwind->getSize();
-    unwindSize = (unwindSize + (kCodeAlignment - 1)) & ~(kCodeAlignment - 1); // Match code allocator alignment
-    LUAU_ASSERT(blockSize >= unwindSize);
-
-    RUNTIME_FUNCTION* runtimeFunc = (RUNTIME_FUNCTION*)block;
-    runtimeFunc->BeginAddress = DWORD(unwindSize);                    // Code will start after the unwind info
-    runtimeFunc->EndAddress = DWORD(blockSize);                       // Whole block is a part of a 'single function'
-    runtimeFunc->UnwindInfoAddress = DWORD(sizeof(RUNTIME_FUNCTION)); // Unwind info is placed at the start of the block
-
-    char* unwindData = (char*)block + runtimeFunc->UnwindInfoAddress;
-    unwind->finalize(unwindData, block + unwindSize, blockSize - unwindSize);
-
-    if (!RtlAddFunctionTable(runtimeFunc, 1, uintptr_t(block)))
-    {
-        LUAU_ASSERT(!"failed to allocate function table");
-        return nullptr;
-    }
-
-    beginOffset = unwindSize + unwind->getBeginOffset();
-    return block;
-#elif !defined(_WIN32)
     UnwindBuilder* unwind = (UnwindBuilder*)context;
 
     // All unwinding related data is placed together at the start of the block
@@ -87,37 +62,34 @@ void* createBlockUnwindInfo(void* context, uint8_t* block, size_t blockSize, siz
     LUAU_ASSERT(blockSize >= unwindSize);
 
     char* unwindData = (char*)block;
-    unwind->finalize(unwindData, block, blockSize);
+    unwind->finalize(unwindData, unwindSize, block, blockSize);
 
-#if defined(__APPLE__)
+#if defined(_WIN32) && defined(_M_X64)
+    if (!RtlAddFunctionTable((RUNTIME_FUNCTION*)block, uint32_t(unwind->getFunctionCount()), uintptr_t(block)))
+    {
+        LUAU_ASSERT(!"failed to allocate function table");
+        return nullptr;
+    }
+#elif defined(__APPLE__)
     visitFdeEntries(unwindData, __register_frame);
-#else
+#elif !defined(_WIN32)
     __register_frame(unwindData);
 #endif
 
     beginOffset = unwindSize + unwind->getBeginOffset();
     return block;
-#endif
-
-    return nullptr;
 }
 
 void destroyBlockUnwindInfo(void* context, void* unwindData)
 {
 #if defined(_WIN32) && defined(_M_X64)
-    RUNTIME_FUNCTION* runtimeFunc = (RUNTIME_FUNCTION*)unwindData;
-
-    if (!RtlDeleteFunctionTable(runtimeFunc))
+    if (!RtlDeleteFunctionTable((RUNTIME_FUNCTION*)unwindData))
         LUAU_ASSERT(!"failed to deallocate function table");
-#elif !defined(_WIN32)
-
-#if defined(__APPLE__)
+#elif defined(__APPLE__)
     visitFdeEntries((char*)unwindData, __deregister_frame);
-#else
+#elif !defined(_WIN32)
     __deregister_frame(unwindData);
 #endif
-
-#endif
 }
 
 } // namespace CodeGen
diff --git a/CodeGen/src/CodeGen.cpp b/CodeGen/src/CodeGen.cpp
index 8e6e949..6cd9ea0 100644
--- a/CodeGen/src/CodeGen.cpp
+++ b/CodeGen/src/CodeGen.cpp
@@ -176,6 +176,10 @@ static bool lowerImpl(AssemblyBuilder& build, IrLowering& lowering, IrFunction&
 
             IrInst& inst = function.instructions[index];
 
+            // Substitutions might have meta information about operand restore location from memory
+            if (inst.cmd == IrCmd::SUBSTITUTE && inst.b.kind != IrOpKind::None)
+                function.recordRestoreOp(inst.a.index, inst.b);
+
             // Skip pseudo instructions, but make sure they are not used at this stage
             // This also prevents them from getting into text output when that's enabled
             if (isPseudo(inst.cmd))
@@ -195,7 +199,18 @@ static bool lowerImpl(AssemblyBuilder& build, IrLowering& lowering, IrFunction&
             lowering.lowerInst(inst, index, next);
 
             if (lowering.hasError())
+            {
+                // Place labels for all blocks that we're skipping
+                // This is needed to avoid AssemblyBuilder assertions about jumps in earlier blocks with unplaced labels
+                for (size_t j = i + 1; j < sortedBlocks.size(); ++j)
+                {
+                    IrBlock& abandoned = function.blocks[sortedBlocks[j]];
+
+                    build.setLabel(abandoned.label);
+                }
+
                 return false;
+            }
         }
 
         if (options.includeIr)
@@ -223,12 +238,8 @@ static bool lowerImpl(AssemblyBuilder& build, IrLowering& lowering, IrFunction&
 [[maybe_unused]] static bool lowerIr(
     X64::AssemblyBuilderX64& build, IrBuilder& ir, NativeState& data, ModuleHelpers& helpers, Proto* proto, AssemblyOptions options)
 {
-    constexpr uint32_t kFunctionAlignment = 32;
-
     optimizeMemoryOperandsX64(ir.function);
 
-    build.align(kFunctionAlignment, X64::AlignmentDataX64::Ud2);
-
     X64::IrLoweringX64 lowering(build, helpers, data, ir.function);
 
     return lowerImpl(build, lowering, ir.function, proto->bytecodeid, options);
@@ -237,9 +248,6 @@ static bool lowerImpl(AssemblyBuilder& build, IrLowering& lowering, IrFunction&
 [[maybe_unused]] static bool lowerIr(
     A64::AssemblyBuilderA64& build, IrBuilder& ir, NativeState& data, ModuleHelpers& helpers, Proto* proto, AssemblyOptions options)
 {
-    if (!A64::IrLoweringA64::canLower(ir.function))
-        return false;
-
     A64::IrLoweringA64 lowering(build, helpers, data, proto, ir.function);
 
     return lowerImpl(build, lowering, ir.function, proto->bytecodeid, options);
@@ -432,13 +440,13 @@ void create(lua_State* L)
     initHelperFunctions(data);
 
 #if defined(__x86_64__) || defined(_M_X64)
-    if (!X64::initEntryFunction(data))
+    if (!X64::initHeaderFunctions(data))
     {
         destroyNativeState(L);
         return;
     }
 #elif defined(__aarch64__)
-    if (!A64::initEntryFunction(data))
+    if (!A64::initHeaderFunctions(data))
     {
         destroyNativeState(L);
         return;
diff --git a/CodeGen/src/CodeGenA64.cpp b/CodeGen/src/CodeGenA64.cpp
index e7a1e2e..7f29beb 100644
--- a/CodeGen/src/CodeGenA64.cpp
+++ b/CodeGen/src/CodeGenA64.cpp
@@ -17,14 +17,107 @@ namespace CodeGen
 namespace A64
 {
 
-bool initEntryFunction(NativeState& data)
+struct EntryLocations
 {
-    AssemblyBuilderA64 build(/* logText= */ false);
-    UnwindBuilder& unwind = *data.unwindBuilder.get();
+    Label start;
+    Label prologueEnd;
+    Label epilogueStart;
+};
+
+static void emitExit(AssemblyBuilderA64& build, bool continueInVm)
+{
+    build.mov(x0, continueInVm);
+    build.ldr(x1, mem(rNativeContext, offsetof(NativeContext, gateExit)));
+    build.br(x1);
+}
+
+static void emitInterrupt(AssemblyBuilderA64& build)
+{
+    // x0 = pc offset
+    // x1 = return address in native code
+    // x2 = interrupt
+
+    // Stash return address in rBase; we need to reload rBase anyway
+    build.mov(rBase, x1);
+
+    // Update savedpc; required in case interrupt errors
+    build.add(x0, rCode, x0);
+    build.ldr(x1, mem(rState, offsetof(lua_State, ci)));
+    build.str(x0, mem(x1, offsetof(CallInfo, savedpc)));
+
+    // Call interrupt
+    build.mov(x0, rState);
+    build.mov(w1, -1);
+    build.blr(x2);
+
+    // Check if we need to exit
+    Label skip;
+    build.ldrb(w0, mem(rState, offsetof(lua_State, status)));
+    build.cbz(w0, skip);
+
+    // L->ci->savedpc--
+    // note: recomputing this avoids having to stash x0
+    build.ldr(x1, mem(rState, offsetof(lua_State, ci)));
+    build.ldr(x0, mem(x1, offsetof(CallInfo, savedpc)));
+    build.sub(x0, x0, sizeof(Instruction));
+    build.str(x0, mem(x1, offsetof(CallInfo, savedpc)));
+
+    emitExit(build, /* continueInVm */ false);
+
+    build.setLabel(skip);
+
+    // Return back to caller; rBase has stashed return address
+    build.mov(x0, rBase);
+
+    emitUpdateBase(build); // interrupt may have reallocated stack
+
+    build.br(x0);
+}
+
+static void emitReentry(AssemblyBuilderA64& build, ModuleHelpers& helpers)
+{
+    // x0 = closure object to reentry (equal to clvalue(L->ci->func))
+
+    // If the fallback requested an exit, we need to do this right away
+    build.cbz(x0, helpers.exitNoContinueVm);
+
+    emitUpdateBase(build);
+
+    // Need to update state of the current function before we jump away
+    build.ldr(x1, mem(x0, offsetof(Closure, l.p))); // cl->l.p aka proto
+
+    build.mov(rClosure, x0);
+    build.ldr(rConstants, mem(x1, offsetof(Proto, k))); // proto->k
+    build.ldr(rCode, mem(x1, offsetof(Proto, code)));   // proto->code
+
+    // Get instruction index from instruction pointer
+    // To get instruction index from instruction pointer, we need to divide byte offset by 4
+    // But we will actually need to scale instruction index by 8 back to byte offset later so it cancels out
+    build.ldr(x2, mem(rState, offsetof(lua_State, ci))); // L->ci
+    build.ldr(x2, mem(x2, offsetof(CallInfo, savedpc))); // L->ci->savedpc
+    build.sub(x2, x2, rCode);
+    build.add(x2, x2, x2); // TODO: this would not be necessary if we supported shifted register offsets in loads
+
+    // We need to check if the new function can be executed natively
+    // TODO: This can be done earlier in the function flow, to reduce the JIT->VM transition penalty
+    build.ldr(x1, mem(x1, offsetofProtoExecData));
+    build.cbz(x1, helpers.exitContinueVm);
+
+    // Get new instruction location and jump to it
+    build.ldr(x1, mem(x1, offsetof(NativeProto, instTargets)));
+    build.ldr(x1, mem(x1, x2));
+    build.br(x1);
+}
+
+static EntryLocations buildEntryFunction(AssemblyBuilderA64& build, UnwindBuilder& unwind)
+{
+    EntryLocations locations;
 
     // Arguments: x0 = lua_State*, x1 = Proto*, x2 = native code pointer to jump to, x3 = NativeContext*
 
-    unwind.start();
+    locations.start = build.setLabel();
+    unwind.startFunction();
+
     unwind.allocStack(8); // TODO: this is just a hack to make UnwindBuilder assertions cooperate
 
     // prologue
@@ -38,9 +131,7 @@ bool initEntryFunction(NativeState& data)
 
     build.mov(x29, sp); // this is only necessary if we maintain frame pointers, which we do in the JIT for now
 
-    unwind.finish();
-
-    size_t prologueSize = build.setLabel().location;
+    locations.prologueEnd = build.setLabel();
 
     // Setup native execution environment
     build.mov(rState, x0);
@@ -58,7 +149,7 @@ bool initEntryFunction(NativeState& data)
     build.br(x2);
 
     // Even though we jumped away, we will return here in the end
-    Label returnOff = build.setLabel();
+    locations.epilogueStart = build.setLabel();
 
     // Cleanup and exit
     build.ldp(x23, x24, mem(sp, 48));
@@ -69,12 +160,30 @@ bool initEntryFunction(NativeState& data)
 
     build.ret();
 
+    // Our entry function is special, it spans the whole remaining code area
+    unwind.finishFunction(build.getLabelOffset(locations.start), kFullBlockFuncton);
+
+    return locations;
+}
+
+bool initHeaderFunctions(NativeState& data)
+{
+    AssemblyBuilderA64 build(/* logText= */ false);
+    UnwindBuilder& unwind = *data.unwindBuilder.get();
+
+    unwind.startInfo();
+
+    EntryLocations entryLocations = buildEntryFunction(build, unwind);
+
     build.finalize();
 
+    unwind.finishInfo();
+
     LUAU_ASSERT(build.data.empty());
 
+    uint8_t* codeStart = nullptr;
     if (!data.codeAllocator.allocate(build.data.data(), int(build.data.size()), reinterpret_cast<const uint8_t*>(build.code.data()),
-            int(build.code.size() * sizeof(build.code[0])), data.gateData, data.gateDataSize, data.context.gateEntry))
+            int(build.code.size() * sizeof(build.code[0])), data.gateData, data.gateDataSize, codeStart))
     {
         LUAU_ASSERT(!"failed to create entry function");
         return false;
@@ -82,9 +191,10 @@ bool initEntryFunction(NativeState& data)
 
     // Set the offset at the begining so that functions in new blocks will not overlay the locations
     // specified by the unwind information of the entry function
-    unwind.setBeginOffset(prologueSize);
+    unwind.setBeginOffset(build.getLabelOffset(entryLocations.prologueEnd));
 
-    data.context.gateExit = data.context.gateEntry + build.getLabelOffset(returnOff);
+    data.context.gateEntry = codeStart + build.getLabelOffset(entryLocations.start);
+    data.context.gateExit = codeStart + build.getLabelOffset(entryLocations.epilogueStart);
 
     return true;
 }
diff --git a/CodeGen/src/CodeGenA64.h b/CodeGen/src/CodeGenA64.h
index 7b792cc..f6fda72 100644
--- a/CodeGen/src/CodeGenA64.h
+++ b/CodeGen/src/CodeGenA64.h
@@ -14,7 +14,7 @@ namespace A64
 
 class AssemblyBuilderA64;
 
-bool initEntryFunction(NativeState& data);
+bool initHeaderFunctions(NativeState& data);
 void assembleHelpers(AssemblyBuilderA64& build, ModuleHelpers& helpers);
 
 } // namespace A64
diff --git a/CodeGen/src/CodeGenUtils.cpp b/CodeGen/src/CodeGenUtils.cpp
index ae3dbd4..7a9192a 100644
--- a/CodeGen/src/CodeGenUtils.cpp
+++ b/CodeGen/src/CodeGenUtils.cpp
@@ -13,12 +13,58 @@ namespace Luau
 namespace CodeGen
 {
 
+bool forgLoopTableIter(lua_State* L, Table* h, int index, TValue* ra)
+{
+    int sizearray = h->sizearray;
+
+    // first we advance index through the array portion
+    while (unsigned(index) < unsigned(sizearray))
+    {
+        TValue* e = &h->array[index];
+
+        if (!ttisnil(e))
+        {
+            setpvalue(ra + 2, reinterpret_cast<void*>(uintptr_t(index + 1)));
+            setnvalue(ra + 3, double(index + 1));
+            setobj2s(L, ra + 4, e);
+
+            return true;
+        }
+
+        index++;
+    }
+
+    int sizenode = 1 << h->lsizenode;
+
+    // then we advance index through the hash portion
+    while (unsigned(index - h->sizearray) < unsigned(sizenode))
+    {
+        LuaNode* n = &h->node[index - sizearray];
+
+        if (!ttisnil(gval(n)))
+        {
+            setpvalue(ra + 2, reinterpret_cast<void*>(uintptr_t(index + 1)));
+            getnodekey(L, ra + 3, n);
+            setobj(L, ra + 4, gval(n));
+
+            return true;
+        }
+
+        index++;
+    }
+
+    return false;
+}
+
 bool forgLoopNodeIter(lua_State* L, Table* h, int index, TValue* ra)
 {
+    int sizearray = h->sizearray;
+    int sizenode = 1 << h->lsizenode;
+
     // then we advance index through the hash portion
-    while (unsigned(index - h->sizearray) < unsigned(1 << h->lsizenode))
+    while (unsigned(index - sizearray) < unsigned(sizenode))
     {
-        LuaNode* n = &h->node[index - h->sizearray];
+        LuaNode* n = &h->node[index - sizearray];
 
         if (!ttisnil(gval(n)))
         {
diff --git a/CodeGen/src/CodeGenUtils.h b/CodeGen/src/CodeGenUtils.h
index 6066a69..10e88c1 100644
--- a/CodeGen/src/CodeGenUtils.h
+++ b/CodeGen/src/CodeGenUtils.h
@@ -8,6 +8,7 @@ namespace Luau
 namespace CodeGen
 {
 
+bool forgLoopTableIter(lua_State* L, Table* h, int index, TValue* ra);
 bool forgLoopNodeIter(lua_State* L, Table* h, int index, TValue* ra);
 bool forgLoopNonTableFallback(lua_State* L, int insnA, int aux);
 
diff --git a/CodeGen/src/CodeGenX64.cpp b/CodeGen/src/CodeGenX64.cpp
index 7df1a90..2acb69f 100644
--- a/CodeGen/src/CodeGenX64.cpp
+++ b/CodeGen/src/CodeGenX64.cpp
@@ -41,12 +41,21 @@ namespace CodeGen
 namespace X64
 {
 
-bool initEntryFunction(NativeState& data)
+struct EntryLocations
 {
-    AssemblyBuilderX64 build(/* logText= */ false);
-    UnwindBuilder& unwind = *data.unwindBuilder.get();
+    Label start;
+    Label prologueEnd;
+    Label epilogueStart;
+};
 
-    unwind.start();
+static EntryLocations buildEntryFunction(AssemblyBuilderX64& build, UnwindBuilder& unwind)
+{
+    EntryLocations locations;
+
+    build.align(kFunctionAlignment, X64::AlignmentDataX64::Ud2);
+
+    locations.start = build.setLabel();
+    unwind.startFunction();
 
     // Save common non-volatile registers
     build.push(rbp);
@@ -84,9 +93,7 @@ bool initEntryFunction(NativeState& data)
     build.sub(rsp, kStackSize + kLocalsSize);
     unwind.allocStack(kStackSize + kLocalsSize);
 
-    unwind.finish();
-
-    size_t prologueSize = build.setLabel().location;
+    locations.prologueEnd = build.setLabel();
 
     // Setup native execution environment
     build.mov(rState, rArg1);
@@ -104,7 +111,7 @@ bool initEntryFunction(NativeState& data)
     build.jmp(rArg3);
 
     // Even though we jumped away, we will return here in the end
-    Label returnOff = build.setLabel();
+    locations.epilogueStart = build.setLabel();
 
     // Cleanup and exit
     build.add(rsp, kStackSize + kLocalsSize);
@@ -123,12 +130,30 @@ bool initEntryFunction(NativeState& data)
     build.pop(rbp);
     build.ret();
 
+    // Our entry function is special, it spans the whole remaining code area
+    unwind.finishFunction(build.getLabelOffset(locations.start), kFullBlockFuncton);
+
+    return locations;
+}
+
+bool initHeaderFunctions(NativeState& data)
+{
+    AssemblyBuilderX64 build(/* logText= */ false);
+    UnwindBuilder& unwind = *data.unwindBuilder.get();
+
+    unwind.startInfo();
+
+    EntryLocations entryLocations = buildEntryFunction(build, unwind);
+
     build.finalize();
 
+    unwind.finishInfo();
+
     LUAU_ASSERT(build.data.empty());
 
-    if (!data.codeAllocator.allocate(build.data.data(), int(build.data.size()), build.code.data(), int(build.code.size()), data.gateData,
-            data.gateDataSize, data.context.gateEntry))
+    uint8_t* codeStart = nullptr;
+    if (!data.codeAllocator.allocate(
+            build.data.data(), int(build.data.size()), build.code.data(), int(build.code.size()), data.gateData, data.gateDataSize, codeStart))
     {
         LUAU_ASSERT(!"failed to create entry function");
         return false;
@@ -136,9 +161,10 @@ bool initEntryFunction(NativeState& data)
 
     // Set the offset at the begining so that functions in new blocks will not overlay the locations
     // specified by the unwind information of the entry function
-    unwind.setBeginOffset(prologueSize);
+    unwind.setBeginOffset(build.getLabelOffset(entryLocations.prologueEnd));
 
-    data.context.gateExit = data.context.gateEntry + returnOff.location;
+    data.context.gateEntry = codeStart + build.getLabelOffset(entryLocations.start);
+    data.context.gateExit = codeStart + build.getLabelOffset(entryLocations.epilogueStart);
 
     return true;
 }
diff --git a/CodeGen/src/CodeGenX64.h b/CodeGen/src/CodeGenX64.h
index 1f48311..1f0f27d 100644
--- a/CodeGen/src/CodeGenX64.h
+++ b/CodeGen/src/CodeGenX64.h
@@ -14,7 +14,7 @@ namespace X64
 
 class AssemblyBuilderX64;
 
-bool initEntryFunction(NativeState& data);
+bool initHeaderFunctions(NativeState& data);
 void assembleHelpers(AssemblyBuilderX64& build, ModuleHelpers& helpers);
 
 } // namespace X64
diff --git a/CodeGen/src/EmitBuiltinsX64.cpp b/CodeGen/src/EmitBuiltinsX64.cpp
index b010ce6..4026b95 100644
--- a/CodeGen/src/EmitBuiltinsX64.cpp
+++ b/CodeGen/src/EmitBuiltinsX64.cpp
@@ -107,47 +107,11 @@ void emitBuiltinMathLog(IrRegAllocX64& regs, AssemblyBuilderX64& build, int npar
     regs.assertAllFree();
     build.vmovsd(xmm0, luauRegValue(arg));
 
-    if (nparams == 1)
-    {
-        build.call(qword[rNativeContext + offsetof(NativeContext, libm_log)]);
-    }
-    else
-    {
-        Label log10check, logdivlog, exit;
-
-        // Using 'rbx' for non-volatile temporary storage of log(arg1) result
-        RegisterX64 tmp = rbx;
-        OperandX64 arg2value = qword[args + offsetof(TValue, value)];
-
-        build.vmovsd(xmm1, arg2value);
-
-        jumpOnNumberCmp(build, noreg, build.f64(2.0), xmm1, IrCondition::NotEqual, log10check);
-
+    // TODO: IR builtin lowering assumes that the only valid 2-argument call is log2; ideally, we use a less hacky way to indicate that
+    if (nparams == 2)
         build.call(qword[rNativeContext + offsetof(NativeContext, libm_log2)]);
-        build.jmp(exit);
-
-        build.setLabel(log10check);
-        jumpOnNumberCmp(build, noreg, build.f64(10.0), xmm1, IrCondition::NotEqual, logdivlog);
-
-        build.call(qword[rNativeContext + offsetof(NativeContext, libm_log10)]);
-        build.jmp(exit);
-
-        build.setLabel(logdivlog);
-
-        // log(arg1)
+    else
         build.call(qword[rNativeContext + offsetof(NativeContext, libm_log)]);
-        build.vmovq(tmp, xmm0);
-
-        // log(arg2)
-        build.vmovsd(xmm0, arg2value);
-        build.call(qword[rNativeContext + offsetof(NativeContext, libm_log)]);
-
-        // log(arg1) / log(arg2)
-        build.vmovq(xmm1, tmp);
-        build.vdivsd(xmm0, xmm1, xmm0);
-
-        build.setLabel(exit);
-    }
 
     build.vmovsd(luauRegValue(ra), xmm0);
 }
@@ -256,62 +220,68 @@ void emitBuiltin(IrRegAllocX64& regs, AssemblyBuilderX64& build, int bfid, int r
 
     switch (bfid)
     {
-    case LBF_ASSERT:
-    case LBF_MATH_DEG:
-    case LBF_MATH_RAD:
-    case LBF_MATH_MIN:
-    case LBF_MATH_MAX:
-    case LBF_MATH_CLAMP:
-    case LBF_MATH_FLOOR:
-    case LBF_MATH_CEIL:
-    case LBF_MATH_SQRT:
-    case LBF_MATH_POW:
-    case LBF_MATH_ABS:
-    case LBF_MATH_ROUND:
-        // These instructions are fully translated to IR
-        break;
     case LBF_MATH_EXP:
+        LUAU_ASSERT(nparams == 1 && nresults == 1);
         return emitBuiltinMathExp(regs, build, nparams, ra, arg, argsOp, nresults);
     case LBF_MATH_FMOD:
+        LUAU_ASSERT(nparams == 2 && nresults == 1);
         return emitBuiltinMathFmod(regs, build, nparams, ra, arg, argsOp, nresults);
     case LBF_MATH_ASIN:
+        LUAU_ASSERT(nparams == 1 && nresults == 1);
         return emitBuiltinMathAsin(regs, build, nparams, ra, arg, argsOp, nresults);
     case LBF_MATH_SIN:
+        LUAU_ASSERT(nparams == 1 && nresults == 1);
         return emitBuiltinMathSin(regs, build, nparams, ra, arg, argsOp, nresults);
     case LBF_MATH_SINH:
+        LUAU_ASSERT(nparams == 1 && nresults == 1);
         return emitBuiltinMathSinh(regs, build, nparams, ra, arg, argsOp, nresults);
     case LBF_MATH_ACOS:
+        LUAU_ASSERT(nparams == 1 && nresults == 1);
         return emitBuiltinMathAcos(regs, build, nparams, ra, arg, argsOp, nresults);
     case LBF_MATH_COS:
+        LUAU_ASSERT(nparams == 1 && nresults == 1);
         return emitBuiltinMathCos(regs, build, nparams, ra, arg, argsOp, nresults);
     case LBF_MATH_COSH:
+        LUAU_ASSERT(nparams == 1 && nresults == 1);
         return emitBuiltinMathCosh(regs, build, nparams, ra, arg, argsOp, nresults);
     case LBF_MATH_ATAN:
+        LUAU_ASSERT(nparams == 1 && nresults == 1);
         return emitBuiltinMathAtan(regs, build, nparams, ra, arg, argsOp, nresults);
     case LBF_MATH_TAN:
+        LUAU_ASSERT(nparams == 1 && nresults == 1);
         return emitBuiltinMathTan(regs, build, nparams, ra, arg, argsOp, nresults);
     case LBF_MATH_TANH:
+        LUAU_ASSERT(nparams == 1 && nresults == 1);
         return emitBuiltinMathTanh(regs, build, nparams, ra, arg, argsOp, nresults);
     case LBF_MATH_ATAN2:
+        LUAU_ASSERT(nparams == 2 && nresults == 1);
         return emitBuiltinMathAtan2(regs, build, nparams, ra, arg, argsOp, nresults);
     case LBF_MATH_LOG10:
+        LUAU_ASSERT(nparams == 1 && nresults == 1);
         return emitBuiltinMathLog10(regs, build, nparams, ra, arg, argsOp, nresults);
     case LBF_MATH_LOG:
+        LUAU_ASSERT((nparams == 1 || nparams == 2) && nresults == 1);
         return emitBuiltinMathLog(regs, build, nparams, ra, arg, argsOp, nresults);
     case LBF_MATH_LDEXP:
+        LUAU_ASSERT(nparams == 2 && nresults == 1);
         return emitBuiltinMathLdexp(regs, build, nparams, ra, arg, argsOp, nresults);
     case LBF_MATH_FREXP:
+        LUAU_ASSERT(nparams == 1 && (nresults == 1 || nresults == 2));
         return emitBuiltinMathFrexp(regs, build, nparams, ra, arg, argsOp, nresults);
     case LBF_MATH_MODF:
+        LUAU_ASSERT(nparams == 1 && (nresults == 1 || nresults == 2));
         return emitBuiltinMathModf(regs, build, nparams, ra, arg, argsOp, nresults);
     case LBF_MATH_SIGN:
+        LUAU_ASSERT(nparams == 1 && nresults == 1);
         return emitBuiltinMathSign(regs, build, nparams, ra, arg, argsOp, nresults);
     case LBF_TYPE:
+        LUAU_ASSERT(nparams == 1 && nresults == 1);
         return emitBuiltinType(regs, build, nparams, ra, arg, argsOp, nresults);
     case LBF_TYPEOF:
+        LUAU_ASSERT(nparams == 1 && nresults == 1);
         return emitBuiltinTypeof(regs, build, nparams, ra, arg, argsOp, nresults);
     default:
-        LUAU_ASSERT(!"missing x64 lowering");
+        LUAU_ASSERT(!"Missing x64 lowering");
         break;
     }
 }
diff --git a/CodeGen/src/EmitCommon.h b/CodeGen/src/EmitCommon.h
index a71eafd..6a74966 100644
--- a/CodeGen/src/EmitCommon.h
+++ b/CodeGen/src/EmitCommon.h
@@ -13,8 +13,8 @@ constexpr unsigned kLuaNodeSizeLog2 = 5;
 constexpr unsigned kLuaNodeTagMask = 0xf;
 constexpr unsigned kNextBitOffset = 4;
 
-constexpr unsigned kOffsetOfLuaNodeTag = 12;  // offsetof cannot be used on a bit field
-constexpr unsigned kOffsetOfLuaNodeNext = 12; // offsetof cannot be used on a bit field
+constexpr unsigned kOffsetOfTKeyTag = 12;  // offsetof cannot be used on a bit field
+constexpr unsigned kOffsetOfTKeyNext = 12; // offsetof cannot be used on a bit field
 constexpr unsigned kOffsetOfInstructionC = 3;
 
 // Leaf functions that are placed in every module to perform common instruction sequences
diff --git a/CodeGen/src/EmitCommonA64.cpp b/CodeGen/src/EmitCommonA64.cpp
deleted file mode 100644
index 1758e4f..0000000
--- a/CodeGen/src/EmitCommonA64.cpp
+++ /dev/null
@@ -1,130 +0,0 @@
-// This file is part of the Luau programming language and is licensed under MIT License; see LICENSE.txt for details
-#include "EmitCommonA64.h"
-
-#include "NativeState.h"
-#include "CustomExecUtils.h"
-
-namespace Luau
-{
-namespace CodeGen
-{
-namespace A64
-{
-
-void emitUpdateBase(AssemblyBuilderA64& build)
-{
-    build.ldr(rBase, mem(rState, offsetof(lua_State, base)));
-}
-
-void emitExit(AssemblyBuilderA64& build, bool continueInVm)
-{
-    build.mov(x0, continueInVm);
-    build.ldr(x1, mem(rNativeContext, offsetof(NativeContext, gateExit)));
-    build.br(x1);
-}
-
-void emitInterrupt(AssemblyBuilderA64& build)
-{
-    // x0 = pc offset
-    // x1 = return address in native code
-    // x2 = interrupt
-
-    // Stash return address in rBase; we need to reload rBase anyway
-    build.mov(rBase, x1);
-
-    // Update savedpc; required in case interrupt errors
-    build.add(x0, rCode, x0);
-    build.ldr(x1, mem(rState, offsetof(lua_State, ci)));
-    build.str(x0, mem(x1, offsetof(CallInfo, savedpc)));
-
-    // Call interrupt
-    build.mov(x0, rState);
-    build.mov(w1, -1);
-    build.blr(x2);
-
-    // Check if we need to exit
-    Label skip;
-    build.ldrb(w0, mem(rState, offsetof(lua_State, status)));
-    build.cbz(w0, skip);
-
-    // L->ci->savedpc--
-    // note: recomputing this avoids having to stash x0
-    build.ldr(x1, mem(rState, offsetof(lua_State, ci)));
-    build.ldr(x0, mem(x1, offsetof(CallInfo, savedpc)));
-    build.sub(x0, x0, sizeof(Instruction));
-    build.str(x0, mem(x1, offsetof(CallInfo, savedpc)));
-
-    emitExit(build, /* continueInVm */ false);
-
-    build.setLabel(skip);
-
-    // Return back to caller; rBase has stashed return address
-    build.mov(x0, rBase);
-
-    emitUpdateBase(build); // interrupt may have reallocated stack
-
-    build.br(x0);
-}
-
-void emitReentry(AssemblyBuilderA64& build, ModuleHelpers& helpers)
-{
-    // x0 = closure object to reentry (equal to clvalue(L->ci->func))
-
-    // If the fallback requested an exit, we need to do this right away
-    build.cbz(x0, helpers.exitNoContinueVm);
-
-    emitUpdateBase(build);
-
-    // Need to update state of the current function before we jump away
-    build.ldr(x1, mem(x0, offsetof(Closure, l.p))); // cl->l.p aka proto
-
-    build.mov(rClosure, x0);
-    build.ldr(rConstants, mem(x1, offsetof(Proto, k))); // proto->k
-    build.ldr(rCode, mem(x1, offsetof(Proto, code)));   // proto->code
-
-    // Get instruction index from instruction pointer
-    // To get instruction index from instruction pointer, we need to divide byte offset by 4
-    // But we will actually need to scale instruction index by 8 back to byte offset later so it cancels out
-    build.ldr(x2, mem(rState, offsetof(lua_State, ci))); // L->ci
-    build.ldr(x2, mem(x2, offsetof(CallInfo, savedpc))); // L->ci->savedpc
-    build.sub(x2, x2, rCode);
-    build.add(x2, x2, x2); // TODO: this would not be necessary if we supported shifted register offsets in loads
-
-    // We need to check if the new function can be executed natively
-    // TODO: This can be done earlier in the function flow, to reduce the JIT->VM transition penalty
-    build.ldr(x1, mem(x1, offsetofProtoExecData));
-    build.cbz(x1, helpers.exitContinueVm);
-
-    // Get new instruction location and jump to it
-    build.ldr(x1, mem(x1, offsetof(NativeProto, instTargets)));
-    build.ldr(x1, mem(x1, x2));
-    build.br(x1);
-}
-
-void emitFallback(AssemblyBuilderA64& build, int op, int pcpos)
-{
-    // fallback(L, instruction, base, k)
-    build.mov(x0, rState);
-
-    // TODO: refactor into a common helper
-    if (pcpos * sizeof(Instruction) <= AssemblyBuilderA64::kMaxImmediate)
-    {
-        build.add(x1, rCode, uint16_t(pcpos * sizeof(Instruction)));
-    }
-    else
-    {
-        build.mov(x1, pcpos * sizeof(Instruction));
-        build.add(x1, rCode, x1);
-    }
-
-    build.mov(x2, rBase);
-    build.mov(x3, rConstants);
-    build.ldr(x4, mem(rNativeContext, offsetof(NativeContext, fallback) + op * sizeof(NativeFallback) + offsetof(NativeFallback, fallback)));
-    build.blr(x4);
-
-    emitUpdateBase(build);
-}
-
-} // namespace A64
-} // namespace CodeGen
-} // namespace Luau
diff --git a/CodeGen/src/EmitCommonA64.h b/CodeGen/src/EmitCommonA64.h
index 2a65afa..8cb54c1 100644
--- a/CodeGen/src/EmitCommonA64.h
+++ b/CodeGen/src/EmitCommonA64.h
@@ -7,6 +7,7 @@
 
 #include "lobject.h"
 #include "ltm.h"
+#include "lstate.h"
 
 // AArch64 ABI reminder:
 // Arguments: x0-x7, v0-v7
@@ -38,15 +39,19 @@ constexpr RegisterA64 rBase = x24;      // StkId base
 
 // Native code is as stackless as the interpreter, so we can place some data on the stack once and have it accessible at any point
 // See CodeGenA64.cpp for layout
-constexpr unsigned kStackSize = 64; // 8 stashed registers
+constexpr unsigned kStashSlots = 8; // stashed non-volatile registers
+constexpr unsigned kSpillSlots = 0; // slots for spilling temporary registers (unused)
+constexpr unsigned kTempSlots = 2;  // 16 bytes of temporary space, such luxury!
 
-void emitUpdateBase(AssemblyBuilderA64& build);
+constexpr unsigned kStackSize = (kStashSlots + kSpillSlots + kTempSlots) * 8;
 
-// TODO: Move these to CodeGenA64 so that they can't be accidentally called during lowering
-void emitExit(AssemblyBuilderA64& build, bool continueInVm);
-void emitInterrupt(AssemblyBuilderA64& build);
-void emitReentry(AssemblyBuilderA64& build, ModuleHelpers& helpers);
-void emitFallback(AssemblyBuilderA64& build, int op, int pcpos);
+constexpr AddressA64 sSpillArea = mem(sp, kStashSlots * 8);
+constexpr AddressA64 sTemporary = mem(sp, (kStashSlots + kSpillSlots) * 8);
+
+inline void emitUpdateBase(AssemblyBuilderA64& build)
+{
+    build.ldr(rBase, mem(rState, offsetof(lua_State, base)));
+}
 
 } // namespace A64
 } // namespace CodeGen
diff --git a/CodeGen/src/EmitCommonX64.cpp b/CodeGen/src/EmitCommonX64.cpp
index 9136add..b6d8b85 100644
--- a/CodeGen/src/EmitCommonX64.cpp
+++ b/CodeGen/src/EmitCommonX64.cpp
@@ -279,32 +279,37 @@ void emitUpdateBase(AssemblyBuilderX64& build)
     build.mov(rBase, qword[rState + offsetof(lua_State, base)]);
 }
 
-// Note: only uses rax/rdx, the caller may use other registers
-static void emitSetSavedPc(AssemblyBuilderX64& build, int pcpos)
+static void emitSetSavedPc(IrRegAllocX64& regs, AssemblyBuilderX64& build, int pcpos)
 {
-    build.mov(rdx, sCode);
-    build.add(rdx, pcpos * sizeof(Instruction));
-    build.mov(rax, qword[rState + offsetof(lua_State, ci)]);
-    build.mov(qword[rax + offsetof(CallInfo, savedpc)], rdx);
+    ScopedRegX64 tmp1{regs, SizeX64::qword};
+    ScopedRegX64 tmp2{regs, SizeX64::qword};
+
+    build.mov(tmp1.reg, sCode);
+    build.add(tmp1.reg, pcpos * sizeof(Instruction));
+    build.mov(tmp2.reg, qword[rState + offsetof(lua_State, ci)]);
+    build.mov(qword[tmp2.reg + offsetof(CallInfo, savedpc)], tmp1.reg);
 }
 
-void emitInterrupt(AssemblyBuilderX64& build, int pcpos)
+void emitInterrupt(IrRegAllocX64& regs, AssemblyBuilderX64& build, int pcpos)
 {
     Label skip;
 
+    ScopedRegX64 tmp{regs, SizeX64::qword};
+
     // Skip if there is no interrupt set
-    build.mov(r8, qword[rState + offsetof(lua_State, global)]);
-    build.mov(r8, qword[r8 + offsetof(global_State, cb.interrupt)]);
-    build.test(r8, r8);
+    build.mov(tmp.reg, qword[rState + offsetof(lua_State, global)]);
+    build.mov(tmp.reg, qword[tmp.reg + offsetof(global_State, cb.interrupt)]);
+    build.test(tmp.reg, tmp.reg);
     build.jcc(ConditionX64::Zero, skip);
 
-    emitSetSavedPc(build, pcpos + 1); // uses rax/rdx
+    emitSetSavedPc(regs, build, pcpos + 1);
 
     // Call interrupt
     // TODO: This code should move to the end of the function, or even be outlined so that it can be shared by multiple interruptible instructions
-    build.mov(rArg1, rState);
-    build.mov(dwordReg(rArg2), -1); // function accepts 'int' here and using qword reg would've forced 8 byte constant here
-    build.call(r8);
+    IrCallWrapperX64 callWrap(regs, build);
+    callWrap.addArgument(SizeX64::qword, rState);
+    callWrap.addArgument(SizeX64::dword, -1);
+    callWrap.call(tmp.release());
 
     emitUpdateBase(build); // interrupt may have reallocated stack
 
@@ -320,41 +325,23 @@ void emitInterrupt(AssemblyBuilderX64& build, int pcpos)
     build.setLabel(skip);
 }
 
-void emitFallback(AssemblyBuilderX64& build, NativeState& data, int op, int pcpos)
+void emitFallback(IrRegAllocX64& regs, AssemblyBuilderX64& build, NativeState& data, int op, int pcpos)
 {
-    NativeFallback& opinfo = data.context.fallback[op];
-    LUAU_ASSERT(opinfo.fallback);
-
-    if (build.logText)
-        build.logAppend("; fallback\n");
+    LUAU_ASSERT(data.context.fallback[op]);
 
     // fallback(L, instruction, base, k)
-    build.mov(rArg1, rState);
-    build.mov(rArg2, sCode);
-    build.add(rArg2, pcpos * sizeof(Instruction));
-    build.mov(rArg3, rBase);
-    build.mov(rArg4, rConstants);
-    build.call(qword[rNativeContext + offsetof(NativeContext, fallback) + op * sizeof(NativeFallback) + offsetof(NativeFallback, fallback)]);
+    IrCallWrapperX64 callWrap(regs, build);
+    callWrap.addArgument(SizeX64::qword, rState);
+
+    RegisterX64 reg = callWrap.suggestNextArgumentRegister(SizeX64::qword);
+    build.mov(reg, sCode);
+    callWrap.addArgument(SizeX64::qword, addr[reg + pcpos * sizeof(Instruction)]);
+
+    callWrap.addArgument(SizeX64::qword, rBase);
+    callWrap.addArgument(SizeX64::qword, rConstants);
+    callWrap.call(qword[rNativeContext + offsetof(NativeContext, fallback) + op * sizeof(FallbackFn)]);
 
     emitUpdateBase(build);
-
-    // Some instructions may jump to a different instruction or a completely different function
-    if (opinfo.flags & kFallbackUpdatePc)
-    {
-        build.mov(rcx, sClosure);
-        build.mov(rcx, qword[rcx + offsetof(Closure, l.p)]);
-
-        // Get instruction index from returned instruction pointer
-        // To get instruction index from instruction pointer, we need to divide byte offset by 4
-        // But we will actually need to scale instruction index by 8 back to byte offset later so it cancels out
-        build.sub(rax, sCode);
-
-        build.mov(rdx, qword[rcx + offsetofProtoExecData]);
-
-        // Get new instruction location and jump to it
-        build.mov(rcx, qword[rdx + offsetof(NativeProto, instTargets)]);
-        build.jmp(qword[rax * 2 + rcx]);
-    }
 }
 
 void emitContinueCallInVm(AssemblyBuilderX64& build)
diff --git a/CodeGen/src/EmitCommonX64.h b/CodeGen/src/EmitCommonX64.h
index 6aac5a1..d4684fe 100644
--- a/CodeGen/src/EmitCommonX64.h
+++ b/CodeGen/src/EmitCommonX64.h
@@ -34,6 +34,8 @@ namespace X64
 
 struct IrRegAllocX64;
 
+constexpr uint32_t kFunctionAlignment = 32;
+
 // Data that is very common to access is placed in non-volatile registers
 constexpr RegisterX64 rState = r15;         // lua_State* L
 constexpr RegisterX64 rBase = r14;          // StkId base
@@ -134,7 +136,7 @@ inline OperandX64 luauNodeKeyValue(RegisterX64 node)
 // Note: tag has dirty upper bits
 inline OperandX64 luauNodeKeyTag(RegisterX64 node)
 {
-    return dword[node + offsetof(LuaNode, key) + kOffsetOfLuaNodeTag];
+    return dword[node + offsetof(LuaNode, key) + kOffsetOfTKeyTag];
 }
 
 inline OperandX64 luauNodeValue(RegisterX64 node)
@@ -162,12 +164,6 @@ inline void jumpIfTagIsNot(AssemblyBuilderX64& build, int ri, lua_Type tag, Labe
     build.jcc(ConditionX64::NotEqual, label);
 }
 
-inline void jumpIfTagIsNot(AssemblyBuilderX64& build, RegisterX64 reg, lua_Type tag, Label& label)
-{
-    build.cmp(dword[reg + offsetof(TValue, tt)], tag);
-    build.jcc(ConditionX64::NotEqual, label);
-}
-
 // Note: fallthrough label should be placed after this condition
 inline void jumpIfFalsy(AssemblyBuilderX64& build, int ri, Label& target, Label& fallthrough)
 {
@@ -188,26 +184,6 @@ inline void jumpIfTruthy(AssemblyBuilderX64& build, int ri, Label& target, Label
     build.jcc(ConditionX64::NotEqual, target); // true if boolean value is 'true'
 }
 
-inline void jumpIfMetatablePresent(AssemblyBuilderX64& build, RegisterX64 table, Label& target)
-{
-    build.cmp(qword[table + offsetof(Table, metatable)], 0);
-    build.jcc(ConditionX64::NotEqual, target);
-}
-
-inline void jumpIfUnsafeEnv(AssemblyBuilderX64& build, RegisterX64 tmp, Label& label)
-{
-    build.mov(tmp, sClosure);
-    build.mov(tmp, qword[tmp + offsetof(Closure, env)]);
-    build.test(byte[tmp + offsetof(Table, safeenv)], 1);
-    build.jcc(ConditionX64::Zero, label); // Not a safe environment
-}
-
-inline void jumpIfTableIsReadOnly(AssemblyBuilderX64& build, RegisterX64 table, Label& label)
-{
-    build.cmp(byte[table + offsetof(Table, readonly)], 0);
-    build.jcc(ConditionX64::NotEqual, label);
-}
-
 inline void jumpIfNodeKeyTagIsNot(AssemblyBuilderX64& build, RegisterX64 tmp, RegisterX64 node, lua_Type tag, Label& label)
 {
     tmp.size = SizeX64::dword;
@@ -224,13 +200,6 @@ inline void jumpIfNodeValueTagIs(AssemblyBuilderX64& build, RegisterX64 node, lu
     build.jcc(ConditionX64::Equal, label);
 }
 
-inline void jumpIfNodeHasNext(AssemblyBuilderX64& build, RegisterX64 node, Label& label)
-{
-    build.mov(ecx, dword[node + offsetof(LuaNode, key) + kOffsetOfLuaNodeNext]);
-    build.shr(ecx, kNextBitOffset);
-    build.jcc(ConditionX64::NotZero, label);
-}
-
 inline void jumpIfNodeKeyNotInExpectedSlot(AssemblyBuilderX64& build, RegisterX64 tmp, RegisterX64 node, OperandX64 expectedKey, Label& label)
 {
     jumpIfNodeKeyTagIsNot(build, tmp, node, LUA_TSTRING, label);
@@ -260,8 +229,8 @@ void callStepGc(IrRegAllocX64& regs, AssemblyBuilderX64& build);
 
 void emitExit(AssemblyBuilderX64& build, bool continueInVm);
 void emitUpdateBase(AssemblyBuilderX64& build);
-void emitInterrupt(AssemblyBuilderX64& build, int pcpos);
-void emitFallback(AssemblyBuilderX64& build, NativeState& data, int op, int pcpos);
+void emitInterrupt(IrRegAllocX64& regs, AssemblyBuilderX64& build, int pcpos);
+void emitFallback(IrRegAllocX64& regs, AssemblyBuilderX64& build, NativeState& data, int op, int pcpos);
 
 void emitContinueCallInVm(AssemblyBuilderX64& build);
 
diff --git a/CodeGen/src/EmitInstructionA64.cpp b/CodeGen/src/EmitInstructionA64.cpp
deleted file mode 100644
index 400ba77..0000000
--- a/CodeGen/src/EmitInstructionA64.cpp
+++ /dev/null
@@ -1,74 +0,0 @@
-// This file is part of the Luau programming language and is licensed under MIT License; see LICENSE.txt for details
-#include "EmitInstructionA64.h"
-
-#include "Luau/AssemblyBuilderA64.h"
-
-#include "EmitCommonA64.h"
-#include "NativeState.h"
-#include "CustomExecUtils.h"
-
-namespace Luau
-{
-namespace CodeGen
-{
-namespace A64
-{
-
-void emitInstReturn(AssemblyBuilderA64& build, ModuleHelpers& helpers, int ra, int n)
-{
-    // callFallback(L, ra, n)
-    build.mov(x0, rState);
-    build.add(x1, rBase, uint16_t(ra * sizeof(TValue)));
-    build.mov(w2, n);
-    build.ldr(x3, mem(rNativeContext, offsetof(NativeContext, returnFallback)));
-    build.blr(x3);
-
-    // reentry with x0=closure (NULL will trigger exit)
-    build.b(helpers.reentry);
-}
-
-void emitInstCall(AssemblyBuilderA64& build, ModuleHelpers& helpers, int ra, int nparams, int nresults)
-{
-    // argtop = (nparams == LUA_MULTRET) ? L->top : ra + 1 + nparams;
-    if (nparams == LUA_MULTRET)
-        build.ldr(x2, mem(rState, offsetof(lua_State, top)));
-    else
-        build.add(x2, rBase, uint16_t((ra + 1 + nparams) * sizeof(TValue)));
-
-    // callFallback(L, ra, argtop, nresults)
-    build.mov(x0, rState);
-    build.add(x1, rBase, uint16_t(ra * sizeof(TValue)));
-    build.mov(w3, nresults);
-    build.ldr(x4, mem(rNativeContext, offsetof(NativeContext, callFallback)));
-    build.blr(x4);
-
-    // reentry with x0=closure (NULL will trigger exit)
-    build.b(helpers.reentry);
-}
-
-void emitInstGetImport(AssemblyBuilderA64& build, int ra, uint32_t aux)
-{
-    // luaV_getimport(L, cl->env, k, aux, /* propagatenil= */ false)
-    build.mov(x0, rState);
-    build.ldr(x1, mem(rClosure, offsetof(Closure, env)));
-    build.mov(x2, rConstants);
-    build.mov(w3, aux);
-    build.mov(w4, 0);
-    build.ldr(x5, mem(rNativeContext, offsetof(NativeContext, luaV_getimport)));
-    build.blr(x5);
-
-    emitUpdateBase(build);
-
-    // setobj2s(L, ra, L->top - 1)
-    build.ldr(x0, mem(rState, offsetof(lua_State, top)));
-    build.sub(x0, x0, sizeof(TValue));
-    build.ldr(q0, x0);
-    build.str(q0, mem(rBase, ra * sizeof(TValue)));
-
-    // L->top--
-    build.str(x0, mem(rState, offsetof(lua_State, top)));
-}
-
-} // namespace A64
-} // namespace CodeGen
-} // namespace Luau
diff --git a/CodeGen/src/EmitInstructionA64.h b/CodeGen/src/EmitInstructionA64.h
deleted file mode 100644
index 278d8e8..0000000
--- a/CodeGen/src/EmitInstructionA64.h
+++ /dev/null
@@ -1,24 +0,0 @@
-// This file is part of the Luau programming language and is licensed under MIT License; see LICENSE.txt for details
-#pragma once
-
-#include <stdint.h>
-
-namespace Luau
-{
-namespace CodeGen
-{
-
-struct ModuleHelpers;
-
-namespace A64
-{
-
-class AssemblyBuilderA64;
-
-void emitInstReturn(AssemblyBuilderA64& build, ModuleHelpers& helpers, int ra, int n);
-void emitInstCall(AssemblyBuilderA64& build, ModuleHelpers& helpers, int ra, int nparams, int nresults);
-void emitInstGetImport(AssemblyBuilderA64& build, int ra, uint32_t aux);
-
-} // namespace A64
-} // namespace CodeGen
-} // namespace Luau
diff --git a/CodeGen/src/EmitInstructionX64.cpp b/CodeGen/src/EmitInstructionX64.cpp
index c0a6427..9a10bfd 100644
--- a/CodeGen/src/EmitInstructionX64.cpp
+++ b/CodeGen/src/EmitInstructionX64.cpp
@@ -415,7 +415,7 @@ void emitInstSetList(IrRegAllocX64& regs, AssemblyBuilderX64& build, int ra, int
     callBarrierTableFast(regs, build, table, {});
 }
 
-void emitinstForGLoop(AssemblyBuilderX64& build, int ra, int aux, Label& loopRepeat, Label& loopExit)
+void emitInstForGLoop(AssemblyBuilderX64& build, int ra, int aux, Label& loopRepeat)
 {
     // ipairs-style traversal is handled in IR
     LUAU_ASSERT(aux >= 0);
@@ -484,78 +484,6 @@ void emitinstForGLoop(AssemblyBuilderX64& build, int ra, int aux, Label& loopRep
     build.jcc(ConditionX64::NotZero, loopRepeat);
 }
 
-void emitinstForGLoopFallback(AssemblyBuilderX64& build, int ra, int aux, Label& loopRepeat)
-{
-    build.mov(rArg1, rState);
-    build.mov(dwordReg(rArg2), ra);
-    build.mov(dwordReg(rArg3), aux);
-    build.call(qword[rNativeContext + offsetof(NativeContext, forgLoopNonTableFallback)]);
-    emitUpdateBase(build);
-    build.test(al, al);
-    build.jcc(ConditionX64::NotZero, loopRepeat);
-}
-
-void emitInstForGPrepXnextFallback(AssemblyBuilderX64& build, int pcpos, int ra, Label& target)
-{
-    build.mov(rArg1, rState);
-    build.lea(rArg2, luauRegAddress(ra));
-    build.mov(dwordReg(rArg3), pcpos + 1);
-    build.call(qword[rNativeContext + offsetof(NativeContext, forgPrepXnextFallback)]);
-    build.jmp(target);
-}
-
-void emitInstGetImportFallback(AssemblyBuilderX64& build, int ra, uint32_t aux)
-{
-    build.mov(rax, sClosure);
-
-    // luaV_getimport(L, cl->env, k, aux, /* propagatenil= */ false)
-    build.mov(rArg1, rState);
-    build.mov(rArg2, qword[rax + offsetof(Closure, env)]);
-    build.mov(rArg3, rConstants);
-    build.mov(dwordReg(rArg4), aux);
-
-    if (build.abi == ABIX64::Windows)
-        build.mov(sArg5, 0);
-    else
-        build.xor_(rArg5, rArg5);
-
-    build.call(qword[rNativeContext + offsetof(NativeContext, luaV_getimport)]);
-
-    emitUpdateBase(build);
-
-    // setobj2s(L, ra, L->top - 1)
-    build.mov(rax, qword[rState + offsetof(lua_State, top)]);
-    build.sub(rax, sizeof(TValue));
-    build.vmovups(xmm0, xmmword[rax]);
-    build.vmovups(luauReg(ra), xmm0);
-
-    // L->top--
-    build.mov(qword[rState + offsetof(lua_State, top)], rax);
-}
-
-void emitInstCoverage(AssemblyBuilderX64& build, int pcpos)
-{
-    build.mov(rcx, sCode);
-    build.add(rcx, pcpos * sizeof(Instruction));
-
-    // hits = LUAU_INSN_E(*pc)
-    build.mov(edx, dword[rcx]);
-    build.sar(edx, 8);
-
-    // hits = (hits < (1 << 23) - 1) ? hits + 1 : hits;
-    build.xor_(eax, eax);
-    build.cmp(edx, (1 << 23) - 1);
-    build.setcc(ConditionX64::NotEqual, al);
-    build.add(edx, eax);
-
-
-    // VM_PATCH_E(pc, hits);
-    build.sal(edx, 8);
-    build.movzx(eax, byte[rcx]);
-    build.or_(eax, edx);
-    build.mov(dword[rcx], eax);
-}
-
 } // namespace X64
 } // namespace CodeGen
 } // namespace Luau
diff --git a/CodeGen/src/EmitInstructionX64.h b/CodeGen/src/EmitInstructionX64.h
index d58e133..84fe113 100644
--- a/CodeGen/src/EmitInstructionX64.h
+++ b/CodeGen/src/EmitInstructionX64.h
@@ -20,11 +20,7 @@ struct IrRegAllocX64;
 void emitInstCall(AssemblyBuilderX64& build, ModuleHelpers& helpers, int ra, int nparams, int nresults);
 void emitInstReturn(AssemblyBuilderX64& build, ModuleHelpers& helpers, int ra, int actualResults);
 void emitInstSetList(IrRegAllocX64& regs, AssemblyBuilderX64& build, int ra, int rb, int count, uint32_t index);
-void emitinstForGLoop(AssemblyBuilderX64& build, int ra, int aux, Label& loopRepeat, Label& loopExit);
-void emitinstForGLoopFallback(AssemblyBuilderX64& build, int ra, int aux, Label& loopRepeat);
-void emitInstForGPrepXnextFallback(AssemblyBuilderX64& build, int pcpos, int ra, Label& target);
-void emitInstGetImportFallback(AssemblyBuilderX64& build, int ra, uint32_t aux);
-void emitInstCoverage(AssemblyBuilderX64& build, int pcpos);
+void emitInstForGLoop(AssemblyBuilderX64& build, int ra, int aux, Label& loopRepeat);
 
 } // namespace X64
 } // namespace CodeGen
diff --git a/CodeGen/src/Fallbacks.cpp b/CodeGen/src/Fallbacks.cpp
index e84ee21..1c0dce5 100644
--- a/CodeGen/src/Fallbacks.cpp
+++ b/CodeGen/src/Fallbacks.cpp
@@ -416,6 +416,44 @@ const Instruction* execute_LOP_NAMECALL(lua_State* L, const Instruction* pc, Stk
     return pc;
 }
 
+const Instruction* execute_LOP_SETLIST(lua_State* L, const Instruction* pc, StkId base, TValue* k)
+{
+    [[maybe_unused]] Closure* cl = clvalue(L->ci->func);
+    Instruction insn = *pc++;
+    StkId ra = VM_REG(LUAU_INSN_A(insn));
+    StkId rb = &base[LUAU_INSN_B(insn)]; // note: this can point to L->top if c == LUA_MULTRET making VM_REG unsafe to use
+    int c = LUAU_INSN_C(insn) - 1;
+    uint32_t index = *pc++;
+
+    if (c == LUA_MULTRET)
+    {
+        c = int(L->top - rb);
+        L->top = L->ci->top;
+    }
+
+    Table* h = hvalue(ra);
+
+    // TODO: we really don't need this anymore
+    if (!ttistable(ra))
+        return NULL; // temporary workaround to weaken a rather powerful exploitation primitive in case of a MITM attack on bytecode
+
+    int last = index + c - 1;
+    if (last > h->sizearray)
+    {
+        VM_PROTECT_PC(); // luaH_resizearray may fail due to OOM
+
+        luaH_resizearray(L, h, last);
+    }
+
+    TValue* array = h->array;
+
+    for (int i = 0; i < c; ++i)
+        setobj2t(L, &array[index + i - 1], rb + i);
+
+    luaC_barrierfast(L, h);
+    return pc;
+}
+
 const Instruction* execute_LOP_FORGPREP(lua_State* L, const Instruction* pc, StkId base, TValue* k)
 {
     [[maybe_unused]] Closure* cl = clvalue(L->ci->func);
diff --git a/CodeGen/src/Fallbacks.h b/CodeGen/src/Fallbacks.h
index bfc0e2b..0d2d218 100644
--- a/CodeGen/src/Fallbacks.h
+++ b/CodeGen/src/Fallbacks.h
@@ -16,6 +16,7 @@ const Instruction* execute_LOP_GETTABLEKS(lua_State* L, const Instruction* pc, S
 const Instruction* execute_LOP_SETTABLEKS(lua_State* L, const Instruction* pc, StkId base, TValue* k);
 const Instruction* execute_LOP_NEWCLOSURE(lua_State* L, const Instruction* pc, StkId base, TValue* k);
 const Instruction* execute_LOP_NAMECALL(lua_State* L, const Instruction* pc, StkId base, TValue* k);
+const Instruction* execute_LOP_SETLIST(lua_State* L, const Instruction* pc, StkId base, TValue* k);
 const Instruction* execute_LOP_FORGPREP(lua_State* L, const Instruction* pc, StkId base, TValue* k);
 const Instruction* execute_LOP_GETVARARGS(lua_State* L, const Instruction* pc, StkId base, TValue* k);
 const Instruction* execute_LOP_DUPCLOSURE(lua_State* L, const Instruction* pc, StkId base, TValue* k);
diff --git a/CodeGen/src/IrAnalysis.cpp b/CodeGen/src/IrAnalysis.cpp
index 2246e5c..f3870e9 100644
--- a/CodeGen/src/IrAnalysis.cpp
+++ b/CodeGen/src/IrAnalysis.cpp
@@ -354,6 +354,8 @@ static RegisterSet computeBlockLiveInRegSet(IrFunction& function, const IrBlock&
         case IrCmd::RETURN:
             useRange(vmRegOp(inst.a), function.intOp(inst.b));
             break;
+
+            // TODO: FASTCALL is more restrictive than INVOKE_FASTCALL; we should either determine the exact semantics, or rework it
         case IrCmd::FASTCALL:
         case IrCmd::INVOKE_FASTCALL:
             if (int count = function.intOp(inst.e); count != -1)
diff --git a/CodeGen/src/IrBuilder.cpp b/CodeGen/src/IrBuilder.cpp
index 48c0e25..d86dfe0 100644
--- a/CodeGen/src/IrBuilder.cpp
+++ b/CodeGen/src/IrBuilder.cpp
@@ -468,7 +468,8 @@ void IrBuilder::clone(const IrBlock& source, bool removeCurrentTerminator)
         IrInst clone = function.instructions[index];
 
         // Skip pseudo instructions to make clone more compact, but validate that they have no users
-        if (isPseudo(clone.cmd))
+        // But if substitution tracks a location, that tracking has to be preserved
+        if (isPseudo(clone.cmd) && !(clone.cmd == IrCmd::SUBSTITUTE && clone.b.kind != IrOpKind::None))
         {
             LUAU_ASSERT(clone.useCount == 0);
             continue;
diff --git a/CodeGen/src/IrCallWrapperX64.cpp b/CodeGen/src/IrCallWrapperX64.cpp
index 8ac5f8b..f466df4 100644
--- a/CodeGen/src/IrCallWrapperX64.cpp
+++ b/CodeGen/src/IrCallWrapperX64.cpp
@@ -13,6 +13,10 @@ namespace CodeGen
 namespace X64
 {
 
+static const std::array<OperandX64, 6> kWindowsGprOrder = {rcx, rdx, r8, r9, addr[rsp + 32], addr[rsp + 40]};
+static const std::array<OperandX64, 6> kSystemvGprOrder = {rdi, rsi, rdx, rcx, r8, r9};
+static const std::array<OperandX64, 4> kXmmOrder = {xmm0, xmm1, xmm2, xmm3}; // Common order for first 4 fp arguments on Windows/SystemV
+
 static bool sameUnderlyingRegister(RegisterX64 a, RegisterX64 b)
 {
     SizeX64 underlyingSizeA = a.size == SizeX64::xmmword ? SizeX64::xmmword : SizeX64::qword;
@@ -37,21 +41,35 @@ void IrCallWrapperX64::addArgument(SizeX64 targetSize, OperandX64 source, IrOp s
     LUAU_ASSERT(instIdx != kInvalidInstIdx || sourceOp.kind == IrOpKind::None);
 
     LUAU_ASSERT(argCount < kMaxCallArguments);
-    args[argCount++] = {targetSize, source, sourceOp};
+    CallArgument& arg = args[argCount++];
+    arg = {targetSize, source, sourceOp};
+
+    arg.target = getNextArgumentTarget(targetSize);
+
+    if (build.abi == ABIX64::Windows)
+    {
+        // On Windows, gpr/xmm register positions move in sync
+        gprPos++;
+        xmmPos++;
+    }
+    else
+    {
+        if (targetSize == SizeX64::xmmword)
+            xmmPos++;
+        else
+            gprPos++;
+    }
 }
 
 void IrCallWrapperX64::addArgument(SizeX64 targetSize, ScopedRegX64& scopedReg)
 {
-    LUAU_ASSERT(argCount < kMaxCallArguments);
-    args[argCount++] = {targetSize, scopedReg.release(), {}};
+    addArgument(targetSize, scopedReg.release(), {});
 }
 
 void IrCallWrapperX64::call(const OperandX64& func)
 {
     funcOp = func;
 
-    assignTargetRegisters();
-
     countRegisterUses();
 
     for (int i = 0; i < argCount; ++i)
@@ -190,44 +208,33 @@ void IrCallWrapperX64::call(const OperandX64& func)
     build.call(funcOp);
 }
 
-void IrCallWrapperX64::assignTargetRegisters()
+RegisterX64 IrCallWrapperX64::suggestNextArgumentRegister(SizeX64 size) const
 {
-    static const std::array<OperandX64, 6> kWindowsGprOrder = {rcx, rdx, r8, r9, addr[rsp + 32], addr[rsp + 40]};
-    static const std::array<OperandX64, 6> kSystemvGprOrder = {rdi, rsi, rdx, rcx, r8, r9};
+    OperandX64 target = getNextArgumentTarget(size);
+
+    return target.cat == CategoryX64::reg ? regs.takeReg(target.base, kInvalidInstIdx) : regs.allocReg(size, kInvalidInstIdx);
+}
+
+OperandX64 IrCallWrapperX64::getNextArgumentTarget(SizeX64 size) const
+{
+    if (size == SizeX64::xmmword)
+    {
+        LUAU_ASSERT(size_t(xmmPos) < kXmmOrder.size());
+        return kXmmOrder[xmmPos];
+    }
 
     const std::array<OperandX64, 6>& gprOrder = build.abi == ABIX64::Windows ? kWindowsGprOrder : kSystemvGprOrder;
-    static const std::array<OperandX64, 4> kXmmOrder = {xmm0, xmm1, xmm2, xmm3}; // Common order for first 4 fp arguments on Windows/SystemV
 
-    int gprPos = 0;
-    int xmmPos = 0;
+    LUAU_ASSERT(size_t(gprPos) < gprOrder.size());
+    OperandX64 target = gprOrder[gprPos];
 
-    for (int i = 0; i < argCount; i++)
-    {
-        CallArgument& arg = args[i];
+    // Keep requested argument size
+    if (target.cat == CategoryX64::reg)
+        target.base.size = size;
+    else if (target.cat == CategoryX64::mem)
+        target.memSize = size;
 
-        if (arg.targetSize == SizeX64::xmmword)
-        {
-            LUAU_ASSERT(size_t(xmmPos) < kXmmOrder.size());
-            arg.target = kXmmOrder[xmmPos++];
-
-            if (build.abi == ABIX64::Windows)
-                gprPos++; // On Windows, gpr/xmm register positions move in sync
-        }
-        else
-        {
-            LUAU_ASSERT(size_t(gprPos) < gprOrder.size());
-            arg.target = gprOrder[gprPos++];
-
-            if (build.abi == ABIX64::Windows)
-                xmmPos++; // On Windows, gpr/xmm register positions move in sync
-
-            // Keep requested argument size
-            if (arg.target.cat == CategoryX64::reg)
-                arg.target.base.size = arg.targetSize;
-            else if (arg.target.cat == CategoryX64::mem)
-                arg.target.memSize = arg.targetSize;
-        }
-    }
+    return target;
 }
 
 void IrCallWrapperX64::countRegisterUses()
@@ -376,7 +383,7 @@ RegisterX64 IrCallWrapperX64::findConflictingTarget() const
 void IrCallWrapperX64::renameConflictingRegister(RegisterX64 conflict)
 {
     // Get a fresh register
-    RegisterX64 freshReg = conflict.size == SizeX64::xmmword ? regs.allocXmmReg(kInvalidInstIdx) : regs.allocGprReg(conflict.size, kInvalidInstIdx);
+    RegisterX64 freshReg = regs.allocReg(conflict.size, kInvalidInstIdx);
 
     if (conflict.size == SizeX64::xmmword)
         build.vmovsd(freshReg, conflict, conflict);
diff --git a/CodeGen/src/IrLoweringA64.cpp b/CodeGen/src/IrLoweringA64.cpp
index 7f0305c..3f05d53 100644
--- a/CodeGen/src/IrLoweringA64.cpp
+++ b/CodeGen/src/IrLoweringA64.cpp
@@ -8,7 +8,6 @@
 #include "Luau/IrUtils.h"
 
 #include "EmitCommonA64.h"
-#include "EmitInstructionA64.h"
 #include "NativeState.h"
 
 #include "lstate.h"
@@ -27,13 +26,14 @@ namespace A64
 #ifdef TRACE
 struct LoweringStatsA64
 {
-    size_t can;
+    size_t missing;
     size_t total;
 
     ~LoweringStatsA64()
     {
         if (total)
-            printf("A64 lowering succeeded for %.1f%% functions (%d/%d)\n", double(can) / double(total) * 100, int(can), int(total));
+            printf("A64 lowering succeeded for %.1f%% functions (%d/%d)\n", double(total - missing) / double(total) * 100, int(total - missing),
+                int(total));
     }
 } gStatsA64;
 #endif
@@ -78,32 +78,230 @@ inline ConditionA64 getConditionFP(IrCondition cond)
     }
 }
 
-// TODO: instead of temp1/temp2 we can take a register that we will use for ra->value; that way callers to this function will be able to use it when
-// calling luaC_barrier*
-static void checkObjectBarrierConditions(AssemblyBuilderA64& build, RegisterA64 object, RegisterA64 temp1, RegisterA64 temp2, int ra, Label& skip)
+static void checkObjectBarrierConditions(AssemblyBuilderA64& build, RegisterA64 object, RegisterA64 temp, int ra, Label& skip)
 {
-    RegisterA64 temp1w = castReg(KindA64::w, temp1);
-    RegisterA64 temp2w = castReg(KindA64::w, temp2);
+    RegisterA64 tempw = castReg(KindA64::w, temp);
 
     // iscollectable(ra)
-    build.ldr(temp1w, mem(rBase, ra * sizeof(TValue) + offsetof(TValue, tt)));
-    build.cmp(temp1w, LUA_TSTRING);
+    build.ldr(tempw, mem(rBase, ra * sizeof(TValue) + offsetof(TValue, tt)));
+    build.cmp(tempw, LUA_TSTRING);
     build.b(ConditionA64::Less, skip);
 
     // isblack(obj2gco(o))
     // TODO: conditional bit test with BLACKBIT
-    build.ldrb(temp1w, mem(object, offsetof(GCheader, marked)));
-    build.mov(temp2w, bitmask(BLACKBIT));
-    build.and_(temp1w, temp1w, temp2w);
-    build.cbz(temp1w, skip);
+    build.ldrb(tempw, mem(object, offsetof(GCheader, marked)));
+    build.tst(tempw, bitmask(BLACKBIT));
+    build.b(ConditionA64::Equal, skip); // Equal = Zero after tst
 
     // iswhite(gcvalue(ra))
-    // TODO: tst with bitmask(WHITE0BIT, WHITE1BIT)
-    build.ldr(temp1, mem(rBase, ra * sizeof(TValue) + offsetof(TValue, value)));
-    build.ldrb(temp1w, mem(temp1, offsetof(GCheader, marked)));
-    build.mov(temp2w, bit2mask(WHITE0BIT, WHITE1BIT));
-    build.and_(temp1w, temp1w, temp2w);
-    build.cbz(temp1w, skip);
+    build.ldr(temp, mem(rBase, ra * sizeof(TValue) + offsetof(TValue, value)));
+    build.ldrb(tempw, mem(temp, offsetof(GCheader, marked)));
+    build.tst(tempw, bit2mask(WHITE0BIT, WHITE1BIT));
+    build.b(ConditionA64::Equal, skip); // Equal = Zero after tst
+}
+
+static void emitAddOffset(AssemblyBuilderA64& build, RegisterA64 dst, RegisterA64 src, size_t offset)
+{
+    LUAU_ASSERT(dst != src);
+    LUAU_ASSERT(offset <= INT_MAX);
+
+    if (offset <= AssemblyBuilderA64::kMaxImmediate)
+    {
+        build.add(dst, src, uint16_t(offset));
+    }
+    else
+    {
+        build.mov(dst, int(offset));
+        build.add(dst, dst, src);
+    }
+}
+
+static void emitFallback(AssemblyBuilderA64& build, int op, int pcpos)
+{
+    // fallback(L, instruction, base, k)
+    build.mov(x0, rState);
+    emitAddOffset(build, x1, rCode, pcpos * sizeof(Instruction));
+    build.mov(x2, rBase);
+    build.mov(x3, rConstants);
+    build.ldr(x4, mem(rNativeContext, offsetof(NativeContext, fallback) + op * sizeof(FallbackFn)));
+    build.blr(x4);
+
+    emitUpdateBase(build);
+}
+
+static void emitInvokeLibm1(AssemblyBuilderA64& build, size_t func, int res, int arg)
+{
+    build.ldr(d0, mem(rBase, arg * sizeof(TValue) + offsetof(TValue, value.n)));
+    build.ldr(x0, mem(rNativeContext, uint32_t(func)));
+    build.blr(x0);
+    build.str(d0, mem(rBase, res * sizeof(TValue) + offsetof(TValue, value.n)));
+}
+
+static void emitInvokeLibm2(AssemblyBuilderA64& build, size_t func, int res, int arg, IrOp args, bool argsInt = false)
+{
+    if (args.kind == IrOpKind::VmReg)
+        build.ldr(d1, mem(rBase, args.index * sizeof(TValue) + offsetof(TValue, value.n)));
+    else if (args.kind == IrOpKind::VmConst)
+    {
+        size_t constantOffset = args.index * sizeof(TValue) + offsetof(TValue, value.n);
+
+        // Note: cumulative offset is guaranteed to be divisible by 8 (since we're loading a double); we can use that to expand the useful range that
+        // doesn't require temporaries
+        if (constantOffset / 8 <= AddressA64::kMaxOffset)
+        {
+            build.ldr(d1, mem(rConstants, int(constantOffset)));
+        }
+        else
+        {
+            emitAddOffset(build, x0, rConstants, constantOffset);
+            build.ldr(d1, x0);
+        }
+    }
+    else
+        LUAU_ASSERT(!"Unsupported instruction form");
+
+    if (argsInt)
+        build.fcvtzs(w0, d1);
+
+    build.ldr(d0, mem(rBase, arg * sizeof(TValue) + offsetof(TValue, value.n)));
+    build.ldr(x1, mem(rNativeContext, uint32_t(func)));
+    build.blr(x1);
+    build.str(d0, mem(rBase, res * sizeof(TValue) + offsetof(TValue, value.n)));
+}
+
+static void emitInvokeLibm1P(AssemblyBuilderA64& build, size_t func, int arg)
+{
+    build.ldr(d0, mem(rBase, arg * sizeof(TValue) + offsetof(TValue, value.n)));
+    build.add(x0, sp, sTemporary.data); // sp-relative offset
+    build.ldr(x1, mem(rNativeContext, uint32_t(func)));
+    build.blr(x1);
+}
+
+static bool emitBuiltin(AssemblyBuilderA64& build, IrRegAllocA64& regs, int bfid, int res, int arg, IrOp args, int nparams, int nresults)
+{
+    switch (bfid)
+    {
+    case LBF_MATH_EXP:
+        LUAU_ASSERT(nparams == 1 && nresults == 1);
+        emitInvokeLibm1(build, offsetof(NativeContext, libm_exp), res, arg);
+        return true;
+    case LBF_MATH_FMOD:
+        LUAU_ASSERT(nparams == 2 && nresults == 1);
+        emitInvokeLibm2(build, offsetof(NativeContext, libm_fmod), res, arg, args);
+        return true;
+    case LBF_MATH_ASIN:
+        LUAU_ASSERT(nparams == 1 && nresults == 1);
+        emitInvokeLibm1(build, offsetof(NativeContext, libm_asin), res, arg);
+        return true;
+    case LBF_MATH_SIN:
+        LUAU_ASSERT(nparams == 1 && nresults == 1);
+        emitInvokeLibm1(build, offsetof(NativeContext, libm_sin), res, arg);
+        return true;
+    case LBF_MATH_SINH:
+        LUAU_ASSERT(nparams == 1 && nresults == 1);
+        emitInvokeLibm1(build, offsetof(NativeContext, libm_sinh), res, arg);
+        return true;
+    case LBF_MATH_ACOS:
+        LUAU_ASSERT(nparams == 1 && nresults == 1);
+        emitInvokeLibm1(build, offsetof(NativeContext, libm_acos), res, arg);
+        return true;
+    case LBF_MATH_COS:
+        LUAU_ASSERT(nparams == 1 && nresults == 1);
+        emitInvokeLibm1(build, offsetof(NativeContext, libm_cos), res, arg);
+        return true;
+    case LBF_MATH_COSH:
+        LUAU_ASSERT(nparams == 1 && nresults == 1);
+        emitInvokeLibm1(build, offsetof(NativeContext, libm_cosh), res, arg);
+        return true;
+    case LBF_MATH_ATAN:
+        LUAU_ASSERT(nparams == 1 && nresults == 1);
+        emitInvokeLibm1(build, offsetof(NativeContext, libm_atan), res, arg);
+        return true;
+    case LBF_MATH_TAN:
+        LUAU_ASSERT(nparams == 1 && nresults == 1);
+        emitInvokeLibm1(build, offsetof(NativeContext, libm_tan), res, arg);
+        return true;
+    case LBF_MATH_TANH:
+        LUAU_ASSERT(nparams == 1 && nresults == 1);
+        emitInvokeLibm1(build, offsetof(NativeContext, libm_tanh), res, arg);
+        return true;
+    case LBF_MATH_ATAN2:
+        LUAU_ASSERT(nparams == 2 && nresults == 1);
+        emitInvokeLibm2(build, offsetof(NativeContext, libm_atan2), res, arg, args);
+        return true;
+    case LBF_MATH_LOG10:
+        LUAU_ASSERT(nparams == 1 && nresults == 1);
+        emitInvokeLibm1(build, offsetof(NativeContext, libm_log10), res, arg);
+        return true;
+    case LBF_MATH_LOG:
+        LUAU_ASSERT((nparams == 1 || nparams == 2) && nresults == 1);
+        // TODO: IR builtin lowering assumes that the only valid 2-argument call is log2; ideally, we use a less hacky way to indicate that
+        if (nparams == 2)
+            emitInvokeLibm1(build, offsetof(NativeContext, libm_log2), res, arg);
+        else
+            emitInvokeLibm1(build, offsetof(NativeContext, libm_log), res, arg);
+        return true;
+    case LBF_MATH_LDEXP:
+        LUAU_ASSERT(nparams == 2 && nresults == 1);
+        emitInvokeLibm2(build, offsetof(NativeContext, libm_ldexp), res, arg, args, /* argsInt= */ true);
+        return true;
+    case LBF_MATH_FREXP:
+        LUAU_ASSERT(nparams == 1 && (nresults == 1 || nresults == 2));
+        emitInvokeLibm1P(build, offsetof(NativeContext, libm_frexp), arg);
+        build.str(d0, mem(rBase, res * sizeof(TValue) + offsetof(TValue, value.n)));
+        if (nresults == 2)
+        {
+            build.ldr(w0, sTemporary);
+            build.scvtf(d1, w0);
+            build.str(d1, mem(rBase, (res + 1) * sizeof(TValue) + offsetof(TValue, value.n)));
+        }
+        return true;
+    case LBF_MATH_MODF:
+        LUAU_ASSERT(nparams == 1 && (nresults == 1 || nresults == 2));
+        emitInvokeLibm1P(build, offsetof(NativeContext, libm_modf), arg);
+        build.ldr(d1, sTemporary);
+        build.str(d1, mem(rBase, res * sizeof(TValue) + offsetof(TValue, value.n)));
+        if (nresults == 2)
+            build.str(d0, mem(rBase, (res + 1) * sizeof(TValue) + offsetof(TValue, value.n)));
+        return true;
+    case LBF_MATH_SIGN:
+        LUAU_ASSERT(nparams == 1 && nresults == 1);
+        // TODO: this can be improved with fmov(constant), for now we just load from memory
+        build.ldr(d0, mem(rBase, arg * sizeof(TValue) + offsetof(TValue, value.n)));
+        build.fcmpz(d0);
+        build.adr(x0, 0.0);
+        build.ldr(d0, x0);
+        build.adr(x0, 1.0);
+        build.ldr(d1, x0);
+        build.fcsel(d0, d1, d0, getConditionFP(IrCondition::Greater));
+        build.adr(x0, -1.0);
+        build.ldr(d1, x0);
+        build.fcsel(d0, d1, d0, getConditionFP(IrCondition::Less));
+        build.str(d0, mem(rBase, res * sizeof(TValue) + offsetof(TValue, value.n)));
+        return true;
+
+    case LBF_TYPE:
+        build.ldr(w0, mem(rBase, arg * sizeof(TValue) + offsetof(TValue, tt)));
+        build.ldr(x1, mem(rState, offsetof(lua_State, global)));
+        // TODO: this can use load with shifted/extended offset
+        LUAU_ASSERT(sizeof(TString*) == 8);
+        build.add(x1, x1, zextReg(w0), 3);
+        build.ldr(x0, mem(x1, offsetof(global_State, ttname)));
+        build.str(x0, mem(rBase, res * sizeof(TValue) + offsetof(TValue, value.gc)));
+        return true;
+
+    case LBF_TYPEOF:
+        build.mov(x0, rState);
+        build.add(x1, rBase, uint16_t(arg * sizeof(TValue)));
+        build.ldr(x2, mem(rNativeContext, offsetof(NativeContext, luaT_objtypenamestr)));
+        build.blr(x2);
+        build.str(x0, mem(rBase, res * sizeof(TValue) + offsetof(TValue, value.gc)));
+        return true;
+
+    default:
+        LUAU_ASSERT(!"Missing A64 lowering");
+        return false;
+    }
 }
 
 IrLoweringA64::IrLoweringA64(AssemblyBuilderA64& build, ModuleHelpers& helpers, NativeState& data, Proto* proto, IrFunction& function)
@@ -116,119 +314,10 @@ IrLoweringA64::IrLoweringA64(AssemblyBuilderA64& build, ModuleHelpers& helpers,
 {
     // In order to allocate registers during lowering, we need to know where instruction results are last used
     updateLastUseLocations(function);
-}
 
-// TODO: Eventually this can go away
-bool IrLoweringA64::canLower(const IrFunction& function)
-{
 #ifdef TRACE
     gStatsA64.total++;
 #endif
-
-    for (const IrInst& inst : function.instructions)
-    {
-        switch (inst.cmd)
-        {
-        case IrCmd::NOP:
-        case IrCmd::LOAD_TAG:
-        case IrCmd::LOAD_POINTER:
-        case IrCmd::LOAD_DOUBLE:
-        case IrCmd::LOAD_INT:
-        case IrCmd::LOAD_TVALUE:
-        case IrCmd::LOAD_NODE_VALUE_TV:
-        case IrCmd::LOAD_ENV:
-        case IrCmd::GET_ARR_ADDR:
-        case IrCmd::GET_SLOT_NODE_ADDR:
-        case IrCmd::GET_HASH_NODE_ADDR:
-        case IrCmd::STORE_TAG:
-        case IrCmd::STORE_POINTER:
-        case IrCmd::STORE_DOUBLE:
-        case IrCmd::STORE_INT:
-        case IrCmd::STORE_TVALUE:
-        case IrCmd::STORE_NODE_VALUE_TV:
-        case IrCmd::ADD_INT:
-        case IrCmd::SUB_INT:
-        case IrCmd::ADD_NUM:
-        case IrCmd::SUB_NUM:
-        case IrCmd::MUL_NUM:
-        case IrCmd::DIV_NUM:
-        case IrCmd::MOD_NUM:
-        case IrCmd::POW_NUM:
-        case IrCmd::MIN_NUM:
-        case IrCmd::MAX_NUM:
-        case IrCmd::UNM_NUM:
-        case IrCmd::FLOOR_NUM:
-        case IrCmd::CEIL_NUM:
-        case IrCmd::ROUND_NUM:
-        case IrCmd::SQRT_NUM:
-        case IrCmd::ABS_NUM:
-        case IrCmd::JUMP:
-        case IrCmd::JUMP_IF_TRUTHY:
-        case IrCmd::JUMP_IF_FALSY:
-        case IrCmd::JUMP_EQ_TAG:
-        case IrCmd::JUMP_EQ_INT:
-        case IrCmd::JUMP_EQ_POINTER:
-        case IrCmd::JUMP_CMP_NUM:
-        case IrCmd::JUMP_CMP_ANY:
-        case IrCmd::TABLE_LEN:
-        case IrCmd::NEW_TABLE:
-        case IrCmd::DUP_TABLE:
-        case IrCmd::TRY_NUM_TO_INDEX:
-        case IrCmd::INT_TO_NUM:
-        case IrCmd::ADJUST_STACK_TO_REG:
-        case IrCmd::ADJUST_STACK_TO_TOP:
-        case IrCmd::INVOKE_FASTCALL:
-        case IrCmd::CHECK_FASTCALL_RES:
-        case IrCmd::DO_ARITH:
-        case IrCmd::DO_LEN:
-        case IrCmd::GET_TABLE:
-        case IrCmd::SET_TABLE:
-        case IrCmd::GET_IMPORT:
-        case IrCmd::CONCAT:
-        case IrCmd::GET_UPVALUE:
-        case IrCmd::SET_UPVALUE:
-        case IrCmd::PREPARE_FORN:
-        case IrCmd::CHECK_TAG:
-        case IrCmd::CHECK_READONLY:
-        case IrCmd::CHECK_NO_METATABLE:
-        case IrCmd::CHECK_SAFE_ENV:
-        case IrCmd::CHECK_ARRAY_SIZE:
-        case IrCmd::CHECK_SLOT_MATCH:
-        case IrCmd::INTERRUPT:
-        case IrCmd::CHECK_GC:
-        case IrCmd::BARRIER_OBJ:
-        case IrCmd::BARRIER_TABLE_BACK:
-        case IrCmd::BARRIER_TABLE_FORWARD:
-        case IrCmd::SET_SAVEDPC:
-        case IrCmd::CLOSE_UPVALS:
-        case IrCmd::CAPTURE:
-        case IrCmd::CALL:
-        case IrCmd::RETURN:
-        case IrCmd::FALLBACK_GETGLOBAL:
-        case IrCmd::FALLBACK_SETGLOBAL:
-        case IrCmd::FALLBACK_GETTABLEKS:
-        case IrCmd::FALLBACK_SETTABLEKS:
-        case IrCmd::FALLBACK_NAMECALL:
-        case IrCmd::FALLBACK_PREPVARARGS:
-        case IrCmd::FALLBACK_GETVARARGS:
-        case IrCmd::FALLBACK_NEWCLOSURE:
-        case IrCmd::FALLBACK_DUPCLOSURE:
-        case IrCmd::SUBSTITUTE:
-            continue;
-
-        default:
-#ifdef TRACE
-            printf("A64 lowering missing %s\n", getCmdName(inst.cmd));
-#endif
-            return false;
-        }
-    }
-
-#ifdef TRACE
-    gStatsA64.can++;
-#endif
-
-    return true;
 }
 
 void IrLoweringA64::lowerInst(IrInst& inst, uint32_t index, IrBlock& next)
@@ -245,14 +334,14 @@ void IrLoweringA64::lowerInst(IrInst& inst, uint32_t index, IrBlock& next)
     case IrCmd::LOAD_POINTER:
     {
         inst.regA64 = regs.allocReg(KindA64::x);
-        AddressA64 addr = tempAddr(inst.a, offsetof(TValue, value));
+        AddressA64 addr = tempAddr(inst.a, offsetof(TValue, value.gc));
         build.ldr(inst.regA64, addr);
         break;
     }
     case IrCmd::LOAD_DOUBLE:
     {
         inst.regA64 = regs.allocReg(KindA64::d);
-        AddressA64 addr = tempAddr(inst.a, offsetof(TValue, value));
+        AddressA64 addr = tempAddr(inst.a, offsetof(TValue, value.n));
         build.ldr(inst.regA64, addr);
         break;
     }
@@ -287,13 +376,21 @@ void IrLoweringA64::lowerInst(IrInst& inst, uint32_t index, IrBlock& next)
 
         if (inst.b.kind == IrOpKind::Inst)
         {
-            // TODO: This is a temporary hack that reads wN register as if it was xN. This should use unsigned extension shift once we support it.
-            build.add(inst.regA64, inst.regA64, castReg(KindA64::x, regOp(inst.b)), kTValueSizeLog2);
+            build.add(inst.regA64, inst.regA64, zextReg(regOp(inst.b)), kTValueSizeLog2);
         }
         else if (inst.b.kind == IrOpKind::Constant)
         {
-            LUAU_ASSERT(size_t(intOp(inst.b)) <= AssemblyBuilderA64::kMaxImmediate >> kTValueSizeLog2); // TODO: handle out of range values
-            build.add(inst.regA64, inst.regA64, uint16_t(intOp(inst.b) << kTValueSizeLog2));
+            // TODO: refactor into a common helper? can't use emitAddOffset because we need a temp register
+            if (intOp(inst.b) * sizeof(TValue) <= AssemblyBuilderA64::kMaxImmediate)
+            {
+                build.add(inst.regA64, inst.regA64, uint16_t(intOp(inst.b) * sizeof(TValue)));
+            }
+            else
+            {
+                RegisterA64 temp = regs.allocTemp(KindA64::x);
+                build.mov(temp, intOp(inst.b) * sizeof(TValue));
+                build.add(inst.regA64, inst.regA64, temp);
+            }
         }
         else
             LUAU_ASSERT(!"Unsupported instruction form");
@@ -314,8 +411,7 @@ void IrLoweringA64::lowerInst(IrInst& inst, uint32_t index, IrBlock& next)
 
         // note: this may clobber inst.a, so it's important that we don't use it after this
         build.ldr(inst.regA64, mem(regOp(inst.a), offsetof(Table, node)));
-        // TODO: This is a temporary hack that reads wN register as if it was xN. This should use unsigned extension shift once we support it.
-        build.add(inst.regA64, inst.regA64, castReg(KindA64::x, temp2), kLuaNodeSizeLog2);
+        build.add(inst.regA64, inst.regA64, zextReg(temp2), kLuaNodeSizeLog2);
         break;
     }
     case IrCmd::GET_HASH_NODE_ADDR:
@@ -324,18 +420,16 @@ void IrLoweringA64::lowerInst(IrInst& inst, uint32_t index, IrBlock& next)
         RegisterA64 temp1 = regs.allocTemp(KindA64::w);
         RegisterA64 temp2 = regs.allocTemp(KindA64::w);
 
-        // TODO: this can use bic (andnot) to do hash & ~(-1 << lsizenode) instead but we don't support it yet
-        build.mov(temp1, 1);
+        // hash & ((1 << lsizenode) - 1) == hash & ~(-1 << lsizenode)
+        build.mov(temp1, -1);
         build.ldrb(temp2, mem(regOp(inst.a), offsetof(Table, lsizenode)));
         build.lsl(temp1, temp1, temp2);
-        build.sub(temp1, temp1, 1);
         build.mov(temp2, uintOp(inst.b));
-        build.and_(temp2, temp2, temp1);
+        build.bic(temp2, temp2, temp1);
 
         // note: this may clobber inst.a, so it's important that we don't use it after this
         build.ldr(inst.regA64, mem(regOp(inst.a), offsetof(Table, node)));
-        // TODO: This is a temporary hack that reads wN register as if it was xN. This should use unsigned extension shift once we support it.
-        build.add(inst.regA64, inst.regA64, castReg(KindA64::x, temp2), kLuaNodeSizeLog2);
+        build.add(inst.regA64, inst.regA64, zextReg(temp2), kLuaNodeSizeLog2);
         break;
     }
     case IrCmd::STORE_TAG:
@@ -501,6 +595,37 @@ void IrLoweringA64::lowerInst(IrInst& inst, uint32_t index, IrBlock& next)
         build.fabs(inst.regA64, temp);
         break;
     }
+    case IrCmd::NOT_ANY:
+    {
+        inst.regA64 = regs.allocReuse(KindA64::w, index, {inst.a, inst.b});
+
+        if (inst.a.kind == IrOpKind::Constant)
+        {
+            // other cases should've been constant folded
+            LUAU_ASSERT(tagOp(inst.a) == LUA_TBOOLEAN);
+            build.eor(inst.regA64, regOp(inst.b), 1);
+        }
+        else
+        {
+            Label notbool, exit;
+
+            // use the fact that NIL is the only value less than BOOLEAN to do two tag comparisons at once
+            LUAU_ASSERT(LUA_TNIL == 0 && LUA_TBOOLEAN == 1);
+            build.cmp(regOp(inst.a), LUA_TBOOLEAN);
+            build.b(ConditionA64::NotEqual, notbool);
+
+            // boolean => invert value
+            build.eor(inst.regA64, regOp(inst.b), 1);
+            build.b(exit);
+
+            // not boolean => result is true iff tag was nil
+            build.setLabel(notbool);
+            build.cset(inst.regA64, ConditionA64::Less);
+
+            build.setLabel(exit);
+        }
+        break;
+    }
     case IrCmd::JUMP:
         jumpOrFallthrough(blockOp(inst.a), next);
         break;
@@ -537,10 +662,12 @@ void IrLoweringA64::lowerInst(IrInst& inst, uint32_t index, IrBlock& next)
         break;
     }
     case IrCmd::JUMP_EQ_TAG:
-        if (inst.b.kind == IrOpKind::Constant)
+        if (inst.a.kind == IrOpKind::Inst && inst.b.kind == IrOpKind::Constant)
             build.cmp(regOp(inst.a), tagOp(inst.b));
-        else if (inst.b.kind == IrOpKind::Inst)
+        else if (inst.a.kind == IrOpKind::Inst && inst.b.kind == IrOpKind::Inst)
             build.cmp(regOp(inst.a), regOp(inst.b));
+        else if (inst.a.kind == IrOpKind::Constant && inst.b.kind == IrOpKind::Inst)
+            build.cmp(regOp(inst.b), tagOp(inst.a));
         else
             LUAU_ASSERT(!"Unsupported instruction form");
 
@@ -570,10 +697,20 @@ void IrLoweringA64::lowerInst(IrInst& inst, uint32_t index, IrBlock& next)
     {
         IrCondition cond = conditionOp(inst.c);
 
-        RegisterA64 temp1 = tempDouble(inst.a);
-        RegisterA64 temp2 = tempDouble(inst.b);
+        if (inst.b.kind == IrOpKind::Constant && doubleOp(inst.b) == 0.0)
+        {
+            RegisterA64 temp = tempDouble(inst.a);
+
+            build.fcmpz(temp);
+        }
+        else
+        {
+            RegisterA64 temp1 = tempDouble(inst.a);
+            RegisterA64 temp2 = tempDouble(inst.b);
+
+            build.fcmp(temp1, temp2);
+        }
 
-        build.fcmp(temp1, temp2);
         build.b(getConditionFP(cond), labelOp(inst.d));
         jumpOrFallthrough(blockOp(inst.e), next);
         break;
@@ -607,6 +744,30 @@ void IrLoweringA64::lowerInst(IrInst& inst, uint32_t index, IrBlock& next)
         jumpOrFallthrough(blockOp(inst.e), next);
         break;
     }
+    case IrCmd::JUMP_SLOT_MATCH:
+    {
+        // TODO: share code with CHECK_SLOT_MATCH
+        RegisterA64 temp1 = regs.allocTemp(KindA64::x);
+        RegisterA64 temp1w = castReg(KindA64::w, temp1);
+        RegisterA64 temp2 = regs.allocTemp(KindA64::x);
+
+        build.ldr(temp1w, mem(regOp(inst.a), offsetof(LuaNode, key) + kOffsetOfTKeyTag));
+        build.and_(temp1w, temp1w, kLuaNodeTagMask);
+        build.cmp(temp1w, LUA_TSTRING);
+        build.b(ConditionA64::NotEqual, labelOp(inst.d));
+
+        AddressA64 addr = tempAddr(inst.b, offsetof(TValue, value));
+        build.ldr(temp1, mem(regOp(inst.a), offsetof(LuaNode, key.value)));
+        build.ldr(temp2, addr);
+        build.cmp(temp1, temp2);
+        build.b(ConditionA64::NotEqual, labelOp(inst.d));
+
+        build.ldr(temp1w, mem(regOp(inst.a), offsetof(LuaNode, val.tt)));
+        LUAU_ASSERT(LUA_TNIL == 0);
+        build.cbz(temp1w, labelOp(inst.d));
+        jumpOrFallthrough(blockOp(inst.c), next);
+        break;
+    }
     case IrCmd::TABLE_LEN:
     {
         regs.assertAllFreeExcept(regOp(inst.a));
@@ -664,6 +825,32 @@ void IrLoweringA64::lowerInst(IrInst& inst, uint32_t index, IrBlock& next)
         }
         break;
     }
+    case IrCmd::TRY_CALL_FASTGETTM:
+    {
+        regs.assertAllFreeExcept(regOp(inst.a));
+
+        RegisterA64 temp1 = regs.allocTemp(KindA64::x);
+        RegisterA64 temp2 = regs.allocTemp(KindA64::w);
+
+        build.ldr(temp1, mem(regOp(inst.a), offsetof(Table, metatable)));
+        build.cbz(temp1, labelOp(inst.c)); // no metatable
+
+        build.ldrb(temp2, mem(temp1, offsetof(Table, tmcache)));
+        build.tst(temp2, 1 << intOp(inst.b));             // can't use tbz/tbnz because their jump offsets are too short
+        build.b(ConditionA64::NotEqual, labelOp(inst.c)); // Equal = Zero after tst; tmcache caches *absence* of metamethods
+
+        build.mov(x0, temp1);
+        build.mov(w1, intOp(inst.b));
+        build.ldr(x2, mem(rState, offsetof(lua_State, global)));
+        build.ldr(x2, mem(x2, offsetof(global_State, tmname) + intOp(inst.b) * sizeof(TString*)));
+        build.ldr(x3, mem(rNativeContext, offsetof(NativeContext, luaT_gettm)));
+        build.blr(x3);
+
+        // TODO: we could takeReg x0 but it's unclear if we will be able to keep x0 allocatable due to aliasing concerns
+        inst.regA64 = regs.allocReg(KindA64::x);
+        build.mov(inst.regA64, x0);
+        break;
+    }
     case IrCmd::INT_TO_NUM:
     {
         inst.regA64 = regs.allocReg(KindA64::d);
@@ -683,8 +870,7 @@ void IrLoweringA64::lowerInst(IrInst& inst, uint32_t index, IrBlock& next)
         else if (inst.b.kind == IrOpKind::Inst)
         {
             build.add(temp, rBase, uint16_t(vmRegOp(inst.a) * sizeof(TValue)));
-            // TODO: This is a temporary hack that reads wN register as if it was xN. This should use unsigned extension shift once we support it.
-            build.add(temp, temp, castReg(KindA64::x, regOp(inst.b)), kTValueSizeLog2);
+            build.add(temp, temp, zextReg(regOp(inst.b)), kTValueSizeLog2);
             build.str(temp, mem(rState, offsetof(lua_State, top)));
         }
         else
@@ -699,6 +885,12 @@ void IrLoweringA64::lowerInst(IrInst& inst, uint32_t index, IrBlock& next)
         build.str(temp, mem(rState, offsetof(lua_State, top)));
         break;
     }
+    case IrCmd::FASTCALL:
+        regs.assertAllFree();
+        // TODO: emitBuiltin should be exhaustive
+        if (!emitBuiltin(build, regs, uintOp(inst.a), vmRegOp(inst.b), vmRegOp(inst.c), inst.d, intOp(inst.e), intOp(inst.f)))
+            error = true;
+        break;
     case IrCmd::INVOKE_FASTCALL:
     {
         regs.assertAllFree();
@@ -710,18 +902,7 @@ void IrLoweringA64::lowerInst(IrInst& inst, uint32_t index, IrBlock& next)
         if (inst.d.kind == IrOpKind::VmReg)
             build.add(x4, rBase, uint16_t(vmRegOp(inst.d) * sizeof(TValue)));
         else if (inst.d.kind == IrOpKind::VmConst)
-        {
-            // TODO: refactor into a common helper
-            if (vmConstOp(inst.d) * sizeof(TValue) <= AssemblyBuilderA64::kMaxImmediate)
-            {
-                build.add(x4, rConstants, uint16_t(vmConstOp(inst.d) * sizeof(TValue)));
-            }
-            else
-            {
-                build.mov(x4, vmConstOp(inst.d) * sizeof(TValue));
-                build.add(x4, rConstants, x4);
-            }
-        }
+            emitAddOffset(build, x4, rConstants, vmConstOp(inst.d) * sizeof(TValue));
         else
             LUAU_ASSERT(boolOp(inst.d) == false);
 
@@ -742,7 +923,7 @@ void IrLoweringA64::lowerInst(IrInst& inst, uint32_t index, IrBlock& next)
         build.ldr(x6, mem(rNativeContext, offsetof(NativeContext, luauF_table) + uintOp(inst.a) * sizeof(luau_FastFunction)));
         build.blr(x6);
 
-        // TODO: we could takeReg w0 but it's unclear if we will be able to keep x0 allocatable due to aliasing concerns
+        // since w0 came from a call, we need to move it so that we don't violate zextReg safety contract
         inst.regA64 = regs.allocReg(KindA64::w);
         build.mov(inst.regA64, w0);
         break;
@@ -758,18 +939,7 @@ void IrLoweringA64::lowerInst(IrInst& inst, uint32_t index, IrBlock& next)
         build.add(x2, rBase, uint16_t(vmRegOp(inst.b) * sizeof(TValue)));
 
         if (inst.c.kind == IrOpKind::VmConst)
-        {
-            // TODO: refactor into a common helper
-            if (vmConstOp(inst.c) * sizeof(TValue) <= AssemblyBuilderA64::kMaxImmediate)
-            {
-                build.add(x3, rConstants, uint16_t(vmConstOp(inst.c) * sizeof(TValue)));
-            }
-            else
-            {
-                build.mov(x3, vmConstOp(inst.c) * sizeof(TValue));
-                build.add(x3, rConstants, x3);
-            }
-        }
+            emitAddOffset(build, x3, rConstants, vmConstOp(inst.c) * sizeof(TValue));
         else
             build.add(x3, rBase, uint16_t(vmRegOp(inst.c) * sizeof(TValue)));
 
@@ -835,7 +1005,25 @@ void IrLoweringA64::lowerInst(IrInst& inst, uint32_t index, IrBlock& next)
         break;
     case IrCmd::GET_IMPORT:
         regs.assertAllFree();
-        emitInstGetImport(build, vmRegOp(inst.a), uintOp(inst.b));
+        // luaV_getimport(L, cl->env, k, aux, /* propagatenil= */ false)
+        build.mov(x0, rState);
+        build.ldr(x1, mem(rClosure, offsetof(Closure, env)));
+        build.mov(x2, rConstants);
+        build.mov(w3, uintOp(inst.b));
+        build.mov(w4, 0);
+        build.ldr(x5, mem(rNativeContext, offsetof(NativeContext, luaV_getimport)));
+        build.blr(x5);
+
+        emitUpdateBase(build);
+
+        // setobj2s(L, ra, L->top - 1)
+        build.ldr(x0, mem(rState, offsetof(lua_State, top)));
+        build.sub(x0, x0, sizeof(TValue));
+        build.ldr(q0, x0);
+        build.str(q0, mem(rBase, vmRegOp(inst.a) * sizeof(TValue)));
+
+        // L->top--
+        build.str(x0, mem(rState, offsetof(lua_State, top)));
         break;
     case IrCmd::CONCAT:
         regs.assertAllFree();
@@ -877,7 +1065,6 @@ void IrLoweringA64::lowerInst(IrInst& inst, uint32_t index, IrBlock& next)
         RegisterA64 temp1 = regs.allocTemp(KindA64::x);
         RegisterA64 temp2 = regs.allocTemp(KindA64::x);
         RegisterA64 temp3 = regs.allocTemp(KindA64::q);
-        RegisterA64 temp4 = regs.allocTemp(KindA64::x);
 
         // UpVal*
         build.ldr(temp1, mem(rClosure, offsetof(Closure, l.uprefs) + sizeof(TValue) * vmUpvalueOp(inst.a) + offsetof(TValue, value.gc)));
@@ -887,7 +1074,7 @@ void IrLoweringA64::lowerInst(IrInst& inst, uint32_t index, IrBlock& next)
         build.str(temp3, temp2);
 
         Label skip;
-        checkObjectBarrierConditions(build, temp1, temp2, temp4, vmRegOp(inst.b), skip);
+        checkObjectBarrierConditions(build, temp1, temp2, vmRegOp(inst.b), skip);
 
         build.mov(x0, rState);
         build.mov(x1, temp1); // TODO: aliasing hazard
@@ -945,8 +1132,17 @@ void IrLoweringA64::lowerInst(IrInst& inst, uint32_t index, IrBlock& next)
             build.cmp(temp, regOp(inst.b));
         else if (inst.b.kind == IrOpKind::Constant)
         {
-            LUAU_ASSERT(size_t(intOp(inst.b)) <= AssemblyBuilderA64::kMaxImmediate); // TODO: handle out of range values
-            build.cmp(temp, uint16_t(intOp(inst.b)));
+            // TODO: refactor into a common helper?
+            if (size_t(intOp(inst.b)) <= AssemblyBuilderA64::kMaxImmediate)
+            {
+                build.cmp(temp, uint16_t(intOp(inst.b)));
+            }
+            else
+            {
+                RegisterA64 temp2 = regs.allocTemp(KindA64::w);
+                build.mov(temp2, intOp(inst.b));
+                build.cmp(temp, temp2);
+            }
         }
         else
             LUAU_ASSERT(!"Unsupported instruction form");
@@ -959,12 +1155,9 @@ void IrLoweringA64::lowerInst(IrInst& inst, uint32_t index, IrBlock& next)
         RegisterA64 temp1 = regs.allocTemp(KindA64::x);
         RegisterA64 temp1w = castReg(KindA64::w, temp1);
         RegisterA64 temp2 = regs.allocTemp(KindA64::x);
-        RegisterA64 temp2w = castReg(KindA64::w, temp2);
 
-        build.ldr(temp1w, mem(regOp(inst.a), kOffsetOfLuaNodeTag));
-        // TODO: this needs bitfield extraction, or and-immediate
-        build.mov(temp2w, kLuaNodeTagMask);
-        build.and_(temp1w, temp1w, temp2w);
+        build.ldr(temp1w, mem(regOp(inst.a), offsetof(LuaNode, key) + kOffsetOfTKeyTag));
+        build.and_(temp1w, temp1w, kLuaNodeTagMask);
         build.cmp(temp1w, LUA_TSTRING);
         build.b(ConditionA64::NotEqual, labelOp(inst.c));
 
@@ -979,6 +1172,15 @@ void IrLoweringA64::lowerInst(IrInst& inst, uint32_t index, IrBlock& next)
         build.cbz(temp1w, labelOp(inst.c));
         break;
     }
+    case IrCmd::CHECK_NODE_NO_NEXT:
+    {
+        RegisterA64 temp = regs.allocTemp(KindA64::w);
+
+        build.ldr(temp, mem(regOp(inst.a), offsetof(LuaNode, key) + kOffsetOfTKeyNext));
+        build.and_(temp, temp, ~((1u << kNextBitOffset) - 1)); // TODO: this would be cleaner with a right shift
+        build.cbnz(temp, labelOp(inst.b));
+        break;
+    }
     case IrCmd::INTERRUPT:
     {
         unsigned int pcpos = uintOp(inst.a);
@@ -1023,11 +1225,10 @@ void IrLoweringA64::lowerInst(IrInst& inst, uint32_t index, IrBlock& next)
     {
         regs.assertAllFreeExcept(regOp(inst.a));
 
-        Label skip;
-        RegisterA64 temp1 = regs.allocTemp(KindA64::x);
-        RegisterA64 temp2 = regs.allocTemp(KindA64::x);
+        RegisterA64 temp = regs.allocTemp(KindA64::x);
 
-        checkObjectBarrierConditions(build, regOp(inst.a), temp1, temp2, vmRegOp(inst.b), skip);
+        Label skip;
+        checkObjectBarrierConditions(build, regOp(inst.a), temp, vmRegOp(inst.b), skip);
 
         build.mov(x0, rState);
         build.mov(x1, regOp(inst.a)); // TODO: aliasing hazard
@@ -1044,15 +1245,13 @@ void IrLoweringA64::lowerInst(IrInst& inst, uint32_t index, IrBlock& next)
         regs.assertAllFreeExcept(regOp(inst.a));
 
         Label skip;
-        RegisterA64 temp1 = regs.allocTemp(KindA64::w);
-        RegisterA64 temp2 = regs.allocTemp(KindA64::w);
+        RegisterA64 temp = regs.allocTemp(KindA64::w);
 
         // isblack(obj2gco(t))
-        build.ldrb(temp1, mem(regOp(inst.a), offsetof(GCheader, marked)));
+        build.ldrb(temp, mem(regOp(inst.a), offsetof(GCheader, marked)));
         // TODO: conditional bit test with BLACKBIT
-        build.mov(temp2, bitmask(BLACKBIT));
-        build.and_(temp1, temp1, temp2);
-        build.cbz(temp1, skip);
+        build.tst(temp, bitmask(BLACKBIT));
+        build.b(ConditionA64::Equal, skip); // Equal = Zero after tst
 
         build.mov(x0, rState);
         build.mov(x1, regOp(inst.a)); // TODO: aliasing hazard here and below
@@ -1068,11 +1267,10 @@ void IrLoweringA64::lowerInst(IrInst& inst, uint32_t index, IrBlock& next)
     {
         regs.assertAllFreeExcept(regOp(inst.a));
 
-        Label skip;
-        RegisterA64 temp1 = regs.allocTemp(KindA64::x);
-        RegisterA64 temp2 = regs.allocTemp(KindA64::x);
+        RegisterA64 temp = regs.allocTemp(KindA64::x);
 
-        checkObjectBarrierConditions(build, regOp(inst.a), temp1, temp2, vmRegOp(inst.b), skip);
+        Label skip;
+        checkObjectBarrierConditions(build, regOp(inst.a), temp, vmRegOp(inst.b), skip);
 
         build.mov(x0, rState);
         build.mov(x1, regOp(inst.a)); // TODO: aliasing hazard
@@ -1086,21 +1284,10 @@ void IrLoweringA64::lowerInst(IrInst& inst, uint32_t index, IrBlock& next)
     }
     case IrCmd::SET_SAVEDPC:
     {
-        unsigned int pcpos = uintOp(inst.a);
         RegisterA64 temp1 = regs.allocTemp(KindA64::x);
         RegisterA64 temp2 = regs.allocTemp(KindA64::x);
 
-        // TODO: refactor into a common helper
-        if (pcpos * sizeof(Instruction) <= AssemblyBuilderA64::kMaxImmediate)
-        {
-            build.add(temp1, rCode, uint16_t(pcpos * sizeof(Instruction)));
-        }
-        else
-        {
-            build.mov(temp1, pcpos * sizeof(Instruction));
-            build.add(temp1, rCode, temp1);
-        }
-
+        emitAddOffset(build, temp1, rCode, uintOp(inst.a) * sizeof(Instruction));
         build.ldr(temp2, mem(rState, offsetof(lua_State, ci)));
         build.str(temp1, mem(temp2, offsetof(CallInfo, savedpc)));
         break;
@@ -1133,14 +1320,100 @@ void IrLoweringA64::lowerInst(IrInst& inst, uint32_t index, IrBlock& next)
     case IrCmd::CAPTURE:
         // no-op
         break;
+    case IrCmd::SETLIST:
+        regs.assertAllFree();
+        emitFallback(build, LOP_SETLIST, uintOp(inst.a));
+        break;
     case IrCmd::CALL:
         regs.assertAllFree();
-        emitInstCall(build, helpers, vmRegOp(inst.a), intOp(inst.b), intOp(inst.c));
+        // argtop = (nparams == LUA_MULTRET) ? L->top : ra + 1 + nparams;
+        if (intOp(inst.b) == LUA_MULTRET)
+            build.ldr(x2, mem(rState, offsetof(lua_State, top)));
+        else
+            build.add(x2, rBase, uint16_t((vmRegOp(inst.a) + 1 + intOp(inst.b)) * sizeof(TValue)));
+
+        // callFallback(L, ra, argtop, nresults)
+        build.mov(x0, rState);
+        build.add(x1, rBase, uint16_t(vmRegOp(inst.a) * sizeof(TValue)));
+        build.mov(w3, intOp(inst.c));
+        build.ldr(x4, mem(rNativeContext, offsetof(NativeContext, callFallback)));
+        build.blr(x4);
+
+        // reentry with x0=closure (NULL will trigger exit)
+        build.b(helpers.reentry);
         break;
     case IrCmd::RETURN:
         regs.assertAllFree();
-        emitInstReturn(build, helpers, vmRegOp(inst.a), intOp(inst.b));
+        // callFallback(L, ra, n)
+        build.mov(x0, rState);
+        build.add(x1, rBase, uint16_t(vmRegOp(inst.a) * sizeof(TValue)));
+        build.mov(w2, intOp(inst.b));
+        build.ldr(x3, mem(rNativeContext, offsetof(NativeContext, returnFallback)));
+        build.blr(x3);
+
+        // reentry with x0=closure (NULL will trigger exit)
+        build.b(helpers.reentry);
         break;
+    case IrCmd::FORGLOOP:
+        // register layout: ra + 1 = table, ra + 2 = internal index, ra + 3 .. ra + aux = iteration variables
+        regs.assertAllFree();
+        // clear extra variables since we might have more than two
+        if (intOp(inst.b) > 2)
+        {
+            build.mov(w0, LUA_TNIL);
+            for (int i = 2; i < intOp(inst.b); ++i)
+                build.str(w0, mem(rBase, (vmRegOp(inst.a) + 3 + i) * sizeof(TValue) + offsetof(TValue, tt)));
+        }
+        // we use full iter fallback for now; in the future it could be worthwhile to accelerate array iteration here
+        build.mov(x0, rState);
+        build.ldr(x1, mem(rBase, (vmRegOp(inst.a) + 1) * sizeof(TValue) + offsetof(TValue, value.gc)));
+        build.ldr(w2, mem(rBase, (vmRegOp(inst.a) + 2) * sizeof(TValue) + offsetof(TValue, value.p)));
+        build.add(x3, rBase, uint16_t(vmRegOp(inst.a) * sizeof(TValue)));
+        build.ldr(x4, mem(rNativeContext, offsetof(NativeContext, forgLoopTableIter)));
+        build.blr(x4);
+        // note: no emitUpdateBase necessary because forgLoopTableIter does not reallocate stack
+        build.cbnz(w0, labelOp(inst.c));
+        jumpOrFallthrough(blockOp(inst.d), next);
+        break;
+    case IrCmd::FORGLOOP_FALLBACK:
+        regs.assertAllFree();
+        build.mov(x0, rState);
+        build.mov(w1, vmRegOp(inst.a));
+        build.mov(w2, intOp(inst.b));
+        build.ldr(x3, mem(rNativeContext, offsetof(NativeContext, forgLoopNonTableFallback)));
+        build.blr(x3);
+        emitUpdateBase(build);
+        build.cbnz(w0, labelOp(inst.c));
+        jumpOrFallthrough(blockOp(inst.d), next);
+        break;
+    case IrCmd::FORGPREP_XNEXT_FALLBACK:
+        regs.assertAllFree();
+        build.mov(x0, rState);
+        build.add(x1, rBase, uint16_t(vmRegOp(inst.b) * sizeof(TValue)));
+        build.mov(w2, uintOp(inst.a) + 1);
+        build.ldr(x3, mem(rNativeContext, offsetof(NativeContext, forgPrepXnextFallback)));
+        build.blr(x3);
+        // note: no emitUpdateBase necessary because forgLoopNonTableFallback does not reallocate stack
+        jumpOrFallthrough(blockOp(inst.c), next);
+        break;
+    case IrCmd::COVERAGE:
+    {
+        RegisterA64 temp1 = regs.allocTemp(KindA64::x);
+        RegisterA64 temp2 = regs.allocTemp(KindA64::w);
+        RegisterA64 temp3 = regs.allocTemp(KindA64::w);
+
+        build.mov(temp1, uintOp(inst.a) * sizeof(Instruction));
+        build.ldr(temp2, mem(rCode, temp1));
+
+        // increments E (high 24 bits); if the result overflows a 23-bit counter, high bit becomes 1
+        // note: cmp can be eliminated with adds but we aren't concerned with code size for coverage
+        build.add(temp3, temp2, 256);
+        build.cmp(temp3, 0);
+        build.csel(temp2, temp2, temp3, ConditionA64::Less);
+
+        build.str(temp2, mem(rCode, temp1));
+        break;
+    }
 
         // Full instruction fallbacks
     case IrCmd::FALLBACK_GETGLOBAL:
@@ -1208,9 +1481,25 @@ void IrLoweringA64::lowerInst(IrInst& inst, uint32_t index, IrBlock& next)
         regs.assertAllFree();
         emitFallback(build, LOP_DUPCLOSURE, uintOp(inst.a));
         break;
+    case IrCmd::FALLBACK_FORGPREP:
+        regs.assertAllFree();
+        emitFallback(build, LOP_FORGPREP, uintOp(inst.a));
+        jumpOrFallthrough(blockOp(inst.c), next);
+        break;
 
-    default:
-        LUAU_ASSERT(!"Not supported yet");
+    // Pseudo instructions
+    case IrCmd::NOP:
+    case IrCmd::SUBSTITUTE:
+        LUAU_ASSERT(!"Pseudo instructions should not be lowered");
+        break;
+
+    // Unsupported instructions
+    // Note: when adding implementations for these, please move the case: label so that implemented instructions match the order in IrData.h
+    case IrCmd::STORE_VECTOR:
+#ifdef TRACE
+        gStatsA64.missing++;
+#endif
+        error = true;
         break;
     }
 
@@ -1220,7 +1509,7 @@ void IrLoweringA64::lowerInst(IrInst& inst, uint32_t index, IrBlock& next)
 
 bool IrLoweringA64::hasError() const
 {
-    return false;
+    return error;
 }
 
 bool IrLoweringA64::isFallthroughBlock(IrBlock target, IrBlock next)
@@ -1287,17 +1576,7 @@ AddressA64 IrLoweringA64::tempAddr(IrOp op, int offset)
 
         RegisterA64 temp = regs.allocTemp(KindA64::x);
 
-        // TODO: refactor into a common helper
-        if (constantOffset <= AssemblyBuilderA64::kMaxImmediate)
-        {
-            build.add(temp, rConstants, uint16_t(constantOffset));
-        }
-        else
-        {
-            build.mov(temp, int(constantOffset));
-            build.add(temp, rConstants, temp);
-        }
-
+        emitAddOffset(build, temp, rConstants, constantOffset);
         return temp;
     }
     // If we have a register, we assume it's a pointer to TValue
diff --git a/CodeGen/src/IrLoweringA64.h b/CodeGen/src/IrLoweringA64.h
index b374a26..0c9f874 100644
--- a/CodeGen/src/IrLoweringA64.h
+++ b/CodeGen/src/IrLoweringA64.h
@@ -26,8 +26,6 @@ struct IrLoweringA64
 {
     IrLoweringA64(AssemblyBuilderA64& build, ModuleHelpers& helpers, NativeState& data, Proto* proto, IrFunction& function);
 
-    static bool canLower(const IrFunction& function);
-
     void lowerInst(IrInst& inst, uint32_t index, IrBlock& next);
 
     bool hasError() const;
@@ -61,6 +59,8 @@ struct IrLoweringA64
     IrFunction& function;
 
     IrRegAllocA64 regs;
+
+    bool error = false;
 };
 
 } // namespace A64
diff --git a/CodeGen/src/IrLoweringX64.cpp b/CodeGen/src/IrLoweringX64.cpp
index f2dfdb3..51325a3 100644
--- a/CodeGen/src/IrLoweringX64.cpp
+++ b/CodeGen/src/IrLoweringX64.cpp
@@ -31,6 +31,8 @@ IrLoweringX64::IrLoweringX64(AssemblyBuilderX64& build, ModuleHelpers& helpers,
 {
     // In order to allocate registers during lowering, we need to know where instruction results are last used
     updateLastUseLocations(function);
+
+    build.align(kFunctionAlignment, X64::AlignmentDataX64::Ud2);
 }
 
 void IrLoweringX64::storeDoubleAsFloat(OperandX64 dst, IrOp src)
@@ -59,7 +61,7 @@ void IrLoweringX64::lowerInst(IrInst& inst, uint32_t index, IrBlock& next)
     switch (inst.cmd)
     {
     case IrCmd::LOAD_TAG:
-        inst.regX64 = regs.allocGprReg(SizeX64::dword, index);
+        inst.regX64 = regs.allocReg(SizeX64::dword, index);
 
         if (inst.a.kind == IrOpKind::VmReg)
             build.mov(inst.regX64, luauRegTag(vmRegOp(inst.a)));
@@ -73,7 +75,7 @@ void IrLoweringX64::lowerInst(IrInst& inst, uint32_t index, IrBlock& next)
             LUAU_ASSERT(!"Unsupported instruction form");
         break;
     case IrCmd::LOAD_POINTER:
-        inst.regX64 = regs.allocGprReg(SizeX64::qword, index);
+        inst.regX64 = regs.allocReg(SizeX64::qword, index);
 
         if (inst.a.kind == IrOpKind::VmReg)
             build.mov(inst.regX64, luauRegValue(vmRegOp(inst.a)));
@@ -87,7 +89,7 @@ void IrLoweringX64::lowerInst(IrInst& inst, uint32_t index, IrBlock& next)
             LUAU_ASSERT(!"Unsupported instruction form");
         break;
     case IrCmd::LOAD_DOUBLE:
-        inst.regX64 = regs.allocXmmReg(index);
+        inst.regX64 = regs.allocReg(SizeX64::xmmword, index);
 
         if (inst.a.kind == IrOpKind::VmReg)
             build.vmovsd(inst.regX64, luauRegValue(vmRegOp(inst.a)));
@@ -97,12 +99,12 @@ void IrLoweringX64::lowerInst(IrInst& inst, uint32_t index, IrBlock& next)
             LUAU_ASSERT(!"Unsupported instruction form");
         break;
     case IrCmd::LOAD_INT:
-        inst.regX64 = regs.allocGprReg(SizeX64::dword, index);
+        inst.regX64 = regs.allocReg(SizeX64::dword, index);
 
         build.mov(inst.regX64, luauRegValueInt(vmRegOp(inst.a)));
         break;
     case IrCmd::LOAD_TVALUE:
-        inst.regX64 = regs.allocXmmReg(index);
+        inst.regX64 = regs.allocReg(SizeX64::xmmword, index);
 
         if (inst.a.kind == IrOpKind::VmReg)
             build.vmovups(inst.regX64, luauReg(vmRegOp(inst.a)));
@@ -114,12 +116,12 @@ void IrLoweringX64::lowerInst(IrInst& inst, uint32_t index, IrBlock& next)
             LUAU_ASSERT(!"Unsupported instruction form");
         break;
     case IrCmd::LOAD_NODE_VALUE_TV:
-        inst.regX64 = regs.allocXmmReg(index);
+        inst.regX64 = regs.allocReg(SizeX64::xmmword, index);
 
         build.vmovups(inst.regX64, luauNodeValue(regOp(inst.a)));
         break;
     case IrCmd::LOAD_ENV:
-        inst.regX64 = regs.allocGprReg(SizeX64::qword, index);
+        inst.regX64 = regs.allocReg(SizeX64::qword, index);
 
         build.mov(inst.regX64, sClosure);
         build.mov(inst.regX64, qword[inst.regX64 + offsetof(Closure, env)]);
@@ -127,7 +129,7 @@ void IrLoweringX64::lowerInst(IrInst& inst, uint32_t index, IrBlock& next)
     case IrCmd::GET_ARR_ADDR:
         if (inst.b.kind == IrOpKind::Inst)
         {
-            inst.regX64 = regs.allocGprRegOrReuse(SizeX64::qword, index, {inst.b});
+            inst.regX64 = regs.allocRegOrReuse(SizeX64::qword, index, {inst.b});
 
             if (dwordReg(inst.regX64) != regOp(inst.b))
                 build.mov(dwordReg(inst.regX64), regOp(inst.b));
@@ -137,7 +139,7 @@ void IrLoweringX64::lowerInst(IrInst& inst, uint32_t index, IrBlock& next)
         }
         else if (inst.b.kind == IrOpKind::Constant)
         {
-            inst.regX64 = regs.allocGprRegOrReuse(SizeX64::qword, index, {inst.a});
+            inst.regX64 = regs.allocRegOrReuse(SizeX64::qword, index, {inst.a});
 
             build.mov(inst.regX64, qword[regOp(inst.a) + offsetof(Table, array)]);
 
@@ -151,7 +153,7 @@ void IrLoweringX64::lowerInst(IrInst& inst, uint32_t index, IrBlock& next)
         break;
     case IrCmd::GET_SLOT_NODE_ADDR:
     {
-        inst.regX64 = regs.allocGprReg(SizeX64::qword, index);
+        inst.regX64 = regs.allocReg(SizeX64::qword, index);
 
         ScopedRegX64 tmp{regs, SizeX64::qword};
 
@@ -160,11 +162,11 @@ void IrLoweringX64::lowerInst(IrInst& inst, uint32_t index, IrBlock& next)
     }
     case IrCmd::GET_HASH_NODE_ADDR:
     {
-        inst.regX64 = regs.allocGprReg(SizeX64::qword, index);
-
         // Custom bit shift value can only be placed in cl
         ScopedRegX64 shiftTmp{regs, regs.takeReg(rcx, kInvalidInstIdx)};
 
+        inst.regX64 = regs.allocReg(SizeX64::qword, index);
+
         ScopedRegX64 tmp{regs, SizeX64::qword};
 
         build.mov(inst.regX64, qword[regOp(inst.a) + offsetof(Table, node)]);
@@ -232,7 +234,7 @@ void IrLoweringX64::lowerInst(IrInst& inst, uint32_t index, IrBlock& next)
         build.vmovups(luauNodeValue(regOp(inst.a)), regOp(inst.b));
         break;
     case IrCmd::ADD_INT:
-        inst.regX64 = regs.allocGprRegOrReuse(SizeX64::dword, index, {inst.a});
+        inst.regX64 = regs.allocRegOrReuse(SizeX64::dword, index, {inst.a});
 
         if (inst.regX64 == regOp(inst.a) && intOp(inst.b) == 1)
             build.inc(inst.regX64);
@@ -242,7 +244,7 @@ void IrLoweringX64::lowerInst(IrInst& inst, uint32_t index, IrBlock& next)
             build.lea(inst.regX64, addr[regOp(inst.a) + intOp(inst.b)]);
         break;
     case IrCmd::SUB_INT:
-        inst.regX64 = regs.allocGprRegOrReuse(SizeX64::dword, index, {inst.a});
+        inst.regX64 = regs.allocRegOrReuse(SizeX64::dword, index, {inst.a});
 
         if (inst.regX64 == regOp(inst.a) && intOp(inst.b) == 1)
             build.dec(inst.regX64);
@@ -252,7 +254,7 @@ void IrLoweringX64::lowerInst(IrInst& inst, uint32_t index, IrBlock& next)
             build.lea(inst.regX64, addr[regOp(inst.a) - intOp(inst.b)]);
         break;
     case IrCmd::ADD_NUM:
-        inst.regX64 = regs.allocXmmRegOrReuse(index, {inst.a, inst.b});
+        inst.regX64 = regs.allocRegOrReuse(SizeX64::xmmword, index, {inst.a, inst.b});
 
         if (inst.a.kind == IrOpKind::Constant)
         {
@@ -267,7 +269,7 @@ void IrLoweringX64::lowerInst(IrInst& inst, uint32_t index, IrBlock& next)
         }
         break;
     case IrCmd::SUB_NUM:
-        inst.regX64 = regs.allocXmmRegOrReuse(index, {inst.a, inst.b});
+        inst.regX64 = regs.allocRegOrReuse(SizeX64::xmmword, index, {inst.a, inst.b});
 
         if (inst.a.kind == IrOpKind::Constant)
         {
@@ -282,7 +284,7 @@ void IrLoweringX64::lowerInst(IrInst& inst, uint32_t index, IrBlock& next)
         }
         break;
     case IrCmd::MUL_NUM:
-        inst.regX64 = regs.allocXmmRegOrReuse(index, {inst.a, inst.b});
+        inst.regX64 = regs.allocRegOrReuse(SizeX64::xmmword, index, {inst.a, inst.b});
 
         if (inst.a.kind == IrOpKind::Constant)
         {
@@ -297,7 +299,7 @@ void IrLoweringX64::lowerInst(IrInst& inst, uint32_t index, IrBlock& next)
         }
         break;
     case IrCmd::DIV_NUM:
-        inst.regX64 = regs.allocXmmRegOrReuse(index, {inst.a, inst.b});
+        inst.regX64 = regs.allocRegOrReuse(SizeX64::xmmword, index, {inst.a, inst.b});
 
         if (inst.a.kind == IrOpKind::Constant)
         {
@@ -313,7 +315,7 @@ void IrLoweringX64::lowerInst(IrInst& inst, uint32_t index, IrBlock& next)
         break;
     case IrCmd::MOD_NUM:
     {
-        inst.regX64 = regs.allocXmmRegOrReuse(index, {inst.a, inst.b});
+        inst.regX64 = regs.allocRegOrReuse(SizeX64::xmmword, index, {inst.a, inst.b});
 
         ScopedRegX64 optLhsTmp{regs};
         RegisterX64 lhs;
@@ -362,7 +364,7 @@ void IrLoweringX64::lowerInst(IrInst& inst, uint32_t index, IrBlock& next)
         break;
     }
     case IrCmd::MIN_NUM:
-        inst.regX64 = regs.allocXmmRegOrReuse(index, {inst.a, inst.b});
+        inst.regX64 = regs.allocRegOrReuse(SizeX64::xmmword, index, {inst.a, inst.b});
 
         if (inst.a.kind == IrOpKind::Constant)
         {
@@ -377,7 +379,7 @@ void IrLoweringX64::lowerInst(IrInst& inst, uint32_t index, IrBlock& next)
         }
         break;
     case IrCmd::MAX_NUM:
-        inst.regX64 = regs.allocXmmRegOrReuse(index, {inst.a, inst.b});
+        inst.regX64 = regs.allocRegOrReuse(SizeX64::xmmword, index, {inst.a, inst.b});
 
         if (inst.a.kind == IrOpKind::Constant)
         {
@@ -393,7 +395,7 @@ void IrLoweringX64::lowerInst(IrInst& inst, uint32_t index, IrBlock& next)
         break;
     case IrCmd::UNM_NUM:
     {
-        inst.regX64 = regs.allocXmmRegOrReuse(index, {inst.a});
+        inst.regX64 = regs.allocRegOrReuse(SizeX64::xmmword, index, {inst.a});
 
         RegisterX64 src = regOp(inst.a);
 
@@ -410,18 +412,18 @@ void IrLoweringX64::lowerInst(IrInst& inst, uint32_t index, IrBlock& next)
         break;
     }
     case IrCmd::FLOOR_NUM:
-        inst.regX64 = regs.allocXmmRegOrReuse(index, {inst.a});
+        inst.regX64 = regs.allocRegOrReuse(SizeX64::xmmword, index, {inst.a});
 
         build.vroundsd(inst.regX64, inst.regX64, memRegDoubleOp(inst.a), RoundingModeX64::RoundToNegativeInfinity);
         break;
     case IrCmd::CEIL_NUM:
-        inst.regX64 = regs.allocXmmRegOrReuse(index, {inst.a});
+        inst.regX64 = regs.allocRegOrReuse(SizeX64::xmmword, index, {inst.a});
 
         build.vroundsd(inst.regX64, inst.regX64, memRegDoubleOp(inst.a), RoundingModeX64::RoundToPositiveInfinity);
         break;
     case IrCmd::ROUND_NUM:
     {
-        inst.regX64 = regs.allocXmmRegOrReuse(index, {inst.a});
+        inst.regX64 = regs.allocRegOrReuse(SizeX64::xmmword, index, {inst.a});
 
         ScopedRegX64 tmp1{regs, SizeX64::xmmword};
         ScopedRegX64 tmp2{regs, SizeX64::xmmword};
@@ -439,12 +441,12 @@ void IrLoweringX64::lowerInst(IrInst& inst, uint32_t index, IrBlock& next)
         break;
     }
     case IrCmd::SQRT_NUM:
-        inst.regX64 = regs.allocXmmRegOrReuse(index, {inst.a});
+        inst.regX64 = regs.allocRegOrReuse(SizeX64::xmmword, index, {inst.a});
 
         build.vsqrtsd(inst.regX64, inst.regX64, memRegDoubleOp(inst.a));
         break;
     case IrCmd::ABS_NUM:
-        inst.regX64 = regs.allocXmmRegOrReuse(index, {inst.a});
+        inst.regX64 = regs.allocRegOrReuse(SizeX64::xmmword, index, {inst.a});
 
         if (inst.a.kind != IrOpKind::Inst)
             build.vmovsd(inst.regX64, memRegDoubleOp(inst.a));
@@ -456,7 +458,7 @@ void IrLoweringX64::lowerInst(IrInst& inst, uint32_t index, IrBlock& next)
     case IrCmd::NOT_ANY:
     {
         // TODO: if we have a single user which is a STORE_INT, we are missing the opportunity to write directly to target
-        inst.regX64 = regs.allocGprRegOrReuse(SizeX64::dword, index, {inst.a, inst.b});
+        inst.regX64 = regs.allocRegOrReuse(SizeX64::dword, index, {inst.a, inst.b});
 
         Label saveone, savezero, exit;
 
@@ -558,7 +560,7 @@ void IrLoweringX64::lowerInst(IrInst& inst, uint32_t index, IrBlock& next)
         callWrap.addArgument(SizeX64::qword, regOp(inst.a), inst.a);
         callWrap.call(qword[rNativeContext + offsetof(NativeContext, luaH_getn)]);
 
-        inst.regX64 = regs.allocXmmReg(index);
+        inst.regX64 = regs.allocReg(SizeX64::xmmword, index);
         build.vcvtsi2sd(inst.regX64, inst.regX64, eax);
         break;
     }
@@ -566,8 +568,8 @@ void IrLoweringX64::lowerInst(IrInst& inst, uint32_t index, IrBlock& next)
     {
         IrCallWrapperX64 callWrap(regs, build, index);
         callWrap.addArgument(SizeX64::qword, rState);
-        callWrap.addArgument(SizeX64::dword, int32_t(uintOp(inst.a)), inst.a);
-        callWrap.addArgument(SizeX64::dword, int32_t(uintOp(inst.b)), inst.b);
+        callWrap.addArgument(SizeX64::dword, int32_t(uintOp(inst.a)));
+        callWrap.addArgument(SizeX64::dword, int32_t(uintOp(inst.b)));
         callWrap.call(qword[rNativeContext + offsetof(NativeContext, luaH_new)]);
         inst.regX64 = regs.takeReg(rax, index);
         break;
@@ -583,7 +585,7 @@ void IrLoweringX64::lowerInst(IrInst& inst, uint32_t index, IrBlock& next)
     }
     case IrCmd::TRY_NUM_TO_INDEX:
     {
-        inst.regX64 = regs.allocGprReg(SizeX64::dword, index);
+        inst.regX64 = regs.allocReg(SizeX64::dword, index);
 
         ScopedRegX64 tmp{regs, SizeX64::xmmword};
 
@@ -620,7 +622,7 @@ void IrLoweringX64::lowerInst(IrInst& inst, uint32_t index, IrBlock& next)
         break;
     }
     case IrCmd::INT_TO_NUM:
-        inst.regX64 = regs.allocXmmReg(index);
+        inst.regX64 = regs.allocReg(SizeX64::xmmword, index);
 
         build.vcvtsi2sd(inst.regX64, inst.regX64, regOp(inst.a));
         break;
@@ -688,11 +690,10 @@ void IrLoweringX64::lowerInst(IrInst& inst, uint32_t index, IrBlock& next)
 
         if (nparams == LUA_MULTRET)
         {
-            // Compute 'L->top - (ra + 1)', on SystemV, take r9 register to compute directly into the argument
-            // TODO: IrCallWrapperX64 should provide a way to 'guess' target argument register correctly
-            RegisterX64 reg = build.abi == ABIX64::Windows ? regs.allocGprReg(SizeX64::qword, kInvalidInstIdx) : regs.takeReg(rArg6, kInvalidInstIdx);
+            RegisterX64 reg = callWrap.suggestNextArgumentRegister(SizeX64::qword);
             ScopedRegX64 tmp{regs, SizeX64::qword};
 
+            // L->top - (ra + 1)
             build.mov(reg, qword[rState + offsetof(lua_State, top)]);
             build.lea(tmp.reg, addr[rBase + (ra + 1) * sizeof(TValue)]);
             build.sub(reg, tmp.reg);
@@ -759,9 +760,35 @@ void IrLoweringX64::lowerInst(IrInst& inst, uint32_t index, IrBlock& next)
         }
         break;
     case IrCmd::GET_IMPORT:
-        regs.assertAllFree();
-        emitInstGetImportFallback(build, vmRegOp(inst.a), uintOp(inst.b));
+    {
+        ScopedRegX64 tmp1{regs, SizeX64::qword};
+
+        build.mov(tmp1.reg, sClosure);
+
+        IrCallWrapperX64 callWrap(regs, build, index);
+        callWrap.addArgument(SizeX64::qword, rState);
+        callWrap.addArgument(SizeX64::qword, qword[tmp1.release() + offsetof(Closure, env)]);
+        callWrap.addArgument(SizeX64::qword, rConstants);
+        callWrap.addArgument(SizeX64::dword, uintOp(inst.b));
+        callWrap.addArgument(SizeX64::dword, 0);
+        callWrap.call(qword[rNativeContext + offsetof(NativeContext, luaV_getimport)]);
+
+        emitUpdateBase(build);
+
+        ScopedRegX64 tmp2{regs, SizeX64::qword};
+
+        // setobj2s(L, ra, L->top - 1)
+        build.mov(tmp2.reg, qword[rState + offsetof(lua_State, top)]);
+        build.sub(tmp2.reg, sizeof(TValue));
+
+        ScopedRegX64 tmp3{regs, SizeX64::xmmword};
+        build.vmovups(tmp3.reg, xmmword[tmp2.reg]);
+        build.vmovups(luauReg(vmRegOp(inst.a)), tmp3.reg);
+
+        // L->top--
+        build.mov(qword[rState + offsetof(lua_State, top)], tmp2.reg);
         break;
+    }
     case IrCmd::CONCAT:
     {
         IrCallWrapperX64 callWrap(regs, build, index);
@@ -783,7 +810,6 @@ void IrLoweringX64::lowerInst(IrInst& inst, uint32_t index, IrBlock& next)
 
         // uprefs[] is either an actual value, or it points to UpVal object which has a pointer to value
         Label skip;
-        // TODO: jumpIfTagIsNot can be generalized to take OperandX64 and then we can use it here; let's wait until we see this more though
         build.cmp(dword[tmp1.reg + offsetof(TValue, tt)], LUA_TUPVAL);
         build.jcc(ConditionX64::NotEqual, skip);
 
@@ -822,36 +848,25 @@ void IrLoweringX64::lowerInst(IrInst& inst, uint32_t index, IrBlock& next)
         callPrepareForN(regs, build, vmRegOp(inst.a), vmRegOp(inst.b), vmRegOp(inst.c));
         break;
     case IrCmd::CHECK_TAG:
-        if (inst.a.kind == IrOpKind::Inst)
-        {
-            build.cmp(regOp(inst.a), tagOp(inst.b));
-            build.jcc(ConditionX64::NotEqual, labelOp(inst.c));
-        }
-        else if (inst.a.kind == IrOpKind::VmReg)
-        {
-            jumpIfTagIsNot(build, vmRegOp(inst.a), lua_Type(tagOp(inst.b)), labelOp(inst.c));
-        }
-        else if (inst.a.kind == IrOpKind::VmConst)
-        {
-            build.cmp(luauConstantTag(vmConstOp(inst.a)), tagOp(inst.b));
-            build.jcc(ConditionX64::NotEqual, labelOp(inst.c));
-        }
-        else
-        {
-            LUAU_ASSERT(!"Unsupported instruction form");
-        }
+        build.cmp(memRegTagOp(inst.a), tagOp(inst.b));
+        build.jcc(ConditionX64::NotEqual, labelOp(inst.c));
         break;
     case IrCmd::CHECK_READONLY:
-        jumpIfTableIsReadOnly(build, regOp(inst.a), labelOp(inst.b));
+        build.cmp(byte[regOp(inst.a) + offsetof(Table, readonly)], 0);
+        build.jcc(ConditionX64::NotEqual, labelOp(inst.b));
         break;
     case IrCmd::CHECK_NO_METATABLE:
-        jumpIfMetatablePresent(build, regOp(inst.a), labelOp(inst.b));
+        build.cmp(qword[regOp(inst.a) + offsetof(Table, metatable)], 0);
+        build.jcc(ConditionX64::NotEqual, labelOp(inst.b));
         break;
     case IrCmd::CHECK_SAFE_ENV:
     {
         ScopedRegX64 tmp{regs, SizeX64::qword};
 
-        jumpIfUnsafeEnv(build, tmp.reg, labelOp(inst.a));
+        build.mov(tmp.reg, sClosure);
+        build.mov(tmp.reg, qword[tmp.reg + offsetof(Closure, env)]);
+        build.cmp(byte[tmp.reg + offsetof(Table, safeenv)], 0);
+        build.jcc(ConditionX64::Equal, labelOp(inst.a));
         break;
     }
     case IrCmd::CHECK_ARRAY_SIZE:
@@ -872,11 +887,16 @@ void IrLoweringX64::lowerInst(IrInst& inst, uint32_t index, IrBlock& next)
         break;
     }
     case IrCmd::CHECK_NODE_NO_NEXT:
-        jumpIfNodeHasNext(build, regOp(inst.a), labelOp(inst.b));
+    {
+        ScopedRegX64 tmp{regs, SizeX64::dword};
+
+        build.mov(tmp.reg, dword[regOp(inst.a) + offsetof(LuaNode, key) + kOffsetOfTKeyNext]);
+        build.shr(tmp.reg, kNextBitOffset);
+        build.jcc(ConditionX64::NotZero, labelOp(inst.b));
         break;
+    }
     case IrCmd::INTERRUPT:
-        regs.assertAllFree();
-        emitInterrupt(build, uintOp(inst.a));
+        emitInterrupt(regs, build, uintOp(inst.a));
         break;
     case IrCmd::CHECK_GC:
         callStepGc(regs, build);
@@ -970,94 +990,127 @@ void IrLoweringX64::lowerInst(IrInst& inst, uint32_t index, IrBlock& next)
         break;
     case IrCmd::FORGLOOP:
         regs.assertAllFree();
-        emitinstForGLoop(build, vmRegOp(inst.a), intOp(inst.b), labelOp(inst.c), labelOp(inst.d));
+        emitInstForGLoop(build, vmRegOp(inst.a), intOp(inst.b), labelOp(inst.c));
+        jumpOrFallthrough(blockOp(inst.d), next);
         break;
     case IrCmd::FORGLOOP_FALLBACK:
-        regs.assertAllFree();
-        emitinstForGLoopFallback(build, vmRegOp(inst.a), intOp(inst.b), labelOp(inst.c));
-        build.jmp(labelOp(inst.d));
+    {
+        IrCallWrapperX64 callWrap(regs, build, index);
+        callWrap.addArgument(SizeX64::qword, rState);
+        callWrap.addArgument(SizeX64::dword, vmRegOp(inst.a));
+        callWrap.addArgument(SizeX64::dword, intOp(inst.b));
+        callWrap.call(qword[rNativeContext + offsetof(NativeContext, forgLoopNonTableFallback)]);
+
+        emitUpdateBase(build);
+
+        build.test(al, al);
+        build.jcc(ConditionX64::NotZero, labelOp(inst.c));
+        jumpOrFallthrough(blockOp(inst.d), next);
         break;
+    }
     case IrCmd::FORGPREP_XNEXT_FALLBACK:
-        regs.assertAllFree();
-        emitInstForGPrepXnextFallback(build, uintOp(inst.a), vmRegOp(inst.b), labelOp(inst.c));
+    {
+        IrCallWrapperX64 callWrap(regs, build, index);
+        callWrap.addArgument(SizeX64::qword, rState);
+        callWrap.addArgument(SizeX64::qword, luauRegAddress(vmRegOp(inst.b)));
+        callWrap.addArgument(SizeX64::dword, uintOp(inst.a) + 1);
+        callWrap.call(qword[rNativeContext + offsetof(NativeContext, forgPrepXnextFallback)]);
+        jumpOrFallthrough(blockOp(inst.c), next);
         break;
+    }
     case IrCmd::COVERAGE:
-        regs.assertAllFree();
-        emitInstCoverage(build, uintOp(inst.a));
+    {
+        ScopedRegX64 tmp1{regs, SizeX64::qword};
+        ScopedRegX64 tmp2{regs, SizeX64::dword};
+        ScopedRegX64 tmp3{regs, SizeX64::dword};
+
+        build.mov(tmp1.reg, sCode);
+        build.add(tmp1.reg, uintOp(inst.a) * sizeof(Instruction));
+
+        // hits = LUAU_INSN_E(*pc)
+        build.mov(tmp2.reg, dword[tmp1.reg]);
+        build.sar(tmp2.reg, 8);
+
+        // hits = (hits < (1 << 23) - 1) ? hits + 1 : hits;
+        build.xor_(tmp3.reg, tmp3.reg);
+        build.cmp(tmp2.reg, (1 << 23) - 1);
+        build.setcc(ConditionX64::NotEqual, byteReg(tmp3.reg));
+        build.add(tmp2.reg, tmp3.reg);
+
+        // VM_PATCH_E(pc, hits);
+        build.sal(tmp2.reg, 8);
+        build.movzx(tmp3.reg, byte[tmp1.reg]);
+        build.or_(tmp3.reg, tmp2.reg);
+        build.mov(dword[tmp1.reg], tmp3.reg);
         break;
+    }
 
         // Full instruction fallbacks
     case IrCmd::FALLBACK_GETGLOBAL:
         LUAU_ASSERT(inst.b.kind == IrOpKind::VmReg);
         LUAU_ASSERT(inst.c.kind == IrOpKind::VmConst);
 
-        regs.assertAllFree();
-        emitFallback(build, data, LOP_GETGLOBAL, uintOp(inst.a));
+        emitFallback(regs, build, data, LOP_GETGLOBAL, uintOp(inst.a));
         break;
     case IrCmd::FALLBACK_SETGLOBAL:
         LUAU_ASSERT(inst.b.kind == IrOpKind::VmReg);
         LUAU_ASSERT(inst.c.kind == IrOpKind::VmConst);
 
-        regs.assertAllFree();
-        emitFallback(build, data, LOP_SETGLOBAL, uintOp(inst.a));
+        emitFallback(regs, build, data, LOP_SETGLOBAL, uintOp(inst.a));
         break;
     case IrCmd::FALLBACK_GETTABLEKS:
         LUAU_ASSERT(inst.b.kind == IrOpKind::VmReg);
         LUAU_ASSERT(inst.c.kind == IrOpKind::VmReg);
         LUAU_ASSERT(inst.d.kind == IrOpKind::VmConst);
 
-        regs.assertAllFree();
-        emitFallback(build, data, LOP_GETTABLEKS, uintOp(inst.a));
+        emitFallback(regs, build, data, LOP_GETTABLEKS, uintOp(inst.a));
         break;
     case IrCmd::FALLBACK_SETTABLEKS:
         LUAU_ASSERT(inst.b.kind == IrOpKind::VmReg);
         LUAU_ASSERT(inst.c.kind == IrOpKind::VmReg);
         LUAU_ASSERT(inst.d.kind == IrOpKind::VmConst);
 
-        regs.assertAllFree();
-        emitFallback(build, data, LOP_SETTABLEKS, uintOp(inst.a));
+        emitFallback(regs, build, data, LOP_SETTABLEKS, uintOp(inst.a));
         break;
     case IrCmd::FALLBACK_NAMECALL:
         LUAU_ASSERT(inst.b.kind == IrOpKind::VmReg);
         LUAU_ASSERT(inst.c.kind == IrOpKind::VmReg);
         LUAU_ASSERT(inst.d.kind == IrOpKind::VmConst);
 
-        regs.assertAllFree();
-        emitFallback(build, data, LOP_NAMECALL, uintOp(inst.a));
+        emitFallback(regs, build, data, LOP_NAMECALL, uintOp(inst.a));
         break;
     case IrCmd::FALLBACK_PREPVARARGS:
         LUAU_ASSERT(inst.b.kind == IrOpKind::Constant);
 
-        regs.assertAllFree();
-        emitFallback(build, data, LOP_PREPVARARGS, uintOp(inst.a));
+        emitFallback(regs, build, data, LOP_PREPVARARGS, uintOp(inst.a));
         break;
     case IrCmd::FALLBACK_GETVARARGS:
         LUAU_ASSERT(inst.b.kind == IrOpKind::VmReg);
         LUAU_ASSERT(inst.c.kind == IrOpKind::Constant);
 
-        regs.assertAllFree();
-        emitFallback(build, data, LOP_GETVARARGS, uintOp(inst.a));
+        emitFallback(regs, build, data, LOP_GETVARARGS, uintOp(inst.a));
         break;
     case IrCmd::FALLBACK_NEWCLOSURE:
         LUAU_ASSERT(inst.b.kind == IrOpKind::VmReg);
         LUAU_ASSERT(inst.c.kind == IrOpKind::Constant);
 
-        regs.assertAllFree();
-        emitFallback(build, data, LOP_NEWCLOSURE, uintOp(inst.a));
+        emitFallback(regs, build, data, LOP_NEWCLOSURE, uintOp(inst.a));
         break;
     case IrCmd::FALLBACK_DUPCLOSURE:
         LUAU_ASSERT(inst.b.kind == IrOpKind::VmReg);
         LUAU_ASSERT(inst.c.kind == IrOpKind::VmConst);
 
-        regs.assertAllFree();
-        emitFallback(build, data, LOP_DUPCLOSURE, uintOp(inst.a));
+        emitFallback(regs, build, data, LOP_DUPCLOSURE, uintOp(inst.a));
         break;
     case IrCmd::FALLBACK_FORGPREP:
-        regs.assertAllFree();
-        emitFallback(build, data, LOP_FORGPREP, uintOp(inst.a));
+        emitFallback(regs, build, data, LOP_FORGPREP, uintOp(inst.a));
+        jumpOrFallthrough(blockOp(inst.c), next);
         break;
-    default:
-        LUAU_ASSERT(!"Not supported yet");
+
+    // Pseudo instructions
+    case IrCmd::NOP:
+    case IrCmd::SUBSTITUTE:
+        LUAU_ASSERT(!"Pseudo instructions should not be lowered");
         break;
     }
 
diff --git a/CodeGen/src/IrRegAllocA64.cpp b/CodeGen/src/IrRegAllocA64.cpp
index c6db9e9..9a06cf6 100644
--- a/CodeGen/src/IrRegAllocA64.cpp
+++ b/CodeGen/src/IrRegAllocA64.cpp
@@ -1,9 +1,7 @@
 // This file is part of the Luau programming language and is licensed under MIT License; see LICENSE.txt for details
 #include "IrRegAllocA64.h"
 
-#ifdef _MSC_VER
-#include <intrin.h>
-#endif
+#include "BitUtils.h"
 
 namespace Luau
 {
@@ -12,19 +10,6 @@ namespace CodeGen
 namespace A64
 {
 
-inline int setBit(uint32_t n)
-{
-    LUAU_ASSERT(n);
-
-#ifdef _MSC_VER
-    unsigned long rl;
-    _BitScanReverse(&rl, n);
-    return int(rl);
-#else
-    return 31 - __builtin_clz(n);
-#endif
-}
-
 IrRegAllocA64::IrRegAllocA64(IrFunction& function, std::initializer_list<std::pair<RegisterA64, RegisterA64>> regs)
     : function(function)
 {
@@ -52,7 +37,7 @@ RegisterA64 IrRegAllocA64::allocReg(KindA64 kind)
         return noreg;
     }
 
-    int index = setBit(set.free);
+    int index = 31 - countlz(set.free);
     set.free &= ~(1u << index);
 
     return RegisterA64{kind, uint8_t(index)};
@@ -68,7 +53,7 @@ RegisterA64 IrRegAllocA64::allocTemp(KindA64 kind)
         return noreg;
     }
 
-    int index = setBit(set.free);
+    int index = 31 - countlz(set.free);
 
     set.free &= ~(1u << index);
     set.temp |= 1u << index;
diff --git a/CodeGen/src/IrRegAllocX64.cpp b/CodeGen/src/IrRegAllocX64.cpp
index dc9e7f9..24d8f51 100644
--- a/CodeGen/src/IrRegAllocX64.cpp
+++ b/CodeGen/src/IrRegAllocX64.cpp
@@ -1,6 +1,8 @@
 // This file is part of the Luau programming language and is licensed under MIT License; see LICENSE.txt for details
 #include "Luau/IrRegAllocX64.h"
 
+#include "Luau/IrUtils.h"
+
 #include "EmitCommonX64.h"
 
 namespace Luau
@@ -12,11 +14,6 @@ namespace X64
 
 static const RegisterX64 kGprAllocOrder[] = {rax, rdx, rcx, rbx, rsi, rdi, r8, r9, r10, r11};
 
-static bool isFullTvalueOperand(IrCmd cmd)
-{
-    return cmd == IrCmd::LOAD_TVALUE || cmd == IrCmd::LOAD_NODE_VALUE_TV;
-}
-
 IrRegAllocX64::IrRegAllocX64(AssemblyBuilderX64& build, IrFunction& function)
     : build(build)
     , function(function)
@@ -27,50 +24,43 @@ IrRegAllocX64::IrRegAllocX64(AssemblyBuilderX64& build, IrFunction& function)
     xmmInstUsers.fill(kInvalidInstIdx);
 }
 
-RegisterX64 IrRegAllocX64::allocGprReg(SizeX64 preferredSize, uint32_t instIdx)
+RegisterX64 IrRegAllocX64::allocReg(SizeX64 size, uint32_t instIdx)
 {
-    LUAU_ASSERT(
-        preferredSize == SizeX64::byte || preferredSize == SizeX64::word || preferredSize == SizeX64::dword || preferredSize == SizeX64::qword);
-
-    for (RegisterX64 reg : kGprAllocOrder)
+    if (size == SizeX64::xmmword)
     {
-        if (freeGprMap[reg.index])
+        for (size_t i = 0; i < freeXmmMap.size(); ++i)
         {
-            freeGprMap[reg.index] = false;
-            gprInstUsers[reg.index] = instIdx;
-            return RegisterX64{preferredSize, reg.index};
+            if (freeXmmMap[i])
+            {
+                freeXmmMap[i] = false;
+                xmmInstUsers[i] = instIdx;
+                return RegisterX64{size, uint8_t(i)};
+            }
         }
     }
-
-    // If possible, spill the value with the furthest next use
-    if (uint32_t furthestUseTarget = findInstructionWithFurthestNextUse(gprInstUsers); furthestUseTarget != kInvalidInstIdx)
-        return takeReg(function.instructions[furthestUseTarget].regX64, instIdx);
-
-    LUAU_ASSERT(!"Out of GPR registers to allocate");
-    return noreg;
-}
-
-RegisterX64 IrRegAllocX64::allocXmmReg(uint32_t instIdx)
-{
-    for (size_t i = 0; i < freeXmmMap.size(); ++i)
+    else
     {
-        if (freeXmmMap[i])
+        for (RegisterX64 reg : kGprAllocOrder)
         {
-            freeXmmMap[i] = false;
-            xmmInstUsers[i] = instIdx;
-            return RegisterX64{SizeX64::xmmword, uint8_t(i)};
+            if (freeGprMap[reg.index])
+            {
+                freeGprMap[reg.index] = false;
+                gprInstUsers[reg.index] = instIdx;
+                return RegisterX64{size, reg.index};
+            }
         }
     }
 
     // Out of registers, spill the value with the furthest next use
-    if (uint32_t furthestUseTarget = findInstructionWithFurthestNextUse(xmmInstUsers); furthestUseTarget != kInvalidInstIdx)
+    const std::array<uint32_t, 16>& regInstUsers = size == SizeX64::xmmword ? xmmInstUsers : gprInstUsers;
+    if (uint32_t furthestUseTarget = findInstructionWithFurthestNextUse(regInstUsers); furthestUseTarget != kInvalidInstIdx)
         return takeReg(function.instructions[furthestUseTarget].regX64, instIdx);
 
-    LUAU_ASSERT(!"Out of XMM registers to allocate");
+    LUAU_ASSERT(!"Out of registers to allocate");
     return noreg;
 }
 
-RegisterX64 IrRegAllocX64::allocGprRegOrReuse(SizeX64 preferredSize, uint32_t instIdx, std::initializer_list<IrOp> oprefs)
+RegisterX64 IrRegAllocX64::allocRegOrReuse(SizeX64 size, uint32_t instIdx, std::initializer_list<IrOp> oprefs)
 {
     for (IrOp op : oprefs)
     {
@@ -81,39 +71,24 @@ RegisterX64 IrRegAllocX64::allocGprRegOrReuse(SizeX64 preferredSize, uint32_t in
 
         if (source.lastUse == instIdx && !source.reusedReg && !source.spilled)
         {
-            LUAU_ASSERT(source.regX64.size != SizeX64::xmmword);
+            // Not comparing size directly because we only need matching register set
+            if ((size == SizeX64::xmmword) != (source.regX64.size == SizeX64::xmmword))
+                continue;
+
             LUAU_ASSERT(source.regX64 != noreg);
 
             source.reusedReg = true;
-            gprInstUsers[source.regX64.index] = instIdx;
-            return RegisterX64{preferredSize, source.regX64.index};
+
+            if (size == SizeX64::xmmword)
+                xmmInstUsers[source.regX64.index] = instIdx;
+            else
+                gprInstUsers[source.regX64.index] = instIdx;
+
+            return RegisterX64{size, source.regX64.index};
         }
     }
 
-    return allocGprReg(preferredSize, instIdx);
-}
-
-RegisterX64 IrRegAllocX64::allocXmmRegOrReuse(uint32_t instIdx, std::initializer_list<IrOp> oprefs)
-{
-    for (IrOp op : oprefs)
-    {
-        if (op.kind != IrOpKind::Inst)
-            continue;
-
-        IrInst& source = function.instructions[op.index];
-
-        if (source.lastUse == instIdx && !source.reusedReg && !source.spilled)
-        {
-            LUAU_ASSERT(source.regX64.size == SizeX64::xmmword);
-            LUAU_ASSERT(source.regX64 != noreg);
-
-            source.reusedReg = true;
-            xmmInstUsers[source.regX64.index] = instIdx;
-            return source.regX64;
-        }
-    }
-
-    return allocXmmReg(instIdx);
+    return allocReg(size, instIdx);
 }
 
 RegisterX64 IrRegAllocX64::takeReg(RegisterX64 reg, uint32_t instIdx)
@@ -197,41 +172,34 @@ bool IrRegAllocX64::isLastUseReg(const IrInst& target, uint32_t instIdx) const
 
 void IrRegAllocX64::preserve(IrInst& inst)
 {
-    bool doubleSlot = isFullTvalueOperand(inst.cmd);
+    IrSpillX64 spill;
+    spill.instIdx = function.getInstIndex(inst);
+    spill.valueKind = getCmdValueKind(inst.cmd);
+    spill.spillId = nextSpillId++;
+    spill.originalLoc = inst.regX64;
 
-    // Find a free stack slot. Two consecutive slots might be required for 16 byte TValues, so '- 1' is used
-    for (unsigned i = 0; i < unsigned(usedSpillSlots.size() - 1); ++i)
+    // Loads from VmReg/VmConst don't have to be spilled, they can be restored from a register later
+    if (!hasRestoreOp(inst))
     {
-        if (usedSpillSlots.test(i))
-            continue;
+        unsigned i = findSpillStackSlot(spill.valueKind);
 
-        if (doubleSlot && usedSpillSlots.test(i + 1))
-        {
-            ++i; // No need to retest this double position
-            continue;
-        }
-
-        if (inst.regX64.size == SizeX64::xmmword && doubleSlot)
-        {
+        if (spill.valueKind == IrValueKind::Tvalue)
             build.vmovups(xmmword[sSpillArea + i * 8], inst.regX64);
-        }
-        else if (inst.regX64.size == SizeX64::xmmword)
-        {
+        else if (spill.valueKind == IrValueKind::Double)
             build.vmovsd(qword[sSpillArea + i * 8], inst.regX64);
-        }
+        else if (spill.valueKind == IrValueKind::Pointer)
+            build.mov(qword[sSpillArea + i * 8], inst.regX64);
+        else if (spill.valueKind == IrValueKind::Tag || spill.valueKind == IrValueKind::Int)
+            build.mov(dword[sSpillArea + i * 8], inst.regX64);
         else
-        {
-            OperandX64 location = addr[sSpillArea + i * 8];
-            location.memSize = inst.regX64.size; // Override memory access size
-            build.mov(location, inst.regX64);
-        }
+            LUAU_ASSERT(!"unsupported value kind");
 
         usedSpillSlots.set(i);
 
         if (i + 1 > maxUsedSlot)
             maxUsedSlot = i + 1;
 
-        if (doubleSlot)
+        if (spill.valueKind == IrValueKind::Tvalue)
         {
             usedSpillSlots.set(i + 1);
 
@@ -239,22 +207,15 @@ void IrRegAllocX64::preserve(IrInst& inst)
                 maxUsedSlot = i + 2;
         }
 
-        IrSpillX64 spill;
-        spill.instIdx = function.getInstIndex(inst);
-        spill.useDoubleSlot = doubleSlot;
         spill.stackSlot = uint8_t(i);
-        spill.originalLoc = inst.regX64;
-
-        spills.push_back(spill);
-
-        freeReg(inst.regX64);
-
-        inst.regX64 = noreg;
-        inst.spilled = true;
-        return;
     }
 
-    LUAU_ASSERT(!"nowhere to spill");
+    spills.push_back(spill);
+
+    freeReg(inst.regX64);
+
+    inst.regX64 = noreg;
+    inst.spilled = true;
 }
 
 void IrRegAllocX64::restore(IrInst& inst, bool intoOriginalLocation)
@@ -267,35 +228,34 @@ void IrRegAllocX64::restore(IrInst& inst, bool intoOriginalLocation)
 
         if (spill.instIdx == instIdx)
         {
-            LUAU_ASSERT(spill.stackSlot != kNoStackSlot);
-            RegisterX64 reg;
+            RegisterX64 reg = intoOriginalLocation ? takeReg(spill.originalLoc, instIdx) : allocReg(spill.originalLoc.size, instIdx);
+            OperandX64 restoreLocation = noreg;
 
-            if (spill.originalLoc.size == SizeX64::xmmword)
+            if (spill.stackSlot != kNoStackSlot)
             {
-                reg = intoOriginalLocation ? takeReg(spill.originalLoc, instIdx) : allocXmmReg(instIdx);
+                restoreLocation = addr[sSpillArea + spill.stackSlot * 8];
+                restoreLocation.memSize = reg.size;
 
-                if (spill.useDoubleSlot)
-                    build.vmovups(reg, xmmword[sSpillArea + spill.stackSlot * 8]);
-                else
-                    build.vmovsd(reg, qword[sSpillArea + spill.stackSlot * 8]);
+                usedSpillSlots.set(spill.stackSlot, false);
+
+                if (spill.valueKind == IrValueKind::Tvalue)
+                    usedSpillSlots.set(spill.stackSlot + 1, false);
             }
             else
             {
-                reg = intoOriginalLocation ? takeReg(spill.originalLoc, instIdx) : allocGprReg(spill.originalLoc.size, instIdx);
-
-                OperandX64 location = addr[sSpillArea + spill.stackSlot * 8];
-                location.memSize = reg.size; // Override memory access size
-                build.mov(reg, location);
+                restoreLocation = getRestoreAddress(inst, getRestoreOp(inst));
             }
 
+            if (spill.valueKind == IrValueKind::Tvalue)
+                build.vmovups(reg, restoreLocation);
+            else if (spill.valueKind == IrValueKind::Double)
+                build.vmovsd(reg, restoreLocation);
+            else
+                build.mov(reg, restoreLocation);
+
             inst.regX64 = reg;
             inst.spilled = false;
 
-            usedSpillSlots.set(spill.stackSlot, false);
-
-            if (spill.useDoubleSlot)
-                usedSpillSlots.set(spill.stackSlot + 1, false);
-
             spills[i] = spills.back();
             spills.pop_back();
             return;
@@ -334,6 +294,81 @@ bool IrRegAllocX64::shouldFreeGpr(RegisterX64 reg) const
     return false;
 }
 
+unsigned IrRegAllocX64::findSpillStackSlot(IrValueKind valueKind)
+{
+    // Find a free stack slot. Two consecutive slots might be required for 16 byte TValues, so '- 1' is used
+    for (unsigned i = 0; i < unsigned(usedSpillSlots.size() - 1); ++i)
+    {
+        if (usedSpillSlots.test(i))
+            continue;
+
+        if (valueKind == IrValueKind::Tvalue && usedSpillSlots.test(i + 1))
+        {
+            ++i; // No need to retest this double position
+            continue;
+        }
+
+        return i;
+    }
+
+    LUAU_ASSERT(!"nowhere to spill");
+    return ~0u;
+}
+
+IrOp IrRegAllocX64::getRestoreOp(const IrInst& inst) const
+{
+    switch (inst.cmd)
+    {
+    case IrCmd::LOAD_TAG:
+    case IrCmd::LOAD_POINTER:
+    case IrCmd::LOAD_DOUBLE:
+    case IrCmd::LOAD_INT:
+    case IrCmd::LOAD_TVALUE:
+    {
+        IrOp location = inst.a;
+
+        // Might have an alternative location
+        if (IrOp alternative = function.findRestoreOp(inst); alternative.kind != IrOpKind::None)
+            location = alternative;
+
+        if (location.kind == IrOpKind::VmReg || location.kind == IrOpKind::VmConst)
+            return location;
+
+        break;
+    }
+    default:
+        break;
+    }
+
+    return IrOp();
+}
+
+bool IrRegAllocX64::hasRestoreOp(const IrInst& inst) const
+{
+    return getRestoreOp(inst).kind != IrOpKind::None;
+}
+
+OperandX64 IrRegAllocX64::getRestoreAddress(const IrInst& inst, IrOp restoreOp)
+{
+    switch (inst.cmd)
+    {
+    case IrCmd::LOAD_TAG:
+        return restoreOp.kind == IrOpKind::VmReg ? luauRegTag(vmRegOp(restoreOp)) : luauConstantTag(vmConstOp(restoreOp));
+    case IrCmd::LOAD_POINTER:
+    case IrCmd::LOAD_DOUBLE:
+        return restoreOp.kind == IrOpKind::VmReg ? luauRegValue(vmRegOp(restoreOp)) : luauConstantValue(vmConstOp(restoreOp));
+    case IrCmd::LOAD_INT:
+        LUAU_ASSERT(restoreOp.kind == IrOpKind::VmReg);
+        return luauRegValueInt(vmRegOp(restoreOp));
+    case IrCmd::LOAD_TVALUE:
+        return restoreOp.kind == IrOpKind::VmReg ? luauReg(vmRegOp(restoreOp)) : luauConstant(vmConstOp(restoreOp));
+    default:
+        break;
+    }
+
+    return noreg;
+}
+
 uint32_t IrRegAllocX64::findInstructionWithFurthestNextUse(const std::array<uint32_t, 16>& regInstUsers) const
 {
     uint32_t furthestUseTarget = kInvalidInstIdx;
@@ -411,11 +446,7 @@ ScopedRegX64::~ScopedRegX64()
 void ScopedRegX64::alloc(SizeX64 size)
 {
     LUAU_ASSERT(reg == noreg);
-
-    if (size == SizeX64::xmmword)
-        reg = owner.allocXmmReg(kInvalidInstIdx);
-    else
-        reg = owner.allocGprReg(size, kInvalidInstIdx);
+    reg = owner.allocReg(size, kInvalidInstIdx);
 }
 
 void ScopedRegX64::free()
@@ -435,38 +466,36 @@ RegisterX64 ScopedRegX64::release()
 ScopedSpills::ScopedSpills(IrRegAllocX64& owner)
     : owner(owner)
 {
-    snapshot = owner.spills;
+    startSpillId = owner.nextSpillId;
 }
 
 ScopedSpills::~ScopedSpills()
 {
-    // Taking a copy of current spills because we are going to potentially restore them
-    std::vector<IrSpillX64> current = owner.spills;
+    unsigned endSpillId = owner.nextSpillId;
 
-    // Restore registers that were spilled inside scope protected by this object
-    for (IrSpillX64& curr : current)
+    for (size_t i = 0; i < owner.spills.size();)
     {
-        // If spill existed before current scope, it can be restored outside of it
-        if (!wasSpilledBefore(curr))
+        IrSpillX64& spill = owner.spills[i];
+
+        // Restoring spills inside this scope cannot create new spills
+        LUAU_ASSERT(spill.spillId < endSpillId);
+
+        // If spill was created inside current scope, it has to be restored
+        if (spill.spillId >= startSpillId)
         {
-            IrInst& inst = owner.function.instructions[curr.instIdx];
+            IrInst& inst = owner.function.instructions[spill.instIdx];
 
             owner.restore(inst, /*intoOriginalLocation*/ true);
+
+            // Spill restore removes the spill entry, so loop is repeated at the same 'i'
+        }
+        else
+        {
+            i++;
         }
     }
 }
 
-bool ScopedSpills::wasSpilledBefore(const IrSpillX64& spill) const
-{
-    for (const IrSpillX64& preexisting : snapshot)
-    {
-        if (spill.instIdx == preexisting.instIdx)
-            return true;
-    }
-
-    return false;
-}
-
 } // namespace X64
 } // namespace CodeGen
 } // namespace Luau
diff --git a/CodeGen/src/IrTranslateBuiltins.cpp b/CodeGen/src/IrTranslateBuiltins.cpp
index ba49156..539fcf7 100644
--- a/CodeGen/src/IrTranslateBuiltins.cpp
+++ b/CodeGen/src/IrTranslateBuiltins.cpp
@@ -8,6 +8,8 @@
 
 // TODO: when nresults is less than our actual result count, we can skip computing/writing unused results
 
+static const int kMinMaxUnrolledParams = 5;
+
 namespace Luau
 {
 namespace CodeGen
@@ -23,7 +25,7 @@ BuiltinImplResult translateBuiltinNumberToNumber(
         return {BuiltinImplType::None, -1};
 
     build.loadAndCheckTag(build.vmReg(arg), LUA_TNUMBER, fallback);
-    build.inst(IrCmd::FASTCALL, build.constUint(bfid), build.vmReg(ra), build.vmReg(arg), args, build.constInt(nparams), build.constInt(nresults));
+    build.inst(IrCmd::FASTCALL, build.constUint(bfid), build.vmReg(ra), build.vmReg(arg), args, build.constInt(1), build.constInt(1));
 
     if (ra != arg)
         build.inst(IrCmd::STORE_TAG, build.vmReg(ra), build.constTag(LUA_TNUMBER));
@@ -40,7 +42,7 @@ BuiltinImplResult translateBuiltin2NumberToNumber(
 
     build.loadAndCheckTag(build.vmReg(arg), LUA_TNUMBER, fallback);
     build.loadAndCheckTag(args, LUA_TNUMBER, fallback);
-    build.inst(IrCmd::FASTCALL, build.constUint(bfid), build.vmReg(ra), build.vmReg(arg), args, build.constInt(nparams), build.constInt(nresults));
+    build.inst(IrCmd::FASTCALL, build.constUint(bfid), build.vmReg(ra), build.vmReg(arg), args, build.constInt(2), build.constInt(1));
 
     if (ra != arg)
         build.inst(IrCmd::STORE_TAG, build.vmReg(ra), build.constTag(LUA_TNUMBER));
@@ -56,12 +58,13 @@ BuiltinImplResult translateBuiltinNumberTo2Number(
         return {BuiltinImplType::None, -1};
 
     build.loadAndCheckTag(build.vmReg(arg), LUA_TNUMBER, fallback);
-    build.inst(IrCmd::FASTCALL, build.constUint(bfid), build.vmReg(ra), build.vmReg(arg), args, build.constInt(nparams), build.constInt(nresults));
+    build.inst(
+        IrCmd::FASTCALL, build.constUint(bfid), build.vmReg(ra), build.vmReg(arg), args, build.constInt(1), build.constInt(nresults == 1 ? 1 : 2));
 
     if (ra != arg)
         build.inst(IrCmd::STORE_TAG, build.vmReg(ra), build.constTag(LUA_TNUMBER));
 
-    if (nresults > 1)
+    if (nresults != 1)
         build.inst(IrCmd::STORE_TAG, build.vmReg(ra + 1), build.constTag(LUA_TNUMBER));
 
     return {BuiltinImplType::UsesFallback, 2};
@@ -125,12 +128,33 @@ BuiltinImplResult translateBuiltinMathLog(
     if (nparams < 1 || nresults > 1)
         return {BuiltinImplType::None, -1};
 
-    build.loadAndCheckTag(build.vmReg(arg), LUA_TNUMBER, fallback);
+    LuauBuiltinFunction fcId = bfid;
+    int fcParams = 1;
 
     if (nparams != 1)
-        build.loadAndCheckTag(args, LUA_TNUMBER, fallback);
+    {
+        if (args.kind != IrOpKind::VmConst)
+            return {BuiltinImplType::None, -1};
 
-    build.inst(IrCmd::FASTCALL, build.constUint(bfid), build.vmReg(ra), build.vmReg(arg), args, build.constInt(nparams), build.constInt(nresults));
+        LUAU_ASSERT(build.function.proto);
+        TValue protok = build.function.proto->k[vmConstOp(args)];
+
+        if (protok.tt != LUA_TNUMBER)
+            return {BuiltinImplType::None, -1};
+
+        // TODO: IR builtin lowering assumes that the only valid 2-argument call is log2; ideally, we use a less hacky way to indicate that
+        if (protok.value.n == 2.0)
+            fcParams = 2;
+        else if (protok.value.n == 10.0)
+            fcId = LBF_MATH_LOG10;
+        else
+            // TODO: We can precompute log(args) and divide by it, but that requires extra LOAD/STORE so for now just fall back as this is rare
+            return {BuiltinImplType::None, -1};
+    }
+
+    build.loadAndCheckTag(build.vmReg(arg), LUA_TNUMBER, fallback);
+
+    build.inst(IrCmd::FASTCALL, build.constUint(fcId), build.vmReg(ra), build.vmReg(arg), args, build.constInt(fcParams), build.constInt(1));
 
     if (ra != arg)
         build.inst(IrCmd::STORE_TAG, build.vmReg(ra), build.constTag(LUA_TNUMBER));
@@ -140,17 +164,26 @@ BuiltinImplResult translateBuiltinMathLog(
 
 BuiltinImplResult translateBuiltinMathMin(IrBuilder& build, int nparams, int ra, int arg, IrOp args, int nresults, IrOp fallback)
 {
-    // TODO: this can be extended for other number of arguments
-    if (nparams != 2 || nresults > 1)
+    if (nparams < 2 || nparams > kMinMaxUnrolledParams || nresults > 1)
         return {BuiltinImplType::None, -1};
 
     build.loadAndCheckTag(build.vmReg(arg), LUA_TNUMBER, fallback);
     build.loadAndCheckTag(args, LUA_TNUMBER, fallback);
 
+    for (int i = 3; i <= nparams; ++i)
+        build.loadAndCheckTag(build.vmReg(vmRegOp(args) + (i - 2)), LUA_TNUMBER, fallback);
+
     IrOp varg1 = build.inst(IrCmd::LOAD_DOUBLE, build.vmReg(arg));
     IrOp varg2 = build.inst(IrCmd::LOAD_DOUBLE, args);
 
     IrOp res = build.inst(IrCmd::MIN_NUM, varg2, varg1); // Swapped arguments are required for consistency with VM builtins
+
+    for (int i = 3; i <= nparams; ++i)
+    {
+        IrOp arg = build.inst(IrCmd::LOAD_DOUBLE, build.vmReg(vmRegOp(args) + (i - 2)));
+        res = build.inst(IrCmd::MIN_NUM, arg, res);
+    }
+
     build.inst(IrCmd::STORE_DOUBLE, build.vmReg(ra), res);
 
     if (ra != arg)
@@ -161,17 +194,26 @@ BuiltinImplResult translateBuiltinMathMin(IrBuilder& build, int nparams, int ra,
 
 BuiltinImplResult translateBuiltinMathMax(IrBuilder& build, int nparams, int ra, int arg, IrOp args, int nresults, IrOp fallback)
 {
-    // TODO: this can be extended for other number of arguments
-    if (nparams != 2 || nresults > 1)
+    if (nparams < 2 || nparams > kMinMaxUnrolledParams || nresults > 1)
         return {BuiltinImplType::None, -1};
 
     build.loadAndCheckTag(build.vmReg(arg), LUA_TNUMBER, fallback);
     build.loadAndCheckTag(args, LUA_TNUMBER, fallback);
 
+    for (int i = 3; i <= nparams; ++i)
+        build.loadAndCheckTag(build.vmReg(vmRegOp(args) + (i - 2)), LUA_TNUMBER, fallback);
+
     IrOp varg1 = build.inst(IrCmd::LOAD_DOUBLE, build.vmReg(arg));
     IrOp varg2 = build.inst(IrCmd::LOAD_DOUBLE, args);
 
     IrOp res = build.inst(IrCmd::MAX_NUM, varg2, varg1); // Swapped arguments are required for consistency with VM builtins
+
+    for (int i = 3; i <= nparams; ++i)
+    {
+        IrOp arg = build.inst(IrCmd::LOAD_DOUBLE, build.vmReg(vmRegOp(args) + (i - 2)));
+        res = build.inst(IrCmd::MAX_NUM, arg, res);
+    }
+
     build.inst(IrCmd::STORE_DOUBLE, build.vmReg(ra), res);
 
     if (ra != arg)
@@ -254,8 +296,7 @@ BuiltinImplResult translateBuiltinType(IrBuilder& build, int nparams, int ra, in
     if (nparams < 1 || nresults > 1)
         return {BuiltinImplType::None, -1};
 
-    build.inst(
-        IrCmd::FASTCALL, build.constUint(LBF_TYPE), build.vmReg(ra), build.vmReg(arg), args, build.constInt(nparams), build.constInt(nresults));
+    build.inst(IrCmd::FASTCALL, build.constUint(LBF_TYPE), build.vmReg(ra), build.vmReg(arg), args, build.constInt(1), build.constInt(1));
 
     build.inst(IrCmd::STORE_TAG, build.vmReg(ra), build.constTag(LUA_TSTRING));
 
@@ -267,8 +308,7 @@ BuiltinImplResult translateBuiltinTypeof(IrBuilder& build, int nparams, int ra,
     if (nparams < 1 || nresults > 1)
         return {BuiltinImplType::None, -1};
 
-    build.inst(
-        IrCmd::FASTCALL, build.constUint(LBF_TYPEOF), build.vmReg(ra), build.vmReg(arg), args, build.constInt(nparams), build.constInt(nresults));
+    build.inst(IrCmd::FASTCALL, build.constUint(LBF_TYPEOF), build.vmReg(ra), build.vmReg(arg), args, build.constInt(1), build.constInt(1));
 
     build.inst(IrCmd::STORE_TAG, build.vmReg(ra), build.constTag(LUA_TSTRING));
 
diff --git a/CodeGen/src/IrUtils.cpp b/CodeGen/src/IrUtils.cpp
index c5e7c88..3811ca2 100644
--- a/CodeGen/src/IrUtils.cpp
+++ b/CodeGen/src/IrUtils.cpp
@@ -284,7 +284,7 @@ void replace(IrFunction& function, IrBlock& block, uint32_t instIdx, IrInst repl
     block.useCount--;
 }
 
-void substitute(IrFunction& function, IrInst& inst, IrOp replacement)
+void substitute(IrFunction& function, IrInst& inst, IrOp replacement, IrOp location)
 {
     LUAU_ASSERT(!isBlockTerminator(inst.cmd));
 
@@ -298,7 +298,7 @@ void substitute(IrFunction& function, IrInst& inst, IrOp replacement)
     removeUse(function, inst.f);
 
     inst.a = replacement;
-    inst.b = {};
+    inst.b = location;
     inst.c = {};
     inst.d = {};
     inst.e = {};
diff --git a/CodeGen/src/NativeState.cpp b/CodeGen/src/NativeState.cpp
index 5247969..cb128de 100644
--- a/CodeGen/src/NativeState.cpp
+++ b/CodeGen/src/NativeState.cpp
@@ -16,7 +16,7 @@
 #include <math.h>
 #include <string.h>
 
-#define CODEGEN_SET_FALLBACK(op, flags) data.context.fallback[op] = {execute_##op, flags}
+#define CODEGEN_SET_FALLBACK(op) data.context.fallback[op] = {execute_##op}
 
 namespace Luau
 {
@@ -36,20 +36,21 @@ NativeState::~NativeState() = default;
 void initFallbackTable(NativeState& data)
 {
     // When fallback is completely removed, remove it from includeInsts list in lvmexecute_split.py
-    CODEGEN_SET_FALLBACK(LOP_NEWCLOSURE, 0);
-    CODEGEN_SET_FALLBACK(LOP_NAMECALL, 0);
-    CODEGEN_SET_FALLBACK(LOP_FORGPREP, kFallbackUpdatePc);
-    CODEGEN_SET_FALLBACK(LOP_GETVARARGS, 0);
-    CODEGEN_SET_FALLBACK(LOP_DUPCLOSURE, 0);
-    CODEGEN_SET_FALLBACK(LOP_PREPVARARGS, 0);
-    CODEGEN_SET_FALLBACK(LOP_BREAK, 0);
+    CODEGEN_SET_FALLBACK(LOP_NEWCLOSURE);
+    CODEGEN_SET_FALLBACK(LOP_NAMECALL);
+    CODEGEN_SET_FALLBACK(LOP_FORGPREP);
+    CODEGEN_SET_FALLBACK(LOP_GETVARARGS);
+    CODEGEN_SET_FALLBACK(LOP_DUPCLOSURE);
+    CODEGEN_SET_FALLBACK(LOP_PREPVARARGS);
+    CODEGEN_SET_FALLBACK(LOP_BREAK);
+    CODEGEN_SET_FALLBACK(LOP_SETLIST);
 
     // Fallbacks that are called from partial implementation of an instruction
     // TODO: these fallbacks should be replaced with special functions that exclude the (redundantly executed) fast path from the fallback
-    CODEGEN_SET_FALLBACK(LOP_GETGLOBAL, 0);
-    CODEGEN_SET_FALLBACK(LOP_SETGLOBAL, 0);
-    CODEGEN_SET_FALLBACK(LOP_GETTABLEKS, 0);
-    CODEGEN_SET_FALLBACK(LOP_SETTABLEKS, 0);
+    CODEGEN_SET_FALLBACK(LOP_GETGLOBAL);
+    CODEGEN_SET_FALLBACK(LOP_SETGLOBAL);
+    CODEGEN_SET_FALLBACK(LOP_GETTABLEKS);
+    CODEGEN_SET_FALLBACK(LOP_SETTABLEKS);
 }
 
 void initHelperFunctions(NativeState& data)
@@ -105,6 +106,7 @@ void initHelperFunctions(NativeState& data)
     data.context.libm_tan = tan;
     data.context.libm_tanh = tanh;
 
+    data.context.forgLoopTableIter = forgLoopTableIter;
     data.context.forgLoopNodeIter = forgLoopNodeIter;
     data.context.forgLoopNonTableFallback = forgLoopNonTableFallback;
     data.context.forgPrepXnextFallback = forgPrepXnextFallback;
diff --git a/CodeGen/src/NativeState.h b/CodeGen/src/NativeState.h
index 2d97e63..99d4089 100644
--- a/CodeGen/src/NativeState.h
+++ b/CodeGen/src/NativeState.h
@@ -23,15 +23,7 @@ namespace CodeGen
 
 class UnwindBuilder;
 
-using FallbackFn = const Instruction*(lua_State* L, const Instruction* pc, StkId base, TValue* k);
-
-constexpr uint8_t kFallbackUpdatePc = 1 << 0;
-
-struct NativeFallback
-{
-    FallbackFn* fallback;
-    uint8_t flags;
-};
+using FallbackFn = const Instruction* (*)(lua_State* L, const Instruction* pc, StkId base, TValue* k);
 
 struct NativeProto
 {
@@ -96,6 +88,7 @@ struct NativeContext
     double (*libm_modf)(double, double*) = nullptr;
 
     // Helper functions
+    bool (*forgLoopTableIter)(lua_State* L, Table* h, int index, TValue* ra) = nullptr;
     bool (*forgLoopNodeIter)(lua_State* L, Table* h, int index, TValue* ra) = nullptr;
     bool (*forgLoopNonTableFallback)(lua_State* L, int insnA, int aux) = nullptr;
     void (*forgPrepXnextFallback)(lua_State* L, TValue* ra, int pc) = nullptr;
@@ -106,7 +99,7 @@ struct NativeContext
     Closure* (*returnFallback)(lua_State* L, StkId ra, int n) = nullptr;
 
     // Opcode fallbacks, implemented in C
-    NativeFallback fallback[LOP__COUNT] = {};
+    FallbackFn fallback[LOP__COUNT] = {};
 
     // Fast call methods, implemented in C
     luau_FastFunction luauF_table[256] = {};
diff --git a/CodeGen/src/OptimizeConstProp.cpp b/CodeGen/src/OptimizeConstProp.cpp
index 7157a18..c7d3d8e 100644
--- a/CodeGen/src/OptimizeConstProp.cpp
+++ b/CodeGen/src/OptimizeConstProp.cpp
@@ -502,6 +502,8 @@ static void constPropInInst(ConstPropState& state, IrBuilder& build, IrFunction&
             }
         }
         break;
+
+        // TODO: FASTCALL is more restrictive than INVOKE_FASTCALL; we should either determine the exact semantics, or rework it
     case IrCmd::FASTCALL:
     case IrCmd::INVOKE_FASTCALL:
         handleBuiltinEffects(state, LuauBuiltinFunction(function.uintOp(inst.a)), vmRegOp(inst.b), function.intOp(inst.f));
diff --git a/CodeGen/src/UnwindBuilderDwarf2.cpp b/CodeGen/src/UnwindBuilderDwarf2.cpp
index 0b3134b..b20a6b2 100644
--- a/CodeGen/src/UnwindBuilderDwarf2.cpp
+++ b/CodeGen/src/UnwindBuilderDwarf2.cpp
@@ -132,7 +132,7 @@ size_t UnwindBuilderDwarf2::getBeginOffset() const
     return beginOffset;
 }
 
-void UnwindBuilderDwarf2::start()
+void UnwindBuilderDwarf2::startInfo()
 {
     uint8_t* cieLength = pos;
     pos = writeu32(pos, 0); // Length (to be filled later)
@@ -149,13 +149,23 @@ void UnwindBuilderDwarf2::start()
     // Optional CIE augmentation section (not present)
 
     // Call frame instructions (common for all FDEs, of which we have 1)
-    stackOffset = 8; // Return address was pushed by calling the function
-
-    pos = defineCfaExpression(pos, DW_REG_RSP, stackOffset); // Define CFA to be the rsp + 8
+    pos = defineCfaExpression(pos, DW_REG_RSP, 8);           // Define CFA to be the rsp + 8
     pos = defineSavedRegisterLocation(pos, DW_REG_RA, 8);    // Define return address register (RA) to be located at CFA - 8
 
     pos = alignPosition(cieLength, pos);
     writeu32(cieLength, unsigned(pos - cieLength - 4)); // Length field itself is excluded from length
+}
+
+void UnwindBuilderDwarf2::startFunction()
+{
+    // End offset is filled in later and everything gets adjusted at the end
+    UnwindFunctionDwarf2 func;
+    func.beginOffset = 0;
+    func.endOffset = 0;
+    func.fdeEntryStartPos = uint32_t(pos - rawData);
+    unwindFunctions.push_back(func);
+
+    stackOffset = 8; // Return address was pushed by calling the function
 
     fdeEntryStart = pos;                          // Will be written at the end
     pos = writeu32(pos, 0);                       // Length (to be filled later)
@@ -198,14 +208,20 @@ void UnwindBuilderDwarf2::setupFrameReg(X64::RegisterX64 reg, int espOffset)
     // Cfa is based on rsp, so no additonal commands are required
 }
 
-void UnwindBuilderDwarf2::finish()
+void UnwindBuilderDwarf2::finishFunction(uint32_t beginOffset, uint32_t endOffset)
 {
+    unwindFunctions.back().beginOffset = beginOffset;
+    unwindFunctions.back().endOffset = endOffset;
+
     LUAU_ASSERT(stackOffset % 16 == 0 && "stack has to be aligned to 16 bytes after prologue");
     LUAU_ASSERT(fdeEntryStart != nullptr);
 
     pos = alignPosition(fdeEntryStart, pos);
     writeu32(fdeEntryStart, unsigned(pos - fdeEntryStart - 4)); // Length field itself is excluded from length
+}
 
+void UnwindBuilderDwarf2::finishInfo()
+{
     // Terminate section
     pos = writeu32(pos, 0);
 
@@ -217,15 +233,26 @@ size_t UnwindBuilderDwarf2::getSize() const
     return size_t(pos - rawData);
 }
 
-void UnwindBuilderDwarf2::finalize(char* target, void* funcAddress, size_t funcSize) const
+size_t UnwindBuilderDwarf2::getFunctionCount() const
+{
+    return unwindFunctions.size();
+}
+
+void UnwindBuilderDwarf2::finalize(char* target, size_t offset, void* funcAddress, size_t funcSize) const
 {
     memcpy(target, rawData, getSize());
 
-    LUAU_ASSERT(fdeEntryStart != nullptr);
-    unsigned fdeEntryStartPos = unsigned(fdeEntryStart - rawData);
+    for (const UnwindFunctionDwarf2& func : unwindFunctions)
+    {
+        uint8_t* fdeEntryStart = (uint8_t*)target + func.fdeEntryStartPos;
 
-    writeu64((uint8_t*)target + fdeEntryStartPos + kFdeInitialLocationOffset, uintptr_t(funcAddress));
-    writeu64((uint8_t*)target + fdeEntryStartPos + kFdeAddressRangeOffset, funcSize);
+        writeu64(fdeEntryStart + kFdeInitialLocationOffset, uintptr_t(funcAddress) + offset + func.beginOffset);
+
+        if (func.endOffset == kFullBlockFuncton)
+            writeu64(fdeEntryStart + kFdeAddressRangeOffset, funcSize - offset);
+        else
+            writeu64(fdeEntryStart + kFdeAddressRangeOffset, func.endOffset - func.beginOffset);
+    }
 }
 
 } // namespace CodeGen
diff --git a/CodeGen/src/UnwindBuilderWin.cpp b/CodeGen/src/UnwindBuilderWin.cpp
index 2173300..5f4f16a 100644
--- a/CodeGen/src/UnwindBuilderWin.cpp
+++ b/CodeGen/src/UnwindBuilderWin.cpp
@@ -21,17 +21,6 @@ namespace Luau
 namespace CodeGen
 {
 
-// This struct matches the layout of UNWIND_INFO from ehdata.h
-struct UnwindInfoWin
-{
-    uint8_t version : 3;
-    uint8_t flags : 5;
-    uint8_t prologsize;
-    uint8_t unwindcodecount;
-    uint8_t framereg : 4;
-    uint8_t frameregoff : 4;
-};
-
 void UnwindBuilderWin::setBeginOffset(size_t beginOffset)
 {
     this->beginOffset = beginOffset;
@@ -42,11 +31,28 @@ size_t UnwindBuilderWin::getBeginOffset() const
     return beginOffset;
 }
 
-void UnwindBuilderWin::start()
-{
-    stackOffset = 8; // Return address was pushed by calling the function
+void UnwindBuilderWin::startInfo() {}
 
+void UnwindBuilderWin::startFunction()
+{
+    // End offset is filled in later and everything gets adjusted at the end
+    UnwindFunctionWin func;
+    func.beginOffset = 0;
+    func.endOffset = 0;
+    func.unwindInfoOffset = uint32_t(rawDataPos - rawData);
+    unwindFunctions.push_back(func);
+
+    unwindCodes.clear();
     unwindCodes.reserve(16);
+
+    prologSize = 0;
+
+    // rax has register index 0, which in Windows unwind info means that frame register is not used
+    frameReg = X64::rax;
+    frameRegOffset = 0;
+
+    // Return address was pushed by calling the function
+    stackOffset = 8;
 }
 
 void UnwindBuilderWin::spill(int espOffset, X64::RegisterX64 reg)
@@ -85,49 +91,89 @@ void UnwindBuilderWin::setupFrameReg(X64::RegisterX64 reg, int espOffset)
     unwindCodes.push_back({prologSize, UWOP_SET_FPREG, frameRegOffset});
 }
 
-void UnwindBuilderWin::finish()
+void UnwindBuilderWin::finishFunction(uint32_t beginOffset, uint32_t endOffset)
 {
+    unwindFunctions.back().beginOffset = beginOffset;
+    unwindFunctions.back().endOffset = endOffset;
+
     // Windows unwind code count is stored in uint8_t, so we can't have more
     LUAU_ASSERT(unwindCodes.size() < 256);
 
     LUAU_ASSERT(stackOffset % 16 == 0 && "stack has to be aligned to 16 bytes after prologue");
 
-    size_t codeArraySize = unwindCodes.size();
-    codeArraySize = (codeArraySize + 1) & ~1; // Size has to be even, but unwind code count doesn't have to
-
-    infoSize = sizeof(UnwindInfoWin) + sizeof(UnwindCodeWin) * codeArraySize;
-}
-
-size_t UnwindBuilderWin::getSize() const
-{
-    return infoSize;
-}
-
-void UnwindBuilderWin::finalize(char* target, void* funcAddress, size_t funcSize) const
-{
     UnwindInfoWin info;
     info.version = 1;
     info.flags = 0; // No EH
     info.prologsize = prologSize;
     info.unwindcodecount = uint8_t(unwindCodes.size());
+
+    LUAU_ASSERT(frameReg.index < 16);
     info.framereg = frameReg.index;
+
+    LUAU_ASSERT(frameRegOffset < 16);
     info.frameregoff = frameRegOffset;
 
-    memcpy(target, &info, sizeof(info));
-    target += sizeof(UnwindInfoWin);
+    LUAU_ASSERT(rawDataPos + sizeof(info) <= rawData + kRawDataLimit);
+    memcpy(rawDataPos, &info, sizeof(info));
+    rawDataPos += sizeof(info);
 
     if (!unwindCodes.empty())
     {
         // Copy unwind codes in reverse order
         // Some unwind codes take up two array slots, but we don't use those atm
-        char* pos = target + sizeof(UnwindCodeWin) * (unwindCodes.size() - 1);
+        uint8_t* unwindCodePos = rawDataPos + sizeof(UnwindCodeWin) * (unwindCodes.size() - 1);
+        LUAU_ASSERT(unwindCodePos <= rawData + kRawDataLimit);
 
         for (size_t i = 0; i < unwindCodes.size(); i++)
         {
-            memcpy(pos, &unwindCodes[i], sizeof(UnwindCodeWin));
-            pos -= sizeof(UnwindCodeWin);
+            memcpy(unwindCodePos, &unwindCodes[i], sizeof(UnwindCodeWin));
+            unwindCodePos -= sizeof(UnwindCodeWin);
         }
     }
+
+    rawDataPos += sizeof(UnwindCodeWin) * unwindCodes.size();
+
+    // Size has to be even, but unwind code count doesn't have to
+    if (unwindCodes.size() % 2 != 0)
+        rawDataPos += sizeof(UnwindCodeWin);
+
+    LUAU_ASSERT(rawDataPos <= rawData + kRawDataLimit);
+}
+
+void UnwindBuilderWin::finishInfo() {}
+
+size_t UnwindBuilderWin::getSize() const
+{
+    return sizeof(UnwindFunctionWin) * unwindFunctions.size() + size_t(rawDataPos - rawData);
+}
+
+size_t UnwindBuilderWin::getFunctionCount() const
+{
+    return unwindFunctions.size();
+}
+
+void UnwindBuilderWin::finalize(char* target, size_t offset, void* funcAddress, size_t funcSize) const
+{
+    // Copy adjusted function information
+    for (UnwindFunctionWin func : unwindFunctions)
+    {
+        // Code will start after the unwind info
+        func.beginOffset += uint32_t(offset);
+
+        // Whole block is a part of a 'single function'
+        if (func.endOffset == kFullBlockFuncton)
+            func.endOffset = uint32_t(funcSize);
+        else
+            func.endOffset += uint32_t(offset);
+
+        // Unwind data is placed right after the RUNTIME_FUNCTION data
+        func.unwindInfoOffset += uint32_t(sizeof(UnwindFunctionWin) * unwindFunctions.size());
+        memcpy(target, &func, sizeof(func));
+        target += sizeof(func);
+    }
+
+    // Copy unwind codes
+    memcpy(target, rawData, size_t(rawDataPos - rawData));
 }
 
 } // namespace CodeGen
diff --git a/Sources.cmake b/Sources.cmake
index 3508ec3..9f54b91 100644
--- a/Sources.cmake
+++ b/Sources.cmake
@@ -89,9 +89,7 @@ target_sources(Luau.CodeGen PRIVATE
     CodeGen/src/CodeGenA64.cpp
     CodeGen/src/CodeGenX64.cpp
     CodeGen/src/EmitBuiltinsX64.cpp
-    CodeGen/src/EmitCommonA64.cpp
     CodeGen/src/EmitCommonX64.cpp
-    CodeGen/src/EmitInstructionA64.cpp
     CodeGen/src/EmitInstructionX64.cpp
     CodeGen/src/Fallbacks.cpp
     CodeGen/src/IrAnalysis.cpp
@@ -111,6 +109,7 @@ target_sources(Luau.CodeGen PRIVATE
     CodeGen/src/UnwindBuilderDwarf2.cpp
     CodeGen/src/UnwindBuilderWin.cpp
 
+    CodeGen/src/BitUtils.h
     CodeGen/src/ByteUtils.h
     CodeGen/src/CustomExecUtils.h
     CodeGen/src/CodeGenUtils.h
@@ -120,7 +119,6 @@ target_sources(Luau.CodeGen PRIVATE
     CodeGen/src/EmitCommon.h
     CodeGen/src/EmitCommonA64.h
     CodeGen/src/EmitCommonX64.h
-    CodeGen/src/EmitInstructionA64.h
     CodeGen/src/EmitInstructionX64.h
     CodeGen/src/Fallbacks.h
     CodeGen/src/FallbacksProlog.h
diff --git a/VM/src/lapi.cpp b/VM/src/lapi.cpp
index 08d64d5..054faa7 100644
--- a/VM/src/lapi.cpp
+++ b/VM/src/lapi.cpp
@@ -538,6 +538,8 @@ const void* lua_topointer(lua_State* L, int idx)
     StkId o = index2addr(L, idx);
     switch (ttype(o))
     {
+    case LUA_TSTRING:
+        return tsvalue(o);
     case LUA_TTABLE:
         return hvalue(o);
     case LUA_TFUNCTION:
diff --git a/VM/src/ltable.cpp b/VM/src/ltable.cpp
index 5eceea7..c963ac8 100644
--- a/VM/src/ltable.cpp
+++ b/VM/src/ltable.cpp
@@ -33,8 +33,6 @@
 
 #include <string.h>
 
-LUAU_FASTFLAGVARIABLE(LuauArrBoundResizeFix, false)
-
 // max size of both array and hash part is 2^MAXBITS
 #define MAXBITS 26
 #define MAXSIZE (1 << MAXBITS)
@@ -466,30 +464,22 @@ static void rehash(lua_State* L, Table* t, const TValue* ek)
     int na = computesizes(nums, &nasize);
     int nh = totaluse - na;
 
-    if (FFlag::LuauArrBoundResizeFix)
+    // enforce the boundary invariant; for performance, only do hash lookups if we must
+    int nadjusted = adjustasize(t, nasize, ek);
+
+    // count how many extra elements belong to array part instead of hash part
+    int aextra = nadjusted - nasize;
+
+    if (aextra != 0)
     {
-        // enforce the boundary invariant; for performance, only do hash lookups if we must
-        int nadjusted = adjustasize(t, nasize, ek);
+        // we no longer need to store those extra array elements in hash part
+        nh -= aextra;
 
-        // count how many extra elements belong to array part instead of hash part
-        int aextra = nadjusted - nasize;
+        // because hash nodes are twice as large as array nodes, the memory we saved for hash parts can be used by array part
+        // this follows the general sparse array part optimization where array is allocated when 50% occupation is reached
+        nasize = nadjusted + aextra;
 
-        if (aextra != 0)
-        {
-            // we no longer need to store those extra array elements in hash part
-            nh -= aextra;
-
-            // because hash nodes are twice as large as array nodes, the memory we saved for hash parts can be used by array part
-            // this follows the general sparse array part optimization where array is allocated when 50% occupation is reached
-            nasize = nadjusted + aextra;
-
-            // since the size was changed, it's again important to enforce the boundary invariant at the new size
-            nasize = adjustasize(t, nasize, ek);
-        }
-    }
-    else
-    {
-        // enforce the boundary invariant; for performance, only do hash lookups if we must
+        // since the size was changed, it's again important to enforce the boundary invariant at the new size
         nasize = adjustasize(t, nasize, ek);
     }
 
diff --git a/fuzz/linter.cpp b/fuzz/linter.cpp
index 854c632..8efd424 100644
--- a/fuzz/linter.cpp
+++ b/fuzz/linter.cpp
@@ -21,7 +21,7 @@ extern "C" int LLVMFuzzerTestOneInput(const uint8_t* Data, size_t Size)
     static Luau::NullFileResolver fileResolver;
     static Luau::NullConfigResolver configResolver;
     static Luau::Frontend frontend{&fileResolver, &configResolver};
-    static int once = (Luau::registerBuiltinGlobals(frontend), 1);
+    static int once = (Luau::registerBuiltinGlobals(frontend, frontend.globals, false), 1);
     (void)once;
     static int once2 = (Luau::freeze(frontend.globals.globalTypes), 1);
     (void)once2;
diff --git a/fuzz/proto.cpp b/fuzz/proto.cpp
index ffeb491..9366da5 100644
--- a/fuzz/proto.cpp
+++ b/fuzz/proto.cpp
@@ -97,12 +97,12 @@ lua_State* createGlobalState()
     return L;
 }
 
-int registerTypes(Luau::TypeChecker& typeChecker, Luau::GlobalTypes& globals)
+int registerTypes(Luau::Frontend& frontend, Luau::GlobalTypes& globals, bool forAutocomplete)
 {
     using namespace Luau;
     using std::nullopt;
 
-    Luau::registerBuiltinGlobals(typeChecker, globals);
+    Luau::registerBuiltinGlobals(frontend, globals, forAutocomplete);
 
     TypeArena& arena = globals.globalTypes;
     BuiltinTypes& builtinTypes = *globals.builtinTypes;
@@ -147,10 +147,10 @@ int registerTypes(Luau::TypeChecker& typeChecker, Luau::GlobalTypes& globals)
 
 static void setupFrontend(Luau::Frontend& frontend)
 {
-    registerTypes(frontend.typeChecker, frontend.globals);
+    registerTypes(frontend, frontend.globals, false);
     Luau::freeze(frontend.globals.globalTypes);
 
-    registerTypes(frontend.typeCheckerForAutocomplete, frontend.globalsForAutocomplete);
+    registerTypes(frontend, frontend.globalsForAutocomplete, true);
     Luau::freeze(frontend.globalsForAutocomplete.globalTypes);
 
     frontend.iceHandler.onInternalError = [](const char* error) {
diff --git a/fuzz/typeck.cpp b/fuzz/typeck.cpp
index 4f8f885..87a8827 100644
--- a/fuzz/typeck.cpp
+++ b/fuzz/typeck.cpp
@@ -26,7 +26,7 @@ extern "C" int LLVMFuzzerTestOneInput(const uint8_t* Data, size_t Size)
     static Luau::NullFileResolver fileResolver;
     static Luau::NullConfigResolver configResolver;
     static Luau::Frontend frontend{&fileResolver, &configResolver};
-    static int once = (Luau::registerBuiltinGlobals(frontend), 1);
+    static int once = (Luau::registerBuiltinGlobals(frontend, frontend.globals, false), 1);
     (void)once;
     static int once2 = (Luau::freeze(frontend.globals.globalTypes), 1);
     (void)once2;
diff --git a/tests/AssemblyBuilderA64.test.cpp b/tests/AssemblyBuilderA64.test.cpp
index 1690c74..a0df0f9 100644
--- a/tests/AssemblyBuilderA64.test.cpp
+++ b/tests/AssemblyBuilderA64.test.cpp
@@ -86,6 +86,7 @@ TEST_CASE_FIXTURE(AssemblyBuilderA64Fixture, "Binary")
     SINGLE_COMPARE(add(x0, x1, x2, 7), 0x8B021C20);
     SINGLE_COMPARE(sub(x0, x1, x2), 0xCB020020);
     SINGLE_COMPARE(and_(x0, x1, x2), 0x8A020020);
+    SINGLE_COMPARE(bic(x0, x1, x2), 0x8A220020);
     SINGLE_COMPARE(orr(x0, x1, x2), 0xAA020020);
     SINGLE_COMPARE(eor(x0, x1, x2), 0xCA020020);
     SINGLE_COMPARE(lsl(x0, x1, x2), 0x9AC22020);
@@ -94,6 +95,7 @@ TEST_CASE_FIXTURE(AssemblyBuilderA64Fixture, "Binary")
     SINGLE_COMPARE(asr(x0, x1, x2), 0x9AC22820);
     SINGLE_COMPARE(ror(x0, x1, x2), 0x9AC22C20);
     SINGLE_COMPARE(cmp(x0, x1), 0xEB01001F);
+    SINGLE_COMPARE(tst(x0, x1), 0xEA01001F);
 
     // reg, imm
     SINGLE_COMPARE(add(x3, x7, 78), 0x910138E3);
@@ -102,6 +104,24 @@ TEST_CASE_FIXTURE(AssemblyBuilderA64Fixture, "Binary")
     SINGLE_COMPARE(cmp(w0, 42), 0x7100A81F);
 }
 
+TEST_CASE_FIXTURE(AssemblyBuilderA64Fixture, "BinaryImm")
+{
+    // instructions
+    SINGLE_COMPARE(and_(w1, w2, 1), 0x12000041);
+    SINGLE_COMPARE(orr(w1, w2, 1), 0x32000041);
+    SINGLE_COMPARE(eor(w1, w2, 1), 0x52000041);
+    SINGLE_COMPARE(tst(w1, 1), 0x7200003f);
+
+    // various mask forms
+    SINGLE_COMPARE(and_(w0, w0, 1), 0x12000000);
+    SINGLE_COMPARE(and_(w0, w0, 3), 0x12000400);
+    SINGLE_COMPARE(and_(w0, w0, 7), 0x12000800);
+    SINGLE_COMPARE(and_(w0, w0, 2147483647), 0x12007800);
+    SINGLE_COMPARE(and_(w0, w0, 6), 0x121F0400);
+    SINGLE_COMPARE(and_(w0, w0, 12), 0x121E0400);
+    SINGLE_COMPARE(and_(w0, w0, 2147483648), 0x12010000);
+}
+
 TEST_CASE_FIXTURE(AssemblyBuilderA64Fixture, "Loads")
 {
     // address forms
@@ -359,11 +379,13 @@ TEST_CASE_FIXTURE(AssemblyBuilderA64Fixture, "AddressOffsetSize")
     SINGLE_COMPARE(str(q0, mem(x1, 16)), 0x3D800420);
 }
 
-TEST_CASE_FIXTURE(AssemblyBuilderA64Fixture, "ConditionalSelect")
+TEST_CASE_FIXTURE(AssemblyBuilderA64Fixture, "Conditionals")
 {
     SINGLE_COMPARE(csel(x0, x1, x2, ConditionA64::Equal), 0x9A820020);
     SINGLE_COMPARE(csel(w0, w1, w2, ConditionA64::Equal), 0x1A820020);
     SINGLE_COMPARE(fcsel(d0, d1, d2, ConditionA64::Equal), 0x1E620C20);
+
+    SINGLE_COMPARE(cset(x1, ConditionA64::Less), 0x9A9FA7E1);
 }
 
 TEST_CASE("LogTest")
@@ -394,6 +416,7 @@ TEST_CASE("LogTest")
     build.ldr(q1, x2);
 
     build.csel(x0, x1, x2, ConditionA64::Equal);
+    build.cset(x0, ConditionA64::Equal);
 
     build.fcmp(d0, d1);
     build.fcmpz(d0);
@@ -423,6 +446,7 @@ TEST_CASE("LogTest")
  fabs        d1,d2
  ldr         q1,[x2]
  csel        x0,x1,x2,eq
+ cset        x0,eq
  fcmp        d0,d1
  fcmp        d0,#0
 .L1:
diff --git a/tests/AssemblyBuilderX64.test.cpp b/tests/AssemblyBuilderX64.test.cpp
index 054eca7..bafb68b 100644
--- a/tests/AssemblyBuilderX64.test.cpp
+++ b/tests/AssemblyBuilderX64.test.cpp
@@ -67,6 +67,9 @@ TEST_CASE_FIXTURE(AssemblyBuilderX64Fixture, "BaseBinaryInstructionForms")
     SINGLE_COMPARE(add(rax, 0x7f), 0x48, 0x83, 0xc0, 0x7f);
     SINGLE_COMPARE(add(rax, 0x80), 0x48, 0x81, 0xc0, 0x80, 0x00, 0x00, 0x00);
     SINGLE_COMPARE(add(r10, 0x7fffffff), 0x49, 0x81, 0xc2, 0xff, 0xff, 0xff, 0x7f);
+    SINGLE_COMPARE(add(al, 3), 0x80, 0xc0, 0x03);
+    SINGLE_COMPARE(add(sil, 3), 0x48, 0x80, 0xc6, 0x03);
+    SINGLE_COMPARE(add(r11b, 3), 0x49, 0x80, 0xc3, 0x03);
 
     // reg, [reg]
     SINGLE_COMPARE(add(rax, qword[rax]), 0x48, 0x03, 0x00);
@@ -191,6 +194,8 @@ TEST_CASE_FIXTURE(AssemblyBuilderX64Fixture, "FormsOfMov")
     SINGLE_COMPARE(mov64(rcx, 0x1234567812345678ll), 0x48, 0xb9, 0x78, 0x56, 0x34, 0x12, 0x78, 0x56, 0x34, 0x12);
     SINGLE_COMPARE(mov(ecx, 2), 0xb9, 0x02, 0x00, 0x00, 0x00);
     SINGLE_COMPARE(mov(cl, 2), 0xb1, 0x02);
+    SINGLE_COMPARE(mov(sil, 2), 0x48, 0xb6, 0x02);
+    SINGLE_COMPARE(mov(r9b, 2), 0x49, 0xb1, 0x02);
     SINGLE_COMPARE(mov(rcx, qword[rdi]), 0x48, 0x8b, 0x0f);
     SINGLE_COMPARE(mov(dword[rax], 0xabcd), 0xc7, 0x00, 0xcd, 0xab, 0x00, 0x00);
     SINGLE_COMPARE(mov(r13, 1), 0x49, 0xbd, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
@@ -201,6 +206,8 @@ TEST_CASE_FIXTURE(AssemblyBuilderX64Fixture, "FormsOfMov")
     SINGLE_COMPARE(mov(qword[rdx], r9), 0x4c, 0x89, 0x0a);
     SINGLE_COMPARE(mov(byte[rsi], 0x3), 0xc6, 0x06, 0x03);
     SINGLE_COMPARE(mov(byte[rsi], al), 0x88, 0x06);
+    SINGLE_COMPARE(mov(byte[rsi], dil), 0x48, 0x88, 0x3e);
+    SINGLE_COMPARE(mov(byte[rsi], r10b), 0x4c, 0x88, 0x16);
 }
 
 TEST_CASE_FIXTURE(AssemblyBuilderX64Fixture, "FormsOfMovExtended")
@@ -229,6 +236,8 @@ TEST_CASE_FIXTURE(AssemblyBuilderX64Fixture, "FormsOfShift")
 {
     SINGLE_COMPARE(shl(al, 1), 0xd0, 0xe0);
     SINGLE_COMPARE(shl(al, cl), 0xd2, 0xe0);
+    SINGLE_COMPARE(shl(sil, cl), 0x48, 0xd2, 0xe6);
+    SINGLE_COMPARE(shl(r10b, cl), 0x49, 0xd2, 0xe2);
     SINGLE_COMPARE(shr(al, 4), 0xc0, 0xe8, 0x04);
     SINGLE_COMPARE(shr(eax, 1), 0xd1, 0xe8);
     SINGLE_COMPARE(sal(eax, cl), 0xd3, 0xe0);
@@ -247,6 +256,7 @@ TEST_CASE_FIXTURE(AssemblyBuilderX64Fixture, "FormsOfLea")
 TEST_CASE_FIXTURE(AssemblyBuilderX64Fixture, "FormsOfSetcc")
 {
     SINGLE_COMPARE(setcc(ConditionX64::NotEqual, bl), 0x0f, 0x95, 0xc3);
+    SINGLE_COMPARE(setcc(ConditionX64::NotEqual, dil), 0x48, 0x0f, 0x95, 0xc7);
     SINGLE_COMPARE(setcc(ConditionX64::BelowEqual, byte[rcx]), 0x0f, 0x96, 0x01);
 }
 
diff --git a/tests/Autocomplete.test.cpp b/tests/Autocomplete.test.cpp
index c79bf35..3dc75d6 100644
--- a/tests/Autocomplete.test.cpp
+++ b/tests/Autocomplete.test.cpp
@@ -3473,4 +3473,34 @@ TEST_CASE_FIXTURE(ACFixture, "autocomplete_response_perf1" * doctest::timeout(0.
     CHECK(ac.entryMap.count("Instance"));
 }
 
+TEST_CASE_FIXTURE(ACFixture, "strict_mode_force")
+{
+    check(R"(
+--!nonstrict
+local a: {x: number} = {x=1}
+local b = a
+local c = b.@1
+    )");
+
+    auto ac = autocomplete('1');
+
+    CHECK_EQ(1, ac.entryMap.size());
+    CHECK(ac.entryMap.count("x"));
+}
+
+TEST_CASE_FIXTURE(ACFixture, "suggest_exported_types")
+{
+    ScopedFastFlag luauCopyExportedTypes{"LuauCopyExportedTypes", true};
+
+    check(R"(
+export type Type = {a: number}
+local a: T@1
+    )");
+
+    auto ac = autocomplete('1');
+
+    CHECK(ac.entryMap.count("Type"));
+    CHECK_EQ(ac.context, AutocompleteContext::Type);
+}
+
 TEST_SUITE_END();
diff --git a/tests/CodeAllocator.test.cpp b/tests/CodeAllocator.test.cpp
index 359f2ba..01deddd 100644
--- a/tests/CodeAllocator.test.cpp
+++ b/tests/CodeAllocator.test.cpp
@@ -135,7 +135,8 @@ TEST_CASE("WindowsUnwindCodesX64")
 
     UnwindBuilderWin unwind;
 
-    unwind.start();
+    unwind.startInfo();
+    unwind.startFunction();
     unwind.spill(16, rdx);
     unwind.spill(8, rcx);
     unwind.save(rdi);
@@ -148,14 +149,15 @@ TEST_CASE("WindowsUnwindCodesX64")
     unwind.save(r15);
     unwind.allocStack(72);
     unwind.setupFrameReg(rbp, 48);
-    unwind.finish();
+    unwind.finishFunction(0x11223344, 0x55443322);
+    unwind.finishInfo();
 
     std::vector<char> data;
     data.resize(unwind.getSize());
-    unwind.finalize(data.data(), nullptr, 0);
+    unwind.finalize(data.data(), 0, nullptr, 0);
 
-    std::vector<uint8_t> expected{0x01, 0x23, 0x0a, 0x35, 0x23, 0x33, 0x1e, 0x82, 0x1a, 0xf0, 0x18, 0xe0, 0x16, 0xd0, 0x14, 0xc0, 0x12, 0x50, 0x10,
-        0x30, 0x0e, 0x60, 0x0c, 0x70};
+    std::vector<uint8_t> expected{0x44, 0x33, 0x22, 0x11, 0x22, 0x33, 0x44, 0x55, 0x0c, 0x00, 0x00, 0x00, 0x01, 0x23, 0x0a, 0x35, 0x23, 0x33, 0x1e,
+        0x82, 0x1a, 0xf0, 0x18, 0xe0, 0x16, 0xd0, 0x14, 0xc0, 0x12, 0x50, 0x10, 0x30, 0x0e, 0x60, 0x0c, 0x70};
 
     REQUIRE(data.size() == expected.size());
     CHECK(memcmp(data.data(), expected.data(), expected.size()) == 0);
@@ -168,7 +170,8 @@ TEST_CASE("Dwarf2UnwindCodesX64")
 
     UnwindBuilderDwarf2 unwind;
 
-    unwind.start();
+    unwind.startInfo();
+    unwind.startFunction();
     unwind.save(rdi);
     unwind.save(rsi);
     unwind.save(rbx);
@@ -179,11 +182,12 @@ TEST_CASE("Dwarf2UnwindCodesX64")
     unwind.save(r15);
     unwind.allocStack(72);
     unwind.setupFrameReg(rbp, 48);
-    unwind.finish();
+    unwind.finishFunction(0, 0);
+    unwind.finishInfo();
 
     std::vector<char> data;
     data.resize(unwind.getSize());
-    unwind.finalize(data.data(), nullptr, 0);
+    unwind.finalize(data.data(), 0, nullptr, 0);
 
     std::vector<uint8_t> expected{0x14, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x01, 0x78, 0x10, 0x0c, 0x07, 0x08, 0x05, 0x10, 0x01,
         0x00, 0x00, 0x00, 0x00, 0x00, 0x4c, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
@@ -211,6 +215,8 @@ constexpr X64::RegisterX64 rArg3 = X64::rdx;
 
 constexpr X64::RegisterX64 rNonVol1 = X64::r12;
 constexpr X64::RegisterX64 rNonVol2 = X64::rbx;
+constexpr X64::RegisterX64 rNonVol3 = X64::r13;
+constexpr X64::RegisterX64 rNonVol4 = X64::r14;
 
 TEST_CASE("GeneratedCodeExecutionX64")
 {
@@ -260,7 +266,10 @@ TEST_CASE("GeneratedCodeExecutionWithThrowX64")
     std::unique_ptr<UnwindBuilder> unwind = std::make_unique<UnwindBuilderDwarf2>();
 #endif
 
-    unwind->start();
+    unwind->startInfo();
+
+    Label functionBegin = build.setLabel();
+    unwind->startFunction();
 
     // Prologue
     build.push(rNonVol1);
@@ -279,8 +288,6 @@ TEST_CASE("GeneratedCodeExecutionWithThrowX64")
     build.lea(rbp, addr[rsp + stackSize]);
     unwind->setupFrameReg(rbp, stackSize);
 
-    unwind->finish();
-
     // Body
     build.mov(rNonVol1, rArg1);
     build.mov(rNonVol2, rArg2);
@@ -296,8 +303,12 @@ TEST_CASE("GeneratedCodeExecutionWithThrowX64")
     build.pop(rNonVol1);
     build.ret();
 
+    unwind->finishFunction(build.getLabelOffset(functionBegin), ~0u);
+
     build.finalize();
 
+    unwind->finishInfo();
+
     size_t blockSize = 1024 * 1024;
     size_t maxTotalSize = 1024 * 1024;
     CodeAllocator allocator(blockSize, maxTotalSize);
@@ -326,6 +337,152 @@ TEST_CASE("GeneratedCodeExecutionWithThrowX64")
     }
 }
 
+TEST_CASE("GeneratedCodeExecutionMultipleFunctionsWithThrowX64")
+{
+    using namespace X64;
+
+    AssemblyBuilderX64 build(/* logText= */ false);
+
+#if defined(_WIN32)
+    std::unique_ptr<UnwindBuilder> unwind = std::make_unique<UnwindBuilderWin>();
+#else
+    std::unique_ptr<UnwindBuilder> unwind = std::make_unique<UnwindBuilderDwarf2>();
+#endif
+
+    unwind->startInfo();
+
+    Label start1;
+    Label start2;
+
+    // First function
+    {
+        build.setLabel(start1);
+        unwind->startFunction();
+
+        // Prologue
+        build.push(rNonVol1);
+        unwind->save(rNonVol1);
+        build.push(rNonVol2);
+        unwind->save(rNonVol2);
+        build.push(rbp);
+        unwind->save(rbp);
+
+        int stackSize = 32;
+        int localsSize = 16;
+
+        build.sub(rsp, stackSize + localsSize);
+        unwind->allocStack(stackSize + localsSize);
+
+        build.lea(rbp, addr[rsp + stackSize]);
+        unwind->setupFrameReg(rbp, stackSize);
+
+        // Body
+        build.mov(rNonVol1, rArg1);
+        build.mov(rNonVol2, rArg2);
+
+        build.add(rNonVol1, 15);
+        build.mov(rArg1, rNonVol1);
+        build.call(rNonVol2);
+
+        // Epilogue
+        build.lea(rsp, addr[rbp + localsSize]);
+        build.pop(rbp);
+        build.pop(rNonVol2);
+        build.pop(rNonVol1);
+        build.ret();
+
+        Label end1 = build.setLabel();
+        unwind->finishFunction(build.getLabelOffset(start1), build.getLabelOffset(end1));
+    }
+
+    // Second function with different layout
+    {
+        build.setLabel(start2);
+        unwind->startFunction();
+
+        // Prologue
+        build.push(rNonVol1);
+        unwind->save(rNonVol1);
+        build.push(rNonVol2);
+        unwind->save(rNonVol2);
+        build.push(rNonVol3);
+        unwind->save(rNonVol3);
+        build.push(rNonVol4);
+        unwind->save(rNonVol4);
+        build.push(rbp);
+        unwind->save(rbp);
+
+        int stackSize = 32;
+        int localsSize = 32;
+
+        build.sub(rsp, stackSize + localsSize);
+        unwind->allocStack(stackSize + localsSize);
+
+        build.lea(rbp, addr[rsp + stackSize]);
+        unwind->setupFrameReg(rbp, stackSize);
+
+        // Body
+        build.mov(rNonVol3, rArg1);
+        build.mov(rNonVol4, rArg2);
+
+        build.add(rNonVol3, 15);
+        build.mov(rArg1, rNonVol3);
+        build.call(rNonVol4);
+
+        // Epilogue
+        build.lea(rsp, addr[rbp + localsSize]);
+        build.pop(rbp);
+        build.pop(rNonVol4);
+        build.pop(rNonVol3);
+        build.pop(rNonVol2);
+        build.pop(rNonVol1);
+        build.ret();
+
+        unwind->finishFunction(build.getLabelOffset(start2), ~0u);
+    }
+
+    build.finalize();
+
+    unwind->finishInfo();
+
+    size_t blockSize = 1024 * 1024;
+    size_t maxTotalSize = 1024 * 1024;
+    CodeAllocator allocator(blockSize, maxTotalSize);
+
+    allocator.context = unwind.get();
+    allocator.createBlockUnwindInfo = createBlockUnwindInfo;
+    allocator.destroyBlockUnwindInfo = destroyBlockUnwindInfo;
+
+    uint8_t* nativeData;
+    size_t sizeNativeData;
+    uint8_t* nativeEntry;
+    REQUIRE(allocator.allocate(build.data.data(), build.data.size(), build.code.data(), build.code.size(), nativeData, sizeNativeData, nativeEntry));
+    REQUIRE(nativeEntry);
+
+    using FunctionType = int64_t(int64_t, void (*)(int64_t));
+    FunctionType* f1 = (FunctionType*)(nativeEntry + start1.location);
+    FunctionType* f2 = (FunctionType*)(nativeEntry + start2.location);
+
+    // To simplify debugging, CHECK_THROWS_WITH_AS is not used here
+    try
+    {
+        f1(10, throwing);
+    }
+    catch (const std::runtime_error& error)
+    {
+        CHECK(strcmp(error.what(), "testing") == 0);
+    }
+
+    try
+    {
+        f2(10, throwing);
+    }
+    catch (const std::runtime_error& error)
+    {
+        CHECK(strcmp(error.what(), "testing") == 0);
+    }
+}
+
 TEST_CASE("GeneratedCodeExecutionWithThrowOutsideTheGateX64")
 {
     using namespace X64;
@@ -338,7 +495,10 @@ TEST_CASE("GeneratedCodeExecutionWithThrowOutsideTheGateX64")
     std::unique_ptr<UnwindBuilder> unwind = std::make_unique<UnwindBuilderDwarf2>();
 #endif
 
-    unwind->start();
+    unwind->startInfo();
+
+    Label functionBegin = build.setLabel();
+    unwind->startFunction();
 
     // Prologue (some of these registers don't have to be saved, but we want to have a big prologue)
     build.push(r10);
@@ -365,8 +525,6 @@ TEST_CASE("GeneratedCodeExecutionWithThrowOutsideTheGateX64")
     build.lea(rbp, addr[rsp + stackSize]);
     unwind->setupFrameReg(rbp, stackSize);
 
-    unwind->finish();
-
     size_t prologueSize = build.setLabel().location;
 
     // Body
@@ -387,8 +545,12 @@ TEST_CASE("GeneratedCodeExecutionWithThrowOutsideTheGateX64")
     build.pop(r10);
     build.ret();
 
+    unwind->finishFunction(build.getLabelOffset(functionBegin), ~0u);
+
     build.finalize();
 
+    unwind->finishInfo();
+
     size_t blockSize = 4096; // Force allocate to create a new block each time
     size_t maxTotalSize = 1024 * 1024;
     CodeAllocator allocator(blockSize, maxTotalSize);
diff --git a/tests/Conformance.test.cpp b/tests/Conformance.test.cpp
index 0a9d1f7..ee7066b 100644
--- a/tests/Conformance.test.cpp
+++ b/tests/Conformance.test.cpp
@@ -285,8 +285,16 @@ TEST_CASE("Tables")
         lua_pushcfunction(
             L,
             [](lua_State* L) {
-                unsigned v = luaL_checkunsigned(L, 1);
-                lua_pushlightuserdata(L, reinterpret_cast<void*>(uintptr_t(v)));
+                if (lua_type(L, 1) == LUA_TNUMBER)
+                {
+                    unsigned v = luaL_checkunsigned(L, 1);
+                    lua_pushlightuserdata(L, reinterpret_cast<void*>(uintptr_t(v)));
+                }
+                else
+                {
+                    const void* p = lua_topointer(L, 1);
+                    lua_pushlightuserdata(L, const_cast<void*>(p));
+                }
                 return 1;
             },
             "makelud");
@@ -402,21 +410,24 @@ TEST_CASE("PCall")
 {
     ScopedFastFlag sff("LuauBetterOOMHandling", true);
 
-    runConformance("pcall.lua", [](lua_State* L) {
-        lua_pushcfunction(L, cxxthrow, "cxxthrow");
-        lua_setglobal(L, "cxxthrow");
+    runConformance(
+        "pcall.lua",
+        [](lua_State* L) {
+            lua_pushcfunction(L, cxxthrow, "cxxthrow");
+            lua_setglobal(L, "cxxthrow");
 
-        lua_pushcfunction(
-            L,
-            [](lua_State* L) -> int {
-                lua_State* co = lua_tothread(L, 1);
-                lua_xmove(L, co, 1);
-                lua_resumeerror(co, L);
-                return 0;
-            },
-            "resumeerror");
-        lua_setglobal(L, "resumeerror");
-    }, nullptr, lua_newstate(limitedRealloc, nullptr));
+            lua_pushcfunction(
+                L,
+                [](lua_State* L) -> int {
+                    lua_State* co = lua_tothread(L, 1);
+                    lua_xmove(L, co, 1);
+                    lua_resumeerror(co, L);
+                    return 0;
+                },
+                "resumeerror");
+            lua_setglobal(L, "resumeerror");
+        },
+        nullptr, lua_newstate(limitedRealloc, nullptr));
 }
 
 TEST_CASE("Pack")
diff --git a/tests/Fixture.cpp b/tests/Fixture.cpp
index aebf177..aba2891 100644
--- a/tests/Fixture.cpp
+++ b/tests/Fixture.cpp
@@ -21,6 +21,7 @@
 static const char* mainModuleName = "MainModule";
 
 LUAU_FASTFLAG(DebugLuauDeferredConstraintResolution);
+LUAU_FASTFLAG(LuauOnDemandTypecheckers);
 
 extern std::optional<unsigned> randomSeed; // tests/main.cpp
 
@@ -180,9 +181,16 @@ AstStatBlock* Fixture::parse(const std::string& source, const ParseOptions& pars
 
                 Luau::lint(sourceModule->root, *sourceModule->names, frontend.globals.globalScope, module.get(), sourceModule->hotcomments, {});
             }
+            else if (!FFlag::LuauOnDemandTypecheckers)
+            {
+                ModulePtr module = frontend.typeChecker_DEPRECATED.check(*sourceModule, sourceModule->mode.value_or(Luau::Mode::Nonstrict));
+
+                Luau::lint(sourceModule->root, *sourceModule->names, frontend.globals.globalScope, module.get(), sourceModule->hotcomments, {});
+            }
             else
             {
-                ModulePtr module = frontend.typeChecker.check(*sourceModule, sourceModule->mode.value_or(Luau::Mode::Nonstrict));
+                TypeChecker typeChecker(frontend.globals.globalScope, &moduleResolver, builtinTypes, &frontend.iceHandler);
+                ModulePtr module = typeChecker.check(*sourceModule, sourceModule->mode.value_or(Luau::Mode::Nonstrict), std::nullopt);
 
                 Luau::lint(sourceModule->root, *sourceModule->names, frontend.globals.globalScope, module.get(), sourceModule->hotcomments, {});
             }
diff --git a/tests/Module.test.cpp b/tests/Module.test.cpp
index 7e61235..3c613a1 100644
--- a/tests/Module.test.cpp
+++ b/tests/Module.test.cpp
@@ -3,6 +3,7 @@
 #include "Luau/Module.h"
 #include "Luau/Scope.h"
 #include "Luau/RecursionCounter.h"
+#include "Luau/Parser.h"
 
 #include "Fixture.h"
 
@@ -42,6 +43,38 @@ TEST_CASE_FIXTURE(Fixture, "is_within_comment")
     CHECK(!isWithinComment(*sm, Position{7, 11}));
 }
 
+TEST_CASE_FIXTURE(Fixture, "is_within_comment_parse_result")
+{
+    std::string src = R"(
+        --!strict
+        local foo = {}
+        function foo:bar() end
+
+        --[[
+            foo:
+        ]] foo:bar()
+
+        --[[]]--[[]] -- Two distinct comments that have zero characters of space between them.
+    )";
+
+    Luau::Allocator alloc;
+    Luau::AstNameTable names{alloc};
+    Luau::ParseOptions parseOptions;
+    parseOptions.captureComments = true;
+    Luau::ParseResult parseResult = Luau::Parser::parse(src.data(), src.size(), names, alloc, parseOptions);
+
+    CHECK_EQ(5, parseResult.commentLocations.size());
+
+    CHECK(isWithinComment(parseResult, Position{1, 15}));
+    CHECK(isWithinComment(parseResult, Position{6, 16}));
+    CHECK(isWithinComment(parseResult, Position{9, 13}));
+    CHECK(isWithinComment(parseResult, Position{9, 14}));
+
+    CHECK(!isWithinComment(parseResult, Position{2, 15}));
+    CHECK(!isWithinComment(parseResult, Position{7, 10}));
+    CHECK(!isWithinComment(parseResult, Position{7, 11}));
+}
+
 TEST_CASE_FIXTURE(Fixture, "dont_clone_persistent_primitive")
 {
     TypeArena dest;
@@ -319,6 +352,10 @@ TEST_CASE_FIXTURE(Fixture, "clone_recursion_limit")
 
 TEST_CASE_FIXTURE(Fixture, "any_persistance_does_not_leak")
 {
+    ScopedFastFlag flags[] = {
+        {"LuauOccursIsntAlwaysFailure", true},
+    };
+
     fileResolver.source["Module/A"] = R"(
 export type A = B
 type B = A
@@ -332,7 +369,7 @@ type B = A
     auto mod = frontend.moduleResolver.getModule("Module/A");
     auto it = mod->exportedTypeBindings.find("A");
     REQUIRE(it != mod->exportedTypeBindings.end());
-    CHECK(toString(it->second.type) == "any");
+    CHECK(toString(it->second.type) == "*error-type*");
 }
 
 TEST_CASE_FIXTURE(BuiltinsFixture, "do_not_clone_reexports")
diff --git a/tests/StringUtils.test.cpp b/tests/StringUtils.test.cpp
index afef3b0..786f965 100644
--- a/tests/StringUtils.test.cpp
+++ b/tests/StringUtils.test.cpp
@@ -106,4 +106,22 @@ TEST_CASE("AreWeUsingDistanceWithAdjacentTranspositionsAndNotOptimalStringAlignm
     CHECK_EQ(distance, 2);
 }
 
+TEST_CASE("EditDistanceSupportsUnicode")
+{
+    // ASCII character
+    CHECK_EQ(Luau::editDistance("A block", "X block"), 1);
+
+    // UTF-8 2 byte character
+    CHECK_EQ(Luau::editDistance("A block", "À block"), 2);
+
+    // UTF-8 3 byte character
+    CHECK_EQ(Luau::editDistance("A block", "⪻ block"), 3);
+
+    // UTF-8 4 byte character
+    CHECK_EQ(Luau::editDistance("A block", "𒋄 block"), 4);
+
+    // UTF-8 extreme characters
+    CHECK_EQ(Luau::editDistance("A block", "R̴̨̢̟̚ŏ̶̳̳͚́ͅb̶̡̻̞̐̿ͅl̸̼͝ợ̷̜͓̒̏͜͝ẍ̴̝̦̟̰́̒́̌ block"), 85);
+}
+
 TEST_SUITE_END();
diff --git a/tests/TypeInfer.annotations.test.cpp b/tests/TypeInfer.annotations.test.cpp
index 2c87cb4..3de5299 100644
--- a/tests/TypeInfer.annotations.test.cpp
+++ b/tests/TypeInfer.annotations.test.cpp
@@ -435,6 +435,10 @@ TEST_CASE_FIXTURE(Fixture, "typeof_expr")
 
 TEST_CASE_FIXTURE(Fixture, "corecursive_types_error_on_tight_loop")
 {
+    ScopedFastFlag flags[] = {
+        {"LuauOccursIsntAlwaysFailure", true},
+    };
+
     CheckResult result = check(R"(
         type A = B
         type B = A
@@ -443,10 +447,10 @@ TEST_CASE_FIXTURE(Fixture, "corecursive_types_error_on_tight_loop")
         local bb:B
     )");
 
-    TypeId fType = requireType("aa");
-    const AnyType* ftv = get<AnyType>(follow(fType));
-    REQUIRE(ftv != nullptr);
-    REQUIRE(!result.errors.empty());
+    LUAU_REQUIRE_ERROR_COUNT(1, result);
+
+    OccursCheckFailed* ocf = get<OccursCheckFailed>(result.errors[0]);
+    REQUIRE(ocf);
 }
 
 TEST_CASE_FIXTURE(Fixture, "type_alias_always_resolve_to_a_real_type")
@@ -762,6 +766,7 @@ TEST_CASE_FIXTURE(Fixture, "occurs_check_on_cyclic_union_type")
 {
     CheckResult result = check(R"(
         type T = T | T
+        local x : T
     )");
 
     LUAU_REQUIRE_ERROR_COUNT(1, result);
diff --git a/tests/TypeInfer.functions.test.cpp b/tests/TypeInfer.functions.test.cpp
index f1d42c6..942ce19 100644
--- a/tests/TypeInfer.functions.test.cpp
+++ b/tests/TypeInfer.functions.test.cpp
@@ -1281,6 +1281,39 @@ f(function(x) return x * 2 end)
     LUAU_REQUIRE_NO_ERRORS(result);
 }
 
+TEST_CASE_FIXTURE(Fixture, "variadic_any_is_compatible_with_a_generic_TypePack")
+{
+    ScopedFastFlag sff[] = {
+        {"LuauVariadicAnyCanBeGeneric", true}
+    };
+
+    CheckResult result = check(R"(
+        --!strict
+        local function f(...) return ... end
+        local g = function(...) return f(...) end
+    )");
+
+    LUAU_REQUIRE_NO_ERRORS(result);
+}
+
+// https://github.com/Roblox/luau/issues/767
+TEST_CASE_FIXTURE(BuiltinsFixture, "variadic_any_is_compatible_with_a_generic_TypePack_2")
+{
+    ScopedFastFlag sff{"LuauVariadicAnyCanBeGeneric", true};
+
+    CheckResult result = check(R"(
+        local function somethingThatsAny(...: any)
+            print(...)
+        end
+
+        local function x<T...>(...: T...)
+            somethingThatsAny(...) -- Failed to unify variadic type packs
+        end
+    )");
+
+    LUAU_REQUIRE_NO_ERRORS(result);
+}
+
 TEST_CASE_FIXTURE(Fixture, "infer_anonymous_function_arguments_outside_call")
 {
     CheckResult result = check(R"(
diff --git a/tests/TypeInfer.operators.test.cpp b/tests/TypeInfer.operators.test.cpp
index 174bc31..d224195 100644
--- a/tests/TypeInfer.operators.test.cpp
+++ b/tests/TypeInfer.operators.test.cpp
@@ -53,10 +53,6 @@ TEST_CASE_FIXTURE(Fixture, "or_joins_types_with_no_superfluous_union")
 
 TEST_CASE_FIXTURE(Fixture, "and_does_not_always_add_boolean")
 {
-    ScopedFastFlag sff[]{
-        {"LuauTryhardAnd", true},
-    };
-
     CheckResult result = check(R"(
         local s = "a" and 10
         local x:boolean|number = s
@@ -737,6 +733,8 @@ TEST_CASE_FIXTURE(Fixture, "error_on_invalid_operand_types_to_relational_operato
 
 TEST_CASE_FIXTURE(Fixture, "cli_38355_recursive_union")
 {
+    ScopedFastFlag sff{"LuauOccursIsntAlwaysFailure", true};
+
     CheckResult result = check(R"(
         --!strict
         local _
@@ -744,7 +742,7 @@ TEST_CASE_FIXTURE(Fixture, "cli_38355_recursive_union")
     )");
 
     LUAU_REQUIRE_ERROR_COUNT(1, result);
-    CHECK_EQ("Type contains a self-recursive construct that cannot be resolved", toString(result.errors[0]));
+    CHECK_EQ("Unknown type used in + operation; consider adding a type annotation to '_'", toString(result.errors[0]));
 }
 
 TEST_CASE_FIXTURE(BuiltinsFixture, "UnknownGlobalCompoundAssign")
@@ -1048,10 +1046,6 @@ TEST_CASE_FIXTURE(BuiltinsFixture, "mm_comparisons_must_return_a_boolean")
 
 TEST_CASE_FIXTURE(BuiltinsFixture, "reworked_and")
 {
-    ScopedFastFlag sff[]{
-        {"LuauTryhardAnd", true},
-    };
-
     CheckResult result = check(R"(
 local a: number? = 5
 local b: boolean = (a or 1) > 10
@@ -1077,10 +1071,6 @@ local w = c and 1
 
 TEST_CASE_FIXTURE(BuiltinsFixture, "reworked_or")
 {
-    ScopedFastFlag sff[]{
-        {"LuauTryhardAnd", true},
-    };
-
     CheckResult result = check(R"(
 local a: number | false = 5
 local b: number? = 6
@@ -1115,11 +1105,6 @@ local f1 = f or 'f'
 
 TEST_CASE_FIXTURE(BuiltinsFixture, "reducing_and")
 {
-    ScopedFastFlag sff[]{
-        {"LuauTryhardAnd", true},
-        {"LuauReducingAndOr", true},
-    };
-
     CheckResult result = check(R"(
 type Foo = { name: string?, flag: boolean? }
 local arr: {Foo} = {}
@@ -1137,4 +1122,61 @@ end
     LUAU_REQUIRE_NO_ERRORS(result);
 }
 
+TEST_CASE_FIXTURE(BuiltinsFixture, "luau_polyfill_is_array_simplified")
+{
+    CheckResult result = check(R"(
+     --!strict
+     return function(value: any) : boolean
+        if typeof(value) ~= "number" then
+           return false
+        end
+        if value % 1 ~= 0 or value < 1 then
+           return false
+        end
+        return true
+     end 
+    )");
+
+    LUAU_REQUIRE_NO_ERRORS(result);
+}
+
+TEST_CASE_FIXTURE(BuiltinsFixture, "luau_polyfill_is_array")
+{
+    CheckResult result = check(R"(
+--!strict
+return function(value: any): boolean
+    if typeof(value) ~= "table" then
+        return false
+    end
+    if next(value) == nil then
+        -- an empty table is an empty array
+        return true
+    end
+
+    local length = #value
+
+    if length == 0 then
+        return false
+    end
+
+    local count = 0
+    local sum = 0
+    for key in pairs(value) do
+        if typeof(key) ~= "number" then
+            return false
+        end
+        if key % 1 ~= 0 or key < 1 then
+            return false
+        end
+        count += 1
+        sum += key
+    end
+
+    return sum == (count * (count + 1) / 2)
+end
+    )");
+
+    LUAU_REQUIRE_NO_ERRORS(result);
+}
+
 TEST_SUITE_END();
diff --git a/tests/TypeInfer.provisional.test.cpp b/tests/TypeInfer.provisional.test.cpp
index 87419de..e074bc8 100644
--- a/tests/TypeInfer.provisional.test.cpp
+++ b/tests/TypeInfer.provisional.test.cpp
@@ -320,23 +320,6 @@ TEST_CASE_FIXTURE(Fixture, "weird_fail_to_unify_type_pack")
     LUAU_REQUIRE_ERRORS(result); // Should not have any errors.
 }
 
-TEST_CASE_FIXTURE(Fixture, "weird_fail_to_unify_variadic_pack")
-{
-    ScopedFastFlag sff[] = {
-        // I'm not sure why this is broken without DCR, but it seems to be fixed
-        // when DCR is enabled.
-        {"DebugLuauDeferredConstraintResolution", false},
-    };
-
-    CheckResult result = check(R"(
-        --!strict
-        local function f(...) return ... end
-        local g = function(...) return f(...) end
-    )");
-
-    LUAU_REQUIRE_ERRORS(result); // Should not have any errors.
-}
-
 // Belongs in TypeInfer.builtins.test.cpp.
 TEST_CASE_FIXTURE(BuiltinsFixture, "pcall_returns_at_least_two_value_but_function_returns_nothing")
 {
@@ -819,4 +802,23 @@ TEST_CASE_FIXTURE(BuiltinsFixture, "table_insert_with_a_singleton_argument")
     }
 }
 
+// We really should be warning on this.  We have no guarantee that T has any properties.
+TEST_CASE_FIXTURE(Fixture, "lookup_prop_of_intersection_containing_unions_of_tables_that_have_the_prop")
+{
+    CheckResult result = check(R"(
+        local function mergeOptions<T>(options: T & ({variable: string} | {variable: number}))
+            return options.variable
+        end
+    )");
+
+    LUAU_REQUIRE_NO_ERRORS(result);
+
+    // LUAU_REQUIRE_ERROR_COUNT(1, result);
+
+    // const UnknownProperty* unknownProp = get<UnknownProperty>(result.errors[0]);
+    // REQUIRE(unknownProp);
+
+    // CHECK("variable" == unknownProp->key);
+}
+
 TEST_SUITE_END();
diff --git a/tests/TypeInfer.test.cpp b/tests/TypeInfer.test.cpp
index 3088235..f540be0 100644
--- a/tests/TypeInfer.test.cpp
+++ b/tests/TypeInfer.test.cpp
@@ -1195,6 +1195,21 @@ local b = typeof(foo) ~= 'nil'
     CHECK(toString(result.errors[1]) == "Unknown global 'foo'");
 }
 
+TEST_CASE_FIXTURE(Fixture, "occurs_isnt_always_failure")
+{
+    ScopedFastFlag sff{"LuauOccursIsntAlwaysFailure", true};
+
+    CheckResult result = check(R"(
+function f(x, c)                   -- x : X
+    local y = if c then x else nil -- y : X?
+    local z = if c then x else nil -- z : X?
+    y = z
+end
+    )");
+
+    LUAU_REQUIRE_NO_ERRORS(result);
+}
+
 TEST_CASE_FIXTURE(Fixture, "dcr_delays_expansion_of_function_containing_blocked_parameter_type")
 {
     ScopedFastFlag sff[] = {
diff --git a/tests/TypeInfer.unionTypes.test.cpp b/tests/TypeInfer.unionTypes.test.cpp
index 19a19e4..19b2214 100644
--- a/tests/TypeInfer.unionTypes.test.cpp
+++ b/tests/TypeInfer.unionTypes.test.cpp
@@ -776,4 +776,20 @@ TEST_CASE_FIXTURE(Fixture, "generic_function_with_optional_arg")
     LUAU_REQUIRE_NO_ERRORS(result);
 }
 
+TEST_CASE_FIXTURE(Fixture, "lookup_prop_of_intersection_containing_unions")
+{
+    CheckResult result = check(R"(
+        local function mergeOptions<T>(options: T & ({} | {}))
+            return options.variables
+        end
+    )");
+
+    LUAU_REQUIRE_ERROR_COUNT(1, result);
+
+    const UnknownProperty* unknownProp = get<UnknownProperty>(result.errors[0]);
+    REQUIRE(unknownProp);
+
+    CHECK("variables" == unknownProp->key);
+}
+
 TEST_SUITE_END();
diff --git a/tests/TypeInfer.unknownnever.test.cpp b/tests/TypeInfer.unknownnever.test.cpp
index 410fd52..8558670 100644
--- a/tests/TypeInfer.unknownnever.test.cpp
+++ b/tests/TypeInfer.unknownnever.test.cpp
@@ -301,11 +301,6 @@ TEST_CASE_FIXTURE(Fixture, "length_of_never")
 
 TEST_CASE_FIXTURE(Fixture, "dont_unify_operands_if_one_of_the_operand_is_never_in_any_ordering_operators")
 {
-    ScopedFastFlag sff[]{
-        {"LuauTryhardAnd", true},
-        {"LuauReducingAndOr", true},
-    };
-
     CheckResult result = check(R"(
         local function ord(x: nil, y)
             return x ~= nil and x > y
diff --git a/tests/TypeVar.test.cpp b/tests/TypeVar.test.cpp
index 3f0becc..dbf58cc 100644
--- a/tests/TypeVar.test.cpp
+++ b/tests/TypeVar.test.cpp
@@ -273,12 +273,14 @@ TEST_CASE_FIXTURE(Fixture, "substitution_skip_failure")
 
     TypeId root = &ttvTweenResult;
 
-    frontend.typeChecker.currentModule = std::make_shared<Module>();
-    frontend.typeChecker.currentModule->scopes.emplace_back(Location{}, std::make_shared<Scope>(builtinTypes->anyTypePack));
+    ModulePtr currentModule = std::make_shared<Module>();
+    Anyification anyification(&currentModule->internalTypes, frontend.globals.globalScope, builtinTypes, &frontend.iceHandler, builtinTypes->anyType,
+        builtinTypes->anyTypePack);
+    std::optional<TypeId> any = anyification.substitute(root);
 
-    TypeId result = frontend.typeChecker.anyify(frontend.globals.globalScope, root, Location{});
-
-    CHECK_EQ("{| f: t1 |} where t1 = () -> {| f: () -> {| f: ({| f: t1 |}) -> (), signal: {| f: (any) -> () |} |} |}", toString(result));
+    REQUIRE(!anyification.normalizationTooComplex);
+    REQUIRE(any.has_value());
+    CHECK_EQ("{| f: t1 |} where t1 = () -> {| f: () -> {| f: ({| f: t1 |}) -> (), signal: {| f: (any) -> () |} |} |}", toString(*any));
 }
 
 TEST_CASE("tagging_tables")
diff --git a/tests/conformance/math.lua b/tests/conformance/math.lua
index ea3b5c8..4734273 100644
--- a/tests/conformance/math.lua
+++ b/tests/conformance/math.lua
@@ -347,5 +347,15 @@ assert(select('#', math.ceil(1.6)) == 1)
 assert(select('#', math.sqrt(9)) == 1)
 assert(select('#', math.deg(9)) == 1)
 assert(select('#', math.rad(9)) == 1)
+assert(select('#', math.sin(1.5)) == 1)
+assert(select('#', math.atan2(1.5, 0.5)) == 1)
+assert(select('#', math.modf(1.5)) == 2)
+assert(select('#', math.frexp(1.5)) == 2)
+
+-- test that fastcalls that return variadic results return them correctly in variadic position
+assert(select(1, math.modf(1.5)) == 1)
+assert(select(2, math.modf(1.5)) == 0.5)
+assert(select(1, math.frexp(1.5)) == 0.75)
+assert(select(2, math.frexp(1.5)) == 1)
 
 return('OK')
diff --git a/tests/conformance/tables.lua b/tests/conformance/tables.lua
index 596eed3..03b4639 100644
--- a/tests/conformance/tables.lua
+++ b/tests/conformance/tables.lua
@@ -715,4 +715,11 @@ do
   end
 end
 
+-- check that fast path for table lookup can't be tricked into assuming a light user data with string pointer is a string
+assert((function ()
+  local t = {}
+  t[makelud("hi")] = "no"
+  return t.hi
+end)() == nil)
+
 return"OK"
diff --git a/tools/lvmexecute_split.py b/tools/lvmexecute_split.py
index 16de45d..6e64bcd 100644
--- a/tools/lvmexecute_split.py
+++ b/tools/lvmexecute_split.py
@@ -34,7 +34,7 @@ source = """// This file is part of the Luau programming language and is license
 function = ""
 signature = ""
 
-includeInsts = ["LOP_NEWCLOSURE", "LOP_NAMECALL", "LOP_FORGPREP", "LOP_GETVARARGS", "LOP_DUPCLOSURE", "LOP_PREPVARARGS", "LOP_BREAK", "LOP_GETGLOBAL", "LOP_SETGLOBAL", "LOP_GETTABLEKS", "LOP_SETTABLEKS"]
+includeInsts = ["LOP_NEWCLOSURE", "LOP_NAMECALL", "LOP_FORGPREP", "LOP_GETVARARGS", "LOP_DUPCLOSURE", "LOP_PREPVARARGS", "LOP_BREAK", "LOP_GETGLOBAL", "LOP_SETGLOBAL", "LOP_GETTABLEKS", "LOP_SETTABLEKS", "LOP_SETLIST"]
 
 state = 0