diff --git a/Cargo.toml b/Cargo.toml index 1424d78..8ea5526 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "luau0-src" -version = "0.2.2+luau521" +version = "0.3.0+luau526" authors = ["Aleksandr Orlenko "] edition = "2018" repository = "https://github.com/khvzak/luau-src-rs" diff --git a/luau/Ast/include/Luau/Ast.h b/luau/Ast/include/Luau/Ast.h index 31cd01c..6f39e3f 100644 --- a/luau/Ast/include/Luau/Ast.h +++ b/luau/Ast/include/Luau/Ast.h @@ -313,7 +313,7 @@ template struct AstArray { T* data; - std::size_t size; + size_t size; const T* begin() const { diff --git a/luau/Ast/include/Luau/DenseHash.h b/luau/Ast/include/Luau/DenseHash.h index 65939be..f854311 100644 --- a/luau/Ast/include/Luau/DenseHash.h +++ b/luau/Ast/include/Luau/DenseHash.h @@ -32,6 +32,7 @@ class DenseHashTable { public: class const_iterator; + class iterator; DenseHashTable(const Key& empty_key, size_t buckets = 0) : count(0) @@ -43,7 +44,7 @@ public: // don't move this to initializer list! this works around an MSVC codegen issue on AMD CPUs: // https://developercommunity.visualstudio.com/t/stdvector-constructor-from-size-t-is-25-times-slow/1546547 if (buckets) - data.resize(buckets, ItemInterface::create(empty_key)); + resize_data(buckets); } void clear() @@ -125,7 +126,7 @@ public: if (data.empty() && data.capacity() >= newsize) { LUAU_ASSERT(count == 0); - data.resize(newsize, ItemInterface::create(empty_key)); + resize_data(newsize); return; } @@ -169,6 +170,21 @@ public: return const_iterator(this, data.size()); } + iterator begin() + { + size_t start = 0; + + while (start < data.size() && eq(ItemInterface::getKey(data[start]), empty_key)) + start++; + + return iterator(this, start); + } + + iterator end() + { + return iterator(this, data.size()); + } + size_t size() const { return count; @@ -233,7 +249,82 @@ public: size_t index; }; + class iterator + { + public: + iterator() + : set(0) + , index(0) + { + } + + iterator(DenseHashTable* set, size_t index) + : set(set) + , index(index) + { + } + + MutableItem& operator*() const + { + return *reinterpret_cast(&set->data[index]); + } + + MutableItem* operator->() const + { + return reinterpret_cast(&set->data[index]); + } + + bool operator==(const iterator& other) const + { + return set == other.set && index == other.index; + } + + bool operator!=(const iterator& other) const + { + return set != other.set || index != other.index; + } + + iterator& operator++() + { + size_t size = set->data.size(); + + do + { + index++; + } while (index < size && set->eq(ItemInterface::getKey(set->data[index]), set->empty_key)); + + return *this; + } + + iterator operator++(int) + { + iterator res = *this; + ++*this; + return res; + } + + private: + DenseHashTable* set; + size_t index; + }; + private: + template + void resize_data(size_t count, typename std::enable_if_t>* dummy = nullptr) + { + data.resize(count, ItemInterface::create(empty_key)); + } + + template + void resize_data(size_t count, typename std::enable_if_t>* dummy = nullptr) + { + size_t size = data.size(); + data.resize(count); + + for (size_t i = size; i < count; i++) + data[i].first = empty_key; + } + std::vector data; size_t count; Key empty_key; @@ -290,6 +381,7 @@ class DenseHashSet public: typedef typename Impl::const_iterator const_iterator; + typedef typename Impl::iterator iterator; DenseHashSet(const Key& empty_key, size_t buckets = 0) : impl(empty_key, buckets) @@ -336,6 +428,16 @@ public: { return impl.end(); } + + iterator begin() + { + return impl.begin(); + } + + iterator end() + { + return impl.end(); + } }; // This is a faster alternative of unordered_map, but it does not implement the same interface (i.e. it does not support erasing and has @@ -348,6 +450,7 @@ class DenseHashMap public: typedef typename Impl::const_iterator const_iterator; + typedef typename Impl::iterator iterator; DenseHashMap(const Key& empty_key, size_t buckets = 0) : impl(empty_key, buckets) @@ -401,10 +504,21 @@ public: { return impl.begin(); } + const_iterator end() const { return impl.end(); } + + iterator begin() + { + return impl.begin(); + } + + iterator end() + { + return impl.end(); + } }; } // namespace Luau diff --git a/luau/Ast/include/Luau/Lexer.h b/luau/Ast/include/Luau/Lexer.h index d7d867f..4f3dbbd 100644 --- a/luau/Ast/include/Luau/Lexer.h +++ b/luau/Ast/include/Luau/Lexer.h @@ -173,7 +173,7 @@ public: } const Lexeme& next(); - const Lexeme& next(bool skipComments); + const Lexeme& next(bool skipComments, bool updatePrevLocation); void nextline(); Lexeme lookahead(); diff --git a/luau/Ast/include/Luau/StringUtils.h b/luau/Ast/include/Luau/StringUtils.h index 6ecf060..6ae9e97 100644 --- a/luau/Ast/include/Luau/StringUtils.h +++ b/luau/Ast/include/Luau/StringUtils.h @@ -19,6 +19,7 @@ std::string format(const char* fmt, ...) LUAU_PRINTF_ATTR(1, 2); std::string vformat(const char* fmt, va_list args); void formatAppend(std::string& str, const char* fmt, ...) LUAU_PRINTF_ATTR(2, 3); +void vformatAppend(std::string& ret, const char* fmt, va_list args); std::string join(const std::vector& segments, std::string_view delimiter); std::string join(const std::vector& segments, std::string_view delimiter); diff --git a/luau/Ast/include/Luau/TimeTrace.h b/luau/Ast/include/Luau/TimeTrace.h index 5018456..9f7b2bd 100644 --- a/luau/Ast/include/Luau/TimeTrace.h +++ b/luau/Ast/include/Luau/TimeTrace.h @@ -9,14 +9,21 @@ LUAU_FASTFLAG(DebugLuauTimeTracing) +namespace Luau +{ +namespace TimeTrace +{ +double getClock(); +uint32_t getClockMicroseconds(); +} // namespace TimeTrace +} // namespace Luau + #if defined(LUAU_ENABLE_TIME_TRACE) namespace Luau { namespace TimeTrace { -uint32_t getClockMicroseconds(); - struct Token { const char* name; diff --git a/luau/Ast/src/Lexer.cpp b/luau/Ast/src/Lexer.cpp index 70c6c78..a1f1d46 100644 --- a/luau/Ast/src/Lexer.cpp +++ b/luau/Ast/src/Lexer.cpp @@ -6,8 +6,6 @@ #include -LUAU_FASTFLAGVARIABLE(LuauParseLocationIgnoreCommentSkip, false) - namespace Luau { @@ -349,13 +347,11 @@ void Lexer::setReadNames(bool read) const Lexeme& Lexer::next() { - return next(this->skipComments); + return next(this->skipComments, true); } -const Lexeme& Lexer::next(bool skipComments) +const Lexeme& Lexer::next(bool skipComments, bool updatePrevLocation) { - bool first = true; - // in skipComments mode we reject valid comments do { @@ -363,11 +359,11 @@ const Lexeme& Lexer::next(bool skipComments) while (isSpace(peekch())) consume(); - if (!FFlag::LuauParseLocationIgnoreCommentSkip || first) + if (updatePrevLocation) prevLocation = lexeme.location; lexeme = readNext(); - first = false; + updatePrevLocation = false; } while (skipComments && (lexeme.type == Lexeme::Comment || lexeme.type == Lexeme::BlockComment)); return lexeme; diff --git a/luau/Ast/src/Parser.cpp b/luau/Ast/src/Parser.cpp index f6dfd90..91f5cd2 100644 --- a/luau/Ast/src/Parser.cpp +++ b/luau/Ast/src/Parser.cpp @@ -10,6 +10,7 @@ // See docs/SyntaxChanges.md for an explanation. LUAU_FASTINTVARIABLE(LuauRecursionLimit, 1000) LUAU_FASTINTVARIABLE(LuauParseErrorLimit, 100) +LUAU_FASTFLAGVARIABLE(LuauParseLocationIgnoreCommentSkipInCapture, false) namespace Luau { @@ -165,6 +166,7 @@ Parser::Parser(const char* buffer, size_t bufferSize, AstNameTable& names, Alloc Function top; top.vararg = true; + functionStack.reserve(8); functionStack.push_back(top); nameSelf = names.addStatic("self"); @@ -184,6 +186,13 @@ Parser::Parser(const char* buffer, size_t bufferSize, AstNameTable& names, Alloc // all hot comments parsed after the first non-comment lexeme are special in that they don't affect type checking / linting mode hotcommentHeader = false; + + // preallocate some buffers that are very likely to grow anyway; this works around std::vector's inefficient growth policy for small arrays + localStack.reserve(16); + scratchStat.reserve(16); + scratchExpr.reserve(16); + scratchLocal.reserve(16); + scratchBinding.reserve(16); } bool Parser::blockFollow(const Lexeme& l) @@ -1420,6 +1429,11 @@ AstType* Parser::parseTypeAnnotation(TempVector& parts, const Location parts.push_back(parseSimpleTypeAnnotation(/* allowPack= */ false).type); isIntersection = true; } + else if (c == Lexeme::Dot3) + { + report(lexer.current().location, "Unexpected '...' after type annotation"); + nextLexeme(); + } else break; } @@ -1536,6 +1550,11 @@ AstTypeOrPack Parser::parseSimpleTypeAnnotation(bool allowPack) prefix = name.name; name = parseIndexName("field name", pointPosition); } + else if (lexer.current().type == Lexeme::Dot3) + { + report(lexer.current().location, "Unexpected '...' after type name; type pack is not allowed in this context"); + nextLexeme(); + } else if (name.name == "typeof") { Lexeme typeofBegin = lexer.current(); @@ -2778,7 +2797,7 @@ void Parser::nextLexeme() { if (options.captureComments) { - Lexeme::Type type = lexer.next(/* skipComments= */ false).type; + Lexeme::Type type = lexer.next(/* skipComments= */ false, true).type; while (type == Lexeme::BrokenComment || type == Lexeme::Comment || type == Lexeme::BlockComment) { @@ -2802,7 +2821,7 @@ void Parser::nextLexeme() hotcomments.push_back({hotcommentHeader, lexeme.location, std::string(text + 1, text + end)}); } - type = lexer.next(/* skipComments= */ false).type; + type = lexer.next(/* skipComments= */ false, !FFlag::LuauParseLocationIgnoreCommentSkipInCapture).type; } } else diff --git a/luau/Ast/src/StringUtils.cpp b/luau/Ast/src/StringUtils.cpp index 9c7fed3..0dc3f3f 100644 --- a/luau/Ast/src/StringUtils.cpp +++ b/luau/Ast/src/StringUtils.cpp @@ -11,7 +11,7 @@ namespace Luau { -static void vformatAppend(std::string& ret, const char* fmt, va_list args) +void vformatAppend(std::string& ret, const char* fmt, va_list args) { va_list argscopy; va_copy(argscopy, args); diff --git a/luau/Ast/src/TimeTrace.cpp b/luau/Ast/src/TimeTrace.cpp index 19564f0..e380768 100644 --- a/luau/Ast/src/TimeTrace.cpp +++ b/luau/Ast/src/TimeTrace.cpp @@ -26,9 +26,6 @@ #include LUAU_FASTFLAGVARIABLE(DebugLuauTimeTracing, false) - -#if defined(LUAU_ENABLE_TIME_TRACE) - namespace Luau { namespace TimeTrace @@ -67,6 +64,14 @@ static double getClockTimestamp() #endif } +double getClock() +{ + static double period = getClockPeriod(); + static double start = getClockTimestamp(); + + return (getClockTimestamp() - start) * period; +} + uint32_t getClockMicroseconds() { static double period = getClockPeriod() * 1e6; @@ -74,7 +79,15 @@ uint32_t getClockMicroseconds() return uint32_t((getClockTimestamp() - start) * period); } +} // namespace TimeTrace +} // namespace Luau +#if defined(LUAU_ENABLE_TIME_TRACE) + +namespace Luau +{ +namespace TimeTrace +{ struct GlobalContext { GlobalContext() = default; diff --git a/luau/Compiler/include/Luau/Bytecode.h b/luau/Compiler/include/Luau/Bytecode.h index c6e5a03..f71d893 100644 --- a/luau/Compiler/include/Luau/Bytecode.h +++ b/luau/Compiler/include/Luau/Bytecode.h @@ -353,6 +353,11 @@ enum LuauOpcode // AUX: constant index LOP_FASTCALL2K, + // FORGPREP: prepare loop variables for a generic for loop, jump to the loop backedge unconditionally + // A: target register; generic for loops assume a register layout [generator, state, index, variables...] + // D: jump offset (-32768..32767) + LOP_FORGPREP, + // Enum entry for number of opcodes, not a valid opcode by itself! LOP__COUNT }; diff --git a/luau/Compiler/include/Luau/BytecodeBuilder.h b/luau/Compiler/include/Luau/BytecodeBuilder.h index 287bf4e..b00440a 100644 --- a/luau/Compiler/include/Luau/BytecodeBuilder.h +++ b/luau/Compiler/include/Luau/BytecodeBuilder.h @@ -3,6 +3,7 @@ #include "Luau/Bytecode.h" #include "Luau/DenseHash.h" +#include "Luau/StringUtils.h" #include @@ -80,6 +81,8 @@ public: void pushDebugUpval(StringRef name); uint32_t getDebugPC() const; + void addDebugRemark(const char* format, ...) LUAU_PRINTF_ATTR(2, 3); + void finalize(); enum DumpFlags @@ -88,6 +91,7 @@ public: Dump_Lines = 1 << 1, Dump_Source = 1 << 2, Dump_Locals = 1 << 3, + Dump_Remarks = 1 << 4, }; void setDumpFlags(uint32_t flags) @@ -228,6 +232,9 @@ private: DenseHashMap stringTable; + std::vector> debugRemarks; + std::string debugRemarkBuffer; + BytecodeEncoder* encoder = nullptr; std::string bytecode; diff --git a/luau/Compiler/src/BytecodeBuilder.cpp b/luau/Compiler/src/BytecodeBuilder.cpp index 6944de0..fb70392 100644 --- a/luau/Compiler/src/BytecodeBuilder.cpp +++ b/luau/Compiler/src/BytecodeBuilder.cpp @@ -96,6 +96,7 @@ inline bool isJumpD(LuauOpcode op) case LOP_JUMPIFNOTLT: case LOP_FORNPREP: case LOP_FORNLOOP: + case LOP_FORGPREP: case LOP_FORGLOOP: case LOP_FORGPREP_INEXT: case LOP_FORGLOOP_INEXT: @@ -184,6 +185,13 @@ BytecodeBuilder::BytecodeBuilder(BytecodeEncoder* encoder) , encoder(encoder) { LUAU_ASSERT(stringTable.find(StringRef{"", 0}) == nullptr); + + // preallocate some buffers that are very likely to grow anyway; this works around std::vector's inefficient growth policy for small arrays + insns.reserve(32); + lines.reserve(32); + constants.reserve(16); + protos.reserve(16); + functions.reserve(8); } uint32_t BytecodeBuilder::beginFunction(uint8_t numparams, bool isvararg) @@ -219,8 +227,8 @@ void BytecodeBuilder::endFunction(uint8_t maxstacksize, uint8_t numupvalues) validate(); #endif - // very approximate: 4 bytes per instruction for code, 1 byte for debug line, and 1-2 bytes for aux data like constants - func.data.reserve(insns.size() * 7); + // very approximate: 4 bytes per instruction for code, 1 byte for debug line, and 1-2 bytes for aux data like constants plus overhead + func.data.reserve(32 + insns.size() * 7); writeFunction(func.data, currentFunction); @@ -242,10 +250,15 @@ void BytecodeBuilder::endFunction(uint8_t maxstacksize, uint8_t numupvalues) constantMap.clear(); tableShapeMap.clear(); + + debugRemarks.clear(); + debugRemarkBuffer.clear(); } void BytecodeBuilder::setMainFunction(uint32_t fid) { + LUAU_ASSERT(fid < functions.size()); + mainFunction = fid; } @@ -505,9 +518,40 @@ uint32_t BytecodeBuilder::getDebugPC() const return uint32_t(insns.size()); } +void BytecodeBuilder::addDebugRemark(const char* format, ...) +{ + if ((dumpFlags & Dump_Remarks) == 0) + return; + + size_t offset = debugRemarkBuffer.size(); + + va_list args; + va_start(args, format); + vformatAppend(debugRemarkBuffer, format, args); + va_end(args); + + // we null-terminate all remarks to avoid storing remark length + debugRemarkBuffer += '\0'; + + debugRemarks.emplace_back(uint32_t(insns.size()), uint32_t(offset)); +} + void BytecodeBuilder::finalize() { LUAU_ASSERT(bytecode.empty()); + + // preallocate space for bytecode blob + size_t capacity = 16; + + for (auto& p : stringTable) + capacity += p.first.length + 2; + + for (const Function& func : functions) + capacity += func.data.size(); + + bytecode.reserve(capacity); + + // assemble final bytecode blob bytecode = char(LBC_VERSION); writeStringTable(bytecode); @@ -663,6 +707,8 @@ void BytecodeBuilder::writeFunction(std::string& ss, uint32_t id) const void BytecodeBuilder::writeLineInfo(std::string& ss) const { + LUAU_ASSERT(!lines.empty()); + // this function encodes lines inside each span as a 8-bit delta to span baseline // span is always a power of two; depending on the line info input, it may need to be as low as 1 int span = 1 << 24; @@ -693,7 +739,17 @@ void BytecodeBuilder::writeLineInfo(std::string& ss) const } // second pass: compute span base - std::vector baseline((lines.size() - 1) / span + 1); + int baselineOne = 0; + std::vector baselineScratch; + int* baseline = &baselineOne; + size_t baselineSize = (lines.size() - 1) / span + 1; + + if (baselineSize > 1) + { + // avoid heap allocation for single-element baseline which is most functions (<256 lines) + baselineScratch.resize(baselineSize); + baseline = baselineScratch.data(); + } for (size_t offset = 0; offset < lines.size(); offset += span) { @@ -725,7 +781,7 @@ void BytecodeBuilder::writeLineInfo(std::string& ss) const int lastLine = 0; - for (size_t i = 0; i < baseline.size(); ++i) + for (size_t i = 0; i < baselineSize; ++i) { writeInt(ss, baseline[i] - lastLine); lastLine = baseline[i]; @@ -1214,6 +1270,11 @@ void BytecodeBuilder::validate() const VJUMP(LUAU_INSN_D(insn)); break; + case LOP_FORGPREP: + VREG(LUAU_INSN_A(insn) + 2 + 1); // forg loop protocol: A, A+1, A+2 are used for iteration protocol; A+3, ... are loop variables + VJUMP(LUAU_INSN_D(insn)); + break; + case LOP_FORGLOOP: VREG( LUAU_INSN_A(insn) + 2 + insns[i + 1]); // forg loop protocol: A, A+1, A+2 are used for iteration protocol; A+3, ... are loop variables @@ -1567,6 +1628,10 @@ const uint32_t* BytecodeBuilder::dumpInstruction(const uint32_t* code, std::stri formatAppend(result, "FORNLOOP R%d %+d\n", LUAU_INSN_A(insn), LUAU_INSN_D(insn)); break; + case LOP_FORGPREP: + formatAppend(result, "FORGPREP R%d %+d\n", LUAU_INSN_A(insn), LUAU_INSN_D(insn)); + break; + case LOP_FORGLOOP: formatAppend(result, "FORGLOOP R%d %+d %d\n", LUAU_INSN_A(insn), LUAU_INSN_D(insn), *code++); break; @@ -1665,6 +1730,7 @@ std::string BytecodeBuilder::dumpCurrentFunction() const const uint32_t* codeEnd = insns.data() + insns.size(); int lastLine = -1; + size_t nextRemark = 0; std::string result; @@ -1687,6 +1753,7 @@ std::string BytecodeBuilder::dumpCurrentFunction() const while (code != codeEnd) { uint8_t op = LUAU_INSN_OP(*code); + uint32_t pc = uint32_t(code - insns.data()); if (op == LOP_PREPVARARGS) { @@ -1695,9 +1762,18 @@ std::string BytecodeBuilder::dumpCurrentFunction() const continue; } + if (dumpFlags & Dump_Remarks) + { + while (nextRemark < debugRemarks.size() && debugRemarks[nextRemark].first == pc) + { + formatAppend(result, "REMARK %s\n", debugRemarkBuffer.c_str() + debugRemarks[nextRemark].second); + nextRemark++; + } + } + if (dumpFlags & Dump_Source) { - int line = lines[code - insns.data()]; + int line = lines[pc]; if (line > 0 && line != lastLine) { @@ -1709,7 +1785,7 @@ std::string BytecodeBuilder::dumpCurrentFunction() const if (dumpFlags & Dump_Lines) { - formatAppend(result, "%d: ", lines[code - insns.data()]); + formatAppend(result, "%d: ", lines[pc]); } code = dumpInstruction(code, result); @@ -1722,11 +1798,11 @@ void BytecodeBuilder::setDumpSource(const std::string& source) { dumpSource.clear(); - std::string::size_type pos = 0; + size_t pos = 0; while (pos != std::string::npos) { - std::string::size_type next = source.find('\n', pos); + size_t next = source.find('\n', pos); if (next == std::string::npos) { diff --git a/luau/Compiler/src/Compiler.cpp b/luau/Compiler/src/Compiler.cpp index 6330bf1..4fe2622 100644 --- a/luau/Compiler/src/Compiler.cpp +++ b/luau/Compiler/src/Compiler.cpp @@ -8,12 +8,27 @@ #include "Builtins.h" #include "ConstantFolding.h" +#include "CostModel.h" #include "TableShape.h" #include "ValueTracking.h" #include #include #include +#include + +LUAU_FASTFLAGVARIABLE(LuauCompileSupportInlining, false) + +LUAU_FASTFLAGVARIABLE(LuauCompileIter, false) +LUAU_FASTFLAGVARIABLE(LuauCompileIterNoReserve, false) +LUAU_FASTFLAGVARIABLE(LuauCompileIterNoPairs, false) + +LUAU_FASTINTVARIABLE(LuauCompileLoopUnrollThreshold, 25) +LUAU_FASTINTVARIABLE(LuauCompileLoopUnrollThresholdMaxBoost, 300) + +LUAU_FASTINTVARIABLE(LuauCompileInlineThreshold, 25) +LUAU_FASTINTVARIABLE(LuauCompileInlineThresholdMaxBoost, 300) +LUAU_FASTINTVARIABLE(LuauCompileInlineDepth, 5) namespace Luau { @@ -77,8 +92,12 @@ struct Compiler , globals(AstName()) , variables(nullptr) , constants(nullptr) + , locstants(nullptr) , tableShapes(nullptr) { + // preallocate some buffers that are very likely to grow anyway; this works around std::vector's inefficient growth policy for small arrays + localStack.reserve(16); + upvals.reserve(16); } uint8_t getLocal(AstLocal* local) @@ -138,6 +157,52 @@ struct Compiler } } + AstExprFunction* getFunctionExpr(AstExpr* node) + { + if (AstExprLocal* le = node->as()) + { + Variable* lv = variables.find(le->local); + + if (!lv || lv->written || !lv->init) + return nullptr; + + return getFunctionExpr(lv->init); + } + else if (AstExprGroup* ge = node->as()) + return getFunctionExpr(ge->expr); + else + return node->as(); + } + + bool canInlineFunctionBody(AstStat* stat) + { + struct CanInlineVisitor : AstVisitor + { + bool result = true; + + bool visit(AstExpr* node) override + { + // nested functions may capture function arguments, and our upval handling doesn't handle elided variables (constant) + // TODO: we could remove this case if we changed function compilation to create temporary locals for constant upvalues + // TODO: additionally we would need to change upvalue handling in compileExprFunction to handle upvalue->local migration + result = result && !node->is(); + return result; + } + + bool visit(AstStat* node) override + { + // loops may need to be unrolled which can result in cost amplification + result = result && !node->is(); + return result; + } + }; + + CanInlineVisitor canInline; + stat->visit(&canInline); + + return canInline.result; + } + uint32_t compileFunction(AstExprFunction* func) { LUAU_TIMETRACE_SCOPE("Compiler::compileFunction", "Compiler"); @@ -205,11 +270,21 @@ struct Compiler bytecode.endFunction(uint8_t(stackSize), uint8_t(upvals.size())); - stackSize = 0; - Function& f = functions[func]; f.id = fid; - f.upvals = std::move(upvals); + f.upvals = upvals; + + // record information for inlining + if (FFlag::LuauCompileSupportInlining && options.optimizationLevel >= 2 && !func->vararg && canInlineFunctionBody(func->body) && + !getfenvUsed && !setfenvUsed) + { + f.canInline = true; + f.stackSize = stackSize; + f.costModel = modelCost(func->body, func->args.data, func->args.size); + } + + upvals.clear(); // note: instead of std::move above, we copy & clear to preserve capacity for future pushes + stackSize = 0; return fid; } @@ -379,12 +454,183 @@ struct Compiler } } + bool tryCompileInlinedCall(AstExprCall* expr, AstExprFunction* func, uint8_t target, uint8_t targetCount, bool multRet, int thresholdBase, + int thresholdMaxBoost, int depthLimit) + { + Function* fi = functions.find(func); + LUAU_ASSERT(fi); + + // make sure we have enough register space + if (regTop > 128 || fi->stackSize > 32) + { + bytecode.addDebugRemark("inlining failed: high register pressure"); + return false; + } + + // we should ideally aggregate the costs during recursive inlining, but for now simply limit the depth + if (int(inlineFrames.size()) >= depthLimit) + { + bytecode.addDebugRemark("inlining failed: too many inlined frames"); + return false; + } + + // compiling recursive inlining is difficult because we share constant/variable state but need to bind variables to different registers + for (InlineFrame& frame : inlineFrames) + if (frame.func == func) + { + bytecode.addDebugRemark("inlining failed: can't inline recursive calls"); + return false; + } + + // TODO: we can compile multret functions if all returns of the function are multret as well + if (multRet) + { + bytecode.addDebugRemark("inlining failed: can't convert fixed returns to multret"); + return false; + } + + // TODO: we can compile functions with mismatching arity at call site but it's more annoying + if (func->args.size != expr->args.size) + { + bytecode.addDebugRemark("inlining failed: argument count mismatch (expected %d, got %d)", int(func->args.size), int(expr->args.size)); + return false; + } + + // we use a dynamic cost threshold that's based on the fixed limit boosted by the cost advantage we gain due to inlining + bool varc[8] = {}; + for (size_t i = 0; i < expr->args.size && i < 8; ++i) + varc[i] = isConstant(expr->args.data[i]); + + int inlinedCost = computeCost(fi->costModel, varc, std::min(int(expr->args.size), 8)); + int baselineCost = computeCost(fi->costModel, nullptr, 0) + 3; + int inlineProfit = (inlinedCost == 0) ? thresholdMaxBoost : std::min(thresholdMaxBoost, 100 * baselineCost / inlinedCost); + + int threshold = thresholdBase * inlineProfit / 100; + + if (inlinedCost > threshold) + { + bytecode.addDebugRemark("inlining failed: too expensive (cost %d, profit %.2fx)", inlinedCost, double(inlineProfit) / 100); + return false; + } + + bytecode.addDebugRemark( + "inlining succeeded (cost %d, profit %.2fx, depth %d)", inlinedCost, double(inlineProfit) / 100, int(inlineFrames.size())); + + compileInlinedCall(expr, func, target, targetCount); + return true; + } + + void compileInlinedCall(AstExprCall* expr, AstExprFunction* func, uint8_t target, uint8_t targetCount) + { + RegScope rs(this); + + size_t oldLocals = localStack.size(); + + // note that we push the frame early; this is needed to block recursive inline attempts + inlineFrames.push_back({func, target, targetCount}); + + // evaluate all arguments; note that we don't emit code for constant arguments (relying on constant folding) + for (size_t i = 0; i < func->args.size; ++i) + { + AstLocal* var = func->args.data[i]; + AstExpr* arg = expr->args.data[i]; + + if (Variable* vv = variables.find(var); vv && vv->written) + { + // if the argument is mutated, we need to allocate a fresh register even if it's a constant + uint8_t reg = allocReg(arg, 1); + compileExprTemp(arg, reg); + pushLocal(var, reg); + } + else if (const Constant* cv = constants.find(arg); cv && cv->type != Constant::Type_Unknown) + { + // since the argument is not mutated, we can simply fold the value into the expressions that need it + locstants[var] = *cv; + } + else + { + AstExprLocal* le = arg->as(); + Variable* lv = le ? variables.find(le->local) : nullptr; + + // if the argument is a local that isn't mutated, we will simply reuse the existing register + if (isExprLocalReg(arg) && (!lv || !lv->written)) + { + uint8_t reg = getLocal(le->local); + pushLocal(var, reg); + } + else + { + uint8_t reg = allocReg(arg, 1); + compileExprTemp(arg, reg); + pushLocal(var, reg); + } + } + } + + // fold constant values updated above into expressions in the function body + foldConstants(constants, variables, locstants, func->body); + + bool usedFallthrough = false; + + for (size_t i = 0; i < func->body->body.size; ++i) + { + AstStat* stat = func->body->body.data[i]; + + if (AstStatReturn* ret = stat->as()) + { + // Optimization: use fallthrough when compiling return at the end of the function to avoid an extra JUMP + compileInlineReturn(ret, /* fallthrough= */ true); + // TODO: This doesn't work when return is part of control flow; ideally we would track the state somehow and generalize this + usedFallthrough = true; + break; + } + else + compileStat(stat); + } + + // for the fallthrough path we need to ensure we clear out target registers + if (!usedFallthrough && !allPathsEndWithReturn(func->body)) + { + for (size_t i = 0; i < targetCount; ++i) + bytecode.emitABC(LOP_LOADNIL, uint8_t(target + i), 0, 0); + } + + popLocals(oldLocals); + + size_t returnLabel = bytecode.emitLabel(); + patchJumps(expr, inlineFrames.back().returnJumps, returnLabel); + + inlineFrames.pop_back(); + + // clean up constant state for future inlining attempts + for (size_t i = 0; i < func->args.size; ++i) + if (Constant* var = locstants.find(func->args.data[i])) + var->type = Constant::Type_Unknown; + + foldConstants(constants, variables, locstants, func->body); + } + void compileExprCall(AstExprCall* expr, uint8_t target, uint8_t targetCount, bool targetTop = false, bool multRet = false) { LUAU_ASSERT(!targetTop || unsigned(target + targetCount) == regTop); setDebugLine(expr); // normally compileExpr sets up line info, but compileExprCall can be called directly + // try inlining the function + if (options.optimizationLevel >= 2 && !expr->self) + { + AstExprFunction* func = getFunctionExpr(expr->func); + Function* fi = func ? functions.find(func) : nullptr; + + if (fi && fi->canInline && + tryCompileInlinedCall(expr, func, target, targetCount, multRet, FInt::LuauCompileInlineThreshold, + FInt::LuauCompileInlineThresholdMaxBoost, FInt::LuauCompileInlineDepth)) + return; + + if (fi && !fi->canInline) + bytecode.addDebugRemark("inlining failed: complex constructs in function body"); + } + RegScope rs(this); unsigned int regCount = std::max(unsigned(1 + expr->self + expr->args.size), unsigned(targetCount)); @@ -749,7 +995,7 @@ struct Compiler { const Constant* c = constants.find(node); - if (!c) + if (!c || c->type == Constant::Type_Unknown) return -1; int cid = -1; @@ -1384,27 +1630,29 @@ struct Compiler { RegScope rs(this); + // note: cv may be invalidated by compileExpr* so we stop using it before calling compile recursively const Constant* cv = constants.find(expr->index); - if (cv && cv->type == Constant::Type_Number && double(int(cv->valueNumber)) == cv->valueNumber && cv->valueNumber >= 1 && - cv->valueNumber <= 256) + if (cv && cv->type == Constant::Type_Number && cv->valueNumber >= 1 && cv->valueNumber <= 256 && + double(int(cv->valueNumber)) == cv->valueNumber) { - uint8_t rt = compileExprAuto(expr->expr, rs); uint8_t i = uint8_t(int(cv->valueNumber) - 1); + uint8_t rt = compileExprAuto(expr->expr, rs); + setDebugLine(expr->index); bytecode.emitABC(LOP_GETTABLEN, target, rt, i); } else if (cv && cv->type == Constant::Type_String) { - uint8_t rt = compileExprAuto(expr->expr, rs); - BytecodeBuilder::StringRef iname = sref(cv->getString()); int32_t cid = bytecode.addConstantString(iname); if (cid < 0) CompileError::raise(expr->location, "Exceeded constant limit; simplify the code to compile"); + uint8_t rt = compileExprAuto(expr->expr, rs); + setDebugLine(expr->index); bytecode.emitABC(LOP_GETTABLEKS, target, rt, uint8_t(BytecodeBuilder::getStringHash(iname))); @@ -1550,8 +1798,9 @@ struct Compiler } else if (AstExprLocal* expr = node->as()) { - if (expr->upvalue) + if (FFlag::LuauCompileSupportInlining ? !isExprLocalReg(expr) : expr->upvalue) { + LUAU_ASSERT(expr->upvalue); uint8_t uid = getUpval(expr->local); bytecode.emitABC(LOP_GETUPVAL, target, uid, 0); @@ -1639,12 +1888,12 @@ struct Compiler // initializes target..target+targetCount-1 range using expressions from the list // if list has fewer expressions, and last expression is a call, we assume the call returns the rest of the values // if list has fewer expressions, and last expression isn't a call, we fill the rest with nil - // assumes target register range can be clobbered and is at the top of the register space - void compileExprListTop(const AstArray& list, uint8_t target, uint8_t targetCount) + // assumes target register range can be clobbered and is at the top of the register space if targetTop = true + void compileExprListTemp(const AstArray& list, uint8_t target, uint8_t targetCount, bool targetTop) { // we assume that target range is at the top of the register space and can be clobbered // this is what allows us to compile the last call expression - if it's a call - using targetTop=true - LUAU_ASSERT(unsigned(target + targetCount) == regTop); + LUAU_ASSERT(!targetTop || unsigned(target + targetCount) == regTop); if (list.size == targetCount) { @@ -1672,7 +1921,7 @@ struct Compiler if (AstExprCall* expr = last->as()) { - compileExprCall(expr, uint8_t(target + list.size - 1), uint8_t(targetCount - (list.size - 1)), /* targetTop= */ true); + compileExprCall(expr, uint8_t(target + list.size - 1), uint8_t(targetCount - (list.size - 1)), targetTop); } else if (AstExprVarargs* expr = last->as()) { @@ -1754,8 +2003,10 @@ struct Compiler if (AstExprLocal* expr = node->as()) { - if (expr->upvalue) + if (FFlag::LuauCompileSupportInlining ? !isExprLocalReg(expr) : expr->upvalue) { + LUAU_ASSERT(expr->upvalue); + LValue result = {LValue::Kind_Upvalue}; result.upval = getUpval(expr->local); result.location = node->location; @@ -1862,7 +2113,7 @@ struct Compiler bool isExprLocalReg(AstExpr* expr) { AstExprLocal* le = expr->as(); - if (!le || le->upvalue) + if (!le || (!FFlag::LuauCompileSupportInlining && le->upvalue)) return false; Local* l = locals.find(le->local); @@ -2069,6 +2320,23 @@ struct Compiler loops.pop_back(); } + void compileInlineReturn(AstStatReturn* stat, bool fallthrough) + { + setDebugLine(stat); // normally compileStat sets up line info, but compileInlineReturn can be called directly + + InlineFrame frame = inlineFrames.back(); + + compileExprListTemp(stat->list, frame.target, frame.targetCount, /* targetTop= */ false); + + if (!fallthrough) + { + size_t jumpLabel = bytecode.emitLabel(); + bytecode.emitAD(LOP_JUMP, 0, 0); + + inlineFrames.back().returnJumps.push_back(jumpLabel); + } + } + void compileStatReturn(AstStatReturn* stat) { RegScope rs(this); @@ -2127,16 +2395,137 @@ struct Compiler // note: allocReg in this case allocates into parent block register - note that we don't have RegScope here uint8_t vars = allocReg(stat, unsigned(stat->vars.size)); - compileExprListTop(stat->values, vars, uint8_t(stat->vars.size)); + compileExprListTemp(stat->values, vars, uint8_t(stat->vars.size), /* targetTop= */ true); for (size_t i = 0; i < stat->vars.size; ++i) pushLocal(stat->vars.data[i], uint8_t(vars + i)); } + int getConstantShort(AstExpr* expr) + { + const Constant* c = constants.find(expr); + + if (c && c->type == Constant::Type_Number) + { + double n = c->valueNumber; + + if (n >= -32767 && n <= 32767 && double(int(n)) == n) + return int(n); + } + + return INT_MIN; + } + + bool canUnrollForBody(AstStatFor* stat) + { + struct CanUnrollVisitor : AstVisitor + { + bool result = true; + + bool visit(AstExpr* node) override + { + // functions may capture loop variable, and our upval handling doesn't handle elided variables (constant) + // TODO: we could remove this case if we changed function compilation to create temporary locals for constant upvalues + result = result && !node->is(); + return result; + } + + bool visit(AstStat* node) override + { + // while we can easily unroll nested loops, our cost model doesn't take unrolling into account so this can result in code explosion + // we also avoid continue/break since they introduce control flow across iterations + result = result && !node->is() && !node->is() && !node->is(); + return result; + } + }; + + CanUnrollVisitor canUnroll; + stat->body->visit(&canUnroll); + + return canUnroll.result; + } + + bool tryCompileUnrolledFor(AstStatFor* stat, int thresholdBase, int thresholdMaxBoost) + { + int from = getConstantShort(stat->from); + int to = getConstantShort(stat->to); + int step = stat->step ? getConstantShort(stat->step) : 1; + + // check that limits are reasonably small and trip count can be computed + if (from == INT_MIN || to == INT_MIN || step == INT_MIN || step == 0 || (step < 0 && to > from) || (step > 0 && to < from)) + { + bytecode.addDebugRemark("loop unroll failed: invalid iteration count"); + return false; + } + + if (!canUnrollForBody(stat)) + { + bytecode.addDebugRemark("loop unroll failed: unsupported loop body"); + return false; + } + + if (Variable* lv = variables.find(stat->var); lv && lv->written) + { + bytecode.addDebugRemark("loop unroll failed: mutable loop variable"); + return false; + } + + int tripCount = (to - from) / step + 1; + + if (tripCount > thresholdBase) + { + bytecode.addDebugRemark("loop unroll failed: too many iterations (%d)", tripCount); + return false; + } + + AstLocal* var = stat->var; + uint64_t costModel = modelCost(stat->body, &var, 1); + + // we use a dynamic cost threshold that's based on the fixed limit boosted by the cost advantage we gain due to unrolling + bool varc = true; + int unrolledCost = computeCost(costModel, &varc, 1) * tripCount; + int baselineCost = (computeCost(costModel, nullptr, 0) + 1) * tripCount; + int unrollProfit = (unrolledCost == 0) ? thresholdMaxBoost : std::min(thresholdMaxBoost, 100 * baselineCost / unrolledCost); + + int threshold = thresholdBase * unrollProfit / 100; + + if (unrolledCost > threshold) + { + bytecode.addDebugRemark( + "loop unroll failed: too expensive (iterations %d, cost %d, profit %.2fx)", tripCount, unrolledCost, double(unrollProfit) / 100); + return false; + } + + bytecode.addDebugRemark("loop unroll succeeded (iterations %d, cost %d, profit %.2fx)", tripCount, unrolledCost, double(unrollProfit) / 100); + + for (int i = from; step > 0 ? i <= to : i >= to; i += step) + { + // we need to re-fold constants in the loop body with the new value; this reuses computed constant values elsewhere in the tree + locstants[var].type = Constant::Type_Number; + locstants[var].valueNumber = i; + + foldConstants(constants, variables, locstants, stat); + + compileStat(stat->body); + } + + // clean up fold state in case we need to recompile - normally we compile the loop body once, but due to inlining we may need to do it again + locstants[var].type = Constant::Type_Unknown; + + foldConstants(constants, variables, locstants, stat); + + return true; + } + void compileStatFor(AstStatFor* stat) { RegScope rs(this); + // Optimization: small loops can be unrolled when it is profitable + if (options.optimizationLevel >= 2 && isConstant(stat->to) && isConstant(stat->from) && (!stat->step || isConstant(stat->step))) + if (tryCompileUnrolledFor(stat, FInt::LuauCompileLoopUnrollThreshold, FInt::LuauCompileLoopUnrollThresholdMaxBoost)) + return; + size_t oldLocals = localStack.size(); size_t oldJumps = loopJumps.size(); @@ -2210,12 +2599,17 @@ struct Compiler uint8_t regs = allocReg(stat, 3); // this puts initial values of (generator, state, index) into the loop registers - compileExprListTop(stat->values, regs, 3); + compileExprListTemp(stat->values, regs, 3, /* targetTop= */ true); - // for the general case, we will execute a CALL for every iteration that needs to evaluate "variables... = generator(state, index)" - // this requires at least extra 3 stack slots after index - // note that these stack slots overlap with the variables so we only need to reserve them to make sure stack frame is large enough - reserveReg(stat, 3); + // we don't need this because the extra stack space is just for calling the function with a loop protocol which is similar to calling + // metamethods - it should fit into the extra stack reservation + if (!FFlag::LuauCompileIterNoReserve) + { + // for the general case, we will execute a CALL for every iteration that needs to evaluate "variables... = generator(state, index)" + // this requires at least extra 3 stack slots after index + // note that these stack slots overlap with the variables so we only need to reserve them to make sure stack frame is large enough + reserveReg(stat, 3); + } // note that we reserve at least 2 variables; this allows our fast path to assume that we need 2 variables instead of 1 or 2 uint8_t vars = allocReg(stat, std::max(unsigned(stat->vars.size), 2u)); @@ -2224,7 +2618,7 @@ struct Compiler // Optimization: when we iterate through pairs/ipairs, we generate special bytecode that optimizes the traversal using internal iteration // index These instructions dynamically check if generator is equal to next/inext and bail out They assume that the generator produces 2 // variables, which is why we allocate at least 2 above (see vars assignment) - LuauOpcode skipOp = LOP_JUMP; + LuauOpcode skipOp = FFlag::LuauCompileIter ? LOP_FORGPREP : LOP_JUMP; LuauOpcode loopOp = LOP_FORGLOOP; if (options.optimizationLevel >= 1 && stat->vars.size <= 2) @@ -2241,7 +2635,7 @@ struct Compiler else if (builtin.isGlobal("pairs")) // for .. in pairs(t) { skipOp = LOP_FORGPREP_NEXT; - loopOp = LOP_FORGLOOP_NEXT; + loopOp = FFlag::LuauCompileIterNoPairs ? LOP_FORGLOOP : LOP_FORGLOOP_NEXT; } } else if (stat->values.size == 2) @@ -2251,7 +2645,7 @@ struct Compiler if (builtin.isGlobal("next")) // for .. in next,t { skipOp = LOP_FORGPREP_NEXT; - loopOp = LOP_FORGLOOP_NEXT; + loopOp = FFlag::LuauCompileIterNoPairs ? LOP_FORGLOOP : LOP_FORGLOOP_NEXT; } } } @@ -2388,10 +2782,10 @@ struct Compiler // compute values into temporaries uint8_t regs = allocReg(stat, unsigned(stat->vars.size)); - compileExprListTop(stat->values, regs, uint8_t(stat->vars.size)); + compileExprListTemp(stat->values, regs, uint8_t(stat->vars.size), /* targetTop= */ true); - // assign variables that have associated values; note that if we have fewer values than variables, we'll assign nil because compileExprListTop - // will generate nils + // assign variables that have associated values; note that if we have fewer values than variables, we'll assign nil because + // compileExprListTemp will generate nils for (size_t i = 0; i < stat->vars.size; ++i) { setDebugLine(stat->vars.data[i]); @@ -2549,7 +2943,10 @@ struct Compiler } else if (AstStatReturn* stat = node->as()) { - compileStatReturn(stat); + if (options.optimizationLevel >= 2 && !inlineFrames.empty()) + compileInlineReturn(stat, /* fallthrough= */ false); + else + compileStatReturn(stat); } else if (AstStatExpr* stat = node->as()) { @@ -2826,6 +3223,8 @@ struct Compiler : self(self) , functions(functions) { + // preallocate the result; this works around std::vector's inefficient growth policy for small arrays + functions.reserve(16); } bool visit(AstExprFunction* node) override @@ -2941,6 +3340,10 @@ struct Compiler { uint32_t id; std::vector upvals; + + uint64_t costModel = 0; + unsigned int stackSize = 0; + bool canInline = false; }; struct Local @@ -2970,6 +3373,16 @@ struct Compiler AstExpr* untilCondition; }; + struct InlineFrame + { + AstExprFunction* func; + + uint8_t target; + uint8_t targetCount; + + std::vector returnJumps; + }; + BytecodeBuilder& bytecode; CompileOptions options; @@ -2979,6 +3392,7 @@ struct Compiler DenseHashMap globals; DenseHashMap variables; DenseHashMap constants; + DenseHashMap locstants; DenseHashMap tableShapes; unsigned int regTop = 0; @@ -2991,6 +3405,7 @@ struct Compiler std::vector upvals; std::vector loopJumps; std::vector loops; + std::vector inlineFrames; }; void compileOrThrow(BytecodeBuilder& bytecode, AstStatBlock* root, const AstNameTable& names, const CompileOptions& options) @@ -3008,7 +3423,7 @@ void compileOrThrow(BytecodeBuilder& bytecode, AstStatBlock* root, const AstName if (options.optimizationLevel >= 1) { // this pass analyzes constantness of expressions - foldConstants(compiler.constants, compiler.variables, root); + foldConstants(compiler.constants, compiler.variables, compiler.locstants, root); // this pass analyzes table assignments to estimate table shapes for initially empty tables predictTableShapes(compiler.tableShapes, root); diff --git a/luau/Compiler/src/ConstantFolding.cpp b/luau/Compiler/src/ConstantFolding.cpp index 60a7c16..52ece73 100644 --- a/luau/Compiler/src/ConstantFolding.cpp +++ b/luau/Compiler/src/ConstantFolding.cpp @@ -3,6 +3,8 @@ #include +LUAU_FASTFLAG(LuauCompileSupportInlining) + namespace Luau { namespace Compile @@ -191,13 +193,13 @@ struct ConstantVisitor : AstVisitor { DenseHashMap& constants; DenseHashMap& variables; + DenseHashMap& locals; - DenseHashMap locals; - - ConstantVisitor(DenseHashMap& constants, DenseHashMap& variables) + ConstantVisitor( + DenseHashMap& constants, DenseHashMap& variables, DenseHashMap& locals) : constants(constants) , variables(variables) - , locals(nullptr) + , locals(locals) { } @@ -290,7 +292,8 @@ struct ConstantVisitor : AstVisitor Constant la = analyze(expr->left); Constant ra = analyze(expr->right); - if (la.type != Constant::Type_Unknown && ra.type != Constant::Type_Unknown) + // note: ra doesn't need to be constant to fold and/or + if (la.type != Constant::Type_Unknown) foldBinary(result, expr->op, la, ra); } else if (AstExprTypeAssertion* expr = node->as()) @@ -313,12 +316,35 @@ struct ConstantVisitor : AstVisitor LUAU_ASSERT(!"Unknown expression type"); } - if (result.type != Constant::Type_Unknown) - constants[node] = result; + recordConstant(constants, node, result); return result; } + template + void recordConstant(DenseHashMap& map, T key, const Constant& value) + { + if (value.type != Constant::Type_Unknown) + map[key] = value; + else if (!FFlag::LuauCompileSupportInlining) + ; + else if (Constant* old = map.find(key)) + old->type = Constant::Type_Unknown; + } + + void recordValue(AstLocal* local, const Constant& value) + { + // note: we rely on trackValues to have been run before us + Variable* v = variables.find(local); + LUAU_ASSERT(v); + + if (!v->written) + { + v->constant = (value.type != Constant::Type_Unknown); + recordConstant(locals, local, value); + } + } + bool visit(AstExpr* node) override { // note: we short-circuit the visitor traversal through any expression trees by returning false @@ -335,18 +361,7 @@ struct ConstantVisitor : AstVisitor { Constant arg = analyze(node->values.data[i]); - if (arg.type != Constant::Type_Unknown) - { - // note: we rely on trackValues to have been run before us - Variable* v = variables.find(node->vars.data[i]); - LUAU_ASSERT(v); - - if (!v->written) - { - locals[node->vars.data[i]] = arg; - v->constant = true; - } - } + recordValue(node->vars.data[i], arg); } if (node->vars.size > node->values.size) @@ -360,15 +375,8 @@ struct ConstantVisitor : AstVisitor { for (size_t i = node->values.size; i < node->vars.size; ++i) { - // note: we rely on trackValues to have been run before us - Variable* v = variables.find(node->vars.data[i]); - LUAU_ASSERT(v); - - if (!v->written) - { - locals[node->vars.data[i]].type = Constant::Type_Nil; - v->constant = true; - } + Constant nil = {Constant::Type_Nil}; + recordValue(node->vars.data[i], nil); } } } @@ -384,9 +392,10 @@ struct ConstantVisitor : AstVisitor } }; -void foldConstants(DenseHashMap& constants, DenseHashMap& variables, AstNode* root) +void foldConstants(DenseHashMap& constants, DenseHashMap& variables, + DenseHashMap& locals, AstNode* root) { - ConstantVisitor visitor{constants, variables}; + ConstantVisitor visitor{constants, variables, locals}; root->visit(&visitor); } diff --git a/luau/Compiler/src/ConstantFolding.h b/luau/Compiler/src/ConstantFolding.h index c0e6353..0a995d7 100644 --- a/luau/Compiler/src/ConstantFolding.h +++ b/luau/Compiler/src/ConstantFolding.h @@ -42,7 +42,8 @@ struct Constant } }; -void foldConstants(DenseHashMap& constants, DenseHashMap& variables, AstNode* root); +void foldConstants(DenseHashMap& constants, DenseHashMap& variables, + DenseHashMap& locals, AstNode* root); } // namespace Compile } // namespace Luau diff --git a/luau/Compiler/src/CostModel.cpp b/luau/Compiler/src/CostModel.cpp new file mode 100644 index 0000000..9afd09f --- /dev/null +++ b/luau/Compiler/src/CostModel.cpp @@ -0,0 +1,258 @@ +// This file is part of the Luau programming language and is licensed under MIT License; see LICENSE.txt for details +#include "CostModel.h" + +#include "Luau/Common.h" +#include "Luau/DenseHash.h" + +namespace Luau +{ +namespace Compile +{ + +inline uint64_t parallelAddSat(uint64_t x, uint64_t y) +{ + uint64_t s = x + y; + uint64_t m = s & 0x8080808080808080ull; // saturation mask + + return (s ^ m) | (m - (m >> 7)); +} + +struct Cost +{ + static const uint64_t kLiteral = ~0ull; + + // cost model: 8 bytes, where first byte is the baseline cost, and the next 7 bytes are discounts for when variable #i is constant + uint64_t model; + // constant mask: 8-byte 0xff mask; equal to all ff's for literals, for variables only byte #i (1+) is set to align with model + uint64_t constant; + + Cost(int cost = 0, uint64_t constant = 0) + : model(cost < 0x7f ? cost : 0x7f) + , constant(constant) + { + } + + Cost operator+(const Cost& other) const + { + Cost result; + result.model = parallelAddSat(model, other.model); + return result; + } + + Cost& operator+=(const Cost& other) + { + model = parallelAddSat(model, other.model); + constant = 0; + return *this; + } + + static Cost fold(const Cost& x, const Cost& y) + { + uint64_t newmodel = parallelAddSat(x.model, y.model); + uint64_t newconstant = x.constant & y.constant; + + // the extra cost for folding is 1; the discount is 1 for the variable that is shared by x&y (or whichever one is used in x/y if the other is + // literal) + uint64_t extra = (newconstant == kLiteral) ? 0 : (1 | (0x0101010101010101ull & newconstant)); + + Cost result; + result.model = parallelAddSat(newmodel, extra); + result.constant = newconstant; + + return result; + } +}; + +struct CostVisitor : AstVisitor +{ + DenseHashMap vars; + Cost result; + + CostVisitor() + : vars(nullptr) + { + } + + Cost model(AstExpr* node) + { + if (AstExprGroup* expr = node->as()) + { + return model(expr->expr); + } + else if (node->is() || node->is() || node->is() || + node->is()) + { + return Cost(0, Cost::kLiteral); + } + else if (AstExprLocal* expr = node->as()) + { + const uint64_t* i = vars.find(expr->local); + + return Cost(0, i ? *i : 0); // locals typically don't require extra instructions to compute + } + else if (node->is()) + { + return 1; + } + else if (node->is()) + { + return 3; + } + else if (AstExprCall* expr = node->as()) + { + Cost cost = 3; + cost += model(expr->func); + + for (size_t i = 0; i < expr->args.size; ++i) + { + Cost ac = model(expr->args.data[i]); + // for constants/locals we still need to copy them to the argument list + cost += ac.model == 0 ? Cost(1) : ac; + } + + return cost; + } + else if (AstExprIndexName* expr = node->as()) + { + return model(expr->expr) + 1; + } + else if (AstExprIndexExpr* expr = node->as()) + { + return model(expr->expr) + model(expr->index) + 1; + } + else if (AstExprFunction* expr = node->as()) + { + return 10; // high baseline cost due to allocation + } + else if (AstExprTable* expr = node->as()) + { + Cost cost = 10; // high baseline cost due to allocation + + for (size_t i = 0; i < expr->items.size; ++i) + { + const AstExprTable::Item& item = expr->items.data[i]; + + if (item.key) + cost += model(item.key); + + cost += model(item.value); + cost += 1; + } + + return cost; + } + else if (AstExprUnary* expr = node->as()) + { + return Cost::fold(model(expr->expr), Cost(0, Cost::kLiteral)); + } + else if (AstExprBinary* expr = node->as()) + { + return Cost::fold(model(expr->left), model(expr->right)); + } + else if (AstExprTypeAssertion* expr = node->as()) + { + return model(expr->expr); + } + else if (AstExprIfElse* expr = node->as()) + { + return model(expr->condition) + model(expr->trueExpr) + model(expr->falseExpr) + 2; + } + else + { + LUAU_ASSERT(!"Unknown expression type"); + return {}; + } + } + + void assign(AstExpr* expr) + { + // variable assignments reset variable mask, so that further uses of this variable aren't discounted + // this doesn't work perfectly with backwards control flow like loops, but is good enough for a single pass + if (AstExprLocal* lv = expr->as()) + if (uint64_t* i = vars.find(lv->local)) + *i = 0; + } + + bool visit(AstExpr* node) override + { + // note: we short-circuit the visitor traversal through any expression trees by returning false + // recursive traversal is happening inside model() which makes it easier to get the resulting value of the subexpression + result += model(node); + + return false; + } + + bool visit(AstStat* node) override + { + if (node->is()) + result += 2; + else if (node->is() || node->is() || node->is() || node->is()) + result += 2; + else if (node->is() || node->is()) + result += 1; + + return true; + } + + bool visit(AstStatLocal* node) override + { + for (size_t i = 0; i < node->values.size; ++i) + { + Cost arg = model(node->values.data[i]); + + // propagate constant mask from expression through variables + if (arg.constant && i < node->vars.size) + vars[node->vars.data[i]] = arg.constant; + + result += arg; + } + + return false; + } + + bool visit(AstStatAssign* node) override + { + for (size_t i = 0; i < node->vars.size; ++i) + assign(node->vars.data[i]); + + return true; + } + + bool visit(AstStatCompoundAssign* node) override + { + assign(node->var); + + // if lhs is not a local, setting it requires an extra table operation + result += node->var->is() ? 1 : 2; + + return true; + } +}; + +uint64_t modelCost(AstNode* root, AstLocal* const* vars, size_t varCount) +{ + CostVisitor visitor; + for (size_t i = 0; i < varCount && i < 7; ++i) + visitor.vars[vars[i]] = 0xffull << (i * 8 + 8); + + root->visit(&visitor); + + return visitor.result.model; +} + +int computeCost(uint64_t model, const bool* varsConst, size_t varCount) +{ + int cost = int(model & 0x7f); + + // don't apply discounts to what is likely a saturated sum + if (cost == 0x7f) + return cost; + + for (size_t i = 0; i < varCount && i < 7; ++i) + cost -= int((model >> (i * 8 + 8)) & 0x7f) * varsConst[i]; + + return cost; +} + +} // namespace Compile +} // namespace Luau diff --git a/luau/Compiler/src/CostModel.h b/luau/Compiler/src/CostModel.h new file mode 100644 index 0000000..c27861e --- /dev/null +++ b/luau/Compiler/src/CostModel.h @@ -0,0 +1,18 @@ +// This file is part of the Luau programming language and is licensed under MIT License; see LICENSE.txt for details +#pragma once + +#include "Luau/Ast.h" + +namespace Luau +{ +namespace Compile +{ + +// cost model: 8 bytes, where first byte is the baseline cost, and the next 7 bytes are discounts for when variable #i is constant +uint64_t modelCost(AstNode* root, AstLocal* const* vars, size_t varCount); + +// cost is computed as B - sum(Di * Ci), where B is baseline cost, Di is the discount for each variable and Ci is 1 when variable #i is constant +int computeCost(uint64_t model, const bool* varsConst, size_t varCount); + +} // namespace Compile +} // namespace Luau diff --git a/luau/VM/include/lua.h b/luau/VM/include/lua.h index d08b73e..c3ebadb 100644 --- a/luau/VM/include/lua.h +++ b/luau/VM/include/lua.h @@ -299,7 +299,7 @@ LUA_API uintptr_t lua_encodepointer(lua_State* L, uintptr_t p); LUA_API double lua_clock(); -LUA_API void lua_setuserdatadtor(lua_State* L, int tag, void (*dtor)(void*)); +LUA_API void lua_setuserdatadtor(lua_State* L, int tag, void (*dtor)(lua_State*, void*)); LUA_API void lua_clonefunction(lua_State* L, int idx); diff --git a/luau/VM/src/lapi.cpp b/luau/VM/src/lapi.cpp index 46b1093..f8baefa 100644 --- a/luau/VM/src/lapi.cpp +++ b/luau/VM/src/lapi.cpp @@ -14,6 +14,8 @@ #include +LUAU_FASTFLAG(LuauGcWorkTrackFix) + const char* lua_ident = "$Lua: Lua 5.1.4 Copyright (C) 1994-2008 Lua.org, PUC-Rio $\n" "$Authors: R. Ierusalimschy, L. H. de Figueiredo & W. Celes $\n" "$URL: www.lua.org $\n"; @@ -1050,6 +1052,7 @@ int lua_gc(lua_State* L, int what, int data) { size_t prevthreshold = g->GCthreshold; size_t amount = (cast_to(size_t, data) << 10); + ptrdiff_t oldcredit = g->gcstate == GCSpause ? 0 : g->GCthreshold - g->totalbytes; // temporarily adjust the threshold so that we can perform GC work if (amount <= g->totalbytes) @@ -1069,9 +1072,9 @@ int lua_gc(lua_State* L, int what, int data) while (g->GCthreshold <= g->totalbytes) { - luaC_step(L, false); + size_t stepsize = luaC_step(L, false); - actualwork += g->gcstepsize; + actualwork += FFlag::LuauGcWorkTrackFix ? stepsize : g->gcstepsize; if (g->gcstate == GCSpause) { /* end of cycle? */ @@ -1107,11 +1110,20 @@ int lua_gc(lua_State* L, int what, int data) // if cycle hasn't finished, advance threshold forward for the amount of extra work performed if (g->gcstate != GCSpause) { - // if a new cycle was triggered by explicit step, we ignore old threshold as that shows an incorrect 'credit' of GC work - if (waspaused) - g->GCthreshold = g->totalbytes + actualwork; + if (FFlag::LuauGcWorkTrackFix) + { + // if a new cycle was triggered by explicit step, old 'credit' of GC work is 0 + ptrdiff_t newthreshold = g->totalbytes + actualwork + oldcredit; + g->GCthreshold = newthreshold < 0 ? 0 : newthreshold; + } else - g->GCthreshold = prevthreshold + actualwork; + { + // if a new cycle was triggered by explicit step, we ignore old threshold as that shows an incorrect 'credit' of GC work + if (waspaused) + g->GCthreshold = g->totalbytes + actualwork; + else + g->GCthreshold = prevthreshold + actualwork; + } } break; } @@ -1258,7 +1270,7 @@ const char* lua_setupvalue(lua_State* L, int funcindex, int n) L->top--; setobj(L, val, L->top); luaC_barrier(L, clvalue(fi), L->top); - luaC_upvalbarrier(L, NULL, val); + luaC_upvalbarrier(L, cast_to(UpVal*, NULL), val); } return name; } @@ -1311,7 +1323,7 @@ void lua_unref(lua_State* L, int ref) return; } -void lua_setuserdatadtor(lua_State* L, int tag, void (*dtor)(void*)) +void lua_setuserdatadtor(lua_State* L, int tag, void (*dtor)(lua_State*, void*)) { api_check(L, unsigned(tag) < LUA_UTAG_LIMIT); L->global->udatagc[tag] = dtor; diff --git a/luau/VM/src/lbuiltins.cpp b/luau/VM/src/lbuiltins.cpp index 718d387..6014919 100644 --- a/luau/VM/src/lbuiltins.cpp +++ b/luau/VM/src/lbuiltins.cpp @@ -15,6 +15,8 @@ #include #endif +LUAU_FASTFLAGVARIABLE(LuauFixBuiltinsStackLimit, false) + // luauF functions implement FASTCALL instruction that performs a direct execution of some builtin functions from the VM // The rule of thumb is that FASTCALL functions can not call user code, yield, fail, or reallocate stack. // If types of the arguments mismatch, luauF_* needs to return -1 and the execution will fall back to the usual call path @@ -1003,7 +1005,7 @@ static int luauF_tunpack(lua_State* L, StkId res, TValue* arg0, int nresults, St else if (nparams == 3 && ttisnumber(args) && ttisnumber(args + 1) && nvalue(args) == 1.0) n = int(nvalue(args + 1)); - if (n >= 0 && n <= t->sizearray && cast_int(L->stack_last - res) >= n) + if (n >= 0 && n <= t->sizearray && cast_int(L->stack_last - res) >= n && (!FFlag::LuauFixBuiltinsStackLimit || n + nparams <= LUAI_MAXCSTACK)) { TValue* array = t->array; for (int i = 0; i < n; ++i) diff --git a/luau/VM/src/lfunc.h b/luau/VM/src/lfunc.h index 8047ceb..a260d00 100644 --- a/luau/VM/src/lfunc.h +++ b/luau/VM/src/lfunc.h @@ -14,6 +14,6 @@ LUAI_FUNC UpVal* luaF_findupval(lua_State* L, StkId level); LUAI_FUNC void luaF_close(lua_State* L, StkId level); LUAI_FUNC void luaF_freeproto(lua_State* L, Proto* f, struct lua_Page* page); LUAI_FUNC void luaF_freeclosure(lua_State* L, Closure* c, struct lua_Page* page); -void luaF_unlinkupval(UpVal* uv); +LUAI_FUNC void luaF_unlinkupval(UpVal* uv); LUAI_FUNC void luaF_freeupval(lua_State* L, UpVal* uv, struct lua_Page* page); LUAI_FUNC const LocVar* luaF_getlocal(const Proto* func, int local_number, int pc); diff --git a/luau/VM/src/lgc.cpp b/luau/VM/src/lgc.cpp index 8fc930d..e7b73fe 100644 --- a/luau/VM/src/lgc.cpp +++ b/luau/VM/src/lgc.cpp @@ -13,9 +13,10 @@ #include -#define GC_SWEEPMAX 40 -#define GC_SWEEPCOST 10 -#define GC_SWEEPPAGESTEPCOST 4 +LUAU_FASTFLAGVARIABLE(LuauGcWorkTrackFix, false) +LUAU_FASTFLAGVARIABLE(LuauGcSweepCostFix, false) + +#define GC_SWEEPPAGESTEPCOST (FFlag::LuauGcSweepCostFix ? 16 : 4) #define GC_INTERRUPT(state) \ { \ @@ -64,7 +65,7 @@ static void recordGcStateStep(global_State* g, int startgcstate, double seconds, case GCSpropagate: case GCSpropagateagain: g->gcmetrics.currcycle.marktime += seconds; - g->gcmetrics.currcycle.markrequests += g->gcstepsize; + g->gcmetrics.currcycle.markwork += work; if (assist) g->gcmetrics.currcycle.markassisttime += seconds; @@ -74,7 +75,7 @@ static void recordGcStateStep(global_State* g, int startgcstate, double seconds, break; case GCSsweep: g->gcmetrics.currcycle.sweeptime += seconds; - g->gcmetrics.currcycle.sweeprequests += g->gcstepsize; + g->gcmetrics.currcycle.sweepwork += work; if (assist) g->gcmetrics.currcycle.sweepassisttime += seconds; @@ -87,13 +88,11 @@ static void recordGcStateStep(global_State* g, int startgcstate, double seconds, { g->gcmetrics.stepassisttimeacc += seconds; g->gcmetrics.currcycle.assistwork += work; - g->gcmetrics.currcycle.assistrequests += g->gcstepsize; } else { g->gcmetrics.stepexplicittimeacc += seconds; g->gcmetrics.currcycle.explicitwork += work; - g->gcmetrics.currcycle.explicitrequests += g->gcstepsize; } } @@ -878,11 +877,11 @@ static size_t getheaptrigger(global_State* g, size_t heapgoal) return heaptrigger < int64_t(g->totalbytes) ? g->totalbytes : (heaptrigger > int64_t(heapgoal) ? heapgoal : size_t(heaptrigger)); } -void luaC_step(lua_State* L, bool assist) +size_t luaC_step(lua_State* L, bool assist) { global_State* g = L->global; - int lim = (g->gcstepsize / 100) * g->gcstepmul; /* how much to work */ + int lim = FFlag::LuauGcWorkTrackFix ? g->gcstepsize * g->gcstepmul / 100 : (g->gcstepsize / 100) * g->gcstepmul; /* how much to work */ LUAU_ASSERT(g->totalbytes >= g->GCthreshold); size_t debt = g->totalbytes - g->GCthreshold; @@ -902,12 +901,13 @@ void luaC_step(lua_State* L, bool assist) int lastgcstate = g->gcstate; size_t work = gcstep(L, lim); - (void)work; #ifdef LUAI_GCMETRICS recordGcStateStep(g, lastgcstate, lua_clock() - lasttimestamp, assist, work); #endif + size_t actualstepsize = work * 100 / g->gcstepmul; + // at the end of the last cycle if (g->gcstate == GCSpause) { @@ -927,14 +927,16 @@ void luaC_step(lua_State* L, bool assist) } else { - g->GCthreshold = g->totalbytes + g->gcstepsize; + g->GCthreshold = g->totalbytes + (FFlag::LuauGcWorkTrackFix ? actualstepsize : g->gcstepsize); // compensate if GC is "behind schedule" (has some debt to pay) - if (g->GCthreshold > debt) + if (FFlag::LuauGcWorkTrackFix ? g->GCthreshold >= debt : g->GCthreshold > debt) g->GCthreshold -= debt; } GC_INTERRUPT(lastgcstate); + + return actualstepsize; } void luaC_fullgc(lua_State* L) diff --git a/luau/VM/src/lgc.h b/luau/VM/src/lgc.h index dcd070b..797284a 100644 --- a/luau/VM/src/lgc.h +++ b/luau/VM/src/lgc.h @@ -120,7 +120,7 @@ #define luaC_upvalbarrier(L, uv, tv) \ { \ - if (iscollectable(tv) && iswhite(gcvalue(tv)) && (!(uv) || ((UpVal*)uv)->v != &((UpVal*)uv)->u.value)) \ + if (iscollectable(tv) && iswhite(gcvalue(tv)) && (!(uv) || (uv)->v != &(uv)->u.value)) \ luaC_barrierupval(L, gcvalue(tv)); \ } @@ -133,7 +133,7 @@ #define luaC_init(L, o, tt) luaC_initobj(L, cast_to(GCObject*, (o)), tt) LUAI_FUNC void luaC_freeall(lua_State* L); -LUAI_FUNC void luaC_step(lua_State* L, bool assist); +LUAI_FUNC size_t luaC_step(lua_State* L, bool assist); LUAI_FUNC void luaC_fullgc(lua_State* L); LUAI_FUNC void luaC_initobj(lua_State* L, GCObject* o, uint8_t tt); LUAI_FUNC void luaC_initupval(lua_State* L, UpVal* uv); diff --git a/luau/VM/src/lstate.h b/luau/VM/src/lstate.h index e7c3737..423514a 100644 --- a/luau/VM/src/lstate.h +++ b/luau/VM/src/lstate.h @@ -106,7 +106,7 @@ struct GCCycleMetrics double markassisttime = 0.0; double markmaxexplicittime = 0.0; size_t markexplicitsteps = 0; - size_t markrequests = 0; + size_t markwork = 0; double atomicstarttimestamp = 0.0; size_t atomicstarttotalsizebytes = 0; @@ -122,10 +122,7 @@ struct GCCycleMetrics double sweepassisttime = 0.0; double sweepmaxexplicittime = 0.0; size_t sweepexplicitsteps = 0; - size_t sweeprequests = 0; - - size_t assistrequests = 0; - size_t explicitrequests = 0; + size_t sweepwork = 0; size_t assistwork = 0; size_t explicitwork = 0; @@ -203,7 +200,7 @@ typedef struct global_State uint64_t rngstate; /* PCG random number generator state */ uint64_t ptrenckey[4]; /* pointer encoding key for display */ - void (*udatagc[LUA_UTAG_LIMIT])(void*); /* for each userdata tag, a gc callback to be called immediately before freeing memory */ + void (*udatagc[LUA_UTAG_LIMIT])(lua_State*, void*); /* for each userdata tag, a gc callback to be called immediately before freeing memory */ lua_Callbacks cb; diff --git a/luau/VM/src/ltable.cpp b/luau/VM/src/ltable.cpp index 1c75c0b..8251b51 100644 --- a/luau/VM/src/ltable.cpp +++ b/luau/VM/src/ltable.cpp @@ -33,9 +33,6 @@ #include -LUAU_FASTFLAGVARIABLE(LuauTableRehashRework, false) -LUAU_FASTFLAGVARIABLE(LuauTableNewBoundary, false) - // max size of both array and hash part is 2^MAXBITS #define MAXBITS 26 #define MAXSIZE (1 << MAXBITS) @@ -390,6 +387,8 @@ static void resize(lua_State* L, Table* t, int nasize, int nhsize) setarrayvector(L, t, nasize); /* create new hash part with appropriate size */ setnodevector(L, t, nhsize); + /* used for the migration check at the end */ + LuaNode* nnew = t->node; if (nasize < oldasize) { /* array part must shrink? */ t->sizearray = nasize; @@ -398,57 +397,51 @@ static void resize(lua_State* L, Table* t, int nasize, int nhsize) { if (!ttisnil(&t->array[i])) { - if (FFlag::LuauTableRehashRework) - { - TValue ok; - setnvalue(&ok, cast_num(i + 1)); - setobjt2t(L, newkey(L, t, &ok), &t->array[i]); - } - else - { - setobjt2t(L, luaH_setnum(L, t, i + 1), &t->array[i]); - } + TValue ok; + setnvalue(&ok, cast_num(i + 1)); + setobjt2t(L, newkey(L, t, &ok), &t->array[i]); } } /* shrink array */ luaM_reallocarray(L, t->array, oldasize, nasize, TValue, t->memcat); } + /* used for the migration check at the end */ + TValue* anew = t->array; /* re-insert elements from hash part */ - if (FFlag::LuauTableRehashRework) + for (int i = twoto(oldhsize) - 1; i >= 0; i--) { - for (int i = twoto(oldhsize) - 1; i >= 0; i--) + LuaNode* old = nold + i; + if (!ttisnil(gval(old))) { - LuaNode* old = nold + i; - if (!ttisnil(gval(old))) - { - TValue ok; - getnodekey(L, &ok, old); - setobjt2t(L, arrayornewkey(L, t, &ok), gval(old)); - } - } - } - else - { - for (int i = twoto(oldhsize) - 1; i >= 0; i--) - { - LuaNode* old = nold + i; - if (!ttisnil(gval(old))) - { - TValue ok; - getnodekey(L, &ok, old); - setobjt2t(L, luaH_set(L, t, &ok), gval(old)); - } + TValue ok; + getnodekey(L, &ok, old); + setobjt2t(L, arrayornewkey(L, t, &ok), gval(old)); } } + /* make sure we haven't recursively rehashed during element migration */ + LUAU_ASSERT(nnew == t->node); + LUAU_ASSERT(anew == t->array); + if (nold != dummynode) luaM_freearray(L, nold, twoto(oldhsize), LuaNode, t->memcat); /* free old array */ } +static int adjustasize(Table* t, int size, const TValue* ek) +{ + bool tbound = t->node != dummynode || size < t->sizearray; + int ekindex = ek && ttisnumber(ek) ? arrayindex(nvalue(ek)) : -1; + /* move the array size up until the boundary is guaranteed to be inside the array part */ + while (size + 1 == ekindex || (tbound && !ttisnil(luaH_getnum(t, size + 1)))) + size++; + return size; +} + void luaH_resizearray(lua_State* L, Table* t, int nasize) { int nsize = (t->node == dummynode) ? 0 : sizenode(t); - resize(L, t, nasize, nsize); + int asize = adjustasize(t, nasize, NULL); + resize(L, t, asize, nsize); } void luaH_resizehash(lua_State* L, Table* t, int nhsize) @@ -470,21 +463,11 @@ static void rehash(lua_State* L, Table* t, const TValue* ek) totaluse++; /* compute new size for array part */ int na = computesizes(nums, &nasize); + int nh = totaluse - na; /* enforce the boundary invariant; for performance, only do hash lookups if we must */ - if (FFlag::LuauTableNewBoundary) - { - bool tbound = t->node != dummynode || nasize < t->sizearray; - int ekindex = ttisnumber(ek) ? arrayindex(nvalue(ek)) : -1; - /* move the array size up until the boundary is guaranteed to be inside the array part */ - while (nasize + 1 == ekindex || (tbound && !ttisnil(luaH_getnum(t, nasize + 1)))) - { - nasize++; - na++; - } - } + nasize = adjustasize(t, nasize, ek); /* resize the table to new computed sizes */ - LUAU_ASSERT(na <= totaluse); - resize(L, t, nasize, totaluse - na); + resize(L, t, nasize, nh); } /* @@ -544,11 +527,11 @@ static LuaNode* getfreepos(Table* t) static TValue* newkey(lua_State* L, Table* t, const TValue* key) { /* enforce boundary invariant */ - if (FFlag::LuauTableNewBoundary && ttisnumber(key) && nvalue(key) == t->sizearray + 1) + if (ttisnumber(key) && nvalue(key) == t->sizearray + 1) { rehash(L, t, key); /* grow table */ - // after rehash, numeric keys might be located in the new array part, but won't be found in the node part + /* after rehash, numeric keys might be located in the new array part, but won't be found in the node part */ return arrayornewkey(L, t, key); } @@ -560,15 +543,8 @@ static TValue* newkey(lua_State* L, Table* t, const TValue* key) { /* cannot find a free place? */ rehash(L, t, key); /* grow table */ - if (!FFlag::LuauTableRehashRework) - { - return luaH_set(L, t, key); /* re-insert key into grown table */ - } - else - { - // after rehash, numeric keys might be located in the new array part, but won't be found in the node part - return arrayornewkey(L, t, key); - } + /* after rehash, numeric keys might be located in the new array part, but won't be found in the node part */ + return arrayornewkey(L, t, key); } LUAU_ASSERT(n != dummynode); TValue mk; @@ -733,37 +709,6 @@ TValue* luaH_setstr(lua_State* L, Table* t, TString* key) } } -static LUAU_NOINLINE int unbound_search(Table* t, unsigned int j) -{ - LUAU_ASSERT(!FFlag::LuauTableNewBoundary); - unsigned int i = j; /* i is zero or a present index */ - j++; - /* find `i' and `j' such that i is present and j is not */ - while (!ttisnil(luaH_getnum(t, j))) - { - i = j; - j *= 2; - if (j > cast_to(unsigned int, INT_MAX)) - { /* overflow? */ - /* table was built with bad purposes: resort to linear search */ - i = 1; - while (!ttisnil(luaH_getnum(t, i))) - i++; - return i - 1; - } - } - /* now do a binary search between them */ - while (j - i > 1) - { - unsigned int m = (i + j) / 2; - if (ttisnil(luaH_getnum(t, m))) - j = m; - else - i = m; - } - return i; -} - static int updateaboundary(Table* t, int boundary) { if (boundary < t->sizearray && ttisnil(&t->array[boundary - 1])) @@ -820,17 +765,12 @@ int luaH_getn(Table* t) maybesetaboundary(t, boundary); return boundary; } - else if (FFlag::LuauTableNewBoundary) + else { /* validate boundary invariant */ LUAU_ASSERT(t->node == dummynode || ttisnil(luaH_getnum(t, j + 1))); return j; } - /* else must find a boundary in hash part */ - else if (t->node == dummynode) /* hash part is empty? */ - return j; /* that is easy... */ - else - return unbound_search(t, j); } Table* luaH_clone(lua_State* L, Table* tt) diff --git a/luau/VM/src/ltablib.cpp b/luau/VM/src/ltablib.cpp index 241a99e..9c1f387 100644 --- a/luau/VM/src/ltablib.cpp +++ b/luau/VM/src/ltablib.cpp @@ -199,9 +199,9 @@ static int tmove(lua_State* L) int tt = !lua_isnoneornil(L, 5) ? 5 : 1; /* destination table */ luaL_checktype(L, tt, LUA_TTABLE); - void (*telemetrycb)(lua_State* L, int f, int e, int t, int nf, int nt) = lua_table_move_telemetry; + void (*telemetrycb)(lua_State * L, int f, int e, int t, int nf, int nt) = lua_table_move_telemetry; - if (DFFlag::LuauTableMoveTelemetry2 && telemetrycb) + if (DFFlag::LuauTableMoveTelemetry2 && telemetrycb && e >= f) { int nf = lua_objlen(L, 1); int nt = lua_objlen(L, tt); diff --git a/luau/VM/src/ltm.cpp b/luau/VM/src/ltm.cpp index 106efb2..9b99506 100644 --- a/luau/VM/src/ltm.cpp +++ b/luau/VM/src/ltm.cpp @@ -37,6 +37,8 @@ const char* const luaT_eventname[] = { "__newindex", "__mode", "__namecall", + "__call", + "__iter", "__eq", @@ -54,13 +56,13 @@ const char* const luaT_eventname[] = { "__lt", "__le", "__concat", - "__call", "__type", }; // clang-format on static_assert(sizeof(luaT_typenames) / sizeof(luaT_typenames[0]) == LUA_T_COUNT, "luaT_typenames size mismatch"); static_assert(sizeof(luaT_eventname) / sizeof(luaT_eventname[0]) == TM_N, "luaT_eventname size mismatch"); +static_assert(TM_EQ < 8, "fasttm optimization stores a bitfield with metamethods in a byte"); void luaT_init(lua_State* L) { diff --git a/luau/VM/src/ltm.h b/luau/VM/src/ltm.h index 0e4e915..e1b95c2 100644 --- a/luau/VM/src/ltm.h +++ b/luau/VM/src/ltm.h @@ -16,6 +16,8 @@ typedef enum TM_NEWINDEX, TM_MODE, TM_NAMECALL, + TM_CALL, + TM_ITER, TM_EQ, /* last tag method with `fast' access */ @@ -33,7 +35,6 @@ typedef enum TM_LT, TM_LE, TM_CONCAT, - TM_CALL, TM_TYPE, TM_N /* number of elements in the enum */ diff --git a/luau/VM/src/ludata.cpp b/luau/VM/src/ludata.cpp index 819d186..2815268 100644 --- a/luau/VM/src/ludata.cpp +++ b/luau/VM/src/ludata.cpp @@ -22,14 +22,21 @@ Udata* luaU_newudata(lua_State* L, size_t s, int tag) void luaU_freeudata(lua_State* L, Udata* u, lua_Page* page) { - void (*dtor)(void*) = nullptr; if (u->tag < LUA_UTAG_LIMIT) + { + void (*dtor)(lua_State*, void*) = nullptr; dtor = L->global->udatagc[u->tag]; + if (dtor) + dtor(L, u->data); + } else if (u->tag == UTAG_IDTOR) + { + void (*dtor)(void*) = nullptr; memcpy(&dtor, &u->data + u->len - sizeof(dtor), sizeof(dtor)); + if (dtor) + dtor(u->data); + } - if (dtor) - dtor(u->data); luaM_freegco(L, u, sizeudata(u->len), u->memcat, page); } diff --git a/luau/VM/src/lvmexecute.cpp b/luau/VM/src/lvmexecute.cpp index 34949ef..3c7c276 100644 --- a/luau/VM/src/lvmexecute.cpp +++ b/luau/VM/src/lvmexecute.cpp @@ -16,7 +16,10 @@ #include -LUAU_FASTFLAG(LuauTableNewBoundary) +LUAU_FASTFLAGVARIABLE(LuauIter, false) +LUAU_DYNAMIC_FASTFLAGVARIABLE(LuauIterCallTelemetry, false) + +void (*lua_iter_call_telemetry)(lua_State* L); // Disable c99-designator to avoid the warning in CGOTO dispatch table #ifdef __clang__ @@ -110,7 +113,7 @@ LUAU_FASTFLAG(LuauTableNewBoundary) VM_DISPATCH_OP(LOP_FORGLOOP_NEXT), VM_DISPATCH_OP(LOP_GETVARARGS), VM_DISPATCH_OP(LOP_DUPCLOSURE), VM_DISPATCH_OP(LOP_PREPVARARGS), \ VM_DISPATCH_OP(LOP_LOADKX), VM_DISPATCH_OP(LOP_JUMPX), VM_DISPATCH_OP(LOP_FASTCALL), VM_DISPATCH_OP(LOP_COVERAGE), \ VM_DISPATCH_OP(LOP_CAPTURE), VM_DISPATCH_OP(LOP_JUMPIFEQK), VM_DISPATCH_OP(LOP_JUMPIFNOTEQK), VM_DISPATCH_OP(LOP_FASTCALL1), \ - VM_DISPATCH_OP(LOP_FASTCALL2), VM_DISPATCH_OP(LOP_FASTCALL2K), + VM_DISPATCH_OP(LOP_FASTCALL2), VM_DISPATCH_OP(LOP_FASTCALL2K), VM_DISPATCH_OP(LOP_FORGPREP), #if defined(__GNUC__) || defined(__clang__) #define VM_USE_CGOTO 1 @@ -150,8 +153,20 @@ LUAU_NOINLINE static void luau_prepareFORN(lua_State* L, StkId plimit, StkId pst LUAU_NOINLINE static bool luau_loopFORG(lua_State* L, int a, int c) { + // note: it's safe to push arguments past top for complicated reasons (see top of the file) StkId ra = &L->base[a]; - LUAU_ASSERT(ra + 6 <= L->top); + LUAU_ASSERT(ra + 3 <= L->top); + + if (DFFlag::LuauIterCallTelemetry) + { + /* TODO: we might be able to stop supporting this depending on whether it's used in practice */ + void (*telemetrycb)(lua_State* L) = lua_iter_call_telemetry; + + if (telemetrycb && ttistable(ra) && fasttm(L, hvalue(ra)->metatable, TM_CALL)) + telemetrycb(L); + if (telemetrycb && ttisuserdata(ra) && fasttm(L, uvalue(ra)->metatable, TM_CALL)) + telemetrycb(L); + } setobjs2s(L, ra + 3 + 2, ra + 2); setobjs2s(L, ra + 3 + 1, ra + 1); @@ -2204,20 +2219,149 @@ static void luau_execute(lua_State* L) } } + VM_CASE(LOP_FORGPREP) + { + Instruction insn = *pc++; + StkId ra = VM_REG(LUAU_INSN_A(insn)); + + if (ttisfunction(ra)) + { + /* will be called during FORGLOOP */ + } + else if (FFlag::LuauIter) + { + Table* mt = ttistable(ra) ? hvalue(ra)->metatable : ttisuserdata(ra) ? uvalue(ra)->metatable : cast_to(Table*, NULL); + + if (const TValue* fn = fasttm(L, mt, TM_ITER)) + { + setobj2s(L, ra + 1, ra); + setobj2s(L, ra, fn); + + L->top = ra + 2; /* func + self arg */ + LUAU_ASSERT(L->top <= L->stack_last); + + VM_PROTECT(luaD_call(L, ra, 3)); + L->top = L->ci->top; + } + else if (fasttm(L, mt, TM_CALL)) + { + /* table or userdata with __call, will be called during FORGLOOP */ + /* TODO: we might be able to stop supporting this depending on whether it's used in practice */ + } + else if (ttistable(ra)) + { + /* set up registers for builtin iteration */ + setobj2s(L, ra + 1, ra); + setpvalue(ra + 2, reinterpret_cast(uintptr_t(0))); + setnilvalue(ra); + } + else + { + VM_PROTECT(luaG_typeerror(L, ra, "iterate over")); + } + } + + pc += LUAU_INSN_D(insn); + LUAU_ASSERT(unsigned(pc - cl->l.p->code) < unsigned(cl->l.p->sizecode)); + VM_NEXT(); + } + VM_CASE(LOP_FORGLOOP) { VM_INTERRUPT(); Instruction insn = *pc++; + StkId ra = VM_REG(LUAU_INSN_A(insn)); uint32_t aux = *pc; - // note: this is a slow generic path, fast-path is FORGLOOP_INEXT/NEXT - bool stop; - VM_PROTECT(stop = luau_loopFORG(L, LUAU_INSN_A(insn), aux)); + if (!FFlag::LuauIter) + { + bool stop; + VM_PROTECT(stop = luau_loopFORG(L, LUAU_INSN_A(insn), aux)); - // note that we need to increment pc by 1 to exit the loop since we need to skip over aux - pc += stop ? 1 : LUAU_INSN_D(insn); - LUAU_ASSERT(unsigned(pc - cl->l.p->code) < unsigned(cl->l.p->sizecode)); - VM_NEXT(); + // note that we need to increment pc by 1 to exit the loop since we need to skip over aux + pc += stop ? 1 : LUAU_INSN_D(insn); + LUAU_ASSERT(unsigned(pc - cl->l.p->code) < unsigned(cl->l.p->sizecode)); + VM_NEXT(); + } + + // fast-path: builtin table iteration + if (ttisnil(ra) && ttistable(ra + 1) && ttislightuserdata(ra + 2)) + { + Table* h = hvalue(ra + 1); + int index = int(reinterpret_cast(pvalue(ra + 2))); + + int sizearray = h->sizearray; + int sizenode = 1 << h->lsizenode; + + // clear extra variables since we might have more than two + if (LUAU_UNLIKELY(aux > 2)) + for (int i = 2; i < int(aux); ++i) + setnilvalue(ra + 3 + i); + + // first we advance index through the array portion + while (unsigned(index) < unsigned(sizearray)) + { + if (!ttisnil(&h->array[index])) + { + setpvalue(ra + 2, reinterpret_cast(uintptr_t(index + 1))); + setnvalue(ra + 3, double(index + 1)); + setobj2s(L, ra + 4, &h->array[index]); + + pc += LUAU_INSN_D(insn); + LUAU_ASSERT(unsigned(pc - cl->l.p->code) < unsigned(cl->l.p->sizecode)); + VM_NEXT(); + } + + index++; + } + + // then we advance index through the hash portion + while (unsigned(index - sizearray) < unsigned(sizenode)) + { + LuaNode* n = &h->node[index - sizearray]; + + if (!ttisnil(gval(n))) + { + setpvalue(ra + 2, reinterpret_cast(uintptr_t(index + 1))); + getnodekey(L, ra + 3, n); + setobj2s(L, ra + 4, gval(n)); + + pc += LUAU_INSN_D(insn); + LUAU_ASSERT(unsigned(pc - cl->l.p->code) < unsigned(cl->l.p->sizecode)); + VM_NEXT(); + } + + index++; + } + + // fallthrough to exit + pc++; + VM_NEXT(); + } + else + { + // note: it's safe to push arguments past top for complicated reasons (see top of the file) + setobjs2s(L, ra + 3 + 2, ra + 2); + setobjs2s(L, ra + 3 + 1, ra + 1); + setobjs2s(L, ra + 3, ra); + + L->top = ra + 3 + 3; /* func + 2 args (state and index) */ + LUAU_ASSERT(L->top <= L->stack_last); + + VM_PROTECT(luaD_call(L, ra + 3, aux)); + L->top = L->ci->top; + + // recompute ra since stack might have been reallocated + ra = VM_REG(LUAU_INSN_A(insn)); + + // copy first variable back into the iteration index + setobjs2s(L, ra + 2, ra + 3); + + // note that we need to increment pc by 1 to exit the loop since we need to skip over aux + pc += ttisnil(ra + 3) ? 1 : LUAU_INSN_D(insn); + LUAU_ASSERT(unsigned(pc - cl->l.p->code) < unsigned(cl->l.p->sizecode)); + VM_NEXT(); + } } VM_CASE(LOP_FORGPREP_INEXT) @@ -2228,8 +2372,15 @@ static void luau_execute(lua_State* L) // fast-path: ipairs/inext if (cl->env->safeenv && ttistable(ra + 1) && ttisnumber(ra + 2) && nvalue(ra + 2) == 0.0) { + if (FFlag::LuauIter) + setnilvalue(ra); + setpvalue(ra + 2, reinterpret_cast(uintptr_t(0))); } + else if (FFlag::LuauIter && !ttisfunction(ra)) + { + VM_PROTECT(luaG_typeerror(L, ra, "iterate over")); + } pc += LUAU_INSN_D(insn); LUAU_ASSERT(unsigned(pc - cl->l.p->code) < unsigned(cl->l.p->sizecode)); @@ -2268,23 +2419,9 @@ static void luau_execute(lua_State* L) VM_NEXT(); } } - else if (FFlag::LuauTableNewBoundary || (h->lsizenode == 0 && ttisnil(gval(h->node)))) - { - // fallthrough to exit - VM_NEXT(); - } else { - // the table has a hash part; index + 1 may appear in it in which case we need to iterate through the hash portion as well - const TValue* val = luaH_getnum(h, index + 1); - - setpvalue(ra + 2, reinterpret_cast(uintptr_t(index + 1))); - setnvalue(ra + 3, double(index + 1)); - setobj2s(L, ra + 4, val); - - // note that nil elements inside the array terminate the traversal - pc += ttisnil(ra + 4) ? 0 : LUAU_INSN_D(insn); - LUAU_ASSERT(unsigned(pc - cl->l.p->code) < unsigned(cl->l.p->sizecode)); + // fallthrough to exit VM_NEXT(); } } @@ -2308,8 +2445,15 @@ static void luau_execute(lua_State* L) // fast-path: pairs/next if (cl->env->safeenv && ttistable(ra + 1) && ttisnil(ra + 2)) { + if (FFlag::LuauIter) + setnilvalue(ra); + setpvalue(ra + 2, reinterpret_cast(uintptr_t(0))); } + else if (FFlag::LuauIter && !ttisfunction(ra)) + { + VM_PROTECT(luaG_typeerror(L, ra, "iterate over")); + } pc += LUAU_INSN_D(insn); LUAU_ASSERT(unsigned(pc - cl->l.p->code) < unsigned(cl->l.p->sizecode)); @@ -2704,7 +2848,7 @@ static void luau_execute(lua_State* L) { VM_PROTECT_PC(); - int n = f(L, ra, arg, nresults, nullptr, nparams); + int n = f(L, ra, arg, nresults, NULL, nparams); if (n >= 0) {