=?utf-8?q?Félix?= Cloutier <fclout...@apple.com>, =?utf-8?q?Félix?= Cloutier <fclout...@apple.com> Message-ID: In-Reply-To: <llvm.org/llvm/llvm-project/pull/135...@github.com>
https://github.com/apple-fcloutier updated https://github.com/llvm/llvm-project/pull/135864 >From 7e15ec24e6de96b9828f13f21099446e55831591 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?F=C3=A9lix=20Cloutier?= <fclout...@apple.com> Date: Tue, 15 Apr 2025 14:28:02 -0700 Subject: [PATCH 1/3] [clang] Constant-evaluate format strings as last resort Clang's -Wformat checker can see through an inconsistent set of operations. We can fall back to the recently-updated constant string evaluation infrastructure when Clang's initial evaluation fails for a second chance at figuring out what the format string is intended to be. This enables analyzing format strings that were built at compile-time with std::string and other constexpr-capable types in C++, as long as all pieces are also constexpr-visible, and a number of other patterns. Radar-ID: rdar://99940060 --- clang/docs/ReleaseNotes.rst | 3 + clang/include/clang/AST/Expr.h | 9 +- .../clang/Basic/DiagnosticSemaKinds.td | 2 + clang/lib/AST/ExprConstant.cpp | 40 ++++- clang/lib/Sema/SemaChecking.cpp | 144 ++++++++++++------ clang/test/Sema/format-strings.c | 14 ++ clang/test/SemaCXX/format-strings.cpp | 74 +++++++++ 7 files changed, 227 insertions(+), 59 deletions(-) diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index 77bf3355af9da..05566d66a65d2 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -265,6 +265,9 @@ related warnings within the method body. ``format_matches`` accepts an example valid format string as its third argument. For more information, see the Clang attributes documentation. +- Format string checking now supports the compile-time evaluation of format + strings as a fallback mechanism. + - Introduced a new statement attribute ``[[clang::atomic]]`` that enables fine-grained control over atomic code generation on a per-statement basis. Supported options include ``[no_]remote_memory``, diff --git a/clang/include/clang/AST/Expr.h b/clang/include/clang/AST/Expr.h index 20f70863a05b3..78eda8bc3c43e 100644 --- a/clang/include/clang/AST/Expr.h +++ b/clang/include/clang/AST/Expr.h @@ -791,7 +791,14 @@ class Expr : public ValueStmt { const Expr *PtrExpression, ASTContext &Ctx, EvalResult &Status) const; - /// If the current Expr can be evaluated to a pointer to a null-terminated + /// Fill \c Into with the first characters that can be constant-evaluated + /// from this \c Expr . When encountering a null character, stop and return + /// \c true (the null is not returned in \c Into ). Return \c false if + /// evaluation runs off the end of the constant-evaluated string before it + /// encounters a null character. + bool tryEvaluateString(ASTContext &Ctx, std::string &Into) const; + + /// If the current \c Expr can be evaluated to a pointer to a null-terminated /// constant string, return the constant string (without the terminating /// null). std::optional<std::string> tryEvaluateString(ASTContext &Ctx) const; diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td index 3cb2731488fab..4139ff2737c76 100644 --- a/clang/include/clang/Basic/DiagnosticSemaKinds.td +++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td @@ -10170,6 +10170,8 @@ def warn_format_bool_as_character : Warning< "using '%0' format specifier, but argument has boolean value">, InGroup<Format>; def note_format_string_defined : Note<"format string is defined here">; +def note_format_string_evaluated_to : Note< + "format string was constant-evaluated">; def note_format_fix_specifier : Note<"did you mean to use '%0'?">; def note_printf_c_str: Note<"did you mean to call the %0 method?">; def note_format_security_fixit: Note< diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp index 80ece3c4ed7e1..fec92edf49096 100644 --- a/clang/lib/AST/ExprConstant.cpp +++ b/clang/lib/AST/ExprConstant.cpp @@ -17945,15 +17945,36 @@ bool Expr::tryEvaluateObjectSize(uint64_t &Result, ASTContext &Ctx, static bool EvaluateBuiltinStrLen(const Expr *E, uint64_t &Result, EvalInfo &Info, std::string *StringResult) { - if (!E->getType()->hasPointerRepresentation() || !E->isPRValue()) + QualType Ty = E->getType(); + if (!E->isPRValue()) return false; LValue String; - - if (!EvaluatePointer(E, String, Info)) + QualType CharTy; + if (Ty->canDecayToPointerType()) { + if (E->isGLValue()) { + if (!EvaluateLValue(E, String, Info)) + return false; + } else { + APValue &Value = Info.CurrentCall->createTemporary( + E, Ty, ScopeKind::FullExpression, String); + if (!EvaluateInPlace(Value, Info, String, E)) + return false; + } + // The result is a pointer to the first element of the array. + auto *AT = Info.Ctx.getAsArrayType(Ty); + CharTy = AT->getElementType(); + if (auto *CAT = dyn_cast<ConstantArrayType>(AT)) + String.addArray(Info, E, CAT); + else + String.addUnsizedArray(Info, E, CharTy); + } else if (Ty->hasPointerRepresentation()) { + if (!EvaluatePointer(E, String, Info)) + return false; + CharTy = Ty->getPointeeType(); + } else { return false; - - QualType CharTy = E->getType()->getPointeeType(); + } // Fast path: if it's a string literal, search the string value. if (const StringLiteral *S = dyn_cast_or_null<StringLiteral>( @@ -17995,13 +18016,16 @@ static bool EvaluateBuiltinStrLen(const Expr *E, uint64_t &Result, } } -std::optional<std::string> Expr::tryEvaluateString(ASTContext &Ctx) const { +bool Expr::tryEvaluateString(ASTContext &Ctx, std::string &StringResult) const { Expr::EvalStatus Status; EvalInfo Info(Ctx, Status, EvalInfo::EM_ConstantFold); uint64_t Result; - std::string StringResult; + return EvaluateBuiltinStrLen(this, Result, Info, &StringResult); +} - if (EvaluateBuiltinStrLen(this, Result, Info, &StringResult)) +std::optional<std::string> Expr::tryEvaluateString(ASTContext &Ctx) const { + std::string StringResult; + if (tryEvaluateString(Ctx, StringResult)) return StringResult; return {}; } diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp index bffd0dd461d3d..017be929ca18e 100644 --- a/clang/lib/Sema/SemaChecking.cpp +++ b/clang/lib/Sema/SemaChecking.cpp @@ -98,6 +98,7 @@ #include "llvm/Support/Locale.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/SaveAndRestore.h" +#include "llvm/Support/SmallVectorMemoryBuffer.h" #include "llvm/Support/raw_ostream.h" #include "llvm/TargetParser/RISCVTargetParser.h" #include "llvm/TargetParser/Triple.h" @@ -5935,8 +5936,14 @@ static void CheckFormatString( llvm::SmallBitVector &CheckedVarArgs, UncoveredArgHandler &UncoveredArg, bool IgnoreStringsWithoutSpecifiers); -static const Expr *maybeConstEvalStringLiteral(ASTContext &Context, - const Expr *E); +enum StringLiteralConstEvalResult { + SLCER_NotEvaluated, + SLCER_NotNullTerminated, + SLCER_Evaluated, +}; + +static StringLiteralConstEvalResult +constEvalStringAsLiteral(Sema &S, const Expr *E, const StringLiteral *&SL); // Determine if an expression is a string literal or constant string. // If this function returns false on the arguments to a function expecting a @@ -5968,14 +5975,9 @@ static StringLiteralCheckType checkFormatStringExpr( switch (E->getStmtClass()) { case Stmt::InitListExprClass: - // Handle expressions like {"foobar"}. - if (const clang::Expr *SLE = maybeConstEvalStringLiteral(S.Context, E)) { - return checkFormatStringExpr( - S, ReferenceFormatString, SLE, Args, APK, format_idx, firstDataArg, - Type, CallType, /*InFunctionCall*/ false, CheckedVarArgs, - UncoveredArg, Offset, IgnoreStringsWithoutSpecifiers); - } - return SLCT_NotALiteral; + // try to constant-evaluate the string + break; + case Stmt::BinaryConditionalOperatorClass: case Stmt::ConditionalOperatorClass: { // The expression is a literal if both sub-expressions were, and it was @@ -6066,10 +6068,9 @@ static StringLiteralCheckType checkFormatStringExpr( if (InitList->isStringLiteralInit()) Init = InitList->getInit(0)->IgnoreParenImpCasts(); } - return checkFormatStringExpr( - S, ReferenceFormatString, Init, Args, APK, format_idx, - firstDataArg, Type, CallType, - /*InFunctionCall*/ false, CheckedVarArgs, UncoveredArg, Offset); + InFunctionCall = false; + E = Init; + goto tryAgain; } } @@ -6142,11 +6143,9 @@ static StringLiteralCheckType checkFormatStringExpr( } return SLCT_UncheckedLiteral; } - return checkFormatStringExpr( - S, ReferenceFormatString, PVFormatMatches->getFormatString(), - Args, APK, format_idx, firstDataArg, Type, CallType, - /*InFunctionCall*/ false, CheckedVarArgs, UncoveredArg, - Offset, IgnoreStringsWithoutSpecifiers); + E = PVFormatMatches->getFormatString(); + InFunctionCall = false; + goto tryAgain; } } @@ -6214,20 +6213,13 @@ static StringLiteralCheckType checkFormatStringExpr( unsigned BuiltinID = FD->getBuiltinID(); if (BuiltinID == Builtin::BI__builtin___CFStringMakeConstantString || BuiltinID == Builtin::BI__builtin___NSStringMakeConstantString) { - const Expr *Arg = CE->getArg(0); - return checkFormatStringExpr( - S, ReferenceFormatString, Arg, Args, APK, format_idx, - firstDataArg, Type, CallType, InFunctionCall, CheckedVarArgs, - UncoveredArg, Offset, IgnoreStringsWithoutSpecifiers); + E = CE->getArg(0); + goto tryAgain; } } } - if (const Expr *SLE = maybeConstEvalStringLiteral(S.Context, E)) - return checkFormatStringExpr( - S, ReferenceFormatString, SLE, Args, APK, format_idx, firstDataArg, - Type, CallType, /*InFunctionCall*/ false, CheckedVarArgs, - UncoveredArg, Offset, IgnoreStringsWithoutSpecifiers); - return SLCT_NotALiteral; + // try to constant-evaluate the string + break; } case Stmt::ObjCMessageExprClass: { const auto *ME = cast<ObjCMessageExpr>(E); @@ -6248,11 +6240,8 @@ static StringLiteralCheckType checkFormatStringExpr( IgnoreStringsWithoutSpecifiers = true; } - const Expr *Arg = ME->getArg(FA->getFormatIdx().getASTIndex()); - return checkFormatStringExpr( - S, ReferenceFormatString, Arg, Args, APK, format_idx, firstDataArg, - Type, CallType, InFunctionCall, CheckedVarArgs, UncoveredArg, - Offset, IgnoreStringsWithoutSpecifiers); + E = ME->getArg(FA->getFormatIdx().getASTIndex()); + goto tryAgain; } } @@ -6314,7 +6303,8 @@ static StringLiteralCheckType checkFormatStringExpr( } } - return SLCT_NotALiteral; + // try to constant-evaluate the string + break; } case Stmt::UnaryOperatorClass: { const UnaryOperator *UnaOp = cast<UnaryOperator>(E); @@ -6331,26 +6321,79 @@ static StringLiteralCheckType checkFormatStringExpr( } } - return SLCT_NotALiteral; + // try to constant-evaluate the string + break; } default: + // try to constant-evaluate the string + break; + } + + const StringLiteral *FakeLiteral = nullptr; + switch (constEvalStringAsLiteral(S, E, FakeLiteral)) { + case SLCER_NotEvaluated: return SLCT_NotALiteral; + + case SLCER_NotNullTerminated: + S.Diag(Args[format_idx]->getBeginLoc(), + diag::warn_printf_format_string_not_null_terminated) + << Args[format_idx]->getSourceRange(); + if (!InFunctionCall) + S.Diag(E->getBeginLoc(), diag::note_format_string_defined); + // Stop checking, as this might just mean we're missing a chunk of the + // format string and there would be other spurious format issues. + return SLCT_UncheckedLiteral; + + case SLCER_Evaluated: + InFunctionCall = false; + E = FakeLiteral; + goto tryAgain; } } -// If this expression can be evaluated at compile-time, -// check if the result is a StringLiteral and return it -// otherwise return nullptr -static const Expr *maybeConstEvalStringLiteral(ASTContext &Context, - const Expr *E) { +static StringLiteralConstEvalResult +constEvalStringAsLiteral(Sema &S, const Expr *E, const StringLiteral *&SL) { + // As a last resort, try to constant-evaluate the format string. If it + // evaluates to a string literal in the first place, we can point to that + // string literal in source and use that. Expr::EvalResult Result; - if (E->EvaluateAsRValue(Result, Context) && Result.Val.isLValue()) { + if (E->EvaluateAsRValue(Result, S.Context) && Result.Val.isLValue()) { const auto *LVE = Result.Val.getLValueBase().dyn_cast<const Expr *>(); - if (isa_and_nonnull<StringLiteral>(LVE)) - return LVE; + if (auto *BaseSL = dyn_cast_or_null<StringLiteral>(LVE)) { + SL = BaseSL; + return SLCER_Evaluated; + } } - return nullptr; + + // Otherwise, try to evaluate the expression as a string constant. + std::string FormatString; + if (!E->tryEvaluateString(S.Context, FormatString)) { + return FormatString.empty() ? SLCER_NotEvaluated : SLCER_NotNullTerminated; + } + + std::unique_ptr<llvm::MemoryBuffer> MemBuf; + { + llvm::SmallString<80> EscapedString; + { + llvm::raw_svector_ostream OS(EscapedString); + OS << '"'; + OS.write_escaped(FormatString); + OS << '"'; + } + MemBuf.reset(new llvm::SmallVectorMemoryBuffer(std::move(EscapedString), + "<scratch space>", true)); + } + + // Plop that string into a scratch buffer, create a string literal and then + // go with that. + auto ScratchFile = S.getSourceManager().createFileID(std::move(MemBuf)); + SourceLocation Begin = S.getSourceManager().getLocForStartOfFile(ScratchFile); + QualType SLType = S.Context.getStringLiteralArrayType(S.Context.CharTy, + FormatString.length()); + SL = StringLiteral::Create(S.Context, FormatString, + StringLiteralKind::Ordinary, false, SLType, Begin); + return SLCER_Evaluated; } StringRef Sema::GetFormatStringTypeName(Sema::FormatStringType FST) { @@ -6973,10 +7016,11 @@ void CheckFormatHandler::EmitFormatDiagnostic( S.Diag(IsStringLocation ? ArgumentExpr->getExprLoc() : Loc, PDiag) << ArgumentExpr->getSourceRange(); - const Sema::SemaDiagnosticBuilder &Note = - S.Diag(IsStringLocation ? Loc : StringRange.getBegin(), - diag::note_format_string_defined); - + SourceLocation DiagLoc = IsStringLocation ? Loc : StringRange.getBegin(); + unsigned DiagID = S.getSourceManager().isWrittenInScratchSpace(DiagLoc) + ? diag::note_format_string_evaluated_to + : diag::note_format_string_defined; + const Sema::SemaDiagnosticBuilder &Note = S.Diag(DiagLoc, DiagID); Note << StringRange; Note << FixIt; } diff --git a/clang/test/Sema/format-strings.c b/clang/test/Sema/format-strings.c index af30ad5d15fe2..a94e0619ce843 100644 --- a/clang/test/Sema/format-strings.c +++ b/clang/test/Sema/format-strings.c @@ -3,6 +3,11 @@ // RUN: %clang_cc1 -fblocks -fsyntax-only -verify -Wformat-nonliteral -isystem %S/Inputs -triple=x86_64-unknown-fuchsia %s // RUN: %clang_cc1 -fblocks -fsyntax-only -verify -Wformat-nonliteral -isystem %S/Inputs -triple=x86_64-linux-android %s +// expected-note@-5{{format string was constant-evaluated}} +// ^^^ there will be a <scratch space> SourceLocation caused by the +// test_consteval_init_array test, that -verify treats as if it showed up at +// line 1 of this file. + #include <stdarg.h> #include <stddef.h> #define __need_wint_t @@ -900,3 +905,12 @@ void test_promotion(void) { // pointers printf("%s", i); // expected-warning{{format specifies type 'char *' but the argument has type 'int'}} } + +void test_consteval_init_array(void) { + const char buf_not_terminated[] = {'%', 55 * 2 + 5, '\n'}; // expected-note{{format string is defined here}} + printf(buf_not_terminated, "hello"); // expected-warning{{format string is not null-terminated}} + + const char buf[] = {'%', 55 * 2 + 5, '\n', 0}; + printf(buf, "hello"); // no-warning + printf(buf, 123); // expected-warning{{format specifies type 'char *' but the argument has type 'int'}} +} diff --git a/clang/test/SemaCXX/format-strings.cpp b/clang/test/SemaCXX/format-strings.cpp index 48cf23999a94f..7b04ea7d8bc75 100644 --- a/clang/test/SemaCXX/format-strings.cpp +++ b/clang/test/SemaCXX/format-strings.cpp @@ -1,6 +1,14 @@ // RUN: %clang_cc1 -fsyntax-only -verify -Wformat-nonliteral -Wformat-non-iso -Wformat-pedantic -fblocks %s // RUN: %clang_cc1 -fsyntax-only -verify -Wformat-nonliteral -Wformat-non-iso -fblocks -std=c++98 %s // RUN: %clang_cc1 -fsyntax-only -verify -Wformat-nonliteral -Wformat-non-iso -Wformat-pedantic -fblocks -std=c++11 %s +// RUN: %clang_cc1 -fsyntax-only -verify -Wformat-nonliteral -Wformat-non-iso -Wformat-pedantic -fblocks -std=c++20 %s + +#if __cplusplus >= 202000l +// expected-note@-6{{format string was constant-evaluated}} +// ^^^ there will be a <scratch space> SourceLocation caused by the +// test_constexpr_string test, that -verify treats as if it showed up at +// line 1 of this file. +#endif #include <stdarg.h> @@ -238,3 +246,69 @@ void f(Scoped1 S1, Scoped2 S2) { } #endif + +#if __cplusplus >= 202000L +class my_string { + char *data; + unsigned size; + +public: + template<unsigned N> + constexpr my_string(const char (&literal)[N]) { + data = new char[N+1]; + for (size = 0; size < N; ++size) { + data[size] = literal[size]; + if (data[size] == 0) + break; + } + data[size] = 0; + } + + my_string(const my_string &) = delete; + + constexpr my_string(my_string &&that) { + data = that.data; + size = that.size; + that.data = nullptr; + that.size = 0; + } + + constexpr ~my_string() { + delete[] data; + } + + template<unsigned N> + constexpr void append(const char (&literal)[N]) { + char *cat = new char[size + N + 1]; + char *tmp = cat; + for (unsigned i = 0; i < size; ++i) { + *tmp++ = data[i]; + } + for (unsigned i = 0; i < N; ++i) { + *tmp = literal[i]; + if (*tmp == 0) + break; + ++tmp; + } + *tmp = 0; + delete[] data; + size = tmp - cat; + data = cat; + } + + constexpr const char *c_str() const { + return data; + } +}; + +constexpr my_string const_string() { + my_string str("hello %s"); + str.append(", %d"); + return str; +} + +void test_constexpr_string() { + printf(const_string().c_str(), "hello", 123); // no-warning + printf(const_string().c_str(), 123, 456); // expected-warning {{format specifies type 'char *' but the argument has type 'int'}} +} +#endif >From 3632a83d3af5ab3958226ebf6e416643d7bfb3f0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?F=C3=A9lix=20Cloutier?= <fclout...@apple.com> Date: Tue, 22 Apr 2025 13:01:40 -0700 Subject: [PATCH 2/3] Create StringEvalResult to hold result of compile-time string evaluation Several users of compile-time string evaluation can meaningfully use the special case that compile-time string evaluation resolves to a string literal in source (for instance, to improve diagnostics). This changes Expr::tryEvaluateString to return a StringEvalResult, which can hold either a string literal and an offset or a std::string of evaluated characters. --- clang/include/clang/AST/Expr.h | 30 ++-- .../clang/Basic/DiagnosticSemaKinds.td | 2 +- clang/lib/AST/ExprConstant.cpp | 141 +++++++++++++----- clang/lib/Analysis/UnsafeBufferUsage.cpp | 5 +- clang/lib/CodeGen/CGBuiltin.cpp | 5 +- clang/lib/Sema/SemaChecking.cpp | 80 +++++----- clang/test/SemaCXX/verbose-trap.cpp | 10 +- 7 files changed, 182 insertions(+), 91 deletions(-) diff --git a/clang/include/clang/AST/Expr.h b/clang/include/clang/AST/Expr.h index 78eda8bc3c43e..54f0ac28ac082 100644 --- a/clang/include/clang/AST/Expr.h +++ b/clang/include/clang/AST/Expr.h @@ -791,17 +791,25 @@ class Expr : public ValueStmt { const Expr *PtrExpression, ASTContext &Ctx, EvalResult &Status) const; - /// Fill \c Into with the first characters that can be constant-evaluated - /// from this \c Expr . When encountering a null character, stop and return - /// \c true (the null is not returned in \c Into ). Return \c false if - /// evaluation runs off the end of the constant-evaluated string before it - /// encounters a null character. - bool tryEvaluateString(ASTContext &Ctx, std::string &Into) const; - - /// If the current \c Expr can be evaluated to a pointer to a null-terminated - /// constant string, return the constant string (without the terminating - /// null). - std::optional<std::string> tryEvaluateString(ASTContext &Ctx) const; + class StringEvalResult { + std::string Storage; + const StringLiteral *SL; + uint64_t Offset; + + public: + StringEvalResult(std::string Contents); + StringEvalResult(const StringLiteral *SL, uint64_t Offset); + + llvm::StringRef getString() const; + bool getStringLiteral(const StringLiteral *&SL, uint64_t &Offset) const; + }; + + /// If the current \c Expr can be evaluated to a pointer to a constant string, + /// return the constant string. The string may not be NUL-terminated. If + /// \c NullTerminated is supplied, it is set to whether there is at least one + /// NUL character in the string. + std::optional<StringEvalResult> + tryEvaluateString(ASTContext &Ctx, bool *NullTerminated = nullptr) const; /// Enumeration used to describe the kind of Null pointer constant /// returned from \c isNullPointerConstant(). diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td index 4139ff2737c76..e3b73c149b3d5 100644 --- a/clang/include/clang/Basic/DiagnosticSemaKinds.td +++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td @@ -9105,7 +9105,7 @@ def err_expected_callable_argument : Error< def note_building_builtin_dump_struct_call : Note< "in call to printing function with arguments '(%0)' while dumping struct">; def err_builtin_verbose_trap_arg : Error< - "argument to __builtin_verbose_trap must %select{be a pointer to a constant string|not contain $}0">; + "argument to __builtin_verbose_trap must %select{be a pointer to a constant NUL-terminated string|not contain $}0">; def err_atomic_load_store_uses_lib : Error< "atomic %select{load|store}0 requires runtime support that is not " diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp index fec92edf49096..6c5e8260131bb 100644 --- a/clang/lib/AST/ExprConstant.cpp +++ b/clang/lib/AST/ExprConstant.cpp @@ -1922,9 +1922,17 @@ static bool EvaluateComplex(const Expr *E, ComplexValue &Res, EvalInfo &Info); static bool EvaluateAtomic(const Expr *E, const LValue *This, APValue &Result, EvalInfo &Info); static bool EvaluateAsRValue(EvalInfo &Info, const Expr *E, APValue &Result); +static bool EvaluateStringAsLValue(EvalInfo &Info, const Expr *E, + QualType &CharTy, LValue &String); +static const StringLiteral *StringLValueIsLiteral(EvalInfo &Info, + LValue &String, + QualType CharTy, + uint64_t &Offset); +template <typename CharAction> +static bool IterateStringLValue(EvalInfo &Info, const Expr *E, QualType CharTy, + LValue &String, CharAction &&Action); static bool EvaluateBuiltinStrLen(const Expr *E, uint64_t &Result, - EvalInfo &Info, - std::string *StringResult = nullptr); + EvalInfo &Info); /// Evaluate an integer or fixed point expression into an APResult. static bool EvaluateFixedPointOrInteger(const Expr *E, APFixedPoint &Result, @@ -17943,14 +17951,12 @@ bool Expr::tryEvaluateObjectSize(uint64_t &Result, ASTContext &Ctx, return tryEvaluateBuiltinObjectSize(this, Type, Info, Result); } -static bool EvaluateBuiltinStrLen(const Expr *E, uint64_t &Result, - EvalInfo &Info, std::string *StringResult) { +static bool EvaluateStringAsLValue(EvalInfo &Info, const Expr *E, + QualType &CharTy, LValue &String) { QualType Ty = E->getType(); if (!E->isPRValue()) return false; - LValue String; - QualType CharTy; if (Ty->canDecayToPointerType()) { if (E->isGLValue()) { if (!EvaluateLValue(E, String, Info)) @@ -17975,8 +17981,13 @@ static bool EvaluateBuiltinStrLen(const Expr *E, uint64_t &Result, } else { return false; } + return true; +} - // Fast path: if it's a string literal, search the string value. +static const StringLiteral *StringLValueIsLiteral(EvalInfo &Info, + LValue &String, + QualType CharTy, + uint64_t &Offset) { if (const StringLiteral *S = dyn_cast_or_null<StringLiteral>( String.getLValueBase().dyn_cast<const Expr *>())) { StringRef Str = S->getBytes(); @@ -17985,49 +17996,111 @@ static bool EvaluateBuiltinStrLen(const Expr *E, uint64_t &Result, S->getCharByteWidth() == 1 && // FIXME: Add fast-path for wchar_t too. Info.Ctx.hasSameUnqualifiedType(CharTy, Info.Ctx.CharTy)) { - Str = Str.substr(Off); - - StringRef::size_type Pos = Str.find(0); - if (Pos != StringRef::npos) - Str = Str.substr(0, Pos); - - Result = Str.size(); - if (StringResult) - *StringResult = Str; - return true; + Offset = static_cast<uint64_t>(Off); + return S; } - - // Fall through to slow path. } + return nullptr; +} - // Slow path: scan the bytes of the string looking for the terminating 0. - for (uint64_t Strlen = 0; /**/; ++Strlen) { +template <typename CharAction> +static bool IterateStringLValue(EvalInfo &Info, const Expr *E, QualType CharTy, + LValue &String, CharAction &&Action) { + while (true) { APValue Char; if (!handleLValueToRValueConversion(Info, E, CharTy, String, Char) || !Char.isInt()) return false; - if (!Char.getInt()) { - Result = Strlen; + if (!Action(Char.getInt().getExtValue())) return true; - } else if (StringResult) - StringResult->push_back(Char.getInt().getExtValue()); if (!HandleLValueArrayAdjustment(Info, E, String, CharTy, 1)) return false; } } -bool Expr::tryEvaluateString(ASTContext &Ctx, std::string &StringResult) const { +static bool EvaluateBuiltinStrLen(const Expr *E, uint64_t &Result, + EvalInfo &Info) { + LValue String; + QualType CharTy; + if (!EvaluateStringAsLValue(Info, E, CharTy, String)) + return false; + + // Fast path: if it's a string literal, search the string value. + uint64_t Off; + if (const auto *S = StringLValueIsLiteral(Info, String, CharTy, Off)) { + StringRef Str = S->getBytes().substr(Off); + + StringRef::size_type Pos = Str.find(0); + if (Pos != StringRef::npos) + Str = Str.substr(0, Pos); + + Result = Str.size(); + return true; + } + + // Slow path: scan the bytes of the string looking for the terminating 0. + Result = 0; + return IterateStringLValue(Info, E, CharTy, String, [&](int Char) { + if (Char) { + Result++; + return true; + } else + return false; + }); +} + +Expr::StringEvalResult::StringEvalResult(const StringLiteral *SL, + uint64_t Offset) + : SL(SL), Offset(Offset) {} + +Expr::StringEvalResult::StringEvalResult(std::string Contents) + : Storage(std::move(Contents)), SL(nullptr), Offset(0) {} + +llvm::StringRef Expr::StringEvalResult::getString() const { + return SL ? SL->getBytes().substr(Offset) : Storage; +} + +bool Expr::StringEvalResult::getStringLiteral(const StringLiteral *&SL, + uint64_t &Offset) const { + if (this->SL) { + SL = this->SL; + Offset = this->Offset; + return true; + } + return false; +} + +std::optional<Expr::StringEvalResult> +Expr::tryEvaluateString(ASTContext &Ctx, bool *NullTerminated) const { + if (NullTerminated) + *NullTerminated = false; + Expr::EvalStatus Status; EvalInfo Info(Ctx, Status, EvalInfo::EM_ConstantFold); - uint64_t Result; - return EvaluateBuiltinStrLen(this, Result, Info, &StringResult); -} + LValue String; + QualType CharTy; + if (!EvaluateStringAsLValue(Info, this, CharTy, String)) + return {}; + + uint64_t Off; + if (const auto *S = StringLValueIsLiteral(Info, String, CharTy, Off)) { + if (NullTerminated) + *NullTerminated = true; + return StringEvalResult(S, Off); + } + + std::string Result; + bool NTFound = IterateStringLValue(Info, this, CharTy, String, [&](int Char) { + if (Char) { + Result.push_back(Char); + return true; + } else + return false; + }); -std::optional<std::string> Expr::tryEvaluateString(ASTContext &Ctx) const { - std::string StringResult; - if (tryEvaluateString(Ctx, StringResult)) - return StringResult; - return {}; + if (NullTerminated) + *NullTerminated = NTFound; + return StringEvalResult(Result); } template <typename T> diff --git a/clang/lib/Analysis/UnsafeBufferUsage.cpp b/clang/lib/Analysis/UnsafeBufferUsage.cpp index fbe753de9ef1f..10d4c63a1da05 100644 --- a/clang/lib/Analysis/UnsafeBufferUsage.cpp +++ b/clang/lib/Analysis/UnsafeBufferUsage.cpp @@ -725,12 +725,13 @@ static bool hasUnsafeFormatOrSArg(const CallExpr *Call, const Expr *&UnsafeArg, const Expr *Fmt = Call->getArg(FmtArgIdx); if (auto *SL = dyn_cast<clang::StringLiteral>(Fmt->IgnoreParenImpCasts())) { + std::optional<Expr::StringEvalResult> SER; StringRef FmtStr; if (SL->getCharByteWidth() == 1) FmtStr = SL->getString(); - else if (auto EvaledFmtStr = SL->tryEvaluateString(Ctx)) - FmtStr = *EvaledFmtStr; + else if ((SER = SL->tryEvaluateString(Ctx))) + FmtStr = SER->getString(); else goto CHECK_UNSAFE_PTR; diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index fe55dfffc1cbe..d92bb977c7546 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -3478,8 +3478,9 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID, llvm::DILocation *TrapLocation = Builder.getCurrentDebugLocation(); if (getDebugInfo()) { TrapLocation = getDebugInfo()->CreateTrapFailureMessageFor( - TrapLocation, *E->getArg(0)->tryEvaluateString(getContext()), - *E->getArg(1)->tryEvaluateString(getContext())); + TrapLocation, + E->getArg(0)->tryEvaluateString(getContext())->getString(), + E->getArg(1)->tryEvaluateString(getContext())->getString()); } ApplyDebugLocation ApplyTrapDI(*this, TrapLocation); // Currently no attempt is made to prevent traps from being merged. diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp index 017be929ca18e..0ca41c9758af9 100644 --- a/clang/lib/Sema/SemaChecking.cpp +++ b/clang/lib/Sema/SemaChecking.cpp @@ -180,12 +180,14 @@ static bool checkBuiltinVerboseTrap(CallExpr *Call, Sema &S) { if (Arg->isValueDependent()) continue; - std::optional<std::string> ArgString = Arg->tryEvaluateString(S.Context); + // Arguments must be pointers to constant strings, must be NUL-terminated, + // and cannot contain '$'. + bool HasNulTerminator; + auto ArgString = Arg->tryEvaluateString(S.Context, &HasNulTerminator); int DiagMsgKind = -1; - // Arguments must be pointers to constant strings and cannot use '$'. - if (!ArgString.has_value()) + if (!(ArgString && HasNulTerminator)) DiagMsgKind = 0; - else if (ArgString->find('$') != std::string::npos) + else if (ArgString->getString().find('$') != llvm::StringRef::npos) DiagMsgKind = 1; if (DiagMsgKind >= 0) { @@ -5943,7 +5945,8 @@ enum StringLiteralConstEvalResult { }; static StringLiteralConstEvalResult -constEvalStringAsLiteral(Sema &S, const Expr *E, const StringLiteral *&SL); +constEvalStringAsLiteral(Sema &S, const Expr *E, const StringLiteral *&SL, + uint64_t &Offset); // Determine if an expression is a string literal or constant string. // If this function returns false on the arguments to a function expecting a @@ -6330,8 +6333,9 @@ static StringLiteralCheckType checkFormatStringExpr( break; } + uint64_t EvalOffset = 0; const StringLiteral *FakeLiteral = nullptr; - switch (constEvalStringAsLiteral(S, E, FakeLiteral)) { + switch (constEvalStringAsLiteral(S, E, FakeLiteral, EvalOffset)) { case SLCER_NotEvaluated: return SLCT_NotALiteral; @@ -6348,51 +6352,49 @@ static StringLiteralCheckType checkFormatStringExpr( case SLCER_Evaluated: InFunctionCall = false; E = FakeLiteral; + Offset = EvalOffset; goto tryAgain; } } static StringLiteralConstEvalResult -constEvalStringAsLiteral(Sema &S, const Expr *E, const StringLiteral *&SL) { - // As a last resort, try to constant-evaluate the format string. If it - // evaluates to a string literal in the first place, we can point to that - // string literal in source and use that. - Expr::EvalResult Result; - if (E->EvaluateAsRValue(Result, S.Context) && Result.Val.isLValue()) { - const auto *LVE = Result.Val.getLValueBase().dyn_cast<const Expr *>(); - if (auto *BaseSL = dyn_cast_or_null<StringLiteral>(LVE)) { - SL = BaseSL; - return SLCER_Evaluated; - } - } - - // Otherwise, try to evaluate the expression as a string constant. - std::string FormatString; - if (!E->tryEvaluateString(S.Context, FormatString)) { - return FormatString.empty() ? SLCER_NotEvaluated : SLCER_NotNullTerminated; - } - +constEvalStringAsLiteral(Sema &S, const Expr *E, const StringLiteral *&SL, + uint64_t &Offset) { + // As a last resort, try to constant-evaluate the format string. + bool HasNul; + auto SER = E->tryEvaluateString(S.Context, &HasNul); + if (!SER) + return SLCER_NotEvaluated; + if (!HasNul) + return SLCER_NotNullTerminated; + + // If it evaluates to a string literal in the first place, we can point to + // that string literal in source and use that. + if (SER->getStringLiteral(SL, Offset)) + return SLCER_Evaluated; + + // Otherwise, lop that string into a scratch buffer, create a string literal + // and then go with that. std::unique_ptr<llvm::MemoryBuffer> MemBuf; { llvm::SmallString<80> EscapedString; { llvm::raw_svector_ostream OS(EscapedString); OS << '"'; - OS.write_escaped(FormatString); + OS.write_escaped(SER->getString()); OS << '"'; } MemBuf.reset(new llvm::SmallVectorMemoryBuffer(std::move(EscapedString), "<scratch space>", true)); } - // Plop that string into a scratch buffer, create a string literal and then - // go with that. auto ScratchFile = S.getSourceManager().createFileID(std::move(MemBuf)); SourceLocation Begin = S.getSourceManager().getLocForStartOfFile(ScratchFile); - QualType SLType = S.Context.getStringLiteralArrayType(S.Context.CharTy, - FormatString.length()); - SL = StringLiteral::Create(S.Context, FormatString, + QualType SLType = S.Context.getStringLiteralArrayType( + S.Context.CharTy, SER->getString().size()); + SL = StringLiteral::Create(S.Context, SER->getString(), StringLiteralKind::Ordinary, false, SLType, Begin); + Offset = 0; return SLCER_Evaluated; } @@ -6481,6 +6483,11 @@ bool Sema::CheckFormatArguments(ArrayRef<const Expr *> Args, VariadicCallType CallType, SourceLocation Loc, SourceRange Range, llvm::SmallBitVector &CheckedVarArgs) { + // As a last resort, Clang attempts to evaluate the format string as a + // constant, which is expensive. Before we go down that route, check that + // the warnings are at least enabled at Loc, which in the common case points + // at the opening parenthesis of the function call. + // CHECK: printf/scanf-like function is called with no format string. if (format_idx >= Args.size()) { Diag(Loc, diag::warn_missing_format_string) << Range; @@ -6493,14 +6500,9 @@ bool Sema::CheckFormatArguments(ArrayRef<const Expr *> Args, // // Dynamically generated format strings are difficult to // automatically vet at compile time. Requiring that format strings - // are string literals: (1) permits the checking of format strings by - // the compiler and thereby (2) can practically remove the source of - // many format string exploits. - - // Format string can be either ObjC string (e.g. @"%d") or - // C string (e.g. "%d") - // ObjC string uses the same format specifiers as C string, so we can use - // the same format string checking logic for both ObjC and C strings. + // can evaluate to constant strings: (1) permits the checking of format + // strings by the compiler and thereby (2) can practically remove the source + // of many format string exploits. UncoveredArgHandler UncoveredArg; StringLiteralCheckType CT = checkFormatStringExpr( *this, ReferenceFormatString, OrigFormatExpr, Args, APK, format_idx, diff --git a/clang/test/SemaCXX/verbose-trap.cpp b/clang/test/SemaCXX/verbose-trap.cpp index 2503f9860d9c3..5562fb453948c 100644 --- a/clang/test/SemaCXX/verbose-trap.cpp +++ b/clang/test/SemaCXX/verbose-trap.cpp @@ -15,6 +15,9 @@ char const constMsg3[] = "hello"; template <const char * const category, const char * const reason> void f(const char * arg) { + const char buf[] = {'a', 'b', 'c'}; + const char buf_nt1[] = {'a', 'b', 'c', 0}; + const char buf_nt2[] = {'a', 'b', 0, 'c'}; __builtin_verbose_trap("cat1", "Arbitrary string literals can be used!"); __builtin_verbose_trap(" cat1 ", "Argument_must_not_be_null"); __builtin_verbose_trap("cat" "egory1", "hello" "world"); @@ -24,9 +27,12 @@ void f(const char * arg) { __builtin_verbose_trap(); // expected-error {{too few arguments}} __builtin_verbose_trap(""); // expected-error {{too few arguments}} __builtin_verbose_trap("", "", ""); // expected-error {{too many arguments}} - __builtin_verbose_trap("", 0); // expected-error {{argument to __builtin_verbose_trap must be a pointer to a constant string}} + __builtin_verbose_trap("", 0); // expected-error {{argument to __builtin_verbose_trap must be a pointer to a constant NUL-terminated string}} __builtin_verbose_trap(1, ""); // expected-error {{cannot initialize a parameter of type 'const char *' with an rvalue of type 'int'}} - __builtin_verbose_trap(arg, ""); // expected-error {{argument to __builtin_verbose_trap must be a pointer to a constant string}} + __builtin_verbose_trap(arg, ""); // expected-error {{argument to __builtin_verbose_trap must be a pointer to a constant NUL-terminated string}} + __builtin_verbose_trap(buf, ""); // expected-error {{argument to __builtin_verbose_trap must be a pointer to a constant NUL-terminated string}} + __builtin_verbose_trap(buf_nt1, ""); + __builtin_verbose_trap(buf_nt2, ""); __builtin_verbose_trap("cat$1", "hel$lo"); // expected-error 2 {{argument to __builtin_verbose_trap must not contain $}} __builtin_verbose_trap(category, reason); __builtin_verbose_trap(u8"cat1", u8"hello"); >From 504cdb609d50414c6825a0ced8afb3a91146d4c7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?F=C3=A9lix=20Cloutier?= <fclout...@apple.com> Date: Tue, 22 Apr 2025 14:12:17 -0700 Subject: [PATCH 3/3] Address naming/documentation-level review feedback --- clang/docs/ReleaseNotes.rst | 35 +++++++++++++++++-- .../clang/Basic/DiagnosticSemaKinds.td | 2 +- clang/lib/AST/ExprConstant.cpp | 11 +++--- clang/lib/Sema/SemaChecking.cpp | 25 ++++++++----- 4 files changed, 57 insertions(+), 16 deletions(-) diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index 05566d66a65d2..8529e5b991cba 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -265,8 +265,39 @@ related warnings within the method body. ``format_matches`` accepts an example valid format string as its third argument. For more information, see the Clang attributes documentation. -- Format string checking now supports the compile-time evaluation of format - strings as a fallback mechanism. +- Clang can now verify format strings that can be constant-folded even if they + do not resolve to a string literal. For instance, all of these can now be + verified: + + .. code-block:: c++ + + const char format[] = {'h', 'e', 'l', 'l', 'o', ' ', '%', 's', 0}; + printf(format, "world"); + // no warning + + printf(format, 123); + // warning: format specifies type 'char *' but the argument has type 'int' + + printf(("%"s + "i"s).c_str(), "world"); + // warning: format specifies type 'int' but the argument has type 'char *' + + When the format expression does not evaluate to a string literal, Clang + points diagnostics into a pseudo-file called ``<scratch space>`` that contains + the format string literal as it evaluated, like so: + + .. code-block:: text + + example.c:6:17: warning: format specifies type 'char *' but the argument has type 'int' [-Wformat] + 6 | printf(format, 123); + | ~~~~~~ ^~~ + <scratch space>:1:4: note: format string resolved to a constant string + 1 | "hello %s" + | ^~ + | %d + + This may mean that format strings which were previously unverified (or which + triggered ``-Wformat-nonliteral``) are now verified by ``-Wformat`` and its + allies. - Introduced a new statement attribute ``[[clang::atomic]]`` that enables fine-grained control over atomic code generation on a per-statement basis. diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td index e3b73c149b3d5..57287bb8e0115 100644 --- a/clang/include/clang/Basic/DiagnosticSemaKinds.td +++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td @@ -10171,7 +10171,7 @@ def warn_format_bool_as_character : Warning< InGroup<Format>; def note_format_string_defined : Note<"format string is defined here">; def note_format_string_evaluated_to : Note< - "format string was constant-evaluated">; + "format string resolved to a constant string">; def note_format_fix_specifier : Note<"did you mean to use '%0'?">; def note_printf_c_str: Note<"did you mean to call the %0 method?">; def note_format_security_fixit: Note< diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp index 6c5e8260131bb..e5f41a9f9b294 100644 --- a/clang/lib/AST/ExprConstant.cpp +++ b/clang/lib/AST/ExprConstant.cpp @@ -17974,14 +17974,17 @@ static bool EvaluateStringAsLValue(EvalInfo &Info, const Expr *E, String.addArray(Info, E, CAT); else String.addUnsizedArray(Info, E, CharTy); - } else if (Ty->hasPointerRepresentation()) { + return true; + } + + if (Ty->hasPointerRepresentation()) { if (!EvaluatePointer(E, String, Info)) return false; CharTy = Ty->getPointeeType(); - } else { - return false; + return true; } - return true; + + return false; } static const StringLiteral *StringLValueIsLiteral(EvalInfo &Info, diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp index 0ca41c9758af9..6184fe384fee5 100644 --- a/clang/lib/Sema/SemaChecking.cpp +++ b/clang/lib/Sema/SemaChecking.cpp @@ -5938,15 +5938,22 @@ static void CheckFormatString( llvm::SmallBitVector &CheckedVarArgs, UncoveredArgHandler &UncoveredArg, bool IgnoreStringsWithoutSpecifiers); -enum StringLiteralConstEvalResult { +enum StringLiteralConstantEvaluationResult { SLCER_NotEvaluated, SLCER_NotNullTerminated, SLCER_Evaluated, }; -static StringLiteralConstEvalResult -constEvalStringAsLiteral(Sema &S, const Expr *E, const StringLiteral *&SL, - uint64_t &Offset); +/// Attempt to fold \c E into a constant string that \c checkFormatStringExpr +/// can use. If \c E folds to a string literal, that string literal will be used +/// for diagnostics. If \c E has a constant string value but it does not fold to +/// a literal (for instance, ("%"s + "i"s).c_str() constant-folds to "%i"), a +/// <scratch space> pseudo-source file will be allocated, containing a string +/// literal representation of the constant string, and format diagnostics will +/// point to it. +static StringLiteralConstantEvaluationResult +EvaluateStringAndCreateLiteral(Sema &S, const Expr *E, const StringLiteral *&SL, + uint64_t &Offset); // Determine if an expression is a string literal or constant string. // If this function returns false on the arguments to a function expecting a @@ -6335,7 +6342,7 @@ static StringLiteralCheckType checkFormatStringExpr( uint64_t EvalOffset = 0; const StringLiteral *FakeLiteral = nullptr; - switch (constEvalStringAsLiteral(S, E, FakeLiteral, EvalOffset)) { + switch (EvaluateStringAndCreateLiteral(S, E, FakeLiteral, EvalOffset)) { case SLCER_NotEvaluated: return SLCT_NotALiteral; @@ -6357,9 +6364,9 @@ static StringLiteralCheckType checkFormatStringExpr( } } -static StringLiteralConstEvalResult -constEvalStringAsLiteral(Sema &S, const Expr *E, const StringLiteral *&SL, - uint64_t &Offset) { +static StringLiteralConstantEvaluationResult +EvaluateStringAndCreateLiteral(Sema &S, const Expr *E, const StringLiteral *&SL, + uint64_t &Offset) { // As a last resort, try to constant-evaluate the format string. bool HasNul; auto SER = E->tryEvaluateString(S.Context, &HasNul); @@ -6388,7 +6395,7 @@ constEvalStringAsLiteral(Sema &S, const Expr *E, const StringLiteral *&SL, "<scratch space>", true)); } - auto ScratchFile = S.getSourceManager().createFileID(std::move(MemBuf)); + FileID ScratchFile = S.getSourceManager().createFileID(std::move(MemBuf)); SourceLocation Begin = S.getSourceManager().getLocForStartOfFile(ScratchFile); QualType SLType = S.Context.getStringLiteralArrayType( S.Context.CharTy, SER->getString().size()); _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits