Author: Mariya Podchishchaeva Date: 2025-03-20T13:02:29+01:00 New Revision: 8c6f309023a9c5b6d8488e89ed0be8da7d117a68
URL: https://github.com/llvm/llvm-project/commit/8c6f309023a9c5b6d8488e89ed0be8da7d117a68 DIFF: https://github.com/llvm/llvm-project/commit/8c6f309023a9c5b6d8488e89ed0be8da7d117a68.diff LOG: [clang] Introduce "binary" StringLiteral for #embed data (#127629) StringLiteral is used as internal data of EmbedExpr and we directly use it as an initializer if a single EmbedExpr appears in the initializer list of a char array. It is fast and convenient, but it is causing problems when string literal character values are checked because #embed data values are within a range [0-2^(char width)] but ordinary StringLiteral is of maybe signed char type. This PR introduces new kind of StringLiteral to hold binary data coming from an embedded resource to mitigate these problems. The new kind of StringLiteral is not assumed to have signed char type. The new kind of StringLiteral also helps to prevent crashes when trying to find StringLiteral token locations since these simply do not exist for binary data. Fixes https://github.com/llvm/llvm-project/issues/119256 Added: clang/test/Preprocessor/embed_constexpr.c Modified: clang/include/clang/AST/Expr.h clang/lib/AST/Expr.cpp clang/lib/Parse/ParseInit.cpp clang/lib/Sema/SemaInit.cpp Removed: ################################################################################ diff --git a/clang/include/clang/AST/Expr.h b/clang/include/clang/AST/Expr.h index 28f5eb283956d..08e34fdf2aa2f 100644 --- a/clang/include/clang/AST/Expr.h +++ b/clang/include/clang/AST/Expr.h @@ -1756,7 +1756,14 @@ enum class StringLiteralKind { UTF8, UTF16, UTF32, - Unevaluated + Unevaluated, + // Binary kind of string literal is used for the data coming via #embed + // directive. File's binary contents is transformed to a special kind of + // string literal that in some cases may be used directly as an initializer + // and some features of classic string literals are not applicable to this + // kind of a string literal, for example finding a particular byte's source + // location for better diagnosing. + Binary }; /// StringLiteral - This represents a string literal expression, e.g. "foo" @@ -1888,6 +1895,8 @@ class StringLiteral final int64_t getCodeUnitS(size_t I, uint64_t BitWidth) const { int64_t V = getCodeUnit(I); if (isOrdinary() || isWide()) { + // Ordinary and wide string literals have types that can be signed. + // It is important for checking C23 constexpr initializers. unsigned Width = getCharByteWidth() * BitWidth; llvm::APInt AInt(Width, (uint64_t)V); V = AInt.getSExtValue(); @@ -5029,9 +5038,9 @@ class EmbedExpr final : public Expr { assert(EExpr && CurOffset != ULLONG_MAX && "trying to dereference an invalid iterator"); IntegerLiteral *N = EExpr->FakeChildNode; - StringRef DataRef = EExpr->Data->BinaryData->getBytes(); N->setValue(*EExpr->Ctx, - llvm::APInt(N->getValue().getBitWidth(), DataRef[CurOffset], + llvm::APInt(N->getValue().getBitWidth(), + EExpr->Data->BinaryData->getCodeUnit(CurOffset), N->getType()->isSignedIntegerType())); // We want to return a reference to the fake child node in the // EmbedExpr, not the local variable N. diff --git a/clang/lib/AST/Expr.cpp b/clang/lib/AST/Expr.cpp index 901ebf9592680..9d5b4a60c9fe7 100644 --- a/clang/lib/AST/Expr.cpp +++ b/clang/lib/AST/Expr.cpp @@ -1104,6 +1104,7 @@ unsigned StringLiteral::mapCharByteWidth(TargetInfo const &Target, switch (SK) { case StringLiteralKind::Ordinary: case StringLiteralKind::UTF8: + case StringLiteralKind::Binary: CharByteWidth = Target.getCharWidth(); break; case StringLiteralKind::Wide: @@ -1216,6 +1217,7 @@ void StringLiteral::outputString(raw_ostream &OS) const { switch (getKind()) { case StringLiteralKind::Unevaluated: case StringLiteralKind::Ordinary: + case StringLiteralKind::Binary: break; // no prefix. case StringLiteralKind::Wide: OS << 'L'; @@ -1332,6 +1334,11 @@ StringLiteral::getLocationOfByte(unsigned ByteNo, const SourceManager &SM, const LangOptions &Features, const TargetInfo &Target, unsigned *StartToken, unsigned *StartTokenByteOffset) const { + // No source location of bytes for binary literals since they don't come from + // source. + if (getKind() == StringLiteralKind::Binary) + return getStrTokenLoc(0); + assert((getKind() == StringLiteralKind::Ordinary || getKind() == StringLiteralKind::UTF8 || getKind() == StringLiteralKind::Unevaluated) && diff --git a/clang/lib/Parse/ParseInit.cpp b/clang/lib/Parse/ParseInit.cpp index 63b1d7bd9db53..471b3eaf28287 100644 --- a/clang/lib/Parse/ParseInit.cpp +++ b/clang/lib/Parse/ParseInit.cpp @@ -445,7 +445,7 @@ ExprResult Parser::createEmbedExpr() { Context.MakeIntValue(Str.size(), Context.getSizeType()); QualType ArrayTy = Context.getConstantArrayType( Ty, ArraySize, nullptr, ArraySizeModifier::Normal, 0); - return StringLiteral::Create(Context, Str, StringLiteralKind::Ordinary, + return StringLiteral::Create(Context, Str, StringLiteralKind::Binary, false, ArrayTy, StartLoc); }; diff --git a/clang/lib/Sema/SemaInit.cpp b/clang/lib/Sema/SemaInit.cpp index 56ec33fe37bf3..cea121d576c5c 100644 --- a/clang/lib/Sema/SemaInit.cpp +++ b/clang/lib/Sema/SemaInit.cpp @@ -106,6 +106,7 @@ static StringInitFailureKind IsStringInit(Expr *Init, const ArrayType *AT, return SIF_None; [[fallthrough]]; case StringLiteralKind::Ordinary: + case StringLiteralKind::Binary: // char array can be initialized with a narrow string. // Only allow char x[] = "foo"; not char x[] = L"foo"; if (ElemTy->isCharType()) diff --git a/clang/test/Preprocessor/embed_constexpr.c b/clang/test/Preprocessor/embed_constexpr.c new file mode 100644 index 0000000000000..e444dfec158b5 --- /dev/null +++ b/clang/test/Preprocessor/embed_constexpr.c @@ -0,0 +1,21 @@ +// RUN: %clang_cc1 %s -fsyntax-only --embed-dir=%S/Inputs -verify -std=c23 + +static constexpr unsigned char data[] = { +#embed "big_char.txt" +}; + +static constexpr char data1[] = { +#embed "big_char.txt" // expected-error {{constexpr initializer evaluates to 255 which is not exactly representable in type 'const char'}} +}; + +static constexpr int data2[] = { +#embed "big_char.txt" +}; + +static constexpr unsigned data3[] = { +#embed "big_char.txt" suffix(, -1) // expected-error {{constexpr initializer evaluates to -1 which is not exactly representable in type 'const unsigned int'}} +}; + +static constexpr int data4[] = { +#embed "big_char.txt" suffix(, -1) +}; _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits