https://github.com/andykaylor updated https://github.com/llvm/llvm-project/pull/171541
>From 13d288e89c4505e46a6ca3b1767c70e65fc8307c Mon Sep 17 00:00:00 2001 From: Adam Smith <[email protected]> Date: Tue, 9 Dec 2025 16:07:08 -0800 Subject: [PATCH 1/4] [CIR] Support wide string literals in CIR codegen Implement support for wide string literals (wchar_t, char16_t, char32_t) in getConstantArrayFromStringLiteral. This migrates the feature from the incubator to upstream. The implementation handles wide character string literals by: - Collecting code units using getCodeUnit() - Creating constant arrays with IntAttr elements - Using ZeroAttr for null-filled strings Add test file wide-string.cpp copied from incubator, expanded with wchar_t test cases. --- clang/lib/CIR/CodeGen/CIRGenModule.cpp | 37 ++++++++++++++++++++++-- clang/test/CIR/CodeGen/wide-string.cpp | 40 ++++++++++++++++++++++++++ 2 files changed, 74 insertions(+), 3 deletions(-) create mode 100644 clang/test/CIR/CodeGen/wide-string.cpp diff --git a/clang/lib/CIR/CodeGen/CIRGenModule.cpp b/clang/lib/CIR/CodeGen/CIRGenModule.cpp index 41a5d9db83e2b..c0dcd3f55f328 100644 --- a/clang/lib/CIR/CodeGen/CIRGenModule.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenModule.cpp @@ -21,7 +21,9 @@ #include "clang/AST/GlobalDecl.h" #include "clang/AST/RecordLayout.h" #include "clang/Basic/SourceManager.h" +#include "clang/CIR/Dialect/IR/CIRAttrs.h" #include "clang/CIR/Dialect/IR/CIRDialect.h" +#include "clang/CIR/Dialect/IR/CIRTypes.h" #include "clang/CIR/Interfaces/CIROpInterfaces.h" #include "clang/CIR/MissingFeatures.h" @@ -31,6 +33,8 @@ #include "mlir/IR/MLIRContext.h" #include "mlir/IR/Verifier.h" +#include <algorithm> + using namespace clang; using namespace clang::CIRGen; @@ -960,9 +964,36 @@ CIRGenModule::getConstantArrayFromStringLiteral(const StringLiteral *e) { return builder.getString(str, eltTy, finalSize); } - errorNYI(e->getSourceRange(), - "getConstantArrayFromStringLiteral: wide characters"); - return mlir::Attribute(); + auto arrayTy = mlir::dyn_cast<cir::ArrayType>(convertType(e->getType())); + assert(arrayTy && "string literals must be emitted as an array type"); + + auto arrayEltTy = mlir::dyn_cast<cir::IntType>(arrayTy.getElementType()); + assert(arrayEltTy && + "string literal elements must be emitted as integral type"); + + auto arraySize = arrayTy.getSize(); + auto literalSize = e->getLength(); + + // Collect the code units. + SmallVector<uint32_t, 32> elementValues; + elementValues.reserve(arraySize); + for (unsigned i = 0; i < literalSize; ++i) + elementValues.push_back(e->getCodeUnit(i)); + elementValues.resize(arraySize); + + // If the string is full of null bytes, emit a #cir.zero instead. + if (std::all_of(elementValues.begin(), elementValues.end(), + [](uint32_t x) { return x == 0; })) + return cir::ZeroAttr::get(arrayTy); + + // Otherwise emit a constant array holding the characters. + SmallVector<mlir::Attribute, 32> elements; + elements.reserve(arraySize); + for (uint64_t i = 0; i < arraySize; ++i) + elements.push_back(cir::IntAttr::get(arrayEltTy, elementValues[i])); + + auto elementsAttr = mlir::ArrayAttr::get(&getMLIRContext(), elements); + return builder.getConstArray(elementsAttr, arrayTy); } bool CIRGenModule::supportsCOMDAT() const { diff --git a/clang/test/CIR/CodeGen/wide-string.cpp b/clang/test/CIR/CodeGen/wide-string.cpp new file mode 100644 index 0000000000000..9f145d022a943 --- /dev/null +++ b/clang/test/CIR/CodeGen/wide-string.cpp @@ -0,0 +1,40 @@ +// RUN: %clang_cc1 -std=c++17 -triple x86_64-unknown-linux-gnu -fclangir -emit-cir %s -o %t.cir +// RUN: FileCheck --input-file=%t.cir %s + +const char16_t *test_utf16() { + return u"你好世界"; +} + +// CHECK: cir.global "private" constant cir_private dso_local @{{.+}} = #cir.const_array<[#cir.int<20320> : !u16i, #cir.int<22909> : !u16i, #cir.int<19990> : !u16i, #cir.int<30028> : !u16i, #cir.int<0> : !u16i]> : !cir.array<!u16i x 5> + +const char32_t *test_utf32() { + return U"你好世界"; +} + +// CHECK: cir.global "private" constant cir_private dso_local @{{.+}} = #cir.const_array<[#cir.int<20320> : !u32i, #cir.int<22909> : !u32i, #cir.int<19990> : !u32i, #cir.int<30028> : !u32i, #cir.int<0> : !u32i]> : !cir.array<!u32i x 5> + +const char16_t *test_zero16() { + return u"\0\0\0\0"; +} + +// CHECK: cir.global "private" constant cir_private dso_local @{{.+}} = #cir.zero : !cir.array<!u16i x 5> + +const char32_t *test_zero32() { + return U"\0\0\0\0"; +} + +// CHECK: cir.global "private" constant cir_private dso_local @{{.+}} = #cir.zero : !cir.array<!u32i x 5> + +#include <stddef.h> + +const wchar_t *test_wchar() { + return L"1234"; +} + +// CHECK: cir.global "private" constant cir_private dso_local @{{.+}} = #cir.const_array<[#cir.int<49> : !s32i, #cir.int<50> : !s32i, #cir.int<51> : !s32i, #cir.int<52> : !s32i, #cir.int<0> : !s32i]> : !cir.array<!s32i x 5> + +const wchar_t *test_wchar_zero() { + return L""; +} + +// CHECK: cir.global "private" constant cir_private dso_local @{{.+}} = #cir.zero : !cir.array<!s32i x 1> >From aa633ba1d1a9ffd253e036dfd8f5bbad7b8a27b1 Mon Sep 17 00:00:00 2001 From: Adam Smith <[email protected]> Date: Thu, 11 Dec 2025 09:38:24 -0800 Subject: [PATCH 2/4] Address reviewer feedback: improve code style and performance - Use mlir::cast<> instead of dyn_cast<> + assert - Use explicit types (uint64_t, unsigned) instead of auto for method returns - Remove unnecessary SmallVector size parameters - Optimize zero-check to happen before building vector (early exit) - Avoid double-looping by building elements directly - Add LLVM and OGCG checks to test file --- clang/lib/CIR/CodeGen/CIRGenModule.cpp | 40 ++++++++++++++------------ clang/test/CIR/CodeGen/wide-string.cpp | 30 ++++++++++++++----- 2 files changed, 44 insertions(+), 26 deletions(-) diff --git a/clang/lib/CIR/CodeGen/CIRGenModule.cpp b/clang/lib/CIR/CodeGen/CIRGenModule.cpp index c0dcd3f55f328..5c19b8b58d0ed 100644 --- a/clang/lib/CIR/CodeGen/CIRGenModule.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenModule.cpp @@ -964,33 +964,35 @@ CIRGenModule::getConstantArrayFromStringLiteral(const StringLiteral *e) { return builder.getString(str, eltTy, finalSize); } - auto arrayTy = mlir::dyn_cast<cir::ArrayType>(convertType(e->getType())); - assert(arrayTy && "string literals must be emitted as an array type"); + auto arrayTy = mlir::cast<cir::ArrayType>(convertType(e->getType())); - auto arrayEltTy = mlir::dyn_cast<cir::IntType>(arrayTy.getElementType()); - assert(arrayEltTy && - "string literal elements must be emitted as integral type"); + auto arrayEltTy = mlir::cast<cir::IntType>(arrayTy.getElementType()); - auto arraySize = arrayTy.getSize(); - auto literalSize = e->getLength(); + uint64_t arraySize = arrayTy.getSize(); + unsigned literalSize = e->getLength(); - // Collect the code units. - SmallVector<uint32_t, 32> elementValues; - elementValues.reserve(arraySize); - for (unsigned i = 0; i < literalSize; ++i) - elementValues.push_back(e->getCodeUnit(i)); - elementValues.resize(arraySize); + // Check if the string is all null bytes before building the vector. + // In most non-zero cases, this will break out on the first element. + // Padding bytes (if literalSize < arraySize) are implicitly zero. + bool isAllZero = true; + for (unsigned i = 0; i < literalSize; ++i) { + if (e->getCodeUnit(i) != 0) { + isAllZero = false; + break; + } + } - // If the string is full of null bytes, emit a #cir.zero instead. - if (std::all_of(elementValues.begin(), elementValues.end(), - [](uint32_t x) { return x == 0; })) + if (isAllZero) return cir::ZeroAttr::get(arrayTy); // Otherwise emit a constant array holding the characters. - SmallVector<mlir::Attribute, 32> elements; + SmallVector<mlir::Attribute> elements; elements.reserve(arraySize); - for (uint64_t i = 0; i < arraySize; ++i) - elements.push_back(cir::IntAttr::get(arrayEltTy, elementValues[i])); + for (unsigned i = 0; i < literalSize; ++i) + elements.push_back(cir::IntAttr::get(arrayEltTy, e->getCodeUnit(i))); + // Pad with zeros if needed. + for (uint64_t i = literalSize; i < arraySize; ++i) + elements.push_back(cir::IntAttr::get(arrayEltTy, 0)); auto elementsAttr = mlir::ArrayAttr::get(&getMLIRContext(), elements); return builder.getConstArray(elementsAttr, arrayTy); diff --git a/clang/test/CIR/CodeGen/wide-string.cpp b/clang/test/CIR/CodeGen/wide-string.cpp index 9f145d022a943..3ce790f0cff3d 100644 --- a/clang/test/CIR/CodeGen/wide-string.cpp +++ b/clang/test/CIR/CodeGen/wide-string.cpp @@ -1,29 +1,41 @@ // RUN: %clang_cc1 -std=c++17 -triple x86_64-unknown-linux-gnu -fclangir -emit-cir %s -o %t.cir -// RUN: FileCheck --input-file=%t.cir %s +// RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s +// RUN: %clang_cc1 -std=c++17 -triple x86_64-unknown-linux-gnu -fclangir -emit-llvm %s -o %t-cir.ll +// RUN: FileCheck --check-prefix=LLVM --input-file=%t-cir.ll %s +// RUN: %clang_cc1 -std=c++17 -triple x86_64-unknown-linux-gnu -emit-llvm %s -o %t.ll +// RUN: FileCheck --check-prefix=OGCG --input-file=%t.ll %s const char16_t *test_utf16() { return u"你好世界"; } -// CHECK: cir.global "private" constant cir_private dso_local @{{.+}} = #cir.const_array<[#cir.int<20320> : !u16i, #cir.int<22909> : !u16i, #cir.int<19990> : !u16i, #cir.int<30028> : !u16i, #cir.int<0> : !u16i]> : !cir.array<!u16i x 5> +// CIR: cir.global "private" constant cir_private dso_local @{{.+}} = #cir.const_array<[#cir.int<20320> : !u16i, #cir.int<22909> : !u16i, #cir.int<19990> : !u16i, #cir.int<30028> : !u16i, #cir.int<0> : !u16i]> : !cir.array<!u16i x 5> +// LLVM: @{{.+}} = private constant [5 x i16] [i16 20320, i16 22909, i16 19990, i16 30028, i16 0] +// OGCG: @{{.+}} = private unnamed_addr constant [5 x i16] [i16 20320, i16 22909, i16 19990, i16 30028, i16 0] const char32_t *test_utf32() { return U"你好世界"; } -// CHECK: cir.global "private" constant cir_private dso_local @{{.+}} = #cir.const_array<[#cir.int<20320> : !u32i, #cir.int<22909> : !u32i, #cir.int<19990> : !u32i, #cir.int<30028> : !u32i, #cir.int<0> : !u32i]> : !cir.array<!u32i x 5> +// CIR: cir.global "private" constant cir_private dso_local @{{.+}} = #cir.const_array<[#cir.int<20320> : !u32i, #cir.int<22909> : !u32i, #cir.int<19990> : !u32i, #cir.int<30028> : !u32i, #cir.int<0> : !u32i]> : !cir.array<!u32i x 5> +// LLVM: @{{.+}} = private constant [5 x i32] [i32 20320, i32 22909, i32 19990, i32 30028, i32 0] +// OGCG: @{{.+}} = private unnamed_addr constant [5 x i32] [i32 20320, i32 22909, i32 19990, i32 30028, i32 0] const char16_t *test_zero16() { return u"\0\0\0\0"; } -// CHECK: cir.global "private" constant cir_private dso_local @{{.+}} = #cir.zero : !cir.array<!u16i x 5> +// CIR: cir.global "private" constant cir_private dso_local @{{.+}} = #cir.zero : !cir.array<!u16i x 5> +// LLVM: @{{.+}} = private constant [5 x i16] zeroinitializer +// OGCG: @{{.+}} = private unnamed_addr constant [5 x i16] zeroinitializer const char32_t *test_zero32() { return U"\0\0\0\0"; } -// CHECK: cir.global "private" constant cir_private dso_local @{{.+}} = #cir.zero : !cir.array<!u32i x 5> +// CIR: cir.global "private" constant cir_private dso_local @{{.+}} = #cir.zero : !cir.array<!u32i x 5> +// LLVM: @{{.+}} = private constant [5 x i32] zeroinitializer +// OGCG: @{{.+}} = private unnamed_addr constant [5 x i32] zeroinitializer #include <stddef.h> @@ -31,10 +43,14 @@ const wchar_t *test_wchar() { return L"1234"; } -// CHECK: cir.global "private" constant cir_private dso_local @{{.+}} = #cir.const_array<[#cir.int<49> : !s32i, #cir.int<50> : !s32i, #cir.int<51> : !s32i, #cir.int<52> : !s32i, #cir.int<0> : !s32i]> : !cir.array<!s32i x 5> +// CIR: cir.global "private" constant cir_private dso_local @{{.+}} = #cir.const_array<[#cir.int<49> : !s32i, #cir.int<50> : !s32i, #cir.int<51> : !s32i, #cir.int<52> : !s32i, #cir.int<0> : !s32i]> : !cir.array<!s32i x 5> +// LLVM: @{{.+}} = private constant [5 x i32] [i32 49, i32 50, i32 51, i32 52, i32 0] +// OGCG: @{{.+}} = private unnamed_addr constant [5 x i32] [i32 49, i32 50, i32 51, i32 52, i32 0] const wchar_t *test_wchar_zero() { return L""; } -// CHECK: cir.global "private" constant cir_private dso_local @{{.+}} = #cir.zero : !cir.array<!s32i x 1> +// CIR: cir.global "private" constant cir_private dso_local @{{.+}} = #cir.zero : !cir.array<!s32i x 1> +// LLVM: @{{.+}} = private constant [1 x i32] zeroinitializer +// OGCG: @{{.+}} = private unnamed_addr constant [1 x i32] zeroinitializer >From a28651485a5de361496abaee73f5b9e9b2c02f9c Mon Sep 17 00:00:00 2001 From: Adam Smith <[email protected]> Date: Thu, 11 Dec 2025 14:56:25 -0800 Subject: [PATCH 3/4] [CIR] Address reviewer feedback: assert equivalence and improve tests - Replace padding logic with assert that arraySize == literalSize - Improve assert message clarity - Replace #include <stddef.h> with typedef __WCHAR_TYPE__ wchar_t - Add test case using typedef __CHAR16_TYPE__ char16_t - Add explanatory comments for test organization Addresses reviewer comments on PR #171541. --- clang/lib/CIR/CodeGen/CIRGenModule.cpp | 6 ++---- clang/test/CIR/CodeGen/wide-string.cpp | 14 +++++++++++++- 2 files changed, 15 insertions(+), 5 deletions(-) diff --git a/clang/lib/CIR/CodeGen/CIRGenModule.cpp b/clang/lib/CIR/CodeGen/CIRGenModule.cpp index 5c19b8b58d0ed..4b403b0bde220 100644 --- a/clang/lib/CIR/CodeGen/CIRGenModule.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenModule.cpp @@ -970,10 +970,11 @@ CIRGenModule::getConstantArrayFromStringLiteral(const StringLiteral *e) { uint64_t arraySize = arrayTy.getSize(); unsigned literalSize = e->getLength(); + assert(arraySize == literalSize && + "wide string literal length must match array type size"); // Check if the string is all null bytes before building the vector. // In most non-zero cases, this will break out on the first element. - // Padding bytes (if literalSize < arraySize) are implicitly zero. bool isAllZero = true; for (unsigned i = 0; i < literalSize; ++i) { if (e->getCodeUnit(i) != 0) { @@ -990,9 +991,6 @@ CIRGenModule::getConstantArrayFromStringLiteral(const StringLiteral *e) { elements.reserve(arraySize); for (unsigned i = 0; i < literalSize; ++i) elements.push_back(cir::IntAttr::get(arrayEltTy, e->getCodeUnit(i))); - // Pad with zeros if needed. - for (uint64_t i = literalSize; i < arraySize; ++i) - elements.push_back(cir::IntAttr::get(arrayEltTy, 0)); auto elementsAttr = mlir::ArrayAttr::get(&getMLIRContext(), elements); return builder.getConstArray(elementsAttr, arrayTy); diff --git a/clang/test/CIR/CodeGen/wide-string.cpp b/clang/test/CIR/CodeGen/wide-string.cpp index 3ce790f0cff3d..7e007edf9f921 100644 --- a/clang/test/CIR/CodeGen/wide-string.cpp +++ b/clang/test/CIR/CodeGen/wide-string.cpp @@ -5,6 +5,7 @@ // RUN: %clang_cc1 -std=c++17 -triple x86_64-unknown-linux-gnu -emit-llvm %s -o %t.ll // RUN: FileCheck --check-prefix=OGCG --input-file=%t.ll %s +// Test with built-in char16_t type const char16_t *test_utf16() { return u"你好世界"; } @@ -37,7 +38,7 @@ const char32_t *test_zero32() { // LLVM: @{{.+}} = private constant [5 x i32] zeroinitializer // OGCG: @{{.+}} = private unnamed_addr constant [5 x i32] zeroinitializer -#include <stddef.h> +typedef __WCHAR_TYPE__ wchar_t; const wchar_t *test_wchar() { return L"1234"; @@ -54,3 +55,14 @@ const wchar_t *test_wchar_zero() { // CIR: cir.global "private" constant cir_private dso_local @{{.+}} = #cir.zero : !cir.array<!s32i x 1> // LLVM: @{{.+}} = private constant [1 x i32] zeroinitializer // OGCG: @{{.+}} = private unnamed_addr constant [1 x i32] zeroinitializer + +// Test with typedef'd char16_t to ensure typedef path works correctly +typedef __CHAR16_TYPE__ char16_t; + +const char16_t *test_char16_typedef() { + return u"test"; +} + +// CIR: cir.global "private" constant cir_private dso_local @{{.+}} = #cir.const_array<[#cir.int<116> : !u16i, #cir.int<101> : !u16i, #cir.int<115> : !u16i, #cir.int<116> : !u16i, #cir.int<0> : !u16i]> : !cir.array<!u16i x 5> +// LLVM: @{{.+}} = private constant [5 x i16] [i16 116, i16 101, i16 115, i16 116, i16 0] +// OGCG: @{{.+}} = private unnamed_addr constant [5 x i16] [i16 116, i16 101, i16 115, i16 116, i16 0] >From 4cb2b4c83204723dc6faffcae82f8f62ca875484 Mon Sep 17 00:00:00 2001 From: Adam Smith <[email protected]> Date: Fri, 12 Dec 2025 10:13:50 -0800 Subject: [PATCH 4/4] [CIR] Fix wide string literal assertion: account for null terminator - Change assert to arraySize == literalSize + 1 to account for null terminator - Explicitly add null terminator to elements array - Remove typedef statements that conflict with C++17 built-in types Fixes CI assertion failure in PR #171541. --- clang/lib/CIR/CodeGen/CIRGenModule.cpp | 7 +++++-- clang/test/CIR/CodeGen/wide-string.cpp | 5 ----- 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/clang/lib/CIR/CodeGen/CIRGenModule.cpp b/clang/lib/CIR/CodeGen/CIRGenModule.cpp index 4b403b0bde220..8050f92753a2f 100644 --- a/clang/lib/CIR/CodeGen/CIRGenModule.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenModule.cpp @@ -970,8 +970,9 @@ CIRGenModule::getConstantArrayFromStringLiteral(const StringLiteral *e) { uint64_t arraySize = arrayTy.getSize(); unsigned literalSize = e->getLength(); - assert(arraySize == literalSize && - "wide string literal length must match array type size"); + assert(arraySize == literalSize + 1 && + "wide string literal array size must be literal length plus null " + "terminator"); // Check if the string is all null bytes before building the vector. // In most non-zero cases, this will break out on the first element. @@ -991,6 +992,8 @@ CIRGenModule::getConstantArrayFromStringLiteral(const StringLiteral *e) { elements.reserve(arraySize); for (unsigned i = 0; i < literalSize; ++i) elements.push_back(cir::IntAttr::get(arrayEltTy, e->getCodeUnit(i))); + // Add null terminator + elements.push_back(cir::IntAttr::get(arrayEltTy, 0)); auto elementsAttr = mlir::ArrayAttr::get(&getMLIRContext(), elements); return builder.getConstArray(elementsAttr, arrayTy); diff --git a/clang/test/CIR/CodeGen/wide-string.cpp b/clang/test/CIR/CodeGen/wide-string.cpp index 7e007edf9f921..6d6ed1458b952 100644 --- a/clang/test/CIR/CodeGen/wide-string.cpp +++ b/clang/test/CIR/CodeGen/wide-string.cpp @@ -38,8 +38,6 @@ const char32_t *test_zero32() { // LLVM: @{{.+}} = private constant [5 x i32] zeroinitializer // OGCG: @{{.+}} = private unnamed_addr constant [5 x i32] zeroinitializer -typedef __WCHAR_TYPE__ wchar_t; - const wchar_t *test_wchar() { return L"1234"; } @@ -56,9 +54,6 @@ const wchar_t *test_wchar_zero() { // LLVM: @{{.+}} = private constant [1 x i32] zeroinitializer // OGCG: @{{.+}} = private unnamed_addr constant [1 x i32] zeroinitializer -// Test with typedef'd char16_t to ensure typedef path works correctly -typedef __CHAR16_TYPE__ char16_t; - const char16_t *test_char16_typedef() { return u"test"; } _______________________________________________ cfe-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
